int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG j; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; FLOAT *ap[8]; BLASLONG n1; BLASLONG m1; BLASLONG m2; BLASLONG m3; BLASLONG n2; BLASLONG lda4; FLOAT ybuffer[8],*xbuffer; FLOAT alpha[2]; if ( m < 1 ) return(0); if ( n < 1 ) return(0); inc_x <<= 1; inc_y <<= 1; lda <<= 1; lda4 = lda << 2; xbuffer = buffer; n1 = n >> 2 ; n2 = n & 3 ; m3 = m & 3 ; m1 = m - m3; m2 = (m & (NBMAX-1)) - m3 ; alpha[0] = alpha_r; alpha[1] = alpha_i; BLASLONG NB = NBMAX; while ( NB == NBMAX ) { m1 -= NB; if ( m1 < 0) { if ( m2 == 0 ) break; NB = m2; } y_ptr = y; a_ptr = a; x_ptr = x; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; if ( inc_x != 2 ) copy_x(NB,x_ptr,xbuffer,inc_x); else xbuffer = x_ptr; if ( inc_y == 2 ) { for( i = 0; i < n1 ; i++) { zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; y_ptr += 8; } if ( n2 & 2 ) { zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); a_ptr += lda * 2; y_ptr += 4; } if ( n2 & 1 ) { zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); a_ptr += lda; y_ptr += 2; } } else { for( i = 0; i < n1 ; i++) { memset(ybuffer,0,64); zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; y_ptr[0] += ybuffer[0]; y_ptr[1] += ybuffer[1]; y_ptr += inc_y; y_ptr[0] += ybuffer[2]; y_ptr[1] += ybuffer[3]; y_ptr += inc_y; y_ptr[0] += ybuffer[4]; y_ptr[1] += ybuffer[5]; y_ptr += inc_y; y_ptr[0] += ybuffer[6]; y_ptr[1] += ybuffer[7]; y_ptr += inc_y; } for( i = 0; i < n2 ; i++) { memset(ybuffer,0,64); zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); a_ptr += lda; y_ptr[0] += ybuffer[0]; y_ptr[1] += ybuffer[1]; y_ptr += inc_y; } } a += 2 * NB; x += NB * inc_x; } if ( m3 == 0 ) return(0); x_ptr = x; j=0; a_ptr = a; y_ptr = y; if ( m3 == 3 ) { FLOAT temp_r ; FLOAT temp_i ; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; x_ptr += inc_x; FLOAT x2 = x_ptr[0]; FLOAT x3 = x_ptr[1]; x_ptr += inc_x; FLOAT x4 = x_ptr[0]; FLOAT x5 = x_ptr[1]; while ( j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif #if !defined(XCONJ) y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } return(0); } if ( m3 == 2 ) { FLOAT temp_r ; FLOAT temp_i ; FLOAT temp_r1 ; FLOAT temp_i1 ; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; x_ptr += inc_x; FLOAT x2 = x_ptr[0]; FLOAT x3 = x_ptr[1]; FLOAT ar = alpha[0]; FLOAT ai = alpha[1]; while ( j < ( n & -2 )) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; a_ptr += lda; temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; a_ptr += lda; temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 - ai * temp_i1; y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 + ai * temp_i1; y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif a_ptr += lda; y_ptr += inc_y; j+=2; } while ( j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } return(0); } if ( m3 == 1 ) { FLOAT temp_r ; FLOAT temp_i ; FLOAT temp_r1 ; FLOAT temp_i1 ; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; FLOAT ar = alpha[0]; FLOAT ai = alpha[1]; while ( j < ( n & -2 )) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; a_ptr += lda; temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; a_ptr += lda; temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 - ai * temp_i1; y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 + ai * temp_i1; y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif a_ptr += lda; y_ptr += inc_y; j+=2; } while ( j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } return(0); } return(0); }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; FLOAT *ap[4]; BLASLONG n1; BLASLONG m1; BLASLONG m2; BLASLONG m3; BLASLONG n2; BLASLONG lda4; FLOAT xbuffer[8],*ybuffer; #if 0 printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y); #endif if ( m < 1 ) return(0); if ( n < 1 ) return(0); ybuffer = buffer; inc_x *= 2; inc_y *= 2; lda *= 2; lda4 = 4 * lda; n1 = n / 4 ; n2 = n % 4 ; m3 = m % 4; m1 = m - ( m % 4 ); m2 = (m % NBMAX) - (m % 4) ; y_ptr = y; BLASLONG NB = NBMAX; while ( NB == NBMAX ) { m1 -= NB; if ( m1 < 0) { if ( m2 == 0 ) break; NB = m2; } a_ptr = a; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; x_ptr = x; //zero_y(NB,ybuffer); memset(ybuffer,0,NB*16); if ( inc_x == 2 ) { for( i = 0; i < n1 ; i++) { zgemv_kernel_4x4(NB,ap,x_ptr,ybuffer); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; x_ptr += 8; } if ( n2 & 2 ) { zgemv_kernel_4x2(NB,ap,x_ptr,ybuffer); x_ptr += 4; a_ptr += 2 * lda; } if ( n2 & 1 ) { zgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer); /* x_ptr += 2; a_ptr += lda; */ } } else { for( i = 0; i < n1 ; i++) { xbuffer[0] = x_ptr[0]; xbuffer[1] = x_ptr[1]; x_ptr += inc_x; xbuffer[2] = x_ptr[0]; xbuffer[3] = x_ptr[1]; x_ptr += inc_x; xbuffer[4] = x_ptr[0]; xbuffer[5] = x_ptr[1]; x_ptr += inc_x; xbuffer[6] = x_ptr[0]; xbuffer[7] = x_ptr[1]; x_ptr += inc_x; zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; } for( i = 0; i < n2 ; i++) { xbuffer[0] = x_ptr[0]; xbuffer[1] = x_ptr[1]; x_ptr += inc_x; zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); a_ptr += 1 * lda; } } add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i); a += 2 * NB; y_ptr += NB * inc_y; } if ( m3 == 0 ) return(0); if ( m3 == 1 ) { a_ptr = a; x_ptr = x; FLOAT temp_r = 0.0; FLOAT temp_i = 0.0; if ( lda == 2 && inc_x == 2 ) { for( i=0 ; i < (n & -2); i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; #else temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; #endif a_ptr += 4; x_ptr += 4; } for( ; i < n; i++ ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; #else temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; #endif a_ptr += 2; x_ptr += 2; } } else { for( i = 0; i < n; i++ ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; #else temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; #endif a_ptr += lda; x_ptr += inc_x; } } #if !defined(XCONJ) y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif return(0); } if ( m3 == 2 ) { a_ptr = a; x_ptr = x; FLOAT temp_r0 = 0.0; FLOAT temp_i0 = 0.0; FLOAT temp_r1 = 0.0; FLOAT temp_i1 = 0.0; if ( lda == 4 && inc_x == 2 ) { for( i = 0; i < (n & -2); i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; #else temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; #endif a_ptr += 8; x_ptr += 4; } for( ; i < n; i++ ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; #else temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; #endif a_ptr += 4; x_ptr += 2; } } else { for( i=0 ; i < n; i++ ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; #else temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; #endif a_ptr += lda; x_ptr += inc_x; } } #if !defined(XCONJ) y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; y_ptr += inc_y; y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; #else y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; y_ptr += inc_y; y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif return(0); } if ( m3 == 3 ) { a_ptr = a; x_ptr = x; FLOAT temp_r0 = 0.0; FLOAT temp_i0 = 0.0; FLOAT temp_r1 = 0.0; FLOAT temp_i1 = 0.0; FLOAT temp_r2 = 0.0; FLOAT temp_i2 = 0.0; if ( lda == 6 && inc_x == 2 ) { for( i=0 ; i < n; i++ ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; #else temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; #endif a_ptr += 6; x_ptr += 2; } } else { for( i = 0; i < n; i++ ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; #else temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; #endif a_ptr += lda; x_ptr += inc_x; } } #if !defined(XCONJ) y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; y_ptr += inc_y; y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; y_ptr += inc_y; y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; #else y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; y_ptr += inc_y; y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; y_ptr += inc_y; y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; #endif return(0); } return(0); }