int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG ix,iy; BLASLONG jx,jy; BLASLONG j; BLASLONG j1; BLASLONG j2; BLASLONG m2; FLOAT temp1; FLOAT temp2; FLOAT *xp, *yp; FLOAT *a0,*a1,*a2,*a3; FLOAT at0,at1,at2,at3; FLOAT tmp1[4]; FLOAT tmp2[4]; #if 0 if( m != offset ) printf("Symv_U: m=%d offset=%d\n",m,offset); #endif BLASLONG m1 = m - offset; BLASLONG mrange = m -m1; if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) ) { jx = m1 * inc_x; jy = m1 * inc_y; for (j=m1; j<m; j++) { temp1 = alpha * x[jx]; temp2 = 0.0; iy = 0; ix = 0; for (i=0; i<j; i++) { y[iy] += temp1 * a[j*lda+i]; temp2 += a[j*lda+i] * x[ix]; ix += inc_x; iy += inc_y; } y[jy] += temp1 * a[j*lda+j] + alpha * temp2; jx += inc_x; jy += inc_y; } return(0); } xp = x; yp = y; m2 = m - ( mrange % 4 ); for (j=m1; j<m2; j+=4) { tmp1[0] = alpha * xp[j]; tmp1[1] = alpha * xp[j+1]; tmp1[2] = alpha * xp[j+2]; tmp1[3] = alpha * xp[j+3]; tmp2[0] = 0.0; tmp2[1] = 0.0; tmp2[2] = 0.0; tmp2[3] = 0.0; a0 = &a[j*lda]; a1 = a0+lda; a2 = a1+lda; a3 = a2+lda; j1 = (j/8)*8; if ( j1 ) dsymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2); if ( j1 < j ) dsymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2); j2 = 0; for ( j1 = j ; j1 < j+4 ; j1++ ) { temp1 = tmp1[j2]; temp2 = tmp2[j2]; a0 = &a[j1*lda]; for ( i=j ; i<j1; i++ ) { yp[i] += temp1 * a0[i]; temp2 += a0[i] * xp[i]; } y[j1] += temp1 * a0[j1] + alpha * temp2; j2++; } } for ( ; j<m; j++) { temp1 = alpha * xp[j]; temp2 = 0.0; a0 = &a[j*lda]; FLOAT at0; j1 = (j/8)*8; if ( j1 ) dsymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2); for (i=j1 ; i<j; i++) { at0 = a0[i]; yp[i] += temp1 * at0; temp2 += at0 * xp[i]; } yp[j] += temp1 * a0[j] + alpha * temp2; } return(0); }
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG ix,iy; BLASLONG jx,jy; BLASLONG j; FLOAT temp1; FLOAT temp2; FLOAT tmp1[4]; FLOAT tmp2[4]; FLOAT *ap[4]; #if 0 if ( m != offset ) printf("Symv_L: m=%d offset=%d\n",m,offset); #endif if ( (inc_x != 1) || (inc_y != 1) ) { jx = 0; jy = 0; for (j=0; j<offset; j++) { temp1 = alpha * x[jx]; temp2 = 0.0; y[jy] += temp1 * a[j*lda+j]; iy = jy; ix = jx; for (i=j+1; i<m; i++) { ix += inc_x; iy += inc_y; y[iy] += temp1 * a[j*lda+i]; temp2 += a[j*lda+i] * x[ix]; } y[jy] += alpha * temp2; jx += inc_x; jy += inc_y; } return(0); } BLASLONG offset1 = (offset/4)*4; for (j=0; j<offset1; j+=4) { tmp1[0] = alpha * x[j]; tmp1[1] = alpha * x[j+1]; tmp1[2] = alpha * x[j+2]; tmp1[3] = alpha * x[j+3]; tmp2[0] = 0.0; tmp2[1] = 0.0; tmp2[2] = 0.0; tmp2[3] = 0.0; ap[0] = &a[j*lda]; ap[1] = ap[0] + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; y[j] += tmp1[0] * ap[0][j]; y[j+1] += tmp1[1] * ap[1][j+1]; y[j+2] += tmp1[2] * ap[2][j+2]; y[j+3] += tmp1[3] * ap[3][j+3]; BLASLONG from = j+1; if ( m - from >=12 ) { BLASLONG m2 = (m/4)*4; for (i=j+1; i<j+4; i++) { y[i] += tmp1[0] * ap[0][i]; tmp2[0] += ap[0][i] * x[i]; } for (i=j+2; i<j+4; i++) { y[i] += tmp1[1] * ap[1][i]; tmp2[1] += ap[1][i] * x[i]; } for (i=j+3; i<j+4; i++) { y[i] += tmp1[2] * ap[2][i]; tmp2[2] += ap[2][i] * x[i]; } if ( m2 > j+4 ) dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2); for (i=m2; i<m; i++) { y[i] += tmp1[0] * ap[0][i]; tmp2[0] += ap[0][i] * x[i]; y[i] += tmp1[1] * ap[1][i]; tmp2[1] += ap[1][i] * x[i]; y[i] += tmp1[2] * ap[2][i]; tmp2[2] += ap[2][i] * x[i]; y[i] += tmp1[3] * ap[3][i]; tmp2[3] += ap[3][i] * x[i]; } } else { for (i=j+1; i<j+4; i++) { y[i] += tmp1[0] * ap[0][i]; tmp2[0] += ap[0][i] * x[i]; } for (i=j+2; i<j+4; i++) { y[i] += tmp1[1] * ap[1][i]; tmp2[1] += ap[1][i] * x[i]; } for (i=j+3; i<j+4; i++) { y[i] += tmp1[2] * ap[2][i]; tmp2[2] += ap[2][i] * x[i]; } for (i=j+4; i<m; i++) { y[i] += tmp1[0] * ap[0][i]; tmp2[0] += ap[0][i] * x[i]; y[i] += tmp1[1] * ap[1][i]; tmp2[1] += ap[1][i] * x[i]; y[i] += tmp1[2] * ap[2][i]; tmp2[2] += ap[2][i] * x[i]; y[i] += tmp1[3] * ap[3][i]; tmp2[3] += ap[3][i] * x[i]; } } y[j] += alpha * tmp2[0]; y[j+1] += alpha * tmp2[1]; y[j+2] += alpha * tmp2[2]; y[j+3] += alpha * tmp2[3]; } for (j=offset1; j<offset; j++) { temp1 = alpha * x[j]; temp2 = 0.0; y[j] += temp1 * a[j*lda+j]; BLASLONG from = j+1; if ( m - from >=8 ) { BLASLONG j1 = ((from + 4)/4)*4; BLASLONG j2 = (m/4)*4; for (i=from; i<j1; i++) { y[i] += temp1 * a[j*lda+i]; temp2 += a[j*lda+i] * x[i]; } for (i=j1; i<j2; i++) { y[i] += temp1 * a[j*lda+i]; temp2 += a[j*lda+i] * x[i]; } for (i=j2; i<m; i++) { y[i] += temp1 * a[j*lda+i]; temp2 += a[j*lda+i] * x[i]; } } else { for (i=from; i<m; i++) { y[i] += temp1 * a[j*lda+i]; temp2 += a[j*lda+i] * x[i]; } } y[j] += alpha * temp2; } return(0); }