static void gemvT2x16(const int M, const int N, const TYPE *A, const int lda, const TYPE *X, const SCALAR beta, TYPE *Y) { int i; const int M2 = (M>>1)<<1, N16 = (N>>4)<<4; const int incAm = (N16 ? (lda<<1) - N16 + 16 : (lda<<1)); const int nr = (N16 ? N-N16+16 : N); TYPE *stY = Y + M2; const TYPE *stX = X + N16-16, *x; const TYPE *A0 = A, *A1 = A + lda; register TYPE y0, y1, x0, x1; #ifndef ATL_PFD #define ATL_PFD 12 #endif if (M2) { do { #ifdef BETA0 y0 = y1 = ATL_rzero; #elif defined(BETAX) y0 = *Y * beta; y1 = beta * Y[1]; #else y0 = *Y; y1 = Y[1]; #endif x = X; if (N16 > 16) { x0 = *x; x1 = x[1]; do { y0 += x0 * *A0; ATL_pfl1R(A0+ATL_PFD); ATL_pfl1R(A1+ATL_PFD); ATL_pfl1R(A0+ATL_PFD+8); ATL_pfl1R(A1+ATL_PFD+8); y1 += x0 * *A1; x0 = x[2]; y0 += x1 * A0[1]; y1 += x1 * A1[1]; x1 = x[3]; y0 += x0 * A0[2]; y1 += x0 * A1[2]; x0 = x[4]; y0 += x1 * A0[3]; y1 += x1 * A1[3]; x1 = x[5]; y0 += x0 * A0[4]; y1 += x0 * A1[4]; x0 = x[6]; y0 += x1 * A0[5]; y1 += x1 * A1[5]; x1 = x[7]; y0 += x0 * A0[6]; y1 += x0 * A1[6]; x0 = x[8]; y0 += x1 * A0[7]; y1 += x1 * A1[7]; x1 = x[9]; y0 += x0 * A0[8]; ATL_pfl1R(A0+ATL_PFD+4); ATL_pfl1R(A1+ATL_PFD+4); ATL_pfl1R(A0+ATL_PFD+12); ATL_pfl1R(A1+ATL_PFD+12); y1 += x0 * A1[8]; x0 = x[10]; y0 += x1 * A0[9]; y1 += x1 * A1[9]; x1 = x[11]; y0 += x0 * A0[10]; y1 += x0 * A1[10]; x0 = x[12]; y0 += x1 * A0[11]; y1 += x1 * A1[11]; x1 = x[13]; y0 += x0 * A0[12]; y1 += x0 * A1[12]; x0 = x[14]; y0 += x1 * A0[13]; y1 += x1 * A1[13]; x1 = x[15]; y0 += x0 * A0[14]; x += 16; y1 += x0 * A1[14]; x0 = *x; y0 += x1 * A0[15]; A0 += 16; y1 += x1 * A1[15]; x1 = x[1]; A1 += 16; } while(x != stX); } for (i=0; i != nr; i++) { x0 = x[i]; y0 += x0 * A0[i]; y1 += x0 * A1[i]; } A0 += incAm; *Y = y0; A1 += incAm; Y[1] = y1; Y += 2; } while (Y != stY); } if (M-M2) gemvT_Msmall(M-M2, N, A0, lda, X, beta, Y); }
static void gemvT4x4(const int M, const int N, const TYPE *A, const int lda, const TYPE *X, const SCALAR beta, TYPE *Y) { const int M4 = (M>>2)<<2, N4 = (N>>2)<<2; const int nr = N-N4, incAm = (lda<<2) - N4; const TYPE *stX = X + N4, *x; const TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda; TYPE *stY = Y + M4; register TYPE a00, a10, a20, a30, a01, a11, a21, a31; register TYPE a02, a12, a22, a32, a03, a13, a23, a33; register TYPE y0, y1, y2, y3, yy0, yy1, yy2, yy3; register TYPE x0, x1, x2, x3; if (N4 > 4) { if (M4) { do { a00 = *A0; a01 = *A1; a02 = *A2; a03 = *A3; a10 = A0[1]; a11 = A1[1]; a12 = A2[1]; a13 = A3[1]; a20 = A0[2]; a21 = A1[2]; a22 = A2[2]; a23 = A3[2]; a30 = A0[3]; a31 = A1[3]; a32 = A2[3]; a33 = A3[3]; A0 += 4; A1 += 4; A2 += 4; A3 += 4; #ifdef BETA0 y0 = y1 = y2 = y3 = yy0 = yy1 = yy2 = yy3 = ATL_rzero; #elif defined(BETAX) y0 = beta; yy0 = *Y; yy1 = Y[1]; yy2 = Y[2]; yy3 = Y[3]; yy0 *= y0; yy1 *= y0; yy2 *= y0; yy3 *= y0; y0 = y1 = y2 = y3 = ATL_rzero; #else yy0 = *Y; yy1 = Y[1]; yy2 = Y[2]; yy3 = Y[3]; y0 = y1 = y2 = y3 = ATL_rzero; #endif x0 = *X; x1 = X[1]; x2 = X[2]; x3 = X[3]; x = X + 4; do { y0 += a00 * x0; a00 = *A0; y1 += a01 * x0; a01 = *A1; yy0 += a10 * x1; a10 = A0[1]; yy1 += a11 * x1; a11 = A1[1]; y2 += a02 * x0; a02 = *A2; y3 += a03 * x0; a03 = *A3; x0 = *x; yy2 += a12 * x1; a12 = A2[1]; yy3 += a13 * x1; a13 = A3[1]; x1 = x[1]; y0 += a20 * x2; a20 = A0[2]; y1 += a21 * x2; a21 = A1[2]; yy0 += a30 * x3; a30 = A0[3]; A0 += 4; yy1 += a31 * x3; a31 = A1[3]; A1 += 4; y2 += a22 * x2; a22 = A2[2]; y3 += a23 * x2; a23 = A3[2]; x2 = x[2]; yy2 += a32 * x3; a32 = A2[3]; A2 += 4; yy3 += a33 * x3; a33 = A3[3]; A3 += 4; x3 = x[3]; x += 4; } while (x != stX); y0 += a00 * x0; y1 += a01 * x0; yy0 += a10 * x1; yy1 += a11 * x1; y2 += a02 * x0; y3 += a03 * x0; yy2 += a12 * x1; yy3 += a13 * x1; y0 += a20 * x2; y1 += a21 * x2; yy0 += a30 * x3; yy1 += a31 * x3; y2 += a22 * x2; y3 += a23 * x2; yy2 += a32 * x3; yy3 += a33 * x3; switch(nr) { case 1: x0 = *x; y0 += *A0 * x0; y1 += *A1 * x0; y2 += *A2 * x0; y3 += *A3 * x0; break; case 2: x0 = *x; x1 = x[1]; y0 += *A0 * x0; y1 += *A1 * x0; yy0 += A0[1] * x1; yy1 += A1[1] * x1; y2 += *A2 * x0; y3 += *A3 * x0; yy2 += A2[1] * x1; yy3 += A3[1] * x1; break; case 3: x0 = *x; x1 = x[1]; x2 = x[2]; y0 += *A0 * x0; y1 += *A1 * x0; yy0 += A0[1] * x1; yy1 += A1[1] * x1; y2 += *A2 * x0; y3 += *A3 * x0; yy2 += A2[1] * x1; yy3 += A3[1] * x1; y0 += A0[2] * x2; y1 += A1[2] * x2; y2 += A2[2] * x2; y3 += A3[2] * x2; break; default:; } A0 += incAm; y0 += yy0; A1 += incAm; y1 += yy1; A2 += incAm; y2 += yy2; A3 += incAm; y3 += yy3; *Y = y0; Y[1] = y1; Y[2] = y2; Y[3] = y3; Y += 4; } while (Y != stY); } if (M-M4) gemvT_Msmall(M-M4, N, A0, lda, X, beta, Y); } else if (M) gemvT_Nsmall(M, N, A, lda, X, beta, Y); }