static void gemvT2x16(const int M, const int N, const TYPE *A, const int lda,
                      const TYPE *X, const SCALAR beta, TYPE *Y)
{
   int i;
   const int M2 = (M>>1)<<1, N16 = (N>>4)<<4;
   const int incAm = (N16 ? (lda<<1) - N16 + 16 : (lda<<1));
   const int nr = (N16 ? N-N16+16 : N);
   TYPE *stY = Y + M2;
   const TYPE *stX = X + N16-16, *x;
   const TYPE *A0 = A, *A1 = A + lda;
   register TYPE y0, y1, x0, x1;
   #ifndef ATL_PFD
      #define ATL_PFD 12
   #endif

   if (M2)
   {
      do
      {
         #ifdef BETA0
            y0 = y1 = ATL_rzero;
         #elif defined(BETAX)
            y0 = *Y * beta; y1 = beta * Y[1];
         #else
            y0 = *Y; y1 = Y[1];
         #endif
         x = X;
         if (N16 > 16)
         {
            x0 = *x; x1 = x[1];
            do
            {
               y0 += x0 * *A0; ATL_pfl1R(A0+ATL_PFD); ATL_pfl1R(A1+ATL_PFD);
                               ATL_pfl1R(A0+ATL_PFD+8); ATL_pfl1R(A1+ATL_PFD+8);
               y1 += x0 * *A1; x0 = x[2];
               y0 += x1 * A0[1];
               y1 += x1 * A1[1]; x1 = x[3];
               y0 += x0 * A0[2];
               y1 += x0 * A1[2]; x0 = x[4];
               y0 += x1 * A0[3];
               y1 += x1 * A1[3]; x1 = x[5];
               y0 += x0 * A0[4];
               y1 += x0 * A1[4]; x0 = x[6];
               y0 += x1 * A0[5];
               y1 += x1 * A1[5]; x1 = x[7];
               y0 += x0 * A0[6];
               y1 += x0 * A1[6]; x0 = x[8];
               y0 += x1 * A0[7];
               y1 += x1 * A1[7]; x1 = x[9];
               y0 += x0 * A0[8];
                  ATL_pfl1R(A0+ATL_PFD+4); ATL_pfl1R(A1+ATL_PFD+4);
                  ATL_pfl1R(A0+ATL_PFD+12); ATL_pfl1R(A1+ATL_PFD+12);
               y1 += x0 * A1[8]; x0 = x[10];
               y0 += x1 * A0[9];
               y1 += x1 * A1[9]; x1 = x[11];
               y0 += x0 * A0[10];
               y1 += x0 * A1[10]; x0 = x[12];
               y0 += x1 * A0[11];
               y1 += x1 * A1[11]; x1 = x[13];
               y0 += x0 * A0[12];
               y1 += x0 * A1[12]; x0 = x[14];
               y0 += x1 * A0[13];
               y1 += x1 * A1[13]; x1 = x[15];
               y0 += x0 * A0[14]; x += 16;
               y1 += x0 * A1[14]; x0 = *x;
               y0 += x1 * A0[15]; A0 += 16;
               y1 += x1 * A1[15]; x1 = x[1]; A1 += 16;
            }
            while(x != stX);
         }
         for (i=0; i != nr; i++)
         {
            x0 = x[i];
            y0 += x0 * A0[i];
            y1 += x0 * A1[i];
         }
         A0 += incAm;
         *Y = y0;
         A1 += incAm;
         Y[1] = y1;
         Y += 2;
      }
      while (Y != stY);
   }
   if (M-M2) gemvT_Msmall(M-M2, N, A0, lda, X, beta, Y);
}
Exemple #2
0
static void gemvT4x4(const int M, const int N, const TYPE *A, const int lda,
                     const TYPE *X, const SCALAR beta, TYPE *Y)
{
   const int M4 = (M>>2)<<2, N4 = (N>>2)<<2;
   const int nr = N-N4, incAm = (lda<<2) - N4;
   const TYPE *stX = X + N4, *x;
   const TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda;
   TYPE *stY = Y + M4;
   register TYPE a00, a10, a20, a30, a01, a11, a21, a31;
   register TYPE a02, a12, a22, a32, a03, a13, a23, a33;
   register TYPE y0, y1, y2, y3, yy0, yy1, yy2, yy3;
   register TYPE x0, x1, x2, x3;

   if (N4 > 4)
   {
      if (M4)
      {
         do
         {
            a00 = *A0;   a01 = *A1;   a02 = *A2;   a03 = *A3;
            a10 = A0[1]; a11 = A1[1]; a12 = A2[1]; a13 = A3[1];
            a20 = A0[2]; a21 = A1[2]; a22 = A2[2]; a23 = A3[2];
            a30 = A0[3]; a31 = A1[3]; a32 = A2[3]; a33 = A3[3];
            A0 += 4;     A1 += 4;     A2 += 4;     A3 += 4;
            #ifdef BETA0
               y0 = y1 = y2 = y3 = yy0 = yy1 = yy2 = yy3 = ATL_rzero;
            #elif defined(BETAX)
               y0 = beta;
               yy0 = *Y; yy1 = Y[1]; yy2 = Y[2];  yy3 = Y[3];
               yy0 *= y0; yy1 *= y0; yy2 *= y0; yy3 *= y0;
               y0 = y1 = y2 = y3 = ATL_rzero;
            #else
               yy0 = *Y; yy1 = Y[1]; yy2 = Y[2];  yy3 = Y[3];
               y0 = y1 = y2 = y3 = ATL_rzero;
            #endif
            x0 = *X; x1 = X[1]; x2 = X[2]; x3 = X[3]; x = X + 4;
            do
            {
               y0  += a00 * x0; a00 = *A0;
               y1  += a01 * x0; a01 = *A1;
               yy0 += a10 * x1; a10 = A0[1];
               yy1 += a11 * x1; a11 = A1[1];
               y2  += a02 * x0; a02 = *A2;
               y3  += a03 * x0; a03 = *A3; x0 = *x;
               yy2 += a12 * x1; a12 = A2[1];
               yy3 += a13 * x1; a13 = A3[1]; x1 = x[1];

               y0  += a20 * x2; a20 = A0[2];
               y1  += a21 * x2; a21 = A1[2];
               yy0 += a30 * x3; a30 = A0[3]; A0 += 4;
               yy1 += a31 * x3; a31 = A1[3]; A1 += 4;
               y2  += a22 * x2; a22 = A2[2];
               y3  += a23 * x2; a23 = A3[2]; x2 = x[2];
               yy2 += a32 * x3; a32 = A2[3]; A2 += 4;
               yy3 += a33 * x3; a33 = A3[3]; A3 += 4; x3 = x[3]; x += 4;
            }
            while (x != stX);
            y0  += a00 * x0;
            y1  += a01 * x0;
            yy0 += a10 * x1;
            yy1 += a11 * x1;
            y2  += a02 * x0;
            y3  += a03 * x0;
            yy2 += a12 * x1;
            yy3 += a13 * x1;

            y0  += a20 * x2;
            y1  += a21 * x2;
            yy0 += a30 * x3;
            yy1 += a31 * x3;
            y2  += a22 * x2;
            y3  += a23 * x2;
            yy2 += a32 * x3;
            yy3 += a33 * x3;
            switch(nr)
            {
            case 1:
               x0 = *x;
               y0 += *A0 * x0;
               y1 += *A1 * x0;
               y2 += *A2 * x0;
               y3 += *A3 * x0;
               break;
            case 2:
               x0 = *x;
               x1 = x[1];
               y0  += *A0 * x0;
               y1  += *A1 * x0;
               yy0 += A0[1] * x1;
               yy1 += A1[1] * x1;
               y2  += *A2 * x0;
               y3  += *A3 * x0;
               yy2 += A2[1] * x1;
               yy3 += A3[1] * x1;
               break;
            case 3:
               x0 = *x;
               x1 = x[1];
               x2 = x[2];
               y0  += *A0 * x0;
               y1  += *A1 * x0;
               yy0 += A0[1] * x1;
               yy1 += A1[1] * x1;
               y2  += *A2 * x0;
               y3  += *A3 * x0;
               yy2 += A2[1] * x1;
               yy3 += A3[1] * x1;
               y0  += A0[2] * x2;
               y1  += A1[2] * x2;
               y2  += A2[2] * x2;
               y3  += A3[2] * x2;
               break;
            default:;
            }
            A0 += incAm;
            y0 += yy0;
            A1 += incAm;
            y1 += yy1;
            A2 += incAm;
            y2 += yy2;
            A3 += incAm;
            y3 += yy3;
            *Y = y0;
            Y[1] = y1;
            Y[2] = y2;
            Y[3] = y3;
            Y += 4;
         }
         while (Y != stY);
      }
      if (M-M4) gemvT_Msmall(M-M4, N, A0, lda, X, beta, Y);
   }
   else if (M) gemvT_Nsmall(M, N, A, lda, X, beta, Y);
}