コード例 #1
0
ファイル: ATL_ger1_4x4_1.c プロジェクト: certik/vendor
void Mjoin(PATL,ger1_a1_x1_yX)
   (const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
    const TYPE *Y, const int incY, TYPE *A, const int lda)
{
   int i, j;
   const int incy = incY<<2;
   const TYPE *Y1= Y + incY, *Y2 = Y1 + incY, *Y3 = Y2 + incY;
   const TYPE *x;
   TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda;
   const int M4 = (M>>2)<<2, N4 = (N>>2)<<2, incAn = (lda<<2) - M4;
   register TYPE x0, x1, x2, x3, y0, y1, y2, y3;

   if (M4)
   {
      for (j=N4; j; j -= 4)
      {
         y0 = *Y;  Y  += incy;
         y1 = *Y1; Y1 += incy;
         y2 = *Y2; Y2 += incy;
         y3 = *Y3; Y3 += incy;
         x = X;
         for (i=M4; i; i -= 4)
         {
            x0 = *x; x1 = x[1]; x2 = x[2]; x3 = x[3];
            *A0 += y0 * x0;
            x += 4;
            *A1 += y1 * x0;
            *A2 += y2 * x0;
            *A3 += y3 * x0;
            A0[1] += y0 * x1;
            A1[1] += y1 * x1;
            A2[1] += y2 * x1;
            A3[1] += y3 * x1;
            A0[2] += y0 * x2;
            A1[2] += y1 * x2;
            A2[2] += y2 * x2;
            A3[2] += y3 * x2;
            A0[3] += y0 * x3; A0 += 4;
            A1[3] += y1 * x3; A1 += 4;
            A2[3] += y2 * x3; A2 += 4;
            A3[3] += y3 * x3; A3 += 4;
         }
         switch(M-M4)
         {
         case 1:
            x0 = *x;
            *A0 += y0 * x0;
            *A1 += y1 * x0;
            *A2 += y2 * x0;
            *A3 += y3 * x0;
            break;
         case 2:
            x0 = *x; x1 = x[1];
            *A0 += y0 * x0;
            *A1 += y1 * x0;
            *A2 += y2 * x0;
            *A3 += y3 * x0;
            A0[1] += y0 * x1;
            A1[1] += y1 * x1;
            A2[1] += y2 * x1;
            A3[1] += y3 * x1;
            break;
         case 3:
            x0 = *x; x1 = x[1]; x2 = x[2];
            *A0 += y0 * x0;
            *A1 += y1 * x0;
            *A2 += y2 * x0;
            *A3 += y3 * x0;
            A0[1] += y0 * x1;
            A1[1] += y1 * x1;
            A2[1] += y2 * x1;
            A3[1] += y3 * x1;
            A0[2] += y0 * x2;
            A1[2] += y1 * x2;
            A2[2] += y2 * x2;
            A3[2] += y3 * x2;
            break;
         }
         A0 += incAn;
         A1 += incAn;
         A2 += incAn;
         A3 += incAn;
      }
      if (N-N4) ger_Nle4(M, N-N4, X, Y, incY, A0, lda);
   }
   else ger_Mle8(M, N, X, Y, incY, A, lda);
}
コード例 #2
0
void Mjoin(PATL,ger1_a1_x1_yX)
   (const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
    const TYPE *Y, const int incY, TYPE *A, const int lda)
{
   int i, j;
   const int incy = incY<<2;
   const TYPE *Y1= Y + incY, *Y2 = Y1 + incY, *Y3 = Y2 + incY;
   const TYPE *x;
   TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda;
   const int M8 = ((M-1)>>3)<<3, mr = M-M8-1;
   const int N4 = (N>>2)<<2, incAn = (lda<<2) - M + 1;
   register TYPE m0, m1, m2, m3, x0, y0, y1, y2, y3;

   if (M8)
   {
      for (j=N4; j; j -= 4)
      {
         y0 = *Y; y1 = *Y1; y2 = *Y2; y3 = *Y3;
         x0 = *X; x = X + 1;
         m0 = y0 * x0; Y += incy;
         m1 = y1 * x0; Y1 += incy;
         m2 = y2 * x0; Y2 += incy;
         m3 = y3 * x0; Y3 += incy;
         for (i=M8; i; i -= 8)
         {
            x0 = *x;
            *A0 += m0; m0 = y0 * x0;
            *A1 += m1; m1 = y1 * x0;
            *A2 += m2; m2 = y2 * x0;
            *A3 += m3; m3 = y3 * x0;
            x0 = x[1];
            A0[1] += m0; m0 = y0 * x0;
            A1[1] += m1; m1 = y1 * x0;
            A2[1] += m2; m2 = y2 * x0;
            A3[1] += m3; m3 = y3 * x0;
            x0 = x[2];
            A0[2] += m0; m0 = y0 * x0;
            A1[2] += m1; m1 = y1 * x0;
            A2[2] += m2; m2 = y2 * x0;
            A3[2] += m3; m3 = y3 * x0;
            x0 = x[3];
            A0[3] += m0; m0 = y0 * x0;
            A1[3] += m1; m1 = y1 * x0;
            A2[3] += m2; m2 = y2 * x0;
            A3[3] += m3; m3 = y3 * x0;
            x0 = x[4];
            A0[4] += m0; m0 = y0 * x0;
            A1[4] += m1; m1 = y1 * x0;
            A2[4] += m2; m2 = y2 * x0;
            A3[4] += m3; m3 = y3 * x0;
            x0 = x[5];
            A0[5] += m0; m0 = y0 * x0;
            A1[5] += m1; m1 = y1 * x0;
            A2[5] += m2; m2 = y2 * x0;
            A3[5] += m3; m3 = y3 * x0;
            x0 = x[6];
            A0[6] += m0; m0 = y0 * x0;
            A1[6] += m1; m1 = y1 * x0;
            A2[6] += m2; m2 = y2 * x0;
            A3[6] += m3; m3 = y3 * x0;
            x0 = x[7]; x += 8;
            A0[7] += m0; m0 = y0 * x0; A0 += 8;
            A1[7] += m1; m1 = y1 * x0; A1 += 8;
            A2[7] += m2; m2 = y2 * x0; A2 += 8;
            A3[7] += m3; m3 = y3 * x0; A3 += 8;
         }
         if (mr)
         {
            for (i=mr; i; i--)
            {
               x0 = *x++;
               *A0++ += m0; m0 = y0 * x0;
               *A1++ += m1; m1 = y1 * x0;
               *A2++ += m2; m2 = y2 * x0;
               *A3++ += m3; m3 = y3 * x0;
            }
         }
         *A0 += m0; A0 += incAn;
         *A1 += m1; A1 += incAn;
         *A2 += m2; A2 += incAn;
         *A3 += m3; A3 += incAn;
      }
      if (N-N4) ger_Nle4(M, N-N4, X, Y, incY, A0, lda);
   }
   else ger_Mle8(M, N, X, Y, incY, A, lda);
}