static void trsmRL_3(const int M, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Right', 'Lower', written with all dependencies shown, so that the
 * compiler can optimize.  A is known to be 3x3, with 1/alpha already applied,
 * diagonals already inverted.
 */
{
   const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3], ar31=A[4], ai31=A[5];
   const TYPE ar22=A[ 8], ai22=A[ 9], ar32=A[10], ai32=A[11];
   const TYPE ar33=A[16], ai33=A[17];
   const int ldb2 = ldb+ldb;
   TYPE xr1, xi1, xr2, xi2, xr3, xi3, t0;
   TYPE *pB0=B, *pB1 = B+ldb2, *pB2=pB1+ldb2;
   int i;
   #define PFD 8

   for (i=M; i; i--)
   {
      xr3 = *pB2; xr2 = *pB1; xr1 = *pB0;
      xi3 = pB2[1]; xi2 = pB1[1]; xi1 = pB0[1];
/*
 *    REAL SEQUENCE :
 *
 *    x3 *= a33;
 *    x2 = (x2 - x3*a32) * a22;
 *    x1 = (x1 - x3*a31 - x2*a21) * a11;
 */
      t0 = xr3;
      xr3 = ar33*xr3 - ai33*xi3;
      xi3 = ar33*xi3 + ai33*t0;

      xr2 -= xr3*ar32 - xi3*ai32;
      xi2 -= xr3*ai32 + xi3*ar32;
      t0 = xr2;
      xr2 = xr2*ar22 - xi2*ai22; ATL_pfl1W(pB2+PFD);
      xi2 = t0 *ai22 + xi2*ar22; ATL_pfl1W(pB1+PFD);

      xr1 -= xr3*ar31 - xi3*ai31;
      xi1 -= xr3*ai31 + xi3*ar31;
      xr1 -= xr2*ar21 - xi2*ai21;
      xi1 -= xr2*ai21 + xi2*ar21; /* ATL_pfl1W(pB0+PFD); */
      t0 = xr1;
      xr1 = xr1*ar11 - xi1*ai11;
      xi1 = t0 *ai11 + xi1*ar11;
      *pB2 = xr3; pB2[1] = xi3; pB2 += 2;
      *pB1 = xr2; pB1[1] = xi2; pB1 += 2;
      *pB0 = xr1; pB0[1] = xi1; pB0 += 2;
   }
   #undef PFD
}
static void trsmRU_3(const int M, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Right', 'Upper', written with all dependencies shown, so that the
 * compiler can optimize.  A is known to be 3x3, with 1/alpha already applied,
 * diagonals already inverted.
 */
{
   const TYPE ar11=*A, ai11=A[1], ar12=A[6], ai12=A[7], ar13=A[12], ai13=A[13];
   const TYPE ar22=A[ 8], ai22=A[ 9], ar23=A[14], ai23=A[15];
   const TYPE ar33=A[16], ai33=A[17];
   const int ldb2 = ldb+ldb;
   TYPE xr1, xi1, xr2, xi2, xr3, xi3, t0;
   TYPE *pB0=B, *pB1 = B+ldb2, *pB2=pB1+ldb2;
   int i;
   #define PFD 8

   for (i=M; i; i--)
   {
      xr1 = *pB0; xr2 = *pB1; xr3 = *pB2;
      xi1 = pB0[1]; xi2 = pB1[1]; xi3 = pB2[1];
/*
 *    real sequence:
 *    x1 *= a11;
 *    x2 = (x2 - x1*a12) * a22;
 *    x3 = (x3 - x1*a13 - x2*a23) * a33;
 */
      t0 = xr1;
      xr1 = xr1*ar11 - xi1*ai11;
      xi1 = t0 *ai11 + xi1*ar11;

      xr2 -= xr1*ar12 - xi1*ai12;
      xi2 -= xr1*ai12 + xi1*ar12;
      t0 = xr2;
      xr2 = xr2*ar22 - xi2*ai22;     ATL_pfl1W(pB0+PFD);
      xi2 = t0 *ai22 + xi2*ar22;     ATL_pfl1W(pB1+PFD);

      xr3 -= xr1*ar13 - xi1*ai13;
      xi3 -= xr1*ai13 + xi1*ar13;
      xr3 -= xr2*ar23 - xi2*ai23;
      xi3 -= xr2*ai23 + xi2*ar23;     ATL_pfl1W(pB2+PFD);
      t0 = xr3;
      xr3 = xr3*ar33 - xi3*ai33;
      xi3 = t0 *ai33 + xi3*ar33;

      *pB0 = xr1; pB0[1] = xi1; pB0 += 2;
      *pB1 = xr2; pB1[1] = xi2; pB1 += 2;
      *pB2 = xr3; pB2[1] = xi3; pB2 += 2;
   }
   #undef PFD
}
static void trsmLU_2(const int N, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Left, 'Upper', with 1 col prefetch, written with all dependencies shown,
 * so that compiler can optimize.
 * A is known to be 2x2, with 1/alpha already applied, diagonals already
 * inverted
 */
{
   const TYPE ar11=*A, ai11=A[1], ar12=A[4], ai12=A[5];
   const TYPE ar22=A[6], ai22=A[7];
   TYPE xr1, xi1, xr2, xi2;
   TYPE t0, p0;
   const int ldb2=ldb+ldb;
   TYPE *bn=B+ldb2;
   const int pfd=ldb2+ldb2;
   int j;

   p0 = B[2];
   for (j=N-1; j; j--) /* stop 1 iteration early to stop prefetch */
   {
      xr2 = p0  ; xi2 = B[3];
      xr1 = *B  ; xi1 = B[1];

      t0 = xr2;
      xr2 = ar22*xr2 - ai22*xi2;
      xi2 = ar22*xi2 + ai22*t0;     p0 = bn[2];

      xr1 -= ar12*xr2 - ai12*xi2;
      xi1 -= ar12*xi2 + ai12*xr2;   ATL_pfl1W(bn+pfd);
      t0 = xr1;
      xr1 = ar11*xr1 - ai11*xi1;
      xi1 = ar11*xi1 + ai11*t0;

      *B   = xr1; B[1] = xi1;
      B[2] = xr2; B[3] = xi2;
      B = bn;
      bn += ldb2;
   }
   xr2 = p0  ; xi2 = B[3];
   xr1 = *B  ; xi1 = B[1];

   t0 = xr2;
   xr2 = ar22*xr2 - ai22*xi2;
   xi2 = ar22*xi2 + ai22*t0;

   xr1 -= ar12*xr2 - ai12*xi2;
   xi1 -= ar12*xi2 + ai12*xr2;
   t0 = xr1;
   xr1 = ar11*xr1 - ai11*xi1;
   xi1 = ar11*xi1 + ai11*t0;

   *B   = xr1; B[1] = xi1;
   B[2] = xr2; B[3] = xi2;
}
static void trsmLL_2(const int N, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Left', 'Lower', with 1 column prefetch, written with all dependencies
 * shown, so that the compiler can optimize.
 * A is known to be 2x2, with 1/alpha already applied, diagonals already
 * inverted
 */
{
   const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3];
   const TYPE ar22=A[6], ai22=A[7];
   const int ldb2 = ldb+ldb;
   TYPE xr1, xi1, xr2, xi2;
   TYPE t0, p0;
   TYPE *pBn=B+ldb2;
   const int pfd=ldb2+ldb2;
   int j;

   p0 = *B;
   for (j=N-1; j; j--)
   {
      xr1 = p0; xi1 = B[1];
      xr2 = B[2]; xi2 = B[3];

      t0 = xr1;
      xr1 = ar11 * xr1 - ai11 * xi1;
      xi1 = ar11 * xi1 + ai11 * t0;     p0 = *pBn;

      xr2 -= ar21*xr1 - ai21*xi1;
      xi2 -= ar21*xi1 + ai21*xr1;       ATL_pfl1W(pBn+pfd);
      t0 = xr2;
      xr2 = ar22*xr2 - ai22*xi2;
      xi2 = ar22*xi2 + ai22*t0;

      *B   = xr1; B[1] = xi1;
      B[2] = xr2; B[3] = xi2;
      B = pBn;
      pBn += ldb2;
   }
   xr1 = p0; xi1 = B[1];
   xr2 = B[2]; xi2 = B[3];

   t0 = xr1;
   xr1 = ar11 * xr1 - ai11 * xi1;
   xi1 = ar11 * xi1 + ai11 * t0;

   xr2 -= ar21*xr1 - ai21*xi1;
   xi2 -= ar21*xi1 + ai21*xr1;
   t0 = xr2;
   xr2 = ar22*xr2 - ai22*xi2;
   xi2 = ar22*xi2 + ai22*t0;

   *B   = xr1; B[1] = xi1;
   B[2] = xr2; B[3] = xi2;
}
static void trsmRL_2(const int M, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Right', 'Lower', written with all dependencies shown, so that the
 * compiler can optimize.  A is known to be 2x2, with 1/alpha already applied,
 * diagonals already inverted.
 */
{
   const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3];
   const TYPE ar22=A[6], ai22=A[7];
   const int ldb2 = ldb+ldb;
   TYPE *pB0=B, *pB1 = B+ldb2;
   TYPE xr1, xi1, xr2, xi2, t0;
   int i;
   #define PFD 8

   for (i=M; i; i--)
   {
      xr2 = *pB1; xr1 = *pB0;
      xi2 = pB1[1]; xi1 = pB0[1];
/*
 *    REAL SEQUENCE :
 *
 *    x2 *= a22;
 *    x1 = (x1 - x2*a21) * a11;
 */
      t0 = xr2;
      xr2 = xr2*ar22 - xi2*ai22;
      xi2 = t0 *ai22 + xi2*ar22;

      xr1 -= xr2*ar21 - xi2*ai21; ATL_pfl1W(pB1+PFD);
      xi1 -= xr2*ai21 + xi2*ar21; ATL_pfl1W(pB0+PFD);
      t0 = xr1;
      xr1 = xr1*ar11 - xi1*ai11;
      xi1 = t0 *ai11 + xi1*ar11;
      *pB1 = xr2; pB1[1] = xi2; pB1 += 2;
      *pB0 = xr1; pB0[1] = xi1; pB0 += 2;
   }
   #undef PFD
}
static void trsmRU_2(const int M, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Right', 'Upper', written with all dependencies shown, so that the
 * compiler can optimize.  A is known to be 2x2, with 1/alpha already applied,
 * diagonals already inverted.
 */
{
   const TYPE ar11=*A, ai11=A[1], ar12=A[4], ai12=A[5];
   const TYPE ar22=A[6], ai22=A[7];
   const int ldb2 = ldb+ldb;
   TYPE xr1, xi1, xr2, xi2, t0;
   TYPE *pB0=B, *pB1 = B+ldb2;
   int i;
   #define PFD 8

   for (i=M; i; i--)
   {
      xr1 = *pB0; xr2 = *pB1;
      xi1 = pB0[1]; xi2 = pB1[1];
/*
 *    real sequence:
 *    x1 *= a11;
 *    x2 = (x2 - x1*a12) * a22;
 */
      t0 = xr1;
      xr1 = xr1*ar11 - xi1*ai11;
      xi1 = t0 *ai11 + xi1*ar11;

      xr2 -= xr1*ar12 - xi1*ai12;
      xi2 -= xr1*ai12 + xi1*ar12;     ATL_pfl1W(pB0+PFD);
      t0 = xr2;                       ATL_pfl1W(pB1+PFD);
      xr2 = xr2*ar22 - xi2*ai22;
      xi2 = t0 *ai22 + xi2*ar22;

      *pB0 = xr1; pB0[1] = xi1; pB0 += 2;
      *pB1 = xr2; pB1[1] = xi2; pB1 += 2;
   }
   #undef PFD
}
Exemple #7
0
void ATL_USERMM
   (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
/*
 * matmul with TA=T, TB=N, lda=KB, ldb=KB, ldc=0, mu=4, nu=4, ku=8
 */
{
   const TYPE *stM = A + KB*M;
   const TYPE *stN = B + KB*N;
   const int incAm = KB3+8, incAn = -KB*M;
   const int incBm = 8-KB;
   #define incBn KB4
   const int incCn = ((ldc<<2) - M)SHIFT;
   const int Kstart=(KB>>3)-1;
   TYPE *pC0=C, *pC1=pC0+(ldc SHIFT), *pC2=pC1+(ldc SHIFT),
        *pC3=pC2+(ldc SHIFT);
   const TYPE *pA0=A;
   const TYPE *pB0=B;
   register int k;
   register TYPE rA0, rA1, rA2, rA3;
   register TYPE rB0, rB1, rB2, rB3;
   register TYPE m0, m1, m2, m3;
   register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC0_1, rC1_1, rC2_1, rC3_1,
                   rC0_2, rC1_2, rC2_2, rC3_2, rC0_3, rC1_3, rC2_3, rC3_3;
   do /* N-loop */
   {
      ATL_pfl1R(pB0); ATL_pfl1R(pB0+KB); ATL_pfl1R(pB0+KB2); ATL_pfl1R(pB0+KB3);
ATL_pfl1R(pB0+8); ATL_pfl1R(pB0+KB+8); ATL_pfl1R(pB0+KB2+8); ATL_pfl1R(pB0+KB3+8);
      do /* M-loop */
      {
         #ifdef BETA0
            rC0_0 = rC1_0 = rC2_0 = rC3_0 =
            rC0_1 = rC1_1 = rC2_1 = rC3_1 =
            rC0_2 = rC1_2 = rC2_2 = rC3_2 =
            rC0_3 = rC1_3 = rC2_3 = rC3_3 = ATL_rzero;
/* ATL_pfl1R(pB0+8); ATL_pfl1R(pB0+KB+8); ATL_pfl1R(pB0+KB2+8); ATL_pfl1R(pB0+KB3+8); */
         #else
            #ifdef TREAL
               rC0_0 = *pC0; rC0_1 = *pC1; rC0_2 = *pC2; rC0_3 = *pC3;
               rC1_0 = pC0[1]; rC1_1 = pC1[1]; rC1_2 = pC2[1]; rC1_3 = pC3[1];
               rC2_0 = pC0[2]; rC2_1 = pC1[2]; rC2_2 = pC2[2]; rC2_3 = pC3[2];
               rC3_0 = pC0[3]; rC3_1 = pC1[3]; rC3_2 = pC2[3]; rC3_3 = pC3[3];
            #else
               rC0_0 = *pC0; rC0_1 = *pC1; rC0_2 = *pC2; rC0_3 = *pC3;
               rC1_0 = pC0[2]; rC1_1 = pC1[2]; rC1_2 = pC2[2]; rC1_3 = pC3[2];
               rC2_0 = pC0[4]; rC2_1 = pC1[4]; rC2_2 = pC2[4]; rC2_3 = pC3[4];
               rC3_0 = pC0[6]; rC3_1 = pC1[6]; rC3_2 = pC2[6]; rC3_3 = pC3[6];
            #endif
            #ifdef BETAX
               rB3 = beta;
               rC0_0 *= rB3; rC0_1 *= rB3; rC0_2 *= rB3; rC0_3 *= rB3;
/* ATL_pfl1R(pB0+8); */
               rC1_0 *= rB3; rC1_1 *= rB3; rC1_2 *= rB3; rC1_3 *= rB3;
/* ATL_pfl1R(pB0+KB+8); */
               rC2_0 *= rB3; rC2_1 *= rB3; rC2_2 *= rB3; rC2_3 *= rB3;
/* ATL_pfl1R(pB0+KB2+8); */
               rC3_0 *= rB3; rC3_1 *= rB3; rC3_2 *= rB3; rC3_3 *= rB3;
/* ATL_pfl1R(pB0+KB3+8); */
            #else
/* ATL_pfl1R(pB0+8); ATL_pfl1R(pB0+KB+8); ATL_pfl1R(pB0+KB2+8); ATL_pfl1R(pB0+KB3+8); */
            #endif
         #endif
/*
 *       Start pipeline
 */
         rA0 = *pA0; rB0 = *pB0;
         rA1 = pA0[KB]; rA2 = pA0[KB2];
         m0 = rA0 * rB0; rA3 = pA0[KB3];
         m1 = rA1 * rB0; rB1 = pB0[KB];
         m2 = rA2 * rB0; rB2 = pB0[KB2];
         m3 = rA3 * rB0; rB3 = pB0[KB3];

         for (k=Kstart; k; k--)
         {
            rC0_0 += m0; m0 = rA0 * rB1; rB0 = pB0[1];
            rC1_0 += m1; m1 = rA1 * rB1;
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+1];
            rC0_1 += m0; m0 = rA0 * rB2;
            rC1_1 += m1; m1 = rA1 * rB2;                   ATL_pfl1R(pA0+KB4);
            rC2_1 += m2; m2 = rA2 * rB2;
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+1];
            rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[1];
            rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+1];
            rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+1];
            rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+1];

            rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+1];
            rC1_3 += m1; m1 = rA1 * rB0;
            rC2_3 += m2; m2 = rA2 * rB0;
            rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[2];
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+2];
            rC0_1 += m0; m0 = rA0 * rB2;
            rC1_1 += m1; m1 = rA1 * rB2;                   ATL_pfl1R(pA0+KB5);
            rC2_1 += m2; m2 = rA2 * rB2;
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+2];
            rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[2];
            rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+2];
            rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+2];
            rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+2];

            rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+2];
            rC1_3 += m1; m1 = rA1 * rB0;
            rC2_3 += m2; m2 = rA2 * rB0;
            rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[3];
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+3];
            rC0_1 += m0; m0 = rA0 * rB2;
            rC1_1 += m1; m1 = rA1 * rB2;                   ATL_pfl1R(pA0+KB6);
            rC2_1 += m2; m2 = rA2 * rB2;
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+3];
            rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[3];
            rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+3];
            rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+3];
            rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+3];


            rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+3];
            rC1_3 += m1; m1 = rA1 * rB0;
            rC2_3 += m2; m2 = rA2 * rB0;
            rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[4];
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+4];
            rC0_1 += m0; m0 = rA0 * rB2;
            rC1_1 += m1; m1 = rA1 * rB2;                   ATL_pfl1R(pA0+KB7);
            rC2_1 += m2; m2 = rA2 * rB2;
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+4];
            rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[4];
            rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+4];
            rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+4];
            rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+4];

            rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+4];
            rC1_3 += m1; m1 = rA1 * rB0;
            rC2_3 += m2; m2 = rA2 * rB0;
            rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[5];
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+5];
            rC0_1 += m0; m0 = rA0 * rB2;
            rC1_1 += m1; m1 = rA1 * rB2;                   ATL_pfl1R(pB0+16);
            rC2_1 += m2; m2 = rA2 * rB2;
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+5];
            rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[5];
            rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+5];
            rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+5];
            rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+5];

            rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+5];
            rC1_3 += m1; m1 = rA1 * rB0;
            rC2_3 += m2; m2 = rA2 * rB0;
            rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[6];
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+6];
            rC0_1 += m0; m0 = rA0 * rB2;
            rC1_1 += m1; m1 = rA1 * rB2;                   ATL_pfl1R(pB0+KB+16);
            rC2_1 += m2; m2 = rA2 * rB2;
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+6];
            rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[6];
            rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+6];
            rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+6];
            rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+6];

            rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+6];
            rC1_3 += m1; m1 = rA1 * rB0;
            rC2_3 += m2; m2 = rA2 * rB0;
            rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[7];
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+7];
            rC0_1 += m0; m0 = rA0 * rB2;
            rC1_1 += m1; m1 = rA1 * rB2;                  ATL_pfl1R(pB0+KB2+16);
            rC2_1 += m2; m2 = rA2 * rB2;
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+7];
            rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[7];
            rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+7];
            rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+7];
            rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+7];

            rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+7];
            rC1_3 += m1; m1 = rA1 * rB0;
            rC2_3 += m2; m2 = rA2 * rB0;
            rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[8];
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+8];
            rC0_1 += m0; m0 = rA0 * rB2;
            rC1_1 += m1; m1 = rA1 * rB2;                  ATL_pfl1R(pB0+KB3+16);
            rC2_1 += m2; m2 = rA2 * rB2;
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+8];
            rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[8];
            rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+8];
            rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+8];
            rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+8];

            rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+8];
            rC1_3 += m1; m1 = rA1 * rB0; pA0 += 8;
            rC2_3 += m2; m2 = rA2 * rB0; pB0 += 8;
            rC3_3 += m3; m3 = rA3 * rB0;
         }
         rC0_0 += m0; m0 = rA0 * rB1; rB0 = pB0[1];
         rC1_0 += m1; m1 = rA1 * rB1;
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+1];
         rC0_1 += m0; m0 = rA0 * rB2;
         rC1_1 += m1; m1 = rA1 * rB2;                  ATL_pfl1R(pA0+KB4);
         rC2_1 += m2; m2 = rA2 * rB2;
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+1];
         rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[1];
         rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+1];
         rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+1];
         rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+1];

         rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+1];
         rC1_3 += m1; m1 = rA1 * rB0;
         rC2_3 += m2; m2 = rA2 * rB0;
         rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[2];
         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1;
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+2];
         rC0_1 += m0; m0 = rA0 * rB2;
         rC1_1 += m1; m1 = rA1 * rB2;                  ATL_pfl1R(pA0+KB5);
         rC2_1 += m2; m2 = rA2 * rB2;
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+2];
         rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[2];
         rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+2];
         rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+2];
         rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+2];

         rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+2];
         rC1_3 += m1; m1 = rA1 * rB0;
         rC2_3 += m2; m2 = rA2 * rB0;
         rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[3];
         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1;
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+3];
         rC0_1 += m0; m0 = rA0 * rB2;
         rC1_1 += m1; m1 = rA1 * rB2;                  ATL_pfl1R(pA0+KB6);
         rC2_1 += m2; m2 = rA2 * rB2;
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+3];
         rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[3];
         rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+3];
         rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+3];
         rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+3];

         rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+3];
         rC1_3 += m1; m1 = rA1 * rB0;
         rC2_3 += m2; m2 = rA2 * rB0;
         rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[4];
         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1;
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+4];
         rC0_1 += m0; m0 = rA0 * rB2;
         rC1_1 += m1; m1 = rA1 * rB2;                  ATL_pfl1R(pA0+KB7);
         rC2_1 += m2; m2 = rA2 * rB2;
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+4];
         rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[4];
         rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+4];
         rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+4];
         rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+4];

         rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+4];
         rC1_3 += m1; m1 = rA1 * rB0;
         rC2_3 += m2; m2 = rA2 * rB0;
         rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[5];
         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1;
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+5];
         rC0_1 += m0; m0 = rA0 * rB2;
         rC1_1 += m1; m1 = rA1 * rB2;                  ATL_pfl1R(pB0-KB+8);
         rC2_1 += m2; m2 = rA2 * rB2;
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+5];
         rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[5];
         rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+5];
         rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+5];
         rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+5];

         rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+5];
         rC1_3 += m1; m1 = rA1 * rB0;
         rC2_3 += m2; m2 = rA2 * rB0;
         rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[6];
         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1;
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+6];
         rC0_1 += m0; m0 = rA0 * rB2;
         rC1_1 += m1; m1 = rA1 * rB2;                  ATL_pfl1R(pB0+8);
         rC2_1 += m2; m2 = rA2 * rB2;
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+6];
         rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[6];
         rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+6];
         rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+6];
         rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+6];

         rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+6];
         rC1_3 += m1; m1 = rA1 * rB0;
         rC2_3 += m2; m2 = rA2 * rB0;
         rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[7];
         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1;
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+7];
         rC0_1 += m0; m0 = rA0 * rB2;
         rC1_1 += m1; m1 = rA1 * rB2;                  ATL_pfl1R(pB0+KB+8);
         rC2_1 += m2; m2 = rA2 * rB2;
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+7];
         rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[7];
         rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+7];
         rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+7];
         rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+7];

         rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+7];
         rC1_3 += m1; m1 = rA1 * rB0;                 pB0 += incBm;
         rC2_3 += m2; m2 = rA2 * rB0;                 pA0 += incAm;
         rC3_3 += m3; m3 = rA3 * rB0; ATL_pfl1W(pC0);
/*
 *       Drain pipe on last iteration of K-loop
 */
         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1; ATL_pfl1W(pC1);
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; ATL_pfl1W(pC2);
         rC0_1 += m0; m0 = rA0 * rB2;
         rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1W(pC3);
         rC2_1 += m2; m2 = rA2 * rB2;
         rC3_1 += m3; m3 = rA3 * rB2; ATL_pfl1R(pC0+4);
         rC0_2 += m0; m0 = rA0 * rB3;
         rC1_2 += m1; m1 = rA1 * rB3; ATL_pfl1R(pC1+4);
         rC2_2 += m2; m2 = rA2 * rB3;
         rC3_2 += m3; m3 = rA3 * rB3; ATL_pfl1R(pC2+4);
         rC0_3 += m0;
         rC1_3 += m1;                 ATL_pfl1R(pC3+4);
         rC2_3 += m2;
         rC3_3 += m3;                 ATL_pfl1R(pB0+KB3+8);
         #ifdef TREAL
         *pC0 = rC0_0; pC0[1] = rC1_0; pC0[2] = rC2_0; pC0[3] = rC3_0; pC0 += 4;
         *pC1 = rC0_1; pC1[1] = rC1_1; pC1[2] = rC2_1; pC1[3] = rC3_1; pC1 += 4;
         *pC2 = rC0_2; pC2[1] = rC1_2; pC2[2] = rC2_2; pC2[3] = rC3_2; pC2 += 4;
         *pC3 = rC0_3; pC3[1] = rC1_3; pC3[2] = rC2_3; pC3[3] = rC3_3; pC3 += 4;
         #else
         *pC0 = rC0_0; pC0[2] = rC1_0; pC0[4] = rC2_0; pC0[6] = rC3_0; pC0 += 8;
         *pC1 = rC0_1; pC1[2] = rC1_1; pC1[4] = rC2_1; pC1[6] = rC3_1; pC1 += 8;
         *pC2 = rC0_2; pC2[2] = rC1_2; pC2[4] = rC2_2; pC2[6] = rC3_2; pC2 += 8;
         *pC3 = rC0_3; pC3[2] = rC1_3; pC3[4] = rC2_3; pC3[6] = rC3_3; pC3 += 8;
         #endif
      }
      while(pA0 != stM);
      pC0 += incCn; pC1 += incCn; pC2 += incCn; pC3 += incCn;
      pA0 += incAn; pB0 += incBn;
   }
   while(pB0 != stN);
}
static void trsmLU_3(const int N, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Left, 'Upper', with 1 col prefetch, written with all dependencies shown,
 * so that compiler can optimize.
 * A is known to be 3x3, with 1/alpha already applied, diagonals already
 * inverted
 */
{
   const TYPE ar11=*A, ai11=A[1], ar12=A[6], ai12=A[7], ar13=A[12], ai13=A[13];
   const TYPE ar22=A[ 8], ai22=A[ 9], ar23=A[14], ai23=A[15];
   const TYPE ar33=A[16], ai33=A[17];
   TYPE xr1, xi1, xr2, xi2, xr3, xi3;
   TYPE t0, p0;
   const int ldb2=ldb+ldb;
   TYPE *bn=B+ldb2;
   const int pfd=ldb2+ldb2;
   int j;

   p0 = B[4];
   for (j=N-1; j; j--)
   {
      xr3 = p0  ; xi3 = B[5];
      xr1 = *B  ; xi1 = B[1];
      xr2 = B[2]; xi2 = B[3];

      t0 = xr3;
      xr3 = ar33*xr3 - ai33*xi3;
      xi3 = ar33*xi3 + ai33*t0;

      xr2 -= ar23*xr3 - ai23*xi3;     p0 = bn[4];
      xi2 -= ar23*xi3 + ai23*xr3;
      t0 = xr2;
      xr2 = ar22*xr2 - ai22*xi2;
      xi2 = ar22*xi2 + ai22*t0;

      xr1 -= ar13*xr3 - ai13*xi3;     ATL_pfl1W(bn+pfd);
      xi1 -= ar13*xi3 + ai13*xr3;     ATL_pfl1W(bn+pfd+4);
      xr1 -= ar12*xr2 - ai12*xi2;
      xi1 -= ar12*xi2 + ai12*xr2;
      t0 = xr1;
      xr1 = ar11*xr1 - ai11*xi1;
      xi1 = ar11*xi1 + ai11*t0;

      *B   = xr1; B[1] = xi1;
      B[2] = xr2; B[3] = xi2;
      B[4] = xr3; B[5] = xi3;
      B = bn;
      bn += ldb2;
   }
   xr3 = p0  ; xi3 = B[5];
   xr1 = *B  ; xi1 = B[1];
   xr2 = B[2]; xi2 = B[3];

   t0 = xr3;
   xr3 = ar33*xr3 - ai33*xi3;
   xi3 = ar33*xi3 + ai33*t0;

   xr2 -= ar23*xr3 - ai23*xi3;
   xi2 -= ar23*xi3 + ai23*xr3;
   t0 = xr2;
   xr2 = ar22*xr2 - ai22*xi2;
   xi2 = ar22*xi2 + ai22*t0;

   xr1 -= ar13*xr3 - ai13*xi3;
   xi1 -= ar13*xi3 + ai13*xr3;
   xr1 -= ar12*xr2 - ai12*xi2;
   xi1 -= ar12*xi2 + ai12*xr2;
   t0 = xr1;
   xr1 = ar11*xr1 - ai11*xi1;
   xi1 = ar11*xi1 + ai11*t0;

   *B   = xr1; B[1] = xi1;
   B[2] = xr2; B[3] = xi2;
   B[4] = xr3; B[5] = xi3;
}
static void trsmRL_4(const int M, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Right', 'Lower', written with all dependencies shown, so that the
 * compiler can optimize.  A is known to be 4x4, with 1/alpha already applied,
 * diagonals already inverted.
 */
{
   const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3], ar31=A[4], ai31=A[5],
              ar41=A[6], ai41=A[7];
   const TYPE ar22=A[10], ai22=A[11], ar32=A[12], ai32=A[13],
              ar42=A[14], ai42=A[15];
   const TYPE ar33=A[20], ai33=A[21], ar43=A[22], ai43=A[23];
   const TYPE ar44=A[30], ai44=A[31];
   const int ldb2 = ldb+ldb;
   TYPE xr1, xi1, xr2, xi2, xr3, xi3, xr4, xi4, t0;
   TYPE *pB0=B, *pB1 = B+ldb2, *pB2=pB1+ldb2, *pB3=pB2+ldb2;
   int i;
   #define PFD 8

   for (i=M; i; i--)
   {
      xr4 = *pB3; xr3 = *pB2; xr2 = *pB1; xr1 = *pB0;
      xi4 = pB3[1]; xi3 = pB2[1]; xi2 = pB1[1]; xi1 = pB0[1];

/*
 *    REAL SEQUENCE :
 *
 *    x4 *= a11;
 *    x3 = (x3 - x4*a43) * a33;
 *    x2 = (x2 - x4*a42 - x3*a32) * a22;
 *    x1 = (x1 - x4*a41 - x3*a31 - x2*a21) * a11;
 */
      t0 = xr4;
      xr4 = ar44*xr4 - ai44*xi4;
      xi4 = ar44*xi4 + ai44*t0;

      xr3 -= xr4*ar43 - xi4*ai43;
      xi3 -= xr4*ai43 + xi4*ar43;
      t0 = xr3;
      xr3 = ar33*xr3 - ai33*xi3;
      xi3 = ar33*xi3 + ai33*t0;

      xr2 -= xr4*ar42 - xi4*ai42;
      xi2 -= xr4*ai42 + xi4*ar42; ATL_pfl1W(pB3+PFD);
      xr2 -= xr3*ar32 - xi3*ai32; ATL_pfl1W(pB2+PFD);
      xi2 -= xr3*ai32 + xi3*ar32;
      t0 = xr2;
      xr2 = xr2*ar22 - xi2*ai22;
      xi2 = t0 *ai22 + xi2*ar22;

      xr1 -= xr4*ar41 - xi4*ai41;
      xi1 -= xr4*ai41 + xi4*ar41;
      xr1 -= xr3*ar31 - xi3*ai31;
      xi1 -= xr3*ai31 + xi3*ar31;
      xr1 -= xr2*ar21 - xi2*ai21; ATL_pfl1W(pB1+PFD);
      xi1 -= xr2*ai21 + xi2*ar21; ATL_pfl1W(pB0+PFD);
      t0 = xr1;
      xr1 = xr1*ar11 - xi1*ai11;
      xi1 = t0 *ai11 + xi1*ar11;
      *pB3 = xr4; pB3[1] = xi4; pB3 += 2;
      *pB2 = xr3; pB2[1] = xi3; pB2 += 2;
      *pB1 = xr2; pB1[1] = xi2; pB1 += 2;
      *pB0 = xr1; pB0[1] = xi1; pB0 += 2;
   }
   #undef PFD
}
static void trsmRU_4(const int M, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Right', 'Upper', written with all dependencies shown, so that the
 * compiler can optimize.  A is known to be 4x4, with 1/alpha already applied,
 * diagonals already inverted.
 */
{
   const TYPE ar11=*A, ai11=A[1], ar12=A[8], ai12=A[9], ar13=A[16], ai13=A[17],
              ar14=A[24], ai14=A[25];
   const TYPE ar22=A[10], ai22=A[11], ar23=A[18], ai23=A[19],
              ar24=A[26], ai24=A[27];
   const TYPE ar33=A[20], ai33=A[21], ar34=A[28], ai34=A[29];
   const TYPE ar44=A[30], ai44=A[31];
   const int ldb2 = ldb+ldb;
   TYPE xr1, xi1, xr2, xi2, xr3, xi3, xr4, xi4, t0;
   TYPE *pB0=B, *pB1 = B+ldb2, *pB2=pB1+ldb2, *pB3=pB2+ldb2;
   int i;
   #define PFD 8

   for (i=M; i; i--)
   {
      xr1 = *pB0; xr2 = *pB1; xr3 = *pB2; xr4 = *pB3;
      xi1 = pB0[1]; xi2 = pB1[1]; xi3 = pB2[1]; xi4 = pB3[1];
/*
 *    real sequence:
 *    x1 *= a11;
 *    x2 = (x2 - x1*a12) * a22;
 *    x3 = (x3 - x1*a13 - x2*a23) * a33;
 *    x4 = (x4 - x1*a14 - x2*a24 - x3*a34) * a44;
 */
      t0 = xr1;
      xr1 = xr1*ar11 - xi1*ai11;
      xi1 = t0 *ai11 + xi1*ar11;

      xr2 -= xr1*ar12 - xi1*ai12;
      xi2 -= xr1*ai12 + xi1*ar12;
      t0 = xr2;
      xr2 = xr2*ar22 - xi2*ai22;
      xi2 = t0 *ai22 + xi2*ar22;

      xr3 -= xr1*ar13 - xi1*ai13;
      xi3 -= xr1*ai13 + xi1*ar13;
      xr3 -= xr2*ar23 - xi2*ai23;     ATL_pfl1W(pB0+PFD);
      xi3 -= xr2*ai23 + xi2*ar23;     ATL_pfl1W(pB1+PFD);
      t0 = xr3;
      xr3 = xr3*ar33 - xi3*ai33;
      xi3 = t0 *ai33 + xi3*ar33;

      xr4 -= xr1*ar14 - xi1*ai14;
      xi4 -= xr1*ai14 + xi1*ar14;
      xr4 -= xr2*ar24 - xi2*ai24;
      xi4 -= xr2*ai24 + xi2*ar24;
      xr4 -= xr3*ar34 - xi3*ai34;     ATL_pfl1W(pB2+PFD);
      xi4 -= xr3*ai34 + xi3*ar34;     ATL_pfl1W(pB3+PFD);
      t0 = xr4;
      xr4 = xr4*ar44 - xi4*ai44;
      xi4 = t0 *ai44 + xi4*ar44;

      *pB0 = xr1; pB0[1] = xi1; pB0 += 2;
      *pB1 = xr2; pB1[1] = xi2; pB1 += 2;
      *pB2 = xr3; pB2[1] = xi3; pB2 += 2;
      *pB3 = xr4; pB3[1] = xi4; pB3 += 2;
   }
   #undef PFD
}
static void trsmLL_4(const int N, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Left', 'Lower', with 1 column prefetch, written with all dependencies
 * shown, so that the compiler can optimize.
 * A is known to be 4x4, with 1/alpha already applied, diagonals already
 * inverted
 */
{
   const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3], ar31=A[4], ai31=A[5],
              ar41=A[6], ai41=A[7];
   const TYPE ar22=A[10], ai22=A[11], ar32=A[12], ai32=A[13],
              ar42=A[14], ai42=A[15];
   const TYPE ar33=A[20], ai33=A[21], ar43=A[22], ai43=A[23];
   const TYPE ar44=A[30], ai44=A[31];
   const int ldb2 = ldb+ldb;
   TYPE xr1, xi1, xr2, xi2, xr3, xi3, xr4, xi4;
   TYPE t0, p0;
   TYPE *pBn=B+ldb2;
   const int pfd = ldb2+ldb2;
   int j;

   p0 = *B;
   for (j=N-1; j; j--)
   {
      xr1 = p0; xi1 = B[1];
      xr3 = B[4]; xi3 = B[5];
      xr2 = B[2]; xi2 = B[3];
      xr4 = B[6]; xi4 = B[7];

      t0 = xr1;
      xr1 = ar11 * xr1 - ai11 * xi1;
      xi1 = ar11 * xi1 + ai11 * t0;

      xr2 -= ar21*xr1 - ai21*xi1;
      xi2 -= ar21*xi1 + ai21*xr1;
      t0 = xr2;
      xr2 = ar22*xr2 - ai22*xi2;
      xi2 = ar22*xi2 + ai22*t0;

      xr3 -= ar31*xr1 - ai31*xi1;
      xi3 -= ar31*xi1 + ai31*xr1;
      xr3 -= ar32*xr2 - ai32*xi2;     p0 = *pBn;
      xi3 -= ar32*xi2 + ai32*xr2;
      t0 = xr3;
      xr3 = ar33*xr3 - ai33*xi3;
      xi3 = ar33*xi3 + ai33*t0;

      xr4 -= ar41*xr1 - ai41*xi1;     ATL_pfl1W(pBn+pfd);
      xi4 -= ar41*xi1 + ai41*xr1;     ATL_pfl1W(pBn+pfd+4);
      xr4 -= ar42*xr2 - ai42*xi2;
      xi4 -= ar42*xi2 + ai42*xr2;
      xr4 -= ar43*xr3 - ai43*xi3;
      xi4 -= ar43*xi3 + ai43*xr3;
      t0 = xr4;
      xr4 = ar44*xr4 - ai44*xi4;
      xi4 = ar44*xi4 + ai44*t0;
      *B   = xr1; B[1] = xi1;
      B[2] = xr2; B[3] = xi2;
      B[4] = xr3; B[5] = xi3;
      B[6] = xr4; B[7] = xi4;
      B = pBn;
      pBn += ldb2;
   }
   xr1 = p0; xi1 = B[1];
   xr3 = B[4]; xi3 = B[5];
   xr2 = B[2]; xi2 = B[3];
   xr4 = B[6]; xi4 = B[7];

   t0 = xr1;
   xr1 = ar11 * xr1 - ai11 * xi1;
   xi1 = ar11 * xi1 + ai11 * t0;

   xr2 -= ar21*xr1 - ai21*xi1;
   xi2 -= ar21*xi1 + ai21*xr1;
   t0 = xr2;
   xr2 = ar22*xr2 - ai22*xi2;
   xi2 = ar22*xi2 + ai22*t0;

   xr3 -= ar31*xr1 - ai31*xi1;
   xi3 -= ar31*xi1 + ai31*xr1;
   xr3 -= ar32*xr2 - ai32*xi2;
   xi3 -= ar32*xi2 + ai32*xr2;
   t0 = xr3;
   xr3 = ar33*xr3 - ai33*xi3;
   xi3 = ar33*xi3 + ai33*t0;

   xr4 -= ar41*xr1 - ai41*xi1;
   xi4 -= ar41*xi1 + ai41*xr1;
   xr4 -= ar42*xr2 - ai42*xi2;
   xi4 -= ar42*xi2 + ai42*xr2;
   xr4 -= ar43*xr3 - ai43*xi3;
   xi4 -= ar43*xi3 + ai43*xr3;
   t0 = xr4;
   xr4 = ar44*xr4 - ai44*xi4;
   xi4 = ar44*xi4 + ai44*t0;
   *B   = xr1; B[1] = xi1;
   B[2] = xr2; B[3] = xi2;
   B[4] = xr3; B[5] = xi3;
   B[6] = xr4; B[7] = xi4;
}
static void trsmLU_4(const int N, const TYPE *A, TYPE *B, const int ldb)
/*
 * 'Left, 'Upper', with 1 col prefetch, written with all dependencies shown,
 * so that compiler can optimize.
 * A is known to be 4x4, with 1/alpha already applied, diagonals already
 * inverted
 */
{
   const TYPE ar11=*A, ai11=A[1], ar12=A[8], ai12=A[9], ar13=A[16], ai13=A[17],
              ar14=A[24], ai14=A[25];
   const TYPE ar22=A[10], ai22=A[11], ar23=A[18], ai23=A[19],
              ar24=A[26], ai24=A[27];
   const TYPE ar33=A[20], ai33=A[21], ar34=A[28], ai34=A[29];
   const TYPE ar44=A[30], ai44=A[31];
   TYPE xr1, xi1, xr2, xi2, xr3, xi3, xr4, xi4;
   TYPE t0, p0;
   const int ldb2=ldb+ldb;
   TYPE *bn=B+ldb2;
   const int pfd = ldb2+ldb2;
   int j;

   p0 = B[6];
   for (j=N-1; j; j--)
   {
      xr4 = p0  ; xi4 = B[7];
      xr1 = *B; xi1 = B[1];
      xr3 = B[4]; xi3 = B[5];
      xr2 = B[2]; xi2 = B[3];

      t0 = xr4;
      xr4 = ar44*xr4 - ai44*xi4;
      xi4 = ar44*xi4 + ai44*t0;

      xr3 -= ar34*xr4 - ai34*xi4;
      xi3 -= ar34*xi4 + ai34*xr4;
      t0 = xr3;
      xr3 = ar33*xr3 - ai33*xi3;
      xi3 = ar33*xi3 + ai33*t0;

      xr2 -= ar24*xr4 - ai24*xi4;     p0 = bn[6];
      xi2 -= ar24*xi4 + ai24*xr4;
      xr2 -= ar23*xr3 - ai23*xi3;
      xi2 -= ar23*xi3 + ai23*xr3;
      t0 = xr2;
      xr2 = ar22*xr2 - ai22*xi2;      ATL_pfl1W(bn+pfd);
      xi2 = ar22*xi2 + ai22*t0;       ATL_pfl1W(bn+pfd+4);

      xr1 -= ar14*xr4 - ai14*xi4;
      xi1 -= ar14*xi4 + ai14*xr4;
      xr1 -= ar13*xr3 - ai13*xi3;
      xi1 -= ar13*xi3 + ai13*xr3;
      xr1 -= ar12*xr2 - ai12*xi2;
      xi1 -= ar12*xi2 + ai12*xr2;
      t0 = xr1;
      xr1 = ar11*xr1 - ai11*xi1;
      xi1 = ar11*xi1 + ai11*t0;

      *B   = xr1; B[1] = xi1;
      B[2] = xr2; B[3] = xi2;
      B[4] = xr3; B[5] = xi3;
      B[6] = xr4; B[7] = xi4;
      B = bn;
      bn += ldb2;
   }
   xr4 = p0  ; xi4 = B[7];
   xr1 = *B; xi1 = B[1];
   xr3 = B[4]; xi3 = B[5];
   xr2 = B[2]; xi2 = B[3];

   t0 = xr4;
   xr4 = ar44*xr4 - ai44*xi4;
   xi4 = ar44*xi4 + ai44*t0;

   xr3 -= ar34*xr4 - ai34*xi4;
   xi3 -= ar34*xi4 + ai34*xr4;
   t0 = xr3;
   xr3 = ar33*xr3 - ai33*xi3;
   xi3 = ar33*xi3 + ai33*t0;

   xr2 -= ar24*xr4 - ai24*xi4;
   xi2 -= ar24*xi4 + ai24*xr4;
   xr2 -= ar23*xr3 - ai23*xi3;
   xi2 -= ar23*xi3 + ai23*xr3;
   t0 = xr2;
   xr2 = ar22*xr2 - ai22*xi2;
   xi2 = ar22*xi2 + ai22*t0;

   xr1 -= ar14*xr4 - ai14*xi4;
   xi1 -= ar14*xi4 + ai14*xr4;
   xr1 -= ar13*xr3 - ai13*xi3;
   xi1 -= ar13*xi3 + ai13*xr3;
   xr1 -= ar12*xr2 - ai12*xi2;
   xi1 -= ar12*xi2 + ai12*xr2;
   t0 = xr1;
   xr1 = ar11*xr1 - ai11*xi1;
   xi1 = ar11*xi1 + ai11*t0;

   *B   = xr1; B[1] = xi1;
   B[2] = xr2; B[3] = xi2;
   B[4] = xr3; B[5] = xi3;
   B[6] = xr4; B[7] = xi4;
}
void ATL_USERMM
   (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
/*
 * matmul with TA=T, TB=N, muladd=0, lat=4 lda=ldb=MB=KB=NB, ldc=0,
 * mu=4, nu=3, ku=2, and register prefetch
 *
 */
{
   const TYPE *stM = A + M*KB;
   const TYPE *stN = B + KB*N;
   const int incAm = KB3+8;
   #define incBn KB3
   const int startK = (KB>>3)-1;
   const int incAn = -KB*M, incBm = 8-KB;
   const int incCn = (3*ldc - M)SHIFT;
   TYPE *pC0=C, *pC1=pC0+(ldc SHIFT), *pC2=pC1+(ldc SHIFT);
   const TYPE *pA0=A;
   const TYPE *pB0=B;
   register int k;
   register TYPE rA0, rA1, rA2, rA3, ra0, ra1, ra2, ra3;
   register TYPE rB0, rB1, rB2, rb0, rb1, rb2;
   register TYPE m0, m1, m2, m3;
   register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC0_1, rC1_1, rC2_1, rC3_1,
                   rC0_2, rC1_2, rC2_2, rC3_2;

   do /* N-loop */
   {
      ATL_pfl1R(pB0); ATL_pfl1R(pB0+KB); ATL_pfl1R(pB0+KB2);
      ATL_pfl1R(pB0+8); ATL_pfl1R(pB0+KB+8); ATL_pfl1R(pB0+KB2+8);

      do /* M-loop */
      {
         #ifdef BETA0
            rC0_0 = rC1_0 = rC2_0 = rC3_0 =
            rC0_1 = rC1_1 = rC2_1 = rC3_1 =
            rC0_2 = rC1_2 = rC2_2 = rC3_2 = ATL_rzero;
         #else
            #ifdef TREAL
               rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3];
               rC0_1 = *pC1; rC1_1 = pC1[1]; rC2_1 = pC1[2]; rC3_1 = pC1[3];
               rC0_2 = *pC2; rC1_2 = pC2[1]; rC2_2 = pC2[2]; rC3_2 = pC2[3];
            #else
               rC0_0 = *pC0; rC1_0 = pC0[2]; rC2_0 = pC0[4]; rC3_0 = pC0[6];
               rC0_1 = *pC1; rC1_1 = pC1[2]; rC2_1 = pC1[4]; rC3_1 = pC1[6];
               rC0_2 = *pC2; rC1_2 = pC2[2]; rC2_2 = pC2[4]; rC3_2 = pC2[6];
            #endif
            #ifdef BETAX
               ra3 = beta;
               rC0_0 *= ra3; rC1_0 *= ra3; rC2_0 *= ra3; rC3_0 *= ra3;
               rC0_1 *= ra3; rC1_1 *= ra3; rC2_1 *= ra3; rC3_1 *= ra3;
               rC0_2 *= ra3; rC1_2 *= ra3; rC2_2 *= ra3; rC3_2 *= ra3;
            #endif
         #endif
/*
 *       Start pipeline
 */
         rA0 = *pA0; rB0 = *pB0;
         rA1 = pA0[KB]; rA2 = pA0[KB2]; rA3 = pA0[KB3];
         rB1 = pB0[KB]; rB2 = pB0[KB2];

         rb0 = pB0[1]; rb1 = pB0[KB+1]; rb2 = pB0[KB2+1];

         m0 = rA0 * rB0; ra0 = pA0[1]; ra1 = pA0[KB+1];
         m1 = rA1 * rB0; ra2 = pA0[KB2+1];
         m2 = rA2 * rB0; ra3 = pA0[KB3+1];
         m3 = rA3 * rB0; rB0 = pB0[2];

         for (k=startK; k; k--) /* easy loop to unroll */
         {
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;                  ATL_pfl1R(pA0+KB4);
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +2];
            rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[    2];
            rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +2];
            rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+2];
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+2];
            rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+2];
            rC1_2 += m1; m1 = ra1 * rb0;
            rC2_2 += m2; m2 = ra2 * rb0;
            rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[3];

            rC0_0 += m0; m0 = ra0 * rb1;
            rC1_0 += m1; m1 = ra1 * rb1;                  ATL_pfl1R(pA0+KB5);
            rC2_0 += m2; m2 = ra2 * rb1;
            rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +3];
            rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[3];
            rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +3];
            rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+3];
            rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+3];
            rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+3];
            rC1_2 += m1; m1 = rA1 * rB0;
            rC2_2 += m2; m2 = rA2 * rB0;
            rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[4];

            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;                  ATL_pfl1R(pA0+KB6);
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +4];
            rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[    4];
            rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +4];
            rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+4];
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+4];
            rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+4];
            rC1_2 += m1; m1 = ra1 * rb0;
            rC2_2 += m2; m2 = ra2 * rb0;
            rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[5];

            rC0_0 += m0; m0 = ra0 * rb1;
            rC1_0 += m1; m1 = ra1 * rb1;                  ATL_pfl1R(pA0+KB7);
            rC2_0 += m2; m2 = ra2 * rb1;
            rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +5];
            rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[5];
            rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +5];
            rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+5];
            rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+5];
            rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+5];
            rC1_2 += m1; m1 = rA1 * rB0;
            rC2_2 += m2; m2 = rA2 * rB0;
            rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[6];
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;                  ATL_pfl1R(pB0+16);
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +6];
            rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[    6];
            rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +6];
            rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+6];
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+6];
            rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+6];
            rC1_2 += m1; m1 = ra1 * rb0;
            rC2_2 += m2; m2 = ra2 * rb0;
            rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[7];

            rC0_0 += m0; m0 = ra0 * rb1;
            rC1_0 += m1; m1 = ra1 * rb1;                  ATL_pfl1R(pB0+KB+16);
            rC2_0 += m2; m2 = ra2 * rb1;
            rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +7];
            rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[7];
            rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +7];
            rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+7];
            rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+7];
            rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+7];
            rC1_2 += m1; m1 = rA1 * rB0;
            rC2_2 += m2; m2 = rA2 * rB0;
            rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[8];
            rC0_0 += m0; m0 = rA0 * rB1;
            rC1_0 += m1; m1 = rA1 * rB1;                  ATL_pfl1R(pB0+KB2+16);
            rC2_0 += m2; m2 = rA2 * rB1;
            rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +8];
            rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[    8];
            rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +8];
            rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+8];
            rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+8];
            rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+8];
            rC1_2 += m1; m1 = ra1 * rb0;
            rC2_2 += m2; m2 = ra2 * rb0;
            rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[9];

            rC0_0 += m0; m0 = ra0 * rb1;
            rC1_0 += m1; m1 = ra1 * rb1;                  ATL_pfl1R(pB0+KB3+16);
            rC2_0 += m2; m2 = ra2 * rb1;
            rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +9];
            rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[9];
            rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +9];
            rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+9];
            rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+9];
            rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+9]; pA0 += 8;
            rC1_2 += m1; m1 = rA1 * rB0;
            rC2_2 += m2; m2 = rA2 * rB0;
            rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[10]; pB0 += 8;
         }
         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1;                  ATL_pfl1R(pA0+KB4);
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +2];
         rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[    2];
         rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +2];
         rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+2];
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+2];
         rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+2];
         rC1_2 += m1; m1 = ra1 * rb0;
         rC2_2 += m2; m2 = ra2 * rb0;
         rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[3];

         rC0_0 += m0; m0 = ra0 * rb1;
         rC1_0 += m1; m1 = ra1 * rb1;                  ATL_pfl1R(pA0+KB5);
         rC2_0 += m2; m2 = ra2 * rb1;
         rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +3];
         rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[3];
         rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +3];
         rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+3];
         rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+3];
         rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+3];
         rC1_2 += m1; m1 = rA1 * rB0;
         rC2_2 += m2; m2 = rA2 * rB0;
         rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[4];

         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1;                  ATL_pfl1R(pA0+KB6);
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +4];
         rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[    4];
         rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +4];
         rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+4];
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+4];
         rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+4];
         rC1_2 += m1; m1 = ra1 * rb0;
         rC2_2 += m2; m2 = ra2 * rb0;
         rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[5];

         rC0_0 += m0; m0 = ra0 * rb1;
         rC1_0 += m1; m1 = ra1 * rb1;                  ATL_pfl1R(pA0+KB7);
         rC2_0 += m2; m2 = ra2 * rb1;
         rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +5];
         rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[5];
         rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +5];
         rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+5];
         rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+5];
         rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+5];
         rC1_2 += m1; m1 = rA1 * rB0;
         rC2_2 += m2; m2 = rA2 * rB0;
         rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[6];
         rC0_0 += m0; m0 = rA0 * rB1;
         rC1_0 += m1; m1 = rA1 * rB1;                  ATL_pfl1R(pB0-KB+8);
         rC2_0 += m2; m2 = rA2 * rB1;
         rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +6];
         rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[    6];
         rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +6];
         rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+6];
         rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+6];
         rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+6];
         rC1_2 += m1; m1 = ra1 * rb0;
         rC2_2 += m2; m2 = ra2 * rb0;
         rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[7];

         rC0_0 += m0; m0 = ra0 * rb1;
         rC1_0 += m1; m1 = ra1 * rb1;                  ATL_pfl1R(pB0+8);
         rC2_0 += m2; m2 = ra2 * rb1;
         rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +7];
         rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[7];
         rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +7];
         rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+7];
         rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+7];
         rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+7];
         rC1_2 += m1; m1 = rA1 * rB0;  pA0 += incAm;
         rC2_2 += m2; m2 = rA2 * rB0;                  ATL_pfl1R(pB0+KB+8);
         rC3_2 += m3; m3 = rA3 * rB0; pB0 += incBm;
         rC0_0 += m0; m0 = rA0 * rB1;                  ATL_pfl1W(pC0);
         rC1_0 += m1; m1 = rA1 * rB1;
         rC2_0 += m2; m2 = rA2 * rB1;                  ATL_pfl1W(pC1);
         rC3_0 += m3; m3 = rA3 * rB1;
         rC0_1 += m0; m0 = rA0 * rB2;                  ATL_pfl1W(pC2);
         rC1_1 += m1; m1 = rA1 * rB2;
         rC2_1 += m2; m2 = rA2 * rB2;                  ATL_pfl1R(pC0+(4 SHIFT));
         rC3_1 += m3; m3 = rA3 * rB2;
         rC0_2 += m0; m0 = ra0 * rb0;                  ATL_pfl1R(pC1+(4 SHIFT));
         rC1_2 += m1; m1 = ra1 * rb0;
         rC2_2 += m2; m2 = ra2 * rb0;                  ATL_pfl1R(pC2+(4 SHIFT));
         rC3_2 += m3; m3 = ra3 * rb0;

         rC0_0 += m0; m0 = ra0 * rb1;                  ATL_pfl1R(pB0+8);
         rC1_0 += m1; m1 = ra1 * rb1;
         rC2_0 += m2; m2 = ra2 * rb1;
         rC3_0 += m3; m3 = ra3 * rb1;
         rC0_1 += m0; m0 = ra0 * rb2;
         rC1_1 += m1; m1 = ra1 * rb2;                  ATL_pfl1R(pB0+KB2+8);
         rC2_1 += m2; m2 = ra2 * rb2;
         rC3_1 += m3; m3 = ra3 * rb2;
         rC0_2 += m0;
         rC1_2 += m1;
         rC2_2 += m2;
         rC3_2 += m3;                                  ATL_pfl1R(pB0+KB2+8);

         #ifdef TREAL
         *pC0 = rC0_0; pC0[1] = rC1_0; pC0[2] = rC2_0; pC0[3] = rC3_0; pC0 += 4;
         *pC1 = rC0_1; pC1[1] = rC1_1; pC1[2] = rC2_1; pC1[3] = rC3_1; pC1 += 4;
         *pC2 = rC0_2; pC2[1] = rC1_2; pC2[2] = rC2_2; pC2[3] = rC3_2; pC2 += 4;
         #else
         *pC0 = rC0_0; pC0[2] = rC1_0; pC0[4] = rC2_0; pC0[6] = rC3_0; pC0 += 8;
         *pC1 = rC0_1; pC1[2] = rC1_1; pC1[4] = rC2_1; pC1[6] = rC3_1; pC1 += 8;
         *pC2 = rC0_2; pC2[2] = rC1_2; pC2[4] = rC2_2; pC2[6] = rC3_2; pC2 += 8;
         #endif
      }
      while(pA0 != stM);
      pC0 += incCn; pC1 += incCn; pC2 += incCn;
      pA0 += incAn; pB0 += incBn;
   }
   while(pB0 != stN);
}