static void trsmRL_3(const int M, const TYPE *A, TYPE *B, const int ldb) /* * 'Right', 'Lower', written with all dependencies shown, so that the * compiler can optimize. A is known to be 3x3, with 1/alpha already applied, * diagonals already inverted. */ { const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3], ar31=A[4], ai31=A[5]; const TYPE ar22=A[ 8], ai22=A[ 9], ar32=A[10], ai32=A[11]; const TYPE ar33=A[16], ai33=A[17]; const int ldb2 = ldb+ldb; TYPE xr1, xi1, xr2, xi2, xr3, xi3, t0; TYPE *pB0=B, *pB1 = B+ldb2, *pB2=pB1+ldb2; int i; #define PFD 8 for (i=M; i; i--) { xr3 = *pB2; xr2 = *pB1; xr1 = *pB0; xi3 = pB2[1]; xi2 = pB1[1]; xi1 = pB0[1]; /* * REAL SEQUENCE : * * x3 *= a33; * x2 = (x2 - x3*a32) * a22; * x1 = (x1 - x3*a31 - x2*a21) * a11; */ t0 = xr3; xr3 = ar33*xr3 - ai33*xi3; xi3 = ar33*xi3 + ai33*t0; xr2 -= xr3*ar32 - xi3*ai32; xi2 -= xr3*ai32 + xi3*ar32; t0 = xr2; xr2 = xr2*ar22 - xi2*ai22; ATL_pfl1W(pB2+PFD); xi2 = t0 *ai22 + xi2*ar22; ATL_pfl1W(pB1+PFD); xr1 -= xr3*ar31 - xi3*ai31; xi1 -= xr3*ai31 + xi3*ar31; xr1 -= xr2*ar21 - xi2*ai21; xi1 -= xr2*ai21 + xi2*ar21; /* ATL_pfl1W(pB0+PFD); */ t0 = xr1; xr1 = xr1*ar11 - xi1*ai11; xi1 = t0 *ai11 + xi1*ar11; *pB2 = xr3; pB2[1] = xi3; pB2 += 2; *pB1 = xr2; pB1[1] = xi2; pB1 += 2; *pB0 = xr1; pB0[1] = xi1; pB0 += 2; } #undef PFD }
static void trsmRU_3(const int M, const TYPE *A, TYPE *B, const int ldb) /* * 'Right', 'Upper', written with all dependencies shown, so that the * compiler can optimize. A is known to be 3x3, with 1/alpha already applied, * diagonals already inverted. */ { const TYPE ar11=*A, ai11=A[1], ar12=A[6], ai12=A[7], ar13=A[12], ai13=A[13]; const TYPE ar22=A[ 8], ai22=A[ 9], ar23=A[14], ai23=A[15]; const TYPE ar33=A[16], ai33=A[17]; const int ldb2 = ldb+ldb; TYPE xr1, xi1, xr2, xi2, xr3, xi3, t0; TYPE *pB0=B, *pB1 = B+ldb2, *pB2=pB1+ldb2; int i; #define PFD 8 for (i=M; i; i--) { xr1 = *pB0; xr2 = *pB1; xr3 = *pB2; xi1 = pB0[1]; xi2 = pB1[1]; xi3 = pB2[1]; /* * real sequence: * x1 *= a11; * x2 = (x2 - x1*a12) * a22; * x3 = (x3 - x1*a13 - x2*a23) * a33; */ t0 = xr1; xr1 = xr1*ar11 - xi1*ai11; xi1 = t0 *ai11 + xi1*ar11; xr2 -= xr1*ar12 - xi1*ai12; xi2 -= xr1*ai12 + xi1*ar12; t0 = xr2; xr2 = xr2*ar22 - xi2*ai22; ATL_pfl1W(pB0+PFD); xi2 = t0 *ai22 + xi2*ar22; ATL_pfl1W(pB1+PFD); xr3 -= xr1*ar13 - xi1*ai13; xi3 -= xr1*ai13 + xi1*ar13; xr3 -= xr2*ar23 - xi2*ai23; xi3 -= xr2*ai23 + xi2*ar23; ATL_pfl1W(pB2+PFD); t0 = xr3; xr3 = xr3*ar33 - xi3*ai33; xi3 = t0 *ai33 + xi3*ar33; *pB0 = xr1; pB0[1] = xi1; pB0 += 2; *pB1 = xr2; pB1[1] = xi2; pB1 += 2; *pB2 = xr3; pB2[1] = xi3; pB2 += 2; } #undef PFD }
static void trsmLU_2(const int N, const TYPE *A, TYPE *B, const int ldb) /* * 'Left, 'Upper', with 1 col prefetch, written with all dependencies shown, * so that compiler can optimize. * A is known to be 2x2, with 1/alpha already applied, diagonals already * inverted */ { const TYPE ar11=*A, ai11=A[1], ar12=A[4], ai12=A[5]; const TYPE ar22=A[6], ai22=A[7]; TYPE xr1, xi1, xr2, xi2; TYPE t0, p0; const int ldb2=ldb+ldb; TYPE *bn=B+ldb2; const int pfd=ldb2+ldb2; int j; p0 = B[2]; for (j=N-1; j; j--) /* stop 1 iteration early to stop prefetch */ { xr2 = p0 ; xi2 = B[3]; xr1 = *B ; xi1 = B[1]; t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; xi2 = ar22*xi2 + ai22*t0; p0 = bn[2]; xr1 -= ar12*xr2 - ai12*xi2; xi1 -= ar12*xi2 + ai12*xr2; ATL_pfl1W(bn+pfd); t0 = xr1; xr1 = ar11*xr1 - ai11*xi1; xi1 = ar11*xi1 + ai11*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; B = bn; bn += ldb2; } xr2 = p0 ; xi2 = B[3]; xr1 = *B ; xi1 = B[1]; t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; xi2 = ar22*xi2 + ai22*t0; xr1 -= ar12*xr2 - ai12*xi2; xi1 -= ar12*xi2 + ai12*xr2; t0 = xr1; xr1 = ar11*xr1 - ai11*xi1; xi1 = ar11*xi1 + ai11*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; }
static void trsmLL_2(const int N, const TYPE *A, TYPE *B, const int ldb) /* * 'Left', 'Lower', with 1 column prefetch, written with all dependencies * shown, so that the compiler can optimize. * A is known to be 2x2, with 1/alpha already applied, diagonals already * inverted */ { const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3]; const TYPE ar22=A[6], ai22=A[7]; const int ldb2 = ldb+ldb; TYPE xr1, xi1, xr2, xi2; TYPE t0, p0; TYPE *pBn=B+ldb2; const int pfd=ldb2+ldb2; int j; p0 = *B; for (j=N-1; j; j--) { xr1 = p0; xi1 = B[1]; xr2 = B[2]; xi2 = B[3]; t0 = xr1; xr1 = ar11 * xr1 - ai11 * xi1; xi1 = ar11 * xi1 + ai11 * t0; p0 = *pBn; xr2 -= ar21*xr1 - ai21*xi1; xi2 -= ar21*xi1 + ai21*xr1; ATL_pfl1W(pBn+pfd); t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; xi2 = ar22*xi2 + ai22*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; B = pBn; pBn += ldb2; } xr1 = p0; xi1 = B[1]; xr2 = B[2]; xi2 = B[3]; t0 = xr1; xr1 = ar11 * xr1 - ai11 * xi1; xi1 = ar11 * xi1 + ai11 * t0; xr2 -= ar21*xr1 - ai21*xi1; xi2 -= ar21*xi1 + ai21*xr1; t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; xi2 = ar22*xi2 + ai22*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; }
static void trsmRL_2(const int M, const TYPE *A, TYPE *B, const int ldb) /* * 'Right', 'Lower', written with all dependencies shown, so that the * compiler can optimize. A is known to be 2x2, with 1/alpha already applied, * diagonals already inverted. */ { const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3]; const TYPE ar22=A[6], ai22=A[7]; const int ldb2 = ldb+ldb; TYPE *pB0=B, *pB1 = B+ldb2; TYPE xr1, xi1, xr2, xi2, t0; int i; #define PFD 8 for (i=M; i; i--) { xr2 = *pB1; xr1 = *pB0; xi2 = pB1[1]; xi1 = pB0[1]; /* * REAL SEQUENCE : * * x2 *= a22; * x1 = (x1 - x2*a21) * a11; */ t0 = xr2; xr2 = xr2*ar22 - xi2*ai22; xi2 = t0 *ai22 + xi2*ar22; xr1 -= xr2*ar21 - xi2*ai21; ATL_pfl1W(pB1+PFD); xi1 -= xr2*ai21 + xi2*ar21; ATL_pfl1W(pB0+PFD); t0 = xr1; xr1 = xr1*ar11 - xi1*ai11; xi1 = t0 *ai11 + xi1*ar11; *pB1 = xr2; pB1[1] = xi2; pB1 += 2; *pB0 = xr1; pB0[1] = xi1; pB0 += 2; } #undef PFD }
static void trsmRU_2(const int M, const TYPE *A, TYPE *B, const int ldb) /* * 'Right', 'Upper', written with all dependencies shown, so that the * compiler can optimize. A is known to be 2x2, with 1/alpha already applied, * diagonals already inverted. */ { const TYPE ar11=*A, ai11=A[1], ar12=A[4], ai12=A[5]; const TYPE ar22=A[6], ai22=A[7]; const int ldb2 = ldb+ldb; TYPE xr1, xi1, xr2, xi2, t0; TYPE *pB0=B, *pB1 = B+ldb2; int i; #define PFD 8 for (i=M; i; i--) { xr1 = *pB0; xr2 = *pB1; xi1 = pB0[1]; xi2 = pB1[1]; /* * real sequence: * x1 *= a11; * x2 = (x2 - x1*a12) * a22; */ t0 = xr1; xr1 = xr1*ar11 - xi1*ai11; xi1 = t0 *ai11 + xi1*ar11; xr2 -= xr1*ar12 - xi1*ai12; xi2 -= xr1*ai12 + xi1*ar12; ATL_pfl1W(pB0+PFD); t0 = xr2; ATL_pfl1W(pB1+PFD); xr2 = xr2*ar22 - xi2*ai22; xi2 = t0 *ai22 + xi2*ar22; *pB0 = xr1; pB0[1] = xi1; pB0 += 2; *pB1 = xr2; pB1[1] = xi2; pB1 += 2; } #undef PFD }
void ATL_USERMM (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc) /* * matmul with TA=T, TB=N, lda=KB, ldb=KB, ldc=0, mu=4, nu=4, ku=8 */ { const TYPE *stM = A + KB*M; const TYPE *stN = B + KB*N; const int incAm = KB3+8, incAn = -KB*M; const int incBm = 8-KB; #define incBn KB4 const int incCn = ((ldc<<2) - M)SHIFT; const int Kstart=(KB>>3)-1; TYPE *pC0=C, *pC1=pC0+(ldc SHIFT), *pC2=pC1+(ldc SHIFT), *pC3=pC2+(ldc SHIFT); const TYPE *pA0=A; const TYPE *pB0=B; register int k; register TYPE rA0, rA1, rA2, rA3; register TYPE rB0, rB1, rB2, rB3; register TYPE m0, m1, m2, m3; register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC0_1, rC1_1, rC2_1, rC3_1, rC0_2, rC1_2, rC2_2, rC3_2, rC0_3, rC1_3, rC2_3, rC3_3; do /* N-loop */ { ATL_pfl1R(pB0); ATL_pfl1R(pB0+KB); ATL_pfl1R(pB0+KB2); ATL_pfl1R(pB0+KB3); ATL_pfl1R(pB0+8); ATL_pfl1R(pB0+KB+8); ATL_pfl1R(pB0+KB2+8); ATL_pfl1R(pB0+KB3+8); do /* M-loop */ { #ifdef BETA0 rC0_0 = rC1_0 = rC2_0 = rC3_0 = rC0_1 = rC1_1 = rC2_1 = rC3_1 = rC0_2 = rC1_2 = rC2_2 = rC3_2 = rC0_3 = rC1_3 = rC2_3 = rC3_3 = ATL_rzero; /* ATL_pfl1R(pB0+8); ATL_pfl1R(pB0+KB+8); ATL_pfl1R(pB0+KB2+8); ATL_pfl1R(pB0+KB3+8); */ #else #ifdef TREAL rC0_0 = *pC0; rC0_1 = *pC1; rC0_2 = *pC2; rC0_3 = *pC3; rC1_0 = pC0[1]; rC1_1 = pC1[1]; rC1_2 = pC2[1]; rC1_3 = pC3[1]; rC2_0 = pC0[2]; rC2_1 = pC1[2]; rC2_2 = pC2[2]; rC2_3 = pC3[2]; rC3_0 = pC0[3]; rC3_1 = pC1[3]; rC3_2 = pC2[3]; rC3_3 = pC3[3]; #else rC0_0 = *pC0; rC0_1 = *pC1; rC0_2 = *pC2; rC0_3 = *pC3; rC1_0 = pC0[2]; rC1_1 = pC1[2]; rC1_2 = pC2[2]; rC1_3 = pC3[2]; rC2_0 = pC0[4]; rC2_1 = pC1[4]; rC2_2 = pC2[4]; rC2_3 = pC3[4]; rC3_0 = pC0[6]; rC3_1 = pC1[6]; rC3_2 = pC2[6]; rC3_3 = pC3[6]; #endif #ifdef BETAX rB3 = beta; rC0_0 *= rB3; rC0_1 *= rB3; rC0_2 *= rB3; rC0_3 *= rB3; /* ATL_pfl1R(pB0+8); */ rC1_0 *= rB3; rC1_1 *= rB3; rC1_2 *= rB3; rC1_3 *= rB3; /* ATL_pfl1R(pB0+KB+8); */ rC2_0 *= rB3; rC2_1 *= rB3; rC2_2 *= rB3; rC2_3 *= rB3; /* ATL_pfl1R(pB0+KB2+8); */ rC3_0 *= rB3; rC3_1 *= rB3; rC3_2 *= rB3; rC3_3 *= rB3; /* ATL_pfl1R(pB0+KB3+8); */ #else /* ATL_pfl1R(pB0+8); ATL_pfl1R(pB0+KB+8); ATL_pfl1R(pB0+KB2+8); ATL_pfl1R(pB0+KB3+8); */ #endif #endif /* * Start pipeline */ rA0 = *pA0; rB0 = *pB0; rA1 = pA0[KB]; rA2 = pA0[KB2]; m0 = rA0 * rB0; rA3 = pA0[KB3]; m1 = rA1 * rB0; rB1 = pB0[KB]; m2 = rA2 * rB0; rB2 = pB0[KB2]; m3 = rA3 * rB0; rB3 = pB0[KB3]; for (k=Kstart; k; k--) { rC0_0 += m0; m0 = rA0 * rB1; rB0 = pB0[1]; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+1]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pA0+KB4); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+1]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[1]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+1]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+1]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+1]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+1]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[2]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+2]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pA0+KB5); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+2]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[2]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+2]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+2]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+2]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+2]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[3]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+3]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pA0+KB6); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+3]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[3]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+3]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+3]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+3]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+3]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[4]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+4]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pA0+KB7); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+4]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[4]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+4]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+4]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+4]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+4]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[5]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+5]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pB0+16); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+5]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[5]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+5]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+5]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+5]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+5]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[6]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+6]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pB0+KB+16); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+6]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[6]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+6]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+6]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+6]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+6]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[7]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+7]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pB0+KB2+16); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+7]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[7]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+7]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+7]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+7]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+7]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[8]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+8]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pB0+KB3+16); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+8]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[8]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+8]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+8]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+8]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+8]; rC1_3 += m1; m1 = rA1 * rB0; pA0 += 8; rC2_3 += m2; m2 = rA2 * rB0; pB0 += 8; rC3_3 += m3; m3 = rA3 * rB0; } rC0_0 += m0; m0 = rA0 * rB1; rB0 = pB0[1]; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+1]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pA0+KB4); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+1]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[1]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+1]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+1]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+1]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+1]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[2]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+2]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pA0+KB5); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+2]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[2]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+2]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+2]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+2]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+2]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[3]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+3]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pA0+KB6); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+3]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[3]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+3]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+3]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+3]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+3]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[4]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+4]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pA0+KB7); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+4]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[4]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+4]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+4]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+4]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+4]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[5]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+5]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pB0-KB+8); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+5]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[5]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+5]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+5]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+5]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+5]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[6]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+6]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pB0+8); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+6]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[6]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+6]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+6]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+6]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+6]; rC1_3 += m1; m1 = rA1 * rB0; rC2_3 += m2; m2 = rA2 * rB0; rC3_3 += m3; m3 = rA3 * rB0; rB0 = pB0[7]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB+7]; rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1R(pB0+KB+8); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+7]; rC0_2 += m0; m0 = rA0 * rB3; rA0 = pA0[7]; rC1_2 += m1; m1 = rA1 * rB3; rA1 = pA0[KB+7]; rC2_2 += m2; m2 = rA2 * rB3; rA2 = pA0[KB2+7]; rC3_2 += m3; m3 = rA3 * rB3; rA3 = pA0[KB3+7]; rC0_3 += m0; m0 = rA0 * rB0; rB3 = pB0[KB3+7]; rC1_3 += m1; m1 = rA1 * rB0; pB0 += incBm; rC2_3 += m2; m2 = rA2 * rB0; pA0 += incAm; rC3_3 += m3; m3 = rA3 * rB0; ATL_pfl1W(pC0); /* * Drain pipe on last iteration of K-loop */ rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; ATL_pfl1W(pC1); rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; ATL_pfl1W(pC2); rC0_1 += m0; m0 = rA0 * rB2; rC1_1 += m1; m1 = rA1 * rB2; ATL_pfl1W(pC3); rC2_1 += m2; m2 = rA2 * rB2; rC3_1 += m3; m3 = rA3 * rB2; ATL_pfl1R(pC0+4); rC0_2 += m0; m0 = rA0 * rB3; rC1_2 += m1; m1 = rA1 * rB3; ATL_pfl1R(pC1+4); rC2_2 += m2; m2 = rA2 * rB3; rC3_2 += m3; m3 = rA3 * rB3; ATL_pfl1R(pC2+4); rC0_3 += m0; rC1_3 += m1; ATL_pfl1R(pC3+4); rC2_3 += m2; rC3_3 += m3; ATL_pfl1R(pB0+KB3+8); #ifdef TREAL *pC0 = rC0_0; pC0[1] = rC1_0; pC0[2] = rC2_0; pC0[3] = rC3_0; pC0 += 4; *pC1 = rC0_1; pC1[1] = rC1_1; pC1[2] = rC2_1; pC1[3] = rC3_1; pC1 += 4; *pC2 = rC0_2; pC2[1] = rC1_2; pC2[2] = rC2_2; pC2[3] = rC3_2; pC2 += 4; *pC3 = rC0_3; pC3[1] = rC1_3; pC3[2] = rC2_3; pC3[3] = rC3_3; pC3 += 4; #else *pC0 = rC0_0; pC0[2] = rC1_0; pC0[4] = rC2_0; pC0[6] = rC3_0; pC0 += 8; *pC1 = rC0_1; pC1[2] = rC1_1; pC1[4] = rC2_1; pC1[6] = rC3_1; pC1 += 8; *pC2 = rC0_2; pC2[2] = rC1_2; pC2[4] = rC2_2; pC2[6] = rC3_2; pC2 += 8; *pC3 = rC0_3; pC3[2] = rC1_3; pC3[4] = rC2_3; pC3[6] = rC3_3; pC3 += 8; #endif } while(pA0 != stM); pC0 += incCn; pC1 += incCn; pC2 += incCn; pC3 += incCn; pA0 += incAn; pB0 += incBn; } while(pB0 != stN); }
static void trsmLU_3(const int N, const TYPE *A, TYPE *B, const int ldb) /* * 'Left, 'Upper', with 1 col prefetch, written with all dependencies shown, * so that compiler can optimize. * A is known to be 3x3, with 1/alpha already applied, diagonals already * inverted */ { const TYPE ar11=*A, ai11=A[1], ar12=A[6], ai12=A[7], ar13=A[12], ai13=A[13]; const TYPE ar22=A[ 8], ai22=A[ 9], ar23=A[14], ai23=A[15]; const TYPE ar33=A[16], ai33=A[17]; TYPE xr1, xi1, xr2, xi2, xr3, xi3; TYPE t0, p0; const int ldb2=ldb+ldb; TYPE *bn=B+ldb2; const int pfd=ldb2+ldb2; int j; p0 = B[4]; for (j=N-1; j; j--) { xr3 = p0 ; xi3 = B[5]; xr1 = *B ; xi1 = B[1]; xr2 = B[2]; xi2 = B[3]; t0 = xr3; xr3 = ar33*xr3 - ai33*xi3; xi3 = ar33*xi3 + ai33*t0; xr2 -= ar23*xr3 - ai23*xi3; p0 = bn[4]; xi2 -= ar23*xi3 + ai23*xr3; t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; xi2 = ar22*xi2 + ai22*t0; xr1 -= ar13*xr3 - ai13*xi3; ATL_pfl1W(bn+pfd); xi1 -= ar13*xi3 + ai13*xr3; ATL_pfl1W(bn+pfd+4); xr1 -= ar12*xr2 - ai12*xi2; xi1 -= ar12*xi2 + ai12*xr2; t0 = xr1; xr1 = ar11*xr1 - ai11*xi1; xi1 = ar11*xi1 + ai11*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; B[4] = xr3; B[5] = xi3; B = bn; bn += ldb2; } xr3 = p0 ; xi3 = B[5]; xr1 = *B ; xi1 = B[1]; xr2 = B[2]; xi2 = B[3]; t0 = xr3; xr3 = ar33*xr3 - ai33*xi3; xi3 = ar33*xi3 + ai33*t0; xr2 -= ar23*xr3 - ai23*xi3; xi2 -= ar23*xi3 + ai23*xr3; t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; xi2 = ar22*xi2 + ai22*t0; xr1 -= ar13*xr3 - ai13*xi3; xi1 -= ar13*xi3 + ai13*xr3; xr1 -= ar12*xr2 - ai12*xi2; xi1 -= ar12*xi2 + ai12*xr2; t0 = xr1; xr1 = ar11*xr1 - ai11*xi1; xi1 = ar11*xi1 + ai11*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; B[4] = xr3; B[5] = xi3; }
static void trsmRL_4(const int M, const TYPE *A, TYPE *B, const int ldb) /* * 'Right', 'Lower', written with all dependencies shown, so that the * compiler can optimize. A is known to be 4x4, with 1/alpha already applied, * diagonals already inverted. */ { const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3], ar31=A[4], ai31=A[5], ar41=A[6], ai41=A[7]; const TYPE ar22=A[10], ai22=A[11], ar32=A[12], ai32=A[13], ar42=A[14], ai42=A[15]; const TYPE ar33=A[20], ai33=A[21], ar43=A[22], ai43=A[23]; const TYPE ar44=A[30], ai44=A[31]; const int ldb2 = ldb+ldb; TYPE xr1, xi1, xr2, xi2, xr3, xi3, xr4, xi4, t0; TYPE *pB0=B, *pB1 = B+ldb2, *pB2=pB1+ldb2, *pB3=pB2+ldb2; int i; #define PFD 8 for (i=M; i; i--) { xr4 = *pB3; xr3 = *pB2; xr2 = *pB1; xr1 = *pB0; xi4 = pB3[1]; xi3 = pB2[1]; xi2 = pB1[1]; xi1 = pB0[1]; /* * REAL SEQUENCE : * * x4 *= a11; * x3 = (x3 - x4*a43) * a33; * x2 = (x2 - x4*a42 - x3*a32) * a22; * x1 = (x1 - x4*a41 - x3*a31 - x2*a21) * a11; */ t0 = xr4; xr4 = ar44*xr4 - ai44*xi4; xi4 = ar44*xi4 + ai44*t0; xr3 -= xr4*ar43 - xi4*ai43; xi3 -= xr4*ai43 + xi4*ar43; t0 = xr3; xr3 = ar33*xr3 - ai33*xi3; xi3 = ar33*xi3 + ai33*t0; xr2 -= xr4*ar42 - xi4*ai42; xi2 -= xr4*ai42 + xi4*ar42; ATL_pfl1W(pB3+PFD); xr2 -= xr3*ar32 - xi3*ai32; ATL_pfl1W(pB2+PFD); xi2 -= xr3*ai32 + xi3*ar32; t0 = xr2; xr2 = xr2*ar22 - xi2*ai22; xi2 = t0 *ai22 + xi2*ar22; xr1 -= xr4*ar41 - xi4*ai41; xi1 -= xr4*ai41 + xi4*ar41; xr1 -= xr3*ar31 - xi3*ai31; xi1 -= xr3*ai31 + xi3*ar31; xr1 -= xr2*ar21 - xi2*ai21; ATL_pfl1W(pB1+PFD); xi1 -= xr2*ai21 + xi2*ar21; ATL_pfl1W(pB0+PFD); t0 = xr1; xr1 = xr1*ar11 - xi1*ai11; xi1 = t0 *ai11 + xi1*ar11; *pB3 = xr4; pB3[1] = xi4; pB3 += 2; *pB2 = xr3; pB2[1] = xi3; pB2 += 2; *pB1 = xr2; pB1[1] = xi2; pB1 += 2; *pB0 = xr1; pB0[1] = xi1; pB0 += 2; } #undef PFD }
static void trsmRU_4(const int M, const TYPE *A, TYPE *B, const int ldb) /* * 'Right', 'Upper', written with all dependencies shown, so that the * compiler can optimize. A is known to be 4x4, with 1/alpha already applied, * diagonals already inverted. */ { const TYPE ar11=*A, ai11=A[1], ar12=A[8], ai12=A[9], ar13=A[16], ai13=A[17], ar14=A[24], ai14=A[25]; const TYPE ar22=A[10], ai22=A[11], ar23=A[18], ai23=A[19], ar24=A[26], ai24=A[27]; const TYPE ar33=A[20], ai33=A[21], ar34=A[28], ai34=A[29]; const TYPE ar44=A[30], ai44=A[31]; const int ldb2 = ldb+ldb; TYPE xr1, xi1, xr2, xi2, xr3, xi3, xr4, xi4, t0; TYPE *pB0=B, *pB1 = B+ldb2, *pB2=pB1+ldb2, *pB3=pB2+ldb2; int i; #define PFD 8 for (i=M; i; i--) { xr1 = *pB0; xr2 = *pB1; xr3 = *pB2; xr4 = *pB3; xi1 = pB0[1]; xi2 = pB1[1]; xi3 = pB2[1]; xi4 = pB3[1]; /* * real sequence: * x1 *= a11; * x2 = (x2 - x1*a12) * a22; * x3 = (x3 - x1*a13 - x2*a23) * a33; * x4 = (x4 - x1*a14 - x2*a24 - x3*a34) * a44; */ t0 = xr1; xr1 = xr1*ar11 - xi1*ai11; xi1 = t0 *ai11 + xi1*ar11; xr2 -= xr1*ar12 - xi1*ai12; xi2 -= xr1*ai12 + xi1*ar12; t0 = xr2; xr2 = xr2*ar22 - xi2*ai22; xi2 = t0 *ai22 + xi2*ar22; xr3 -= xr1*ar13 - xi1*ai13; xi3 -= xr1*ai13 + xi1*ar13; xr3 -= xr2*ar23 - xi2*ai23; ATL_pfl1W(pB0+PFD); xi3 -= xr2*ai23 + xi2*ar23; ATL_pfl1W(pB1+PFD); t0 = xr3; xr3 = xr3*ar33 - xi3*ai33; xi3 = t0 *ai33 + xi3*ar33; xr4 -= xr1*ar14 - xi1*ai14; xi4 -= xr1*ai14 + xi1*ar14; xr4 -= xr2*ar24 - xi2*ai24; xi4 -= xr2*ai24 + xi2*ar24; xr4 -= xr3*ar34 - xi3*ai34; ATL_pfl1W(pB2+PFD); xi4 -= xr3*ai34 + xi3*ar34; ATL_pfl1W(pB3+PFD); t0 = xr4; xr4 = xr4*ar44 - xi4*ai44; xi4 = t0 *ai44 + xi4*ar44; *pB0 = xr1; pB0[1] = xi1; pB0 += 2; *pB1 = xr2; pB1[1] = xi2; pB1 += 2; *pB2 = xr3; pB2[1] = xi3; pB2 += 2; *pB3 = xr4; pB3[1] = xi4; pB3 += 2; } #undef PFD }
static void trsmLL_4(const int N, const TYPE *A, TYPE *B, const int ldb) /* * 'Left', 'Lower', with 1 column prefetch, written with all dependencies * shown, so that the compiler can optimize. * A is known to be 4x4, with 1/alpha already applied, diagonals already * inverted */ { const TYPE ar11=*A, ai11=A[1], ar21=A[2], ai21=A[3], ar31=A[4], ai31=A[5], ar41=A[6], ai41=A[7]; const TYPE ar22=A[10], ai22=A[11], ar32=A[12], ai32=A[13], ar42=A[14], ai42=A[15]; const TYPE ar33=A[20], ai33=A[21], ar43=A[22], ai43=A[23]; const TYPE ar44=A[30], ai44=A[31]; const int ldb2 = ldb+ldb; TYPE xr1, xi1, xr2, xi2, xr3, xi3, xr4, xi4; TYPE t0, p0; TYPE *pBn=B+ldb2; const int pfd = ldb2+ldb2; int j; p0 = *B; for (j=N-1; j; j--) { xr1 = p0; xi1 = B[1]; xr3 = B[4]; xi3 = B[5]; xr2 = B[2]; xi2 = B[3]; xr4 = B[6]; xi4 = B[7]; t0 = xr1; xr1 = ar11 * xr1 - ai11 * xi1; xi1 = ar11 * xi1 + ai11 * t0; xr2 -= ar21*xr1 - ai21*xi1; xi2 -= ar21*xi1 + ai21*xr1; t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; xi2 = ar22*xi2 + ai22*t0; xr3 -= ar31*xr1 - ai31*xi1; xi3 -= ar31*xi1 + ai31*xr1; xr3 -= ar32*xr2 - ai32*xi2; p0 = *pBn; xi3 -= ar32*xi2 + ai32*xr2; t0 = xr3; xr3 = ar33*xr3 - ai33*xi3; xi3 = ar33*xi3 + ai33*t0; xr4 -= ar41*xr1 - ai41*xi1; ATL_pfl1W(pBn+pfd); xi4 -= ar41*xi1 + ai41*xr1; ATL_pfl1W(pBn+pfd+4); xr4 -= ar42*xr2 - ai42*xi2; xi4 -= ar42*xi2 + ai42*xr2; xr4 -= ar43*xr3 - ai43*xi3; xi4 -= ar43*xi3 + ai43*xr3; t0 = xr4; xr4 = ar44*xr4 - ai44*xi4; xi4 = ar44*xi4 + ai44*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; B[4] = xr3; B[5] = xi3; B[6] = xr4; B[7] = xi4; B = pBn; pBn += ldb2; } xr1 = p0; xi1 = B[1]; xr3 = B[4]; xi3 = B[5]; xr2 = B[2]; xi2 = B[3]; xr4 = B[6]; xi4 = B[7]; t0 = xr1; xr1 = ar11 * xr1 - ai11 * xi1; xi1 = ar11 * xi1 + ai11 * t0; xr2 -= ar21*xr1 - ai21*xi1; xi2 -= ar21*xi1 + ai21*xr1; t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; xi2 = ar22*xi2 + ai22*t0; xr3 -= ar31*xr1 - ai31*xi1; xi3 -= ar31*xi1 + ai31*xr1; xr3 -= ar32*xr2 - ai32*xi2; xi3 -= ar32*xi2 + ai32*xr2; t0 = xr3; xr3 = ar33*xr3 - ai33*xi3; xi3 = ar33*xi3 + ai33*t0; xr4 -= ar41*xr1 - ai41*xi1; xi4 -= ar41*xi1 + ai41*xr1; xr4 -= ar42*xr2 - ai42*xi2; xi4 -= ar42*xi2 + ai42*xr2; xr4 -= ar43*xr3 - ai43*xi3; xi4 -= ar43*xi3 + ai43*xr3; t0 = xr4; xr4 = ar44*xr4 - ai44*xi4; xi4 = ar44*xi4 + ai44*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; B[4] = xr3; B[5] = xi3; B[6] = xr4; B[7] = xi4; }
static void trsmLU_4(const int N, const TYPE *A, TYPE *B, const int ldb) /* * 'Left, 'Upper', with 1 col prefetch, written with all dependencies shown, * so that compiler can optimize. * A is known to be 4x4, with 1/alpha already applied, diagonals already * inverted */ { const TYPE ar11=*A, ai11=A[1], ar12=A[8], ai12=A[9], ar13=A[16], ai13=A[17], ar14=A[24], ai14=A[25]; const TYPE ar22=A[10], ai22=A[11], ar23=A[18], ai23=A[19], ar24=A[26], ai24=A[27]; const TYPE ar33=A[20], ai33=A[21], ar34=A[28], ai34=A[29]; const TYPE ar44=A[30], ai44=A[31]; TYPE xr1, xi1, xr2, xi2, xr3, xi3, xr4, xi4; TYPE t0, p0; const int ldb2=ldb+ldb; TYPE *bn=B+ldb2; const int pfd = ldb2+ldb2; int j; p0 = B[6]; for (j=N-1; j; j--) { xr4 = p0 ; xi4 = B[7]; xr1 = *B; xi1 = B[1]; xr3 = B[4]; xi3 = B[5]; xr2 = B[2]; xi2 = B[3]; t0 = xr4; xr4 = ar44*xr4 - ai44*xi4; xi4 = ar44*xi4 + ai44*t0; xr3 -= ar34*xr4 - ai34*xi4; xi3 -= ar34*xi4 + ai34*xr4; t0 = xr3; xr3 = ar33*xr3 - ai33*xi3; xi3 = ar33*xi3 + ai33*t0; xr2 -= ar24*xr4 - ai24*xi4; p0 = bn[6]; xi2 -= ar24*xi4 + ai24*xr4; xr2 -= ar23*xr3 - ai23*xi3; xi2 -= ar23*xi3 + ai23*xr3; t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; ATL_pfl1W(bn+pfd); xi2 = ar22*xi2 + ai22*t0; ATL_pfl1W(bn+pfd+4); xr1 -= ar14*xr4 - ai14*xi4; xi1 -= ar14*xi4 + ai14*xr4; xr1 -= ar13*xr3 - ai13*xi3; xi1 -= ar13*xi3 + ai13*xr3; xr1 -= ar12*xr2 - ai12*xi2; xi1 -= ar12*xi2 + ai12*xr2; t0 = xr1; xr1 = ar11*xr1 - ai11*xi1; xi1 = ar11*xi1 + ai11*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; B[4] = xr3; B[5] = xi3; B[6] = xr4; B[7] = xi4; B = bn; bn += ldb2; } xr4 = p0 ; xi4 = B[7]; xr1 = *B; xi1 = B[1]; xr3 = B[4]; xi3 = B[5]; xr2 = B[2]; xi2 = B[3]; t0 = xr4; xr4 = ar44*xr4 - ai44*xi4; xi4 = ar44*xi4 + ai44*t0; xr3 -= ar34*xr4 - ai34*xi4; xi3 -= ar34*xi4 + ai34*xr4; t0 = xr3; xr3 = ar33*xr3 - ai33*xi3; xi3 = ar33*xi3 + ai33*t0; xr2 -= ar24*xr4 - ai24*xi4; xi2 -= ar24*xi4 + ai24*xr4; xr2 -= ar23*xr3 - ai23*xi3; xi2 -= ar23*xi3 + ai23*xr3; t0 = xr2; xr2 = ar22*xr2 - ai22*xi2; xi2 = ar22*xi2 + ai22*t0; xr1 -= ar14*xr4 - ai14*xi4; xi1 -= ar14*xi4 + ai14*xr4; xr1 -= ar13*xr3 - ai13*xi3; xi1 -= ar13*xi3 + ai13*xr3; xr1 -= ar12*xr2 - ai12*xi2; xi1 -= ar12*xi2 + ai12*xr2; t0 = xr1; xr1 = ar11*xr1 - ai11*xi1; xi1 = ar11*xi1 + ai11*t0; *B = xr1; B[1] = xi1; B[2] = xr2; B[3] = xi2; B[4] = xr3; B[5] = xi3; B[6] = xr4; B[7] = xi4; }
void ATL_USERMM (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc) /* * matmul with TA=T, TB=N, muladd=0, lat=4 lda=ldb=MB=KB=NB, ldc=0, * mu=4, nu=3, ku=2, and register prefetch * */ { const TYPE *stM = A + M*KB; const TYPE *stN = B + KB*N; const int incAm = KB3+8; #define incBn KB3 const int startK = (KB>>3)-1; const int incAn = -KB*M, incBm = 8-KB; const int incCn = (3*ldc - M)SHIFT; TYPE *pC0=C, *pC1=pC0+(ldc SHIFT), *pC2=pC1+(ldc SHIFT); const TYPE *pA0=A; const TYPE *pB0=B; register int k; register TYPE rA0, rA1, rA2, rA3, ra0, ra1, ra2, ra3; register TYPE rB0, rB1, rB2, rb0, rb1, rb2; register TYPE m0, m1, m2, m3; register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC0_1, rC1_1, rC2_1, rC3_1, rC0_2, rC1_2, rC2_2, rC3_2; do /* N-loop */ { ATL_pfl1R(pB0); ATL_pfl1R(pB0+KB); ATL_pfl1R(pB0+KB2); ATL_pfl1R(pB0+8); ATL_pfl1R(pB0+KB+8); ATL_pfl1R(pB0+KB2+8); do /* M-loop */ { #ifdef BETA0 rC0_0 = rC1_0 = rC2_0 = rC3_0 = rC0_1 = rC1_1 = rC2_1 = rC3_1 = rC0_2 = rC1_2 = rC2_2 = rC3_2 = ATL_rzero; #else #ifdef TREAL rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3]; rC0_1 = *pC1; rC1_1 = pC1[1]; rC2_1 = pC1[2]; rC3_1 = pC1[3]; rC0_2 = *pC2; rC1_2 = pC2[1]; rC2_2 = pC2[2]; rC3_2 = pC2[3]; #else rC0_0 = *pC0; rC1_0 = pC0[2]; rC2_0 = pC0[4]; rC3_0 = pC0[6]; rC0_1 = *pC1; rC1_1 = pC1[2]; rC2_1 = pC1[4]; rC3_1 = pC1[6]; rC0_2 = *pC2; rC1_2 = pC2[2]; rC2_2 = pC2[4]; rC3_2 = pC2[6]; #endif #ifdef BETAX ra3 = beta; rC0_0 *= ra3; rC1_0 *= ra3; rC2_0 *= ra3; rC3_0 *= ra3; rC0_1 *= ra3; rC1_1 *= ra3; rC2_1 *= ra3; rC3_1 *= ra3; rC0_2 *= ra3; rC1_2 *= ra3; rC2_2 *= ra3; rC3_2 *= ra3; #endif #endif /* * Start pipeline */ rA0 = *pA0; rB0 = *pB0; rA1 = pA0[KB]; rA2 = pA0[KB2]; rA3 = pA0[KB3]; rB1 = pB0[KB]; rB2 = pB0[KB2]; rb0 = pB0[1]; rb1 = pB0[KB+1]; rb2 = pB0[KB2+1]; m0 = rA0 * rB0; ra0 = pA0[1]; ra1 = pA0[KB+1]; m1 = rA1 * rB0; ra2 = pA0[KB2+1]; m2 = rA2 * rB0; ra3 = pA0[KB3+1]; m3 = rA3 * rB0; rB0 = pB0[2]; for (k=startK; k; k--) /* easy loop to unroll */ { rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; ATL_pfl1R(pA0+KB4); rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +2]; rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[ 2]; rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +2]; rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+2]; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+2]; rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+2]; rC1_2 += m1; m1 = ra1 * rb0; rC2_2 += m2; m2 = ra2 * rb0; rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[3]; rC0_0 += m0; m0 = ra0 * rb1; rC1_0 += m1; m1 = ra1 * rb1; ATL_pfl1R(pA0+KB5); rC2_0 += m2; m2 = ra2 * rb1; rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +3]; rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[3]; rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +3]; rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+3]; rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+3]; rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+3]; rC1_2 += m1; m1 = rA1 * rB0; rC2_2 += m2; m2 = rA2 * rB0; rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[4]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; ATL_pfl1R(pA0+KB6); rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +4]; rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[ 4]; rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +4]; rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+4]; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+4]; rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+4]; rC1_2 += m1; m1 = ra1 * rb0; rC2_2 += m2; m2 = ra2 * rb0; rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[5]; rC0_0 += m0; m0 = ra0 * rb1; rC1_0 += m1; m1 = ra1 * rb1; ATL_pfl1R(pA0+KB7); rC2_0 += m2; m2 = ra2 * rb1; rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +5]; rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[5]; rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +5]; rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+5]; rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+5]; rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+5]; rC1_2 += m1; m1 = rA1 * rB0; rC2_2 += m2; m2 = rA2 * rB0; rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[6]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; ATL_pfl1R(pB0+16); rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +6]; rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[ 6]; rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +6]; rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+6]; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+6]; rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+6]; rC1_2 += m1; m1 = ra1 * rb0; rC2_2 += m2; m2 = ra2 * rb0; rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[7]; rC0_0 += m0; m0 = ra0 * rb1; rC1_0 += m1; m1 = ra1 * rb1; ATL_pfl1R(pB0+KB+16); rC2_0 += m2; m2 = ra2 * rb1; rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +7]; rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[7]; rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +7]; rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+7]; rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+7]; rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+7]; rC1_2 += m1; m1 = rA1 * rB0; rC2_2 += m2; m2 = rA2 * rB0; rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[8]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; ATL_pfl1R(pB0+KB2+16); rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +8]; rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[ 8]; rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +8]; rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+8]; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+8]; rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+8]; rC1_2 += m1; m1 = ra1 * rb0; rC2_2 += m2; m2 = ra2 * rb0; rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[9]; rC0_0 += m0; m0 = ra0 * rb1; rC1_0 += m1; m1 = ra1 * rb1; ATL_pfl1R(pB0+KB3+16); rC2_0 += m2; m2 = ra2 * rb1; rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +9]; rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[9]; rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +9]; rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+9]; rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+9]; rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+9]; pA0 += 8; rC1_2 += m1; m1 = rA1 * rB0; rC2_2 += m2; m2 = rA2 * rB0; rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[10]; pB0 += 8; } rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; ATL_pfl1R(pA0+KB4); rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +2]; rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[ 2]; rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +2]; rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+2]; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+2]; rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+2]; rC1_2 += m1; m1 = ra1 * rb0; rC2_2 += m2; m2 = ra2 * rb0; rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[3]; rC0_0 += m0; m0 = ra0 * rb1; rC1_0 += m1; m1 = ra1 * rb1; ATL_pfl1R(pA0+KB5); rC2_0 += m2; m2 = ra2 * rb1; rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +3]; rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[3]; rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +3]; rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+3]; rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+3]; rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+3]; rC1_2 += m1; m1 = rA1 * rB0; rC2_2 += m2; m2 = rA2 * rB0; rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[4]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; ATL_pfl1R(pA0+KB6); rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +4]; rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[ 4]; rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +4]; rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+4]; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+4]; rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+4]; rC1_2 += m1; m1 = ra1 * rb0; rC2_2 += m2; m2 = ra2 * rb0; rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[5]; rC0_0 += m0; m0 = ra0 * rb1; rC1_0 += m1; m1 = ra1 * rb1; ATL_pfl1R(pA0+KB7); rC2_0 += m2; m2 = ra2 * rb1; rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +5]; rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[5]; rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +5]; rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+5]; rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+5]; rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+5]; rC1_2 += m1; m1 = rA1 * rB0; rC2_2 += m2; m2 = rA2 * rB0; rC3_2 += m3; m3 = rA3 * rB0; rB0 = pB0[6]; rC0_0 += m0; m0 = rA0 * rB1; rC1_0 += m1; m1 = rA1 * rB1; ATL_pfl1R(pB0-KB+8); rC2_0 += m2; m2 = rA2 * rB1; rC3_0 += m3; m3 = rA3 * rB1; rB1 = pB0[KB +6]; rC0_1 += m0; m0 = rA0 * rB2; rA0 = pA0[ 6]; rC1_1 += m1; m1 = rA1 * rB2; rA1 = pA0[KB +6]; rC2_1 += m2; m2 = rA2 * rB2; rA2 = pA0[KB2+6]; rC3_1 += m3; m3 = rA3 * rB2; rB2 = pB0[KB2+6]; rC0_2 += m0; m0 = ra0 * rb0; rA3 = pA0[KB3+6]; rC1_2 += m1; m1 = ra1 * rb0; rC2_2 += m2; m2 = ra2 * rb0; rC3_2 += m3; m3 = ra3 * rb0; rb0 = pB0[7]; rC0_0 += m0; m0 = ra0 * rb1; rC1_0 += m1; m1 = ra1 * rb1; ATL_pfl1R(pB0+8); rC2_0 += m2; m2 = ra2 * rb1; rC3_0 += m3; m3 = ra3 * rb1; rb1 = pB0[KB +7]; rC0_1 += m0; m0 = ra0 * rb2; ra0 = pA0[7]; rC1_1 += m1; m1 = ra1 * rb2; ra1 = pA0[KB +7]; rC2_1 += m2; m2 = ra2 * rb2; ra2 = pA0[KB2+7]; rC3_1 += m3; m3 = ra3 * rb2; rb2 = pB0[KB2+7]; rC0_2 += m0; m0 = rA0 * rB0; ra3 = pA0[KB3+7]; rC1_2 += m1; m1 = rA1 * rB0; pA0 += incAm; rC2_2 += m2; m2 = rA2 * rB0; ATL_pfl1R(pB0+KB+8); rC3_2 += m3; m3 = rA3 * rB0; pB0 += incBm; rC0_0 += m0; m0 = rA0 * rB1; ATL_pfl1W(pC0); rC1_0 += m1; m1 = rA1 * rB1; rC2_0 += m2; m2 = rA2 * rB1; ATL_pfl1W(pC1); rC3_0 += m3; m3 = rA3 * rB1; rC0_1 += m0; m0 = rA0 * rB2; ATL_pfl1W(pC2); rC1_1 += m1; m1 = rA1 * rB2; rC2_1 += m2; m2 = rA2 * rB2; ATL_pfl1R(pC0+(4 SHIFT)); rC3_1 += m3; m3 = rA3 * rB2; rC0_2 += m0; m0 = ra0 * rb0; ATL_pfl1R(pC1+(4 SHIFT)); rC1_2 += m1; m1 = ra1 * rb0; rC2_2 += m2; m2 = ra2 * rb0; ATL_pfl1R(pC2+(4 SHIFT)); rC3_2 += m3; m3 = ra3 * rb0; rC0_0 += m0; m0 = ra0 * rb1; ATL_pfl1R(pB0+8); rC1_0 += m1; m1 = ra1 * rb1; rC2_0 += m2; m2 = ra2 * rb1; rC3_0 += m3; m3 = ra3 * rb1; rC0_1 += m0; m0 = ra0 * rb2; rC1_1 += m1; m1 = ra1 * rb2; ATL_pfl1R(pB0+KB2+8); rC2_1 += m2; m2 = ra2 * rb2; rC3_1 += m3; m3 = ra3 * rb2; rC0_2 += m0; rC1_2 += m1; rC2_2 += m2; rC3_2 += m3; ATL_pfl1R(pB0+KB2+8); #ifdef TREAL *pC0 = rC0_0; pC0[1] = rC1_0; pC0[2] = rC2_0; pC0[3] = rC3_0; pC0 += 4; *pC1 = rC0_1; pC1[1] = rC1_1; pC1[2] = rC2_1; pC1[3] = rC3_1; pC1 += 4; *pC2 = rC0_2; pC2[1] = rC1_2; pC2[2] = rC2_2; pC2[3] = rC3_2; pC2 += 4; #else *pC0 = rC0_0; pC0[2] = rC1_0; pC0[4] = rC2_0; pC0[6] = rC3_0; pC0 += 8; *pC1 = rC0_1; pC1[2] = rC1_1; pC1[4] = rC2_1; pC1[6] = rC3_1; pC1 += 8; *pC2 = rC0_2; pC2[2] = rC1_2; pC2[4] = rC2_2; pC2[6] = rC3_2; pC2 += 8; #endif } while(pA0 != stM); pC0 += incCn; pC1 += incCn; pC2 += incCn; pA0 += incAn; pB0 += incBn; } while(pB0 != stN); }