int mmtst(void) { char fnam[80]; #if defined(LDA) && LDA != 0 const int lda=LDA; #else const int lda=2*LDA2; #endif #if defined(LDB) && LDB != 0 const int ldb=LDB; #else const int ldb=2*LDB2; #endif #if defined(LDC) && LDC != 0 const int ldc=LDC; #else const int ldc=2*LDC2; #endif int nA, nB; #ifdef TCPLX int inca, incb, incc; const TYPE one=1.0, none=(-1.0); #if (ALPHA == 1) TYPE alpha[2] = {1.0, 0.0}; #elif (ALPHA == -1) TYPE alpha[2] = {-1.0, 0.0}; #else TYPE alpha[2] = {2.3, 0.0}; #endif #if (BETA == 1) TYPE beta[2] = {1.0, 0.0}; #elif (BETA == -1) TYPE beta[2] = {-1.0, 0.0}; #elif (BETA == 0) TYPE beta[2] = {0.0, 0.0}; #else TYPE beta[2] = {1.3, 0.0}; #endif #else #ifdef ALPHA TYPE alpha=ALPHA; #else TYPE alpha=1.0; #endif #ifdef BETA TYPE beta=BETA; #else TYPE beta=1.0; #endif #endif const TYPE rone=1.0, rnone=(-1.0); void *va=NULL, *vb=NULL, *vc=NULL; TYPE *C0, *C1, *A, *B; TYPE diff, tmp; int i, j, k, n, nerr; int M=MB, N=NB, K=KB; TYPE ErrBound; if (!M) M = MB0; if (!N) N = NB0; if (!K) K = KB0; #ifdef TREAL ErrBound = 2.0 * (Mabs(alpha) * 2.0*K*EPS + Mabs(beta) * EPS) + EPS; #else diff = Mabs(*alpha) + Mabs(alpha[1]); tmp = Mabs(*beta) + Mabs(beta[1]); ErrBound = 2.0 * (diff*8.0*K*EPS + tmp*EPS) + EPS; #endif #ifdef NoTransA nA = K; #else nA = M; #endif #ifdef NoTransB nB = N; #else nB = K; #endif #ifdef TCPLX inca = lda*nA; incb = ldb*nB; #endif #ifdef ATL_MinMMAlign va = malloc(ATL_MinMMAlign + lda*nA*ATL_sizeof); vb = malloc(ATL_MinMMAlign + ldb*nB*ATL_sizeof); vc = C0 = malloc(2*ldc*N*ATL_sizeof); assert(va && vb && C0); A = (TYPE *) ( ( ((size_t) va)/ATL_MinMMAlign ) * ATL_MinMMAlign + ATL_MinMMAlign ); B = (TYPE *) ( ( ((size_t) vb)/ATL_MinMMAlign ) * ATL_MinMMAlign + ATL_MinMMAlign ); #else C0 = vc = malloc( (2*ldc*N + lda*nA + ldb*nB) * ATL_sizeof); assert(vc); A = C1 + (ldc * N SHIFT); B = A + (lda * nA SHIFT); #endif C1 = C0 + (ldc * N SHIFT); for (n=lda*nA SHIFT, i=0; i < n; i++) A[i] = dumb_rand(); for (n=ldb*nB SHIFT, i=0; i < n; i++) B[i] = dumb_rand(); for (n=ldc*N SHIFT, i=0; i < n; i++) C0[i] = C1[i] = dumb_rand(); tst_mm(M, N, K, alpha, A, lda, B, ldb, beta, C0, ldc); NBmm(M, N, K, alpha, A, lda, B, ldb, beta, C1, ldc); nerr = 0; for (j=0; j < N; j++) { for (i=0; i < M SHIFT; i++) { k = i + j*(ldc SHIFT); diff = C0[k] - C1[k]; if (diff < 0.0) diff = -diff; if (diff > ErrBound) { fprintf(stderr, "C(%d,%d) : expected=%f, got=%f\n", i, j, C0[k], C1[k]); nerr++; } } } free(vc); if (va) free(va); if (vb) free(vb); return(nerr); }
void Mjoin(PATL,mmIJK2)(int K, int nMb, int nNb, int nKb, int ib, int jb, int kb, const SCALAR alpha, const TYPE *A, int lda, TYPE *pA0, int incA, MAT2BLK A2blk, const TYPE *pB0, const SCALAR beta, TYPE *C, int ldc, TYPE *pC, PUTBLK putblk, NBMM0 NBmm0) /* * Outer three loops for matmul with outer loop over rows of A */ { int i, j, ldpc; const int ZEROC = ((putblk == NULL) && SCALAR_IS_ZERO(beta)); const int incK = ATL_MulByNB(K), incC = ATL_MulByNB(ldc); TYPE *pA=pA0, *stA=pA0+ATL_MulByNBNB(nKb); const TYPE *pB=pB0; const TYPE cubeta = ( (putblk) ? ATL_rzero : beta ); TYPE *c; if (putblk) { ldpc = NB; if (!nKb && kb) Mjoin(PATL,gezero)(MB, NB, pC, MB); } else ldpc = ldc; for (i=nMb; i; i--) /* loop over full row panels of A */ { if (A) { A2blk(K, NB, A, lda, pA, alpha); /* get 1 row panel of A */ A += incA; } if (!putblk) pC = C; c = C; C += NB; for (j=nNb; j; j--) /* full column panels of B */ { if (nKb) { NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, beta, pC, ldpc); pA += NBNB; pB += NBNB; if (nKb != 1) { do { NBmm(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, pC, ldpc); pA += NBNB; pB += NBNB; } while (pA != stA); } if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, pC, ldpc); pB += kb*NB; } } else { if (ZEROC) Mjoin(PATL,gezero)(MB, NB, pC, ldpc); if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, cubeta, pC, ldpc); pB += kb*NB; } } pA = pA0; if (putblk) putblk(NB, NB, pC, c, ldc, beta); else pC += incC; c += incC; } if (jb) { NBJBmm(jb, K, pA, pB, cubeta, pC, ldpc); if (putblk) putblk(NB, jb, pC, c, ldc, beta); } pB = pB0; if (!A) { pA0 += incK; pA = pA0; stA += incK; } } if (ib) { c = C; if (A) A2blk(K, ib, A, lda, pA, alpha); /* get last row panel of A */ for (j=nNb; j; j--) /* full column panels of B */ { if (putblk) { IBNBmm(ib, K, pA, pB, ATL_rzero, pC, ib); putblk(ib, NB, pC, c, ldc, beta); } else IBNBmm(ib, K, pA, pB, beta, c, ldc); pB += incK; c += incC; } if (jb) { if (putblk) { IBJBmm(ib, jb, K, pA, pB, ATL_rzero, pC, ib); putblk(ib, jb, pC, c, ldc, beta); } else IBJBmm(ib, jb, K, pA, pB, beta, c, ldc); } } }