void Mjoin(PATL,mmJIK2) (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb, const SCALAR alpha, const TYPE *pA0, const TYPE *B, int ldb, TYPE *pB0, int incB, MAT2BLK B2blk, const SCALAR beta, TYPE *C, int ldc, MATSCAL gescal, NBMM0 NBmm0) { const int incK = ATL_MulByNB(K)SHIFT, incC = ATL_MulByNB(ldc-nMb) SHIFT; const int ZEROC = ((gescal == NULL) && SCALAR_IS_ZERO(beta)); int i, j = nNb; const TYPE *pA=pA0; const TYPE rbeta = ( (gescal) ? ATL_rone : *beta ); TYPE *pB=pB0, *stB=pB0+(ATL_MulByNBNB(nKb)SHIFT); if (nNb) { do /* Loop over full column panels of B */ { if (B) { B2blk(K, NB, B, ldb, pB, alpha); B += incB; } if (nMb) { i = nMb; do /* loop over full row panels of A */ { if (gescal) gescal(NB, NB, beta, C, ldc); if (nKb) /* loop over full blocks in panels */ { NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, rbeta, C, ldc); pA += NBNB2; pB += NBNB2; if (nKb != 1) { do { NBmm_b1(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, C, ldc); pA += NBNB2; pB += NBNB2; } while (pB != stB); } if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, C, ldc); pA += ATL_MulByNB(kb)<<1; } } else if (kb) { if (ZEROC) Mjoin(PATL,gezero)(MB, NB, C, ldc); KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, rbeta, C, ldc); pA += ATL_MulByNB(kb)<<1; } pB = pB0; C += NB2; } while (--i); } if (ib) { if (gescal) gescal(ib, NB, beta, C, ldc); IBNBmm(ib, K, pA, pB, rbeta, C, ldc); } if (!B) { pB0 += incK; pB = pB0; stB += incK; } C += incC; pA = pA0; } while (--j); } if (jb) { if (B) B2blk(K, jb, B, ldb, pB, alpha); for (i=nMb; i; i--) { if (gescal) gescal(NB, jb, beta, C, ldc); NBJBmm(jb, K, pA, pB, rbeta, C, ldc); pA += incK; C += NB2; } if (ib) { if (gescal) gescal(ib, jb, beta, C, ldc); IBJBmm(ib, jb, K, pA, pB, rbeta, C, ldc); } } }
void Mjoin(PATL,mmIJK2)(int K, int nMb, int nNb, int nKb, int ib, int jb, int kb, const SCALAR alpha, const TYPE *A, int lda, TYPE *pA0, int incA, MAT2BLK A2blk, const TYPE *pB0, const SCALAR beta, TYPE *C, int ldc, TYPE *pC, PUTBLK putblk, NBMM0 NBmm0) /* * Outer three loops for matmul with outer loop over rows of A */ { int i, j, ldpc; const int ZEROC = ((putblk == NULL) && SCALAR_IS_ZERO(beta)); const int incK = ATL_MulByNB(K), incC = ATL_MulByNB(ldc); TYPE *pA=pA0, *stA=pA0+ATL_MulByNBNB(nKb); const TYPE *pB=pB0; const TYPE cubeta = ( (putblk) ? ATL_rzero : beta ); TYPE *c; if (putblk) { ldpc = NB; if (!nKb && kb) Mjoin(PATL,gezero)(MB, NB, pC, MB); } else ldpc = ldc; for (i=nMb; i; i--) /* loop over full row panels of A */ { if (A) { A2blk(K, NB, A, lda, pA, alpha); /* get 1 row panel of A */ A += incA; } if (!putblk) pC = C; c = C; C += NB; for (j=nNb; j; j--) /* full column panels of B */ { if (nKb) { NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, beta, pC, ldpc); pA += NBNB; pB += NBNB; if (nKb != 1) { do { NBmm(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, pC, ldpc); pA += NBNB; pB += NBNB; } while (pA != stA); } if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, pC, ldpc); pB += kb*NB; } } else { if (ZEROC) Mjoin(PATL,gezero)(MB, NB, pC, ldpc); if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, cubeta, pC, ldpc); pB += kb*NB; } } pA = pA0; if (putblk) putblk(NB, NB, pC, c, ldc, beta); else pC += incC; c += incC; } if (jb) { NBJBmm(jb, K, pA, pB, cubeta, pC, ldpc); if (putblk) putblk(NB, jb, pC, c, ldc, beta); } pB = pB0; if (!A) { pA0 += incK; pA = pA0; stA += incK; } } if (ib) { c = C; if (A) A2blk(K, ib, A, lda, pA, alpha); /* get last row panel of A */ for (j=nNb; j; j--) /* full column panels of B */ { if (putblk) { IBNBmm(ib, K, pA, pB, ATL_rzero, pC, ib); putblk(ib, NB, pC, c, ldc, beta); } else IBNBmm(ib, K, pA, pB, beta, c, ldc); pB += incK; c += incC; } if (jb) { if (putblk) { IBJBmm(ib, jb, K, pA, pB, ATL_rzero, pC, ib); putblk(ib, jb, pC, c, ldc, beta); } else IBJBmm(ib, jb, K, pA, pB, beta, c, ldc); } } }
void Mjoin(PATL,mmIJK2) (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb, const SCALAR alpha, const TYPE *A, const int lda, TYPE *pA0, const int incA, MAT2BLK A2blk, TYPE *pB0, const SCALAR beta, TYPE *C, int ldc, MATSCAL gescal, NBMM0 NBmm0) { const int incK = ATL_MulByNB(K)<<1; const int incCn = ATL_MulByNB(ldc)<<1, incCm = (MB<<1) - nNb*incCn; const int ZEROC = ((gescal == NULL) && SCALAR_IS_ZERO(beta)); int i, j, k; const TYPE *pB=pB0; const TYPE rbeta = ( (gescal) ? ATL_rone : *beta ); TYPE *pA=pA0; for (i=nMb; i; i--) { if (A) { A2blk(K, NB, A, lda, pA, alpha); /* get 1 row panel of A */ A += incA; } for (j=nNb; j; j--) { if (gescal) gescal(MB, NB, beta, C, ldc); if (nKb) { NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, rbeta, C, ldc); pA += NBNB2; pB += NBNB2; if (nKb != 1) { for (k=nKb-1; k; k--, pA += NBNB2, pB += NBNB2) NBmm_b1(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, C, ldc); } if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, C, ldc); pB += ATL_MulByNB(kb)<<1; } } else { if (ZEROC) Mjoin(PATL,gezero)(MB, NB, C, ldc); if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, rbeta, C, ldc); pB += ATL_MulByNB(kb)<<1; } } pA = pA0; C += incCn; } if (jb) { if (gescal) gescal(MB, jb, beta, C, ldc); MBJBmm(jb, K, pA, pB, rbeta, C, ldc); } pB = pB0; if (!A) { pA0 += incK; pA = pA0; } C += incCm; } if (ib) { if (A) A2blk(K, ib, A, lda, pA, alpha); /* get last row panel of A */ for(j=nNb; j; j--) /* full column panels of B */ { if (gescal) gescal(ib, NB, beta, C, ldc); IBNBmm(ib, K, pA, pB, rbeta, C, ldc); pB += incK; C += incCn; } if (jb) { if (gescal) gescal(ib, jb, beta, C, ldc); IBJBmm(ib, jb, K, pA, pB, rbeta, C, ldc); } } }