int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N0, const int K, const SCALAR alpha, const TYPE *A, const int lda0, const TYPE *B, const int ldb0, const SCALAR beta, TYPE *C, const int ldc0) { size_t incA, incB, incC; const size_t lda=lda0, ldb=ldb0, ldc=ldc0; const size_t incK = ATL_MulByNB((size_t)K); int N = N0; int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n; void *vA=NULL, *vC=NULL; TYPE *pA, *pB, *pC; MAT2BLK A2blk, B2blk; PUTBLK putblk; NBMM0 NBmm0; nMb = ATL_DivByNB(M); nNb = ATL_DivByNB(N); nKb = ATL_DivByNB(K); ib = M - ATL_MulByNB(nMb); jb = N - ATL_MulByNB(nNb); kb = K - ATL_MulByNB(nKb); /* * If K sufficiently large, write to temporary C as safety measure; otherwise * write directly to C */ if (nKb < 12) { putblk = NULL; pC = C; if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1; else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0; else NBmm0 = NBmm_bX; } else { NBmm0 = NBmm_b0; vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB)); if (!vC) return(-1); pC = ATL_AlignPtr(vC); if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1); else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0); else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1); else putblk = Mjoin(PATL,putblk_bX); } /* * Special case if we don't need to copy one or more input matrix */ if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B)) { if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) && ATL_DataIsMinAligned(A)) { i = NBNB; pA = (TYPE *) A; A = NULL; A2blk = NULL; incA = 0; } else { vA = malloc(ATL_Cachelen + ATL_MulBySize(incK)); if (!vA) { free(vC); return(-1); } pA = ATL_AlignPtr(vA); if (TA == AtlasNoTrans) { incA = NB; if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1); else A2blk = Mjoin(PATL,row2blkT_aX); } else { incA = ATL_MulByNB(lda); if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1); else A2blk = Mjoin(PATL,col2blk_aX); } } Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA, incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0); if (vA) free(vA); if (vC) free(vC); return(0); } i = ATL_Cachelen + ATL_MulBySize(N*K + incK); if (i <= ATL_MaxMalloc) vA = malloc(i); if (!vA) { if (TA == AtlasNoTrans && TB == AtlasNoTrans) { if (vC) free(vC); return(1); } if (jb) n = nNb + 1; else n = nNb; for (j=2; !vA; j++) { k = n / j; if (k < 1) break; if (k*j < n) k++; h = ATL_Cachelen + ATL_MulBySize((k+1)*incK); if (h <= ATL_MaxMalloc) vA = malloc(h); } if (!vA) { if (vC) free(vC); return(-1); } n = ATL_MulByNB(k); jb2 = 0; } else { jb2 = jb; k = nNb; n = N; } pA = ATL_AlignPtr(vA); if (TB == AtlasNoTrans) { incB = ldb*n; if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1); else B2blk = Mjoin(PATL,col2blk2_aX); } else { incB = n; if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1); else B2blk = Mjoin(PATL,row2blkT2_aX); } if (TA == AtlasNoTrans) { incA = NB; A2blk = Mjoin(PATL,row2blkT_a1); } else { incA = ATL_MulByNB(lda); A2blk = Mjoin(PATL,col2blk_a1); } incC = ldc*n; pB = pA + incK; do { if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha); else B2blk(n, K, B, ldb, pB, alpha); Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA, incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0); N -= n; nNb -= k; if (N < n) { jb2 = jb; n = N; k = nNb; } C += incC; B += incB; if (!putblk) pC = C; } while (N); if (vC) free(vC); free(vA); return(0); }
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N0, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc) { int N = N0; int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n, incA, incB, incC; const int incK = ATL_MulByNB(K); void *vA=NULL; TYPE *pA, *pB; MAT2BLK A2blk, B2blk; MATSCAL gescal; NBMM0 NBmm0; nMb = ATL_DivByNB(M); nNb = ATL_DivByNB(N); nKb = ATL_DivByNB(K); ib = M - ATL_MulByNB(nMb); jb = N - ATL_MulByNB(nNb); kb = K - ATL_MulByNB(nKb); if (beta[1] == ATL_rzero) { gescal = NULL; if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1); else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0); else NBmm0 = Mjoin(PATL,CNBmm_bX); } else { gescal = Mjoin(PATL,gescal_bX); NBmm0 = Mjoin(PATL,CNBmm_b1); } i = ATL_Cachelen + ATL_MulBySize(N*K + incK); if (i <= ATL_MaxMalloc) vA = malloc(i); if (!vA) { if (TA == AtlasNoTrans && TB == AtlasNoTrans) return(1); if (jb) n = nNb + 1; else n = nNb; for (j=2; !vA; j++) { k = n / j; if (k < 1) break; if (k*j < n) k++; h = ATL_Cachelen + ATL_MulBySize((k+1)*incK); if (h <= ATL_MaxMalloc) vA = malloc(h); } if (!vA) return(-1); n = ATL_MulByNB(k); jb2 = 0; } else { jb2 = jb; k = nNb; n = N; } pA = ATL_AlignPtr(vA); if (TB == AtlasNoTrans) { incB = ldb*n<<1; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) B2blk = Mjoin(PATL,col2blk2_a1); else B2blk = Mjoin(PATL,col2blk2_aXi0); } else B2blk = Mjoin(PATL,col2blk2_aX); } else if (TB == AtlasConjTrans) { incB = n<<1; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkC2_a1); else B2blk = Mjoin(PATL,row2blkC2_aXi0); } else B2blk = Mjoin(PATL,row2blkC2_aX); } else { incB = n<<1; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkT2_a1); else B2blk = Mjoin(PATL,row2blkT2_aXi0); } else B2blk = Mjoin(PATL,row2blkT2_aX); } if (TA == AtlasNoTrans) { incA = NB<<1; A2blk = Mjoin(PATL,row2blkT_a1); } else if (TA == AtlasConjTrans) { incA = ATL_MulByNB(lda)<<1; A2blk = Mjoin(PATL,col2blkConj_a1); } else { incA = ATL_MulByNB(lda)<<1; A2blk = Mjoin(PATL,col2blk_a1); } incC = ldc*n<<1; pB = pA + (incK<<1); do { if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha); else B2blk(n, K, B, ldb, pB, alpha); Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA, incA, A2blk, pB, beta, C, ldc, gescal, NBmm0); N -= n; nNb -= k; if (N < n) { jb2 = jb; n = N; k = nNb; } C += incC; B += incB; } while (N); free(vA); return(0); }