void Mjoin(PATL,gpmm) (const enum PACK_UPLO UA, const enum PACK_TRANS TA, const enum PACK_UPLO UB, const enum PACK_TRANS TB, const enum PACK_UPLO UC, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int IA, const int JA, const int lda, const TYPE *B, const int IB, const int JB, const int ldb, const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc) { int j; #ifdef CacheEdge static const int CE_K = ((ATL_DivBySize(CacheEdge)-(NBNB SHIFT)) / (NB*(NB+NB)))*NB; #else #define CE_K K #endif if (!M || !N) return; if (!K || SCALAR_IS_ZERO(alpha)) { for (j=0; j != N; j++) Mjoin(PATL,scal)(M, beta, C+MindexP(UC,IC,JC+j,ldc), 1); return; } /* * Packed gpmm not yet implemented for complex, * so die if not really a dense gemm */ #ifdef TCPLX ATL_assert (UA == PackGen && UB == PackGen && UC == PackGen); Mjoin(PATL,gemm)(TA, TB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); #else Mjoin(PATL,prankK)(UA, TA, UB, TB, M, N, K, CE_K, alpha, A+MindexP(UA,IA,JA,lda), Mpld(UA,JA,lda), B+MindexP(UB,IB,JB,ldb), Mpld(UB,JB,ldb), beta, UC, C+MindexP(UC,IB,JB,ldc), Mpld(UC,JC,ldc)); #endif }
static void ATL_rk_recUT (const enum PACK_UPLO UA, const enum PACK_TRANS TA, const enum ATLAS_UPLO UC, const int CP, const int N, const int K, const SCALAR alpha, const TYPE *A, int lda, const SCALAR beta, TYPE *C, const int ldc) /* * For upper trans matrix, use recursion to reduce N until enough memory * can be allocated */ { int Nright, Nleft; const enum PACK_UPLO UC2 = (CP ? UC : PackGen); if (Mjoin(PATL,prk_kmm)(UC, UA, TA, N, K, alpha, A, lda, beta, CP, C, ldc)) { Nleft = N >> 1; #ifdef NB if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft)); #endif Nright = N - Nleft; ATL_rk_recUT(UA, TA, UC, CP, Nleft, K, alpha, A, lda, beta, C, ldc); Mjoin(PATL,gpmm)(PackGen, TA, PackGen, AtlasNoTrans, CP?PackUpper:PackGen, Nleft, Nright, K, alpha, A, 0, 0, lda, A+Nleft*(lda SHIFT), 0, 0, lda, beta, C+MindexP(UC2,0,Nleft,ldc), 0, 0, Mpld(UC2,Nleft,ldc)); ATL_rk_recUT(UA, TA, UC, CP, Nright, K, alpha, A+(Nleft SHIFT)*lda, lda, beta, C+MindexP(UC2,Nleft,Nleft,ldc), Mpld(UC2,Nleft,ldc)); } }
void Mjoin(PATL,pcol2blkF) (const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * Copies entire MxN matrix to block major format */ { int j, jb; const int incV = ATL_MulByNB(M); const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper : ( (lda == -1) ? PackLower : PackGen ); void (*col2blk)(const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V); if (ldainc) { if (alpha == ATL_rone) col2blk = Mjoin(PATL,pcol2blk_a1); else col2blk = Mjoin(PATL,pcol2blk_aX); for (j=0; j < N; j += NB) { jb = N-j; jb = Mmin(jb, NB); col2blk(M, jb, alpha, A+MindexP(UA,0,j,lda), Mpld(UA,j,lda), ldainc,V); V += incV; } } else if (alpha == ATL_rone) Mjoin(PATL,col2blk2_a1)(M, N, A, lda, V, alpha); else Mjoin(PATL,col2blk2_aX)(M, N, A, lda, V, alpha); }
void Mjoin(PATL,prow2blkTF)(const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) { const int mb = Mmin(NB,M), nMb = ATL_DivByNB(M); const int m = ATL_MulByNB(nMb), n = ATL_MulByNB(ATL_DivByNB(N)); const int nr = N - n, mr = M - m; const int incVm = ATL_MulByNB(N), incVV = ATL_MulByNB(mr); int i, j, ib, jb; const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper : ( (ldainc == -1) ? PackLower : PackGen ); TYPE *v, *vv = V+nMb*incVm; void (*row2blk)(const int M, const int N, const TYPE alpha, const TYPE *A, int lda, const int ldainc, TYPE *V); if (ldainc) { if (alpha == ATL_rone) row2blk = ATL_prow2blk_KB_a1; else row2blk = ATL_prow2blk_KB_aX; for (j=0; j < n; j += NB) { for (v=V, i=0; i < m; i += NB, v += incVm) row2blk(NB, NB, alpha, A+MindexP(UA,i,j,lda), Mpld(UA,j,lda), ldainc, v); if (mr) { row2blk(mr, NB, alpha, A+MindexP(UA,m,j,lda), Mpld(UA,j,lda), ldainc, vv); vv += incVV; } V += NBNB; } if (nr) { for (v=V, i=0; i < m; i += NB, v += incVm) row2blk(NB, nr, alpha, A+MindexP(UA,i,n,lda), Mpld(UA,n,lda), ldainc, v); if (mr) row2blk(mr, nr, alpha, A+MindexP(UA,m,n,lda), Mpld(UA,n,lda), ldainc, vv); } } else if (SCALAR_IS_ONE(alpha)) Mjoin(PATL,row2blkT2_a1)(M, N, A, lda, V, alpha); else Mjoin(PATL,row2blkT2_aX)(M, N, A, lda, V, alpha); }
void Mjoin(pcol2blkF,_blk) (const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) /* * Copies entire MxN matrix to block major format */ { int j, jb; const int incV = blk*(M+M); const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper : ( (lda == -1) ? PackLower : PackGen ); void (*col2blk)(const int blk, const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V); #ifdef Conj_ if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_a1),_blk); else col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_aXi0),_blk); } else col2blk = Mjoin(Mjoin(PATL,pcol2blkConj_aX),_blk); #else if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) col2blk = Mjoin(Mjoin(PATL,pcol2blk_a1),_blk); else col2blk = Mjoin(Mjoin(PATL,pcol2blk_aXi0),_blk); } else col2blk = Mjoin(Mjoin(PATL,pcol2blk_aX),_blk); #endif for (j=0; j < N; j += blk) { jb = N-j; jb = Mmin(jb, blk); col2blk(blk, M, jb, alpha, A+MindexP(UA,0,j,lda), Mpld(UA,j,lda), ldainc, V); V += incV; } }
int ATL_pmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, const enum PACK_UPLO UC, TYPE *C, const int ldc) /* * Special packed matmul, calls dense gemm kernel using at most * K*NB + 2*NB*NB space. $B$ is copied only once, but $A$ is copied * ceil(N/NB) times. However, $A$ should start in-cache for kernel call. */ { const int nKb = ATL_DivByNB(K), kb = K - ATL_MulByNB(nKb); const int incK = ATL_MulByNB(K); const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0); const int ldbinc = (UB == AtlasUpper) ? 1 : ((UB == AtlasLower) ? -1 : 0); const int ldcinc = (UC == AtlasUpper) ? 1 : ((UC == AtlasLower) ? -1 : 0); int ib, jb, i, j, k; void *vC; TYPE *pC, *pA, *pB; NBMM0 pNBmm, pNBmm0; vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB+NBNB+incK)); if (!vC) return(-1); pC = ATL_AlignPtr(vC); pA = pC + NBNB; pB = pA + NBNB; /* * Loop over column panels of $B$ */ for (j=0; j < N; j += NB) { jb = N - j; jb = Mmin(jb, NB); /* * Copy column-panel of B to block-major storage */ if (alpha == 1.0) { if (TB == AtlasNoTrans) ATL_pcol2blk(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb), ldbinc, pB); else /* TB == AtlasTrans */ ATL_prow2blkT(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB); } else if (TB == AtlasNoTrans) ATL_pcol2blk_aX(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb), ldbinc, pB); else /* TB == AtlasTrans */ ATL_prow2blkT_aX(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB); /* * Loop over row-panels of A */ for (i=0; i < M; i += MB) { ib = M - i; ib = Mmin(ib, MB); if (jb != NB || ib != MB) { pNBmm0 = pNBmm = ATL_gNBmm; if (ib != NB && jb != NB) Mjoin(PATL,gezero)(MB, NB, pC, MB); } else { pNBmm = NBmm; pNBmm0 = NBmm_b0; } /* * Handle full blocks of K */ if (nKb) { if (TA == AtlasNoTrans) ATL_prow2blkT(ib, NB, 1.0, A+MindexP(UA,i,0,lda), lda, ldainc, pA); else ATL_pcol2blk(NB, ib, 1.0, A+MindexP(UA,0,i,lda), Mpld(UA,i,lda), ldainc, pA); pNBmm0(ib, jb, NB, ATL_rone, pA, NB, pB, NB, ATL_rzero, pC, ib); for (k=1; k != nKb; k++) { if (TA == AtlasNoTrans) ATL_prow2blkT(ib, NB, 1.0, A+MindexP(UA,i,ATL_MulByNB(k),lda), Mpld(UA,ATL_MulByNB(k),lda), ldainc, pA); else ATL_pcol2blk(NB, ib, 1.0, A+MindexP(UA,ATL_MulByNB(k),i,lda), Mpld(UA,i,lda), ldainc, pA); pNBmm(ib, jb, NB, ATL_rone, pA, NB, pB+jb*NB*k, NB, ATL_rone, pC, ib); } if (kb) { if (TA == AtlasNoTrans) ATL_prow2blkT(ib, kb, 1.0, A+MindexP(UA,i,ATL_MulByNB(nKb),lda), Mpld(UA,ATL_MulByNB(nKb),lda), ldainc, pA); else ATL_pcol2blk(kb, ib, 1.0, A+MindexP(UA,ATL_MulByNB(nKb),i,lda), Mpld(UA,i,lda), ldainc, pA); ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB+jb*NB*nKb, kb, ATL_rone, pC, ib); } } else if (kb) { Mjoin(PATL,gezero)(ib, jb, pC, ib); if (TA == AtlasNoTrans) ATL_prow2blkT(ib, kb, 1.0, A+MindexP(UA,i,0,lda), lda, ldainc, pA); else ATL_pcol2blk(kb, ib, 1.0, A+MindexP(UA,0,i,lda), Mpld(UA,i,lda), ldainc, pA); ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB, kb, ATL_rzero, pC, ib); } ATL_pputblk(ib, jb, pC, C+MindexP(UC,i,j,ldc), Mpld(UC,j,ldc), ldcinc, beta); } } free(vC); return(0); }
int ATL_pmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, const enum PACK_UPLO UC, TYPE *C, const int ldc) /* * Special packed matmul, calls dense gemm kernel using at most * M*K + K*NB + NB*NB space. If this exceeds ATL_pkMaxMalloc or fails, * operates using at most 2*K*NB + NB*NB. If this fails, returns non-zero. * If full space is malloced, both matrices are copied exactly once. If * the smaller space is used, $A$ will be copied ceil(N/NB) times. */ { const int nKb = ATL_DivByNB(K), kb = K - ATL_MulByNB(nKb); const int incK = ATL_MulByNB(K); const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0); const int ldbinc = (UB == AtlasUpper) ? 1 : ((UB == AtlasLower) ? -1 : 0); const int ldcinc = (UC == AtlasUpper) ? 1 : ((UC == AtlasLower) ? -1 : 0); int ib, jb, i, j, k; void *vC=NULL; TYPE *pC, *pA, *pB, *pA0; NBMM0 pNBmm, pNBmm0; void (*A2blk)(const int M, const int N, const TYPE alpha, const TYPE *A, int lda, const int ldainc, TYPE *V); i = ATL_Cachelen + ATL_MulBySize(NBNB+ATL_MulByNB(K)+M*K); if (i <= ATL_pkMaxMalloc) vC = malloc(i); if (!vC) { vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB+ATL_MulByNB(K+K))); if (TA == AtlasNoTrans) A2blk = ATL_prow2blkT; else A2blk = ATL_pcol2blk; } else A2blk = NULL; if (!vC) return(-1); pC = ATL_AlignPtr(vC); pB = pC + NBNB; pA = pB + ATL_MulByNB(K); /* * If we've got the space, copy all of A up front */ if (!A2blk) { if (TA == AtlasNoTrans) ATL_prow2blkTF(M, K, ATL_rone, A, lda, ldainc, pA); else ATL_pcol2blkF(K, M, ATL_rone, A, lda, ldainc, pA); pA -= ATL_MulByNB(K); } pA0 = pA; /* * Loop over column panels of $B$ */ for (j=0; j < N; j += NB) { jb = N - j; jb = Mmin(jb, NB); /* * Copy column-panel of B to block-major storage */ if (alpha == 1.0) { if (TB == AtlasNoTrans) ATL_pcol2blk(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb), ldbinc, pB); else /* TB == AtlasTrans */ ATL_prow2blkT(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB); } else if (TB == AtlasNoTrans) ATL_pcol2blk_aX(K, jb, alpha, B+MindexP(UB,0,j,ldb), Mpld(UB,j,ldb), ldbinc, pB); else /* TB == AtlasTrans */ ATL_prow2blkT_aX(jb, K, alpha, B+MindexP(UB,j,0,ldb), ldb, ldbinc, pB); /* * Loop over row-panels of A */ for (i=0; i < M; i += MB) { ib = M - i; ib = Mmin(ib, MB); if (A2blk) { if (TA == AtlasNoTrans) ATL_prow2blkT(ib, K, ATL_rone, A+MindexP(UA,i,0,lda), lda, ldainc, pA); else /* TA == AtlasTrans */ ATL_pcol2blk(K, ib, ATL_rone, A+MindexP(UA,0,i,lda), Mpld(UA,i,lda), ldainc, pA); } else pA += ATL_MulByNB(K); if (jb != NB || ib != MB) { pNBmm0 = pNBmm = ATL_gNBmm; if (ib != NB && jb != NB) Mjoin(PATL,gezero)(MB, NB, pC, MB); } else { pNBmm = NBmm; pNBmm0 = NBmm_b0; } /* * Handle full blocks of K */ if (nKb) { pNBmm0(ib, jb, NB, ATL_rone, pA, NB, pB, NB, ATL_rzero, pC, ib); for (k=1; k != nKb; k++) { pNBmm(ib, jb, NB, ATL_rone, pA+ib*NB*k, NB, pB+jb*NB*k, NB, ATL_rone, pC, ib); } if (kb) ATL_gNBmm(ib, jb, kb, ATL_rone, pA+ib*NB*nKb, kb, pB+jb*NB*nKb, kb, ATL_rone, pC, ib); } else if (kb) { Mjoin(PATL,gezero)(ib, jb, pC, ib); ATL_gNBmm(ib, jb, kb, ATL_rone, pA, kb, pB, kb, ATL_rzero, pC, ib); } ATL_pputblk(ib, jb, pC, C+MindexP(UC,i,j,ldc), Mpld(UC,j,ldc), ldcinc, beta); } pA = pA0; } free(vC); return(0); }
void Mjoin(PATL,sprk_rK) (const enum PACK_UPLO UA, const enum PACK_TRANS TA, const enum ATLAS_UPLO UC, const int CP, const int N, const int K, int R, const SCALAR alpha, const TYPE *A, int lda, const SCALAR beta0, TYPE *C, const int ldc) /* * This routine does the packed symmetric rank-K update by doing ceil(K/R) * rank-R updates of C. This primarily done for CacheEdge, but is also * useful to auto-reduce R until enough workspace may be allocated. */ { const enum PACK_UPLO UC2 = ((CP) ? UC : PackGen); int k=0, kb, ierr; const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0); const int ldcinc = (UC2 == AtlasUpper) ? 1 : ((UC2 == AtlasLower) ? -1 : 0); #ifdef TREAL TYPE beta = beta0; #else TYPE beta[2]; *beta = *beta0; beta[1] = beta0[1]; #endif if (R < NB) R = NB<<4; if ((K - R) < 2*NB) R = K; do { kb = K - k; if (kb - R < 2*NB) R = kb; kb = Mmin(R, kb); /* * If we can't do the rank-R update, reduce R until we can, or R = 1 */ ierr = Mjoin(PATL,prk_kmm)(UC, UA, TA, N, kb, alpha, A, lda, beta, CP, C, ldc); if (ierr && R <= NB*8) { if (UC == AtlasUpper) { if (TA == AtlasNoTrans) ATL_rk_recUN(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc); else ATL_rk_recUT(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc); } else { if (TA == AtlasNoTrans) ATL_rk_recLN(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc); else ATL_rk_recLT(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc); } ierr = 0; } if (ierr) { R = Mmin(NB*8, R>>1); ATL_assert(R); } /* * Subsequent updates use beta = 1 */ else { #ifdef TREAL beta = ATL_rone; #else *beta = ATL_rone; beta[1] = ATL_rzero; #endif if (TA == AtlasNoTrans) { A += MindexP(UA, 0, R, lda); lda = Mpld(UA, R, lda); } else A += MindexP(UA, R, 0, lda); k += R; } }