int Mjoin(PATL,syr2kLT) #endif (const int N, const int K, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { int i; void *vc=NULL; TYPE *c; #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0, zero=0.0; #else #define alpha valpha const TYPE *beta=vbeta; const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0}; #endif i = ATL_MulBySize(N)*N; if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i); if (vc == NULL) return(1); c = ATL_AlignPtr(vc); CgemmTN(N, N, K, alpha, A, lda, B, ldb, zero, c, N); if ( SCALAR_IS_ONE(beta) ) Mjoin(syr2k_put,_b1)(N, c, beta, C, ldc); else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr2k_put,_b0)(N, c, beta, C, ldc); #ifdef TCPLX else if (SCALAR_IS_NONE(beta)) Mjoin(syr2k_put,_bn1)(N, c, beta, C, ldc); else if (beta[1] == *zero) Mjoin(syr2k_put,_bXi0)(N, c, beta, C, ldc); #endif else Mjoin(syr2k_put,_bX)(N, c, beta, C, ldc); free(vc); return(0); }
void Mjoin(Mjoin(Mjoin(PATL,syrk),UploNM),T) (const int N, const int K, const void *valpha, const void *A, const int lda, const void *vbeta, void *C, const int ldc) { void *vc; TYPE *c; #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0, zero=0.0; #else #define alpha valpha const TYPE *beta=vbeta; const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0}; #endif if (K > SYRK_Xover) { vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N); ATL_assert(vc); c = ATL_AlignPtr(vc); CgemmTN(N, N, K, alpha, A, lda, A, lda, zero, c, N); if ( SCALAR_IS_ONE(beta) ) Mjoin(syr_put,_b1)(N, c, beta, C, ldc); else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr_put,_b0)(N, c, beta, C, ldc); #ifdef TCPLX else if ( SCALAR_IS_NONE(beta) ) Mjoin(syr_put,_bn1)(N, c, beta, C, ldc); else if (beta[1] == *zero) Mjoin(syr_put,_bXi0)(N, c, beta, C, ldc); #endif else Mjoin(syr_put,_bX)(N, c, beta, C, ldc); free(vc); } else Mjoin(PATL,refsyrk)(Uplo_, AtlasTrans, N, K, alpha, A, lda, beta, C, ldc); }
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N0, const int K, const SCALAR alpha, const TYPE *A, const int lda0, const TYPE *B, const int ldb0, const SCALAR beta, TYPE *C, const int ldc0) { size_t incA, incB, incC; const size_t lda=lda0, ldb=ldb0, ldc=ldc0; const size_t incK = ATL_MulByNB((size_t)K); int N = N0; int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n; void *vA=NULL, *vC=NULL; TYPE *pA, *pB, *pC; MAT2BLK A2blk, B2blk; PUTBLK putblk; NBMM0 NBmm0; nMb = ATL_DivByNB(M); nNb = ATL_DivByNB(N); nKb = ATL_DivByNB(K); ib = M - ATL_MulByNB(nMb); jb = N - ATL_MulByNB(nNb); kb = K - ATL_MulByNB(nKb); /* * If K sufficiently large, write to temporary C as safety measure; otherwise * write directly to C */ if (nKb < 12) { putblk = NULL; pC = C; if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1; else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0; else NBmm0 = NBmm_bX; } else { NBmm0 = NBmm_b0; vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB)); if (!vC) return(-1); pC = ATL_AlignPtr(vC); if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1); else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0); else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1); else putblk = Mjoin(PATL,putblk_bX); } /* * Special case if we don't need to copy one or more input matrix */ if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B)) { if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) && ATL_DataIsMinAligned(A)) { i = NBNB; pA = (TYPE *) A; A = NULL; A2blk = NULL; incA = 0; } else { vA = malloc(ATL_Cachelen + ATL_MulBySize(incK)); if (!vA) { free(vC); return(-1); } pA = ATL_AlignPtr(vA); if (TA == AtlasNoTrans) { incA = NB; if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1); else A2blk = Mjoin(PATL,row2blkT_aX); } else { incA = ATL_MulByNB(lda); if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1); else A2blk = Mjoin(PATL,col2blk_aX); } } Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA, incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0); if (vA) free(vA); if (vC) free(vC); return(0); } i = ATL_Cachelen + ATL_MulBySize(N*K + incK); if (i <= ATL_MaxMalloc) vA = malloc(i); if (!vA) { if (TA == AtlasNoTrans && TB == AtlasNoTrans) { if (vC) free(vC); return(1); } if (jb) n = nNb + 1; else n = nNb; for (j=2; !vA; j++) { k = n / j; if (k < 1) break; if (k*j < n) k++; h = ATL_Cachelen + ATL_MulBySize((k+1)*incK); if (h <= ATL_MaxMalloc) vA = malloc(h); } if (!vA) { if (vC) free(vC); return(-1); } n = ATL_MulByNB(k); jb2 = 0; } else { jb2 = jb; k = nNb; n = N; } pA = ATL_AlignPtr(vA); if (TB == AtlasNoTrans) { incB = ldb*n; if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1); else B2blk = Mjoin(PATL,col2blk2_aX); } else { incB = n; if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1); else B2blk = Mjoin(PATL,row2blkT2_aX); } if (TA == AtlasNoTrans) { incA = NB; A2blk = Mjoin(PATL,row2blkT_a1); } else { incA = ATL_MulByNB(lda); A2blk = Mjoin(PATL,col2blk_a1); } incC = ldc*n; pB = pA + incK; do { if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha); else B2blk(n, K, B, ldb, pB, alpha); Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA, incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0); N -= n; nNb -= k; if (N < n) { jb2 = jb; n = N; k = nNb; } C += incC; B += incB; if (!putblk) pC = C; } while (N); if (vC) free(vC); free(vA); return(0); }
void Mjoin(PATL,pputblk_diag) (const int M, const int N, const TYPE *V, const enum ATLAS_UPLO UC, TYPE *C, int ldc, int ldcinc, const SCALAR alpha, const SCALAR beta) /* * Copies only the Upper or Lower portion of V to C */ { int i, j; if (UC == AtlasUpper) { if (SCALAR_IS_ZERO(beta)) { if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = V[i]; C += ldc; V += M; ldc += ldcinc; } } else if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = -V[i]; C += ldc; V += M; ldc += ldcinc; } } else { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = alpha * V[i]; C += ldc; V += M; ldc += ldcinc; } } } else if (SCALAR_IS_ONE(beta)) { if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] += V[i]; C += ldc; V += M; ldc += ldcinc; } } else if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] -= V[i]; C += ldc; V += M; ldc += ldcinc; } } else { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] += alpha * V[i]; C += ldc; V += M; ldc += ldcinc; } } } else { if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = beta*C[i] + V[i]; C += ldc; V += M; ldc += ldcinc; } } else if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = beta*C[i] - V[i]; C += ldc; V += M; ldc += ldcinc; } } else { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = beta*C[i] + alpha * V[i]; C += ldc; V += M; ldc += ldcinc; } } } } else { if (SCALAR_IS_ZERO(beta)) { if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = -V[i]; C += ldc; V += M; } } else if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = V[i]; C += ldc; V += M; } } else { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = alpha * V[i]; C += ldc; V += M; } } } else if (SCALAR_IS_ONE(beta)) { if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] -= V[i]; C += ldc; V += M; } } else if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] += V[i]; C += ldc; V += M; } } else { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] += alpha * V[i]; C += ldc; V += M; } } } else { if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = beta*C[i] - V[i]; C += ldc; V += M; } } else if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = beta*C[i] + V[i]; C += ldc; V += M; } } else { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = beta*C[i] + alpha * V[i]; C += ldc; V += M; } } } } }