void Mjoin(PATL,sprk) (const enum PACK_UPLO UA, const enum PACK_TRANS TA, const enum ATLAS_UPLO UC, const int CP, const int N, const int K, const SCALAR alpha, const TYPE *A, const int IA, const int JA, const int lda, const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc) { const enum PACK_UPLO UC2 = ((CP) ? UC : PackGen); int j; #ifdef CacheEdge static const int CE_K = ((ATL_DivBySize(CacheEdge SHIFT)-(NBNB SHIFT)) / (NB*(NB+NB)))*NB; #else #define CE_K K #endif if ((!N) || ((SCALAR_IS_ZERO(alpha) || (!K)) && (SCALAR_IS_ONE(beta)))) return; if (!K || SCALAR_IS_ZERO(alpha)) { if (UC == CblasLower) { for (j=0; j != N; j++) Mjoin(PATL,scal)(N-j, beta, C+MindexP(UC2,IC+j,JC+j,ldc), 1); } else /* UC == CblasUpper */ { for (j=0; j != N; j++) Mjoin(PATL,scal)(j+1, beta, C+MindexP(UC2,IC,JC+j,ldc), 1); } return; } Mjoin(PATL,sprk_rK)(UA, TA, UC, CP, N, K, CE_K, alpha, A, lda, beta, C, ldc); }
int Mjoin(PATL,syr2kLT) #endif (const int N, const int K, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { int i; void *vc=NULL; TYPE *c; #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0, zero=0.0; #else #define alpha valpha const TYPE *beta=vbeta; const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0}; #endif i = ATL_MulBySize(N)*N; if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i); if (vc == NULL) return(1); c = ATL_AlignPtr(vc); CgemmTN(N, N, K, alpha, A, lda, B, ldb, zero, c, N); if ( SCALAR_IS_ONE(beta) ) Mjoin(syr2k_put,_b1)(N, c, beta, C, ldc); else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr2k_put,_b0)(N, c, beta, C, ldc); #ifdef TCPLX else if (SCALAR_IS_NONE(beta)) Mjoin(syr2k_put,_bn1)(N, c, beta, C, ldc); else if (beta[1] == *zero) Mjoin(syr2k_put,_bXi0)(N, c, beta, C, ldc); #endif else Mjoin(syr2k_put,_bX)(N, c, beta, C, ldc); free(vc); return(0); }
void Mjoin(Mjoin(PATL,symmR),UploNM) (const int M, const int N, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0; #else #define alpha valpha #define beta vbeta #endif void *va; TYPE *a; if (M > SYMM_Xover) { va = malloc(ATL_Cachelen + ATL_MulBySize(N)*N); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(N, alpha, A, lda, a); else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(N, alpha, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, one, B, ldb, a, N, beta, C, ldc); #else Mjoin(Mjoin(PATL,sycopy),UploNM)(N, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, valpha, B, ldb, a, N, vbeta, C, ldc); #endif free(va); } else Mjoin(PATL,refsymm)(AtlasRight, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
void Mjoin(Mjoin(Mjoin(PATL,syrk),UploNM),T) (const int N, const int K, const void *valpha, const void *A, const int lda, const void *vbeta, void *C, const int ldc) { void *vc; TYPE *c; #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0, zero=0.0; #else #define alpha valpha const TYPE *beta=vbeta; const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0}; #endif if (K > SYRK_Xover) { vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N); ATL_assert(vc); c = ATL_AlignPtr(vc); CgemmTN(N, N, K, alpha, A, lda, A, lda, zero, c, N); if ( SCALAR_IS_ONE(beta) ) Mjoin(syr_put,_b1)(N, c, beta, C, ldc); else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr_put,_b0)(N, c, beta, C, ldc); #ifdef TCPLX else if ( SCALAR_IS_NONE(beta) ) Mjoin(syr_put,_bn1)(N, c, beta, C, ldc); else if (beta[1] == *zero) Mjoin(syr_put,_bXi0)(N, c, beta, C, ldc); #endif else Mjoin(syr_put,_bX)(N, c, beta, C, ldc); free(vc); } else Mjoin(PATL,refsyrk)(Uplo_, AtlasTrans, N, K, alpha, A, lda, beta, C, ldc); }
void Mjoin(Mjoin(PATL,trmmL),ATLP) (const int M, const int N, const void *valpha, const void *A, const int lda, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR one=1.0, zero=0.0; #else const TYPE zero[2]={0.0,0.0}; #define alpha valpha #endif void *va; TYPE *a; if (N > TRMM_Xover) { va = malloc(ATL_Cachelen + ATL_MulBySize(M)*M); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(ATL_trcopy,_a1)(M, alpha, A, lda, a); else Mjoin(ATL_trcopy,_aX)(M, alpha, A, lda, a); CAgemmTN(M, N, M, one, a, M, C, ldc, zero, C, ldc); #else ATL_trcopy(M, A, lda, a); CAgemmTN(M, N, M, valpha, a, M, C, ldc, zero, C, ldc); #endif free(va); } else Mjoin(PATL,reftrmm)(AtlasLeft, Uplo_, Trans_, Unit_, M, N, alpha, A, lda, C, ldc); }
void Mjoin(Mjoin(PATL,symmL),UploNM) (const int M, const int N, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0; #else #define alpha valpha #define beta vbeta #endif TYPE *a; void *va; if (N > SYMM_Xover) { va = malloc(ATL_Cachelen + (ATL_MulBySize(M)*M)); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(M, alpha, A, lda, a); else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(M, alpha, A, lda, a); CgemmTN(M, N, M, one, a, M, B, ldb, beta, C, ldc); #else Mjoin(Mjoin(PATL,sycopy),UploNM)(M, A, lda, a); CgemmTN(M, N, M, valpha, a, M, B, ldb, vbeta, C, ldc); #endif free(va); } else Mjoin(PATL,refsymm)(AtlasLeft, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
void Mjoin(PATL,mvnk_Mlt16) (ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta, TYPE *Y, ATL_CINT incY) /* * y = alpha*A*x + beta*y */ { #ifdef TREAL const static ATL_MVFUNC mvfunc[15] = {ATL_mvn_Meq1, ATL_mvn_Meq2, ATL_mvn_Meq3, ATL_mvn_Meq4, ATL_mvn_Meq5, ATL_mvn_Meq6, ATL_mvn_Meq7, ATL_mvn_Meq8, ATL_mvn_Meq9, ATL_mvn_Meq10, ATL_mvn_Meq11, ATL_mvn_Meq12, ATL_mvn_Meq13, ATL_mvn_Meq14, ATL_mvn_Meq15 }; if ( M < 1 || N < 1 || (SCALAR_IS_ZERO(alpha) && SCALAR_IS_ONE(beta)) ) return; /* * Base max unrolling we use on how many regs we think we have */ #ifdef ATL_GAS_x8664 if (M > 14) #elif defined(ATL_GAS_x8632) if (M > 6) #else if (M > 15) #endif { Mjoin(PATL,mvnk_smallN)(M, N, alpha, A, lda, X, incX, beta, Y, incY); return; } mvfunc[M-1](M, N, alpha, A, lda, X, incX, beta, Y, incY); #else #ifndef TUNING if (M <= 8) Mjoin(PATL,refgemv)(AtlasNoTrans, M, N, alpha, A, lda, X, incX, beta, Y, incY); else #endif Mjoin(PATL,mvnk_smallN)(M, N, alpha, A, lda, X, incX, beta, Y, incY); #endif }
void Mjoin( PATL, gbmv ) ( const enum ATLAS_TRANS TRANS, const int M, const int N, const int KL, const int KU, const SCALAR ALPHA, const TYPE * A, const int LDA, const TYPE * X, const int INCX, const SCALAR BETA, TYPE * Y, const int INCY ) { /* * .. Local Variables .. */ /* .. * .. Executable Statements .. * */ if( ( M == 0 ) || ( N == 0 ) || ( ( SCALAR_IS_ZERO( ALPHA ) ) && ( SCALAR_IS_ONE( BETA ) ) ) ) return; if( SCALAR_IS_ZERO( ALPHA ) ) { if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( M, BETA, Y, INCY ); return; } Mjoin( PATL, refgbmv )( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, BETA, Y, INCY ); /* * End of Mjoin( PATL, gbmv ) */ }
void Mjoin(PATL,prow2blkTF)(const int M, const int N, const SCALAR alpha, const TYPE *A, int lda, const int ldainc, TYPE *V) { const int mb = Mmin(NB,M), nMb = ATL_DivByNB(M); const int m = ATL_MulByNB(nMb), n = ATL_MulByNB(ATL_DivByNB(N)); const int nr = N - n, mr = M - m; const int incVm = ATL_MulByNB(N), incVV = ATL_MulByNB(mr); int i, j, ib, jb; const enum PACK_UPLO UA = (ldainc == 1) ? PackUpper : ( (ldainc == -1) ? PackLower : PackGen ); TYPE *v, *vv = V+nMb*incVm; void (*row2blk)(const int M, const int N, const TYPE alpha, const TYPE *A, int lda, const int ldainc, TYPE *V); if (ldainc) { if (alpha == ATL_rone) row2blk = ATL_prow2blk_KB_a1; else row2blk = ATL_prow2blk_KB_aX; for (j=0; j < n; j += NB) { for (v=V, i=0; i < m; i += NB, v += incVm) row2blk(NB, NB, alpha, A+MindexP(UA,i,j,lda), Mpld(UA,j,lda), ldainc, v); if (mr) { row2blk(mr, NB, alpha, A+MindexP(UA,m,j,lda), Mpld(UA,j,lda), ldainc, vv); vv += incVV; } V += NBNB; } if (nr) { for (v=V, i=0; i < m; i += NB, v += incVm) row2blk(NB, nr, alpha, A+MindexP(UA,i,n,lda), Mpld(UA,n,lda), ldainc, v); if (mr) row2blk(mr, nr, alpha, A+MindexP(UA,m,n,lda), Mpld(UA,n,lda), ldainc, vv); } } else if (SCALAR_IS_ONE(alpha)) Mjoin(PATL,row2blkT2_a1)(M, N, A, lda, V, alpha); else Mjoin(PATL,row2blkT2_aX)(M, N, A, lda, V, alpha); }
void Mjoin( PATL, symm ) ( const enum ATLAS_SIDE SIDE, const enum ATLAS_UPLO UPLO, const int M, const int N, const SCALAR ALPHA, const TYPE * A, const int LDA, const TYPE * B, const int LDB, const SCALAR BETA, TYPE * C, const int LDC ) { /* * Purpose * ======= * * Mjoin( PATL, symm ) performs one of the matrix-matrix operations * * C := alpha * A * B + beta * C, * * or * * C := alpha * B * A + beta * C, * * where alpha and beta are scalars, A is a symmetric matrix and B and * C are m by n matrices. * * This is a recursive version of the algorithm. For a more detailed * description of the arguments of this function, see the reference im- * plementation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ #ifdef TREAL TYPE alpha0 = (TYPE)(ALPHA), beta0 = (TYPE)(BETA); const TYPE one = ATL_rone; #else TYPE one[2] = { ATL_rone, ATL_rzero }; #endif TYPE * alpha, * beta; RC3_FUN_SYMM_T ATL_rsymm; RC3_SYMM_T type; /* .. * .. Executable Statements .. * */ if( ( M == 0 ) || ( N == 0 ) || ( ( SCALAR_IS_ZERO( ALPHA ) ) && ( SCALAR_IS_ONE( BETA ) ) ) ) return; if( SCALAR_IS_ZERO( ALPHA ) ) { Mjoin( PATL, gescal )( M, N, BETA, C, LDC ); return; } #ifdef TREAL type.size = sizeof( TYPE ); type.one = (void *)(&one); type.TgemmNN = Mjoin( PATL, gemmNN_RB ); alpha = &alpha0; beta = &beta0; #else type.size = sizeof( TYPE[2] ); type.one = (void *)(one); type.TgemmNN = Mjoin( PATL, gemmNN_RB ); alpha = (TYPE *)(ALPHA); beta = (TYPE *)(BETA); #endif if( SIDE == AtlasLeft ) { type.Tgemm = Mjoin( PATL, gemmTN_RB ); if( UPLO == AtlasUpper ) { type.Tsymm = Mjoin( PATL, symmLU ); ATL_rsymm = ATL_rsymmLU; } else { type.Tsymm = Mjoin( PATL, symmLL ); ATL_rsymm = ATL_rsymmLL; } } else { type.Tgemm = Mjoin( PATL, gemmNT_RB ); if( UPLO == AtlasUpper ) { type.Tsymm = Mjoin( PATL, symmRU ); ATL_rsymm = ATL_rsymmRU; } else { type.Tsymm = Mjoin( PATL, symmRL ); ATL_rsymm = ATL_rsymmRL; } } ATL_rsymm( &type, M, N, ((void *)alpha), ((void *)A), LDA, ((void *)B), LDB, ((void *)beta), ((void *)C), LDC, SYMM_NB ); /* * End of Mjoin( PATL, symm ) */ }
void ATL_gemv (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY) /* * y = alpha*A*x + beta*y, A is MxN, len(X) = N, len(Y) = M */ { ATL_mvkern_t mvnk, mvnk_b1, mvnk_b0; void *vp=NULL; TYPE *x = (TYPE*)X, *y = (TYPE*)Y, *p; size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNY2A, ForceNU, COPYX, COPYY, APPLYALPHAX; int minM, minN, DOTBASED; #ifdef TREAL #define one ATL_rone #define Zero ATL_rzero TYPE alpha = alpha0, beta = beta0; const int ALPHA_IS_ONE = (alpha0 == ATL_rone); #else TYPE one[2] = {ATL_rone, ATL_rzero}, *alpha=(TYPE*)alpha0; TYPE Zero[2] = {ATL_rzero, ATL_rzero}; TYPE *beta = (TYPE*) beta0; const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha[1] == ATL_rzero); #endif if (M < 1 || N < 1) /* F77BLAS doesn't scale in either case */ return; if (SCALAR_IS_ZERO(alpha)) /* No contrib from alpha*A*x */ { if (!SCALAR_IS_ONE(beta)) { if (SCALAR_IS_ZERO(beta)) Mjoin(PATL,zero)(M, Y, incY); else Mjoin(PATL,scal)(M, beta, Y, incY); } return; } /* * ATLAS's mvn kernels loop over M in inner loop, which is bad news if M is * very small. Call code that requires no copy of X & Y for these degenerate * cases */ if (M < 16) { Mjoin(PATL,mvnk_Mlt16)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY); return; } /* * Get mvnk kernel pointer along with any usage guidelines, and use the * optimized CacheElts to compute the correct blocking factor * For no transpose, X alignment args really apply to Y, and vice versa. */ mvnk_b1 = ATL_GetMVNKern(M, N, A, lda, &mvnk_b0, &DOTBASED, &mu, &nu, &minM, &minN, &alignY, &ALIGNY2A, &alignX, &ForceNU, &CacheElts); /* * Set up to handle case where kernel requires N to be a multiple if NU */ if (ForceNU) { Nm = (N/nu)*nu; nr = N - Nm; } else { Nm = N; nr = 0; } /* * For very small N, we can't afford the data copy, so call special case code */ if (N < 4 || Nm < 1) { Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY); return; } if (CacheElts) { mb = (CacheElts - 2*nu) / (2*(nu+1)); mb = (mb > mu) ? (mb/mu)*mu : M; mb = (mb > M) ? M : mb; } else mb = M; /* ***************************************************************************** Figure out whether vecs need be copied, and which one will be scaled by alpha ***************************************************************************** */ COPYX = (incX != 1); if (!COPYX && alignX) { t1 = (size_t) X; COPYX = ((t1/alignX)*alignX != t1); } COPYY = (incY != 1); if (!COPYY) /* may still need to copy due to alignment issues */ { /* * ATL_Cachelen is the highest alignment that can be requested, so * make X's % with Cachelen match that of A if you want A & X to have * the same alignment */ if (ALIGNY2A) { t1 = (size_t) A; t2 = (size_t) Y; COPYY = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) != (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2))); } else if (alignY) { t1 = (size_t) Y; COPYY = ((t1/alignY)*alignY != t1); } } if (COPYX != COPYY) /* if only one of them is already being copied */ APPLYALPHAX = COPYX; /* apply alpha to that one */ else if (!COPYY && !COPYX) /* nobody currently being copied means */ { /* we'll need to force a copy to apply alpha */ APPLYALPHAX = (M < N); /* apply alpha to vector requiring least */ if (!ALPHA_IS_ONE) /* workspace if alpha != 1.0 */ { COPYX = APPLYALPHAX; COPYY = !APPLYALPHAX; } } else /* if both are being copied anyway */ APPLYALPHAX = 0; /* apply alpha during update of Y */ if (COPYX | COPYY) /* if I need to copy either vector */ { /* allocate & align them */ vp = malloc(ATL_MulBySize(COPYY*mb+COPYX*N) + 2*ATL_Cachelen); /* * If we cannot allocate enough space to copy the vectors, give up and * call the simple loop-based implementation */ if (!vp) { Mjoin(PATL,mvnk_smallN)(M, N, alpha0, A, lda, X, incX, beta0, Y, incY); return; } if (COPYX) { x = ATL_AlignPtr(vp); if (APPLYALPHAX && !ALPHA_IS_ONE) Mjoin(PATL,cpsc)(N, alpha, X, incX, x, 1); else Mjoin(PATL,copy)(N, X, incX, x, 1); if (COPYY) y = x + (N SHIFT); } else /* if (COPYY) known true by surrounding if */ y = vp; if (COPYY) { y = (ALIGNY2A) ? ATL_Align2Ptr(y, A) : ATL_AlignPtr(y); beta = Zero; alpha = one; } } /* * Apply beta to Y if we aren't copying Y */ if (!COPYY && !SCALAR_IS_ONE(beta0)) { if (SCALAR_IS_ZERO(beta0)) beta = Zero; else { Mjoin(PATL,scal)(M, beta0, Y, incY); beta = one; } } mvnk = (COPYY || SCALAR_IS_ZERO(beta)) ? mvnk_b0 : mvnk_b1; m = M; do { imb = Mmin(mb, m); /* * Call optimized kernel (can be restricted or general) */ if (imb >= minM) mvnk(imb, Nm, A, lda, x, y); else Mjoin(PATL,mvnk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy */ if (nr) Mjoin(PATL,mvnk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda, x+(Nm SHIFT), 1, one, y, 1); /* * If we are copying Y, we have formed A*x into y, so scale it by the * original alpha, by using axpby: Y = beta0*Y + alpha0*y */ if (COPYY) Mjoin(PATL,axpby)(imb, alpha0, y, 1, beta0, Y, incY); else y += imb SHIFT; A += imb SHIFT; Y += (imb*incY)SHIFT; m -= imb; } while(m); if (vp) free(vp); }
void Mjoin(PATL,tgemv) (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta, TYPE *Y, ATL_CINT incY) { static size_t ALb=0, ALe=0; size_t at = (size_t) A; ATL_INT n, P, ldaP; ATL_TGEMV_t pd; /* * quick return if possible. */ if (M < 1 || N < 1) return; if (SCALAR_IS_ZERO(alpha)) /* No contrib from alpha*A*x */ { ATL_CINT NY = (TA == AtlasTrans || TA == AtlasConjTrans) ? N : M; if (!SCALAR_IS_ONE(beta)) { if (SCALAR_IS_ZERO(beta)) Mjoin(PATL,zero)(NY, Y, incY); else Mjoin(PATL,scal)(NY, beta, Y, incY); } return; } pd.flg = (at >= ALb && at <= ALe) ? 1 : 0; ALb = (size_t)A; ALe = (size_t)(A+(M SHIFT)); #ifdef TREAL pd.flg |= (TA == AtlasTrans || TA == AtlasConjTrans) ? 2 : 0; #else if (TA != AtlasNoTrans) { if (TA == AtlasConj) pd.flg |= 4; else if (TA == AtlasTrans) pd.flg |= 2; else /* if (TA == AtlasConjTrans) */ pd.flg |= (2|4); } #endif P = ATL_DivBySize(CacheEdge); P = ((size_t)M*N+P-1) / P; /* add more procs only when cache is full */ P = (P&1 && P > 1)?P+1 : P; /* don't use odd P; it hurts alignment */ P = Mmin(ATL_NTHREADS, P); if (TA == AtlasNoTrans || TA == AtlasConj) P=1; //fprintf(stderr, "P=%d, TA=%d, M=%d, N=%d\n", P, (TA==AtlasTrans), M, N); /* * Make sure we don't overflow 32-bit integer lda */ ldaP = P * lda; while ((size_t)ldaP != ((size_t)lda)*P) { P--; ldaP = P * lda; } if (P > 1) { pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda; pd.alpha = alpha; pd.beta = beta; pd.X = X; pd.Y = Y; pd.A = A; pd.P = P; n = N / P; pd.n = n; pd.nr = N - n*P; if (pd.flg & 2) /* Transpose case */ { ATL_goparallel(P, Mjoin(PATL,DOMVTWORK_cols), &pd, NULL); return; } /* * For gemvN, everyone needs a private M-length y. Don't do this unless * we are sure the combine cost is likely dominated by the parallelism */ else if (n > Mmax(P,8)) { int vrank; const TYPE *a; TYPE *y, *y0; #ifdef TCPLX TYPE one[2] = {ATL_rone, ATL_rzero}; TYPE zero[2] = {ATL_rzero, ATL_rzero}; #endif y0 = y = malloc(P*(ATL_Cachelen+ATL_MulBySize(M))); ATL_assert(y); pd.Y = y; pd.incY = 1; #ifdef TREAL pd.alpha = ATL_rone; pd.beta = ATL_rzero; #else pd.alpha = one; pd.beta = zero; #endif ATL_goparallel(P, Mjoin(PATL,DOMVNWORK_cols), &pd, Mjoin(PATL,CombineMVN)); /* * goparallel reduces all node's Ys to node 0's. Extract his from the * work array, and combine it with input array, applying both alpha * and beta in the process */ vrank = (!pd.nr || (pd.flg & 1)) ? 0 : pd.nr-1; a = A + (lda SHIFT)*vrank; y = ATL_Align2Ptr(y, a); Mjoin(PATL,axpby)(M, alpha, y, 1, beta, Y, incY); free(y0); return; } } /* * If we haven't parallelized this thing, just do it serial */ Mjoin(PATL,gemv)(TA, M, N, alpha, A, lda, X, incX, beta, Y, incY); }
void Mjoin( PATL, syr2k ) ( const enum ATLAS_UPLO UPLO, const enum ATLAS_TRANS TRANS, const int N, const int K, const SCALAR ALPHA, const TYPE * A, const int LDA, const TYPE * B, const int LDB, const SCALAR BETA, TYPE * C, const int LDC ) { /* * Purpose * ======= * * Mjoin( PATL, syr2k ) performs one of the @(syhe_comm) rank 2k operations * * C := alpha * A * B' + alpha * B * A' + beta * C, * * or * * C := alpha * A' * B + alpha * B' * A + beta * C, * * where alpha and beta are scalars, C is an n by n @(syhe_comm) matrix and * A and B are n by k matrices in the first case and k by n matrices in * the second case. * * This is a recursive version of the algorithm. For a more detailed * description of the arguments of this function, see the reference im- * plementation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ #ifdef TREAL TYPE alpha0 = (TYPE)(ALPHA), beta0 = (TYPE)(BETA); TYPE one = ATL_rone; TYPE * alpha, * beta; #else TYPE one[2] = { ATL_rone, ATL_rzero }; TYPE * alpha, * beta; #endif RC3_FUN_SYR2K_T ATL_rsyr2k; RC3_SYR2K_T type; /* .. * .. Executable Statements .. * */ if( ( N == 0 ) || ( ( SCALAR_IS_ZERO( ALPHA ) || ( K == 0 ) ) && SCALAR_IS_ONE( BETA ) ) ) return; if( ( SCALAR_IS_ZERO( ALPHA ) ) || ( K == 0 ) ) { Mjoin( PATL, trscal )( UPLO, N, N, BETA, C, LDC ); return; } #ifdef TREAL type.size = sizeof( TYPE ); type.one = (void *)(&one); alpha = &alpha0; beta = &beta0; #else type.size = sizeof( TYPE[2] ); type.one = (void *)one; alpha = (TYPE *)(ALPHA); beta = (TYPE *)(BETA); #endif if( TRANS == AtlasNoTrans ) { type.Tgemm = Mjoin( PATL, gemmNT_RB ); if( UPLO == AtlasUpper ) { type.Tsyr2k = Mjoin( PATL, syr2kUN ); ATL_rsyr2k = ATL_rsyr2kUN; } else { type.Tsyr2k = Mjoin( PATL, syr2kLN ); ATL_rsyr2k = ATL_rsyr2kLN; } } else { type.Tgemm = Mjoin( PATL, gemmTN_RB ); if( UPLO == AtlasUpper ) { type.Tsyr2k = Mjoin( PATL, syr2kUT ); ATL_rsyr2k = ATL_rsyr2kUT; } else { type.Tsyr2k = Mjoin( PATL, syr2kLT ); ATL_rsyr2k = ATL_rsyr2kLT; } } ATL_rsyr2k( &type, N, K, (void *)(alpha), (void *)(A), LDA, (void *)(B), LDB, (void *)(beta), (void *)(C), LDC, SYR2K_NB ); /* * End of Mjoin( PATL, syr2k ) */ }
void Mjoin( PATL, hpmv ) ( const enum ATLAS_UPLO UPLO, const int N, const SCALAR ALPHA, const TYPE * A, const TYPE * X, const int INCX, const SCALAR BETA, TYPE * Y, const int INCY ) { /* * Purpose * ======= * * Mjoin( PATL, hpmv ) performs the matrix-vector operation * * y := alpha * A * x + beta * y, * * where alpha and beta are scalars, x and y are n-element vectors and A * is an n by n Hermitian matrix, supplied in packed form. * * This is a blocked version of the algorithm. For a more detailed des- * cription of the arguments of this function, see the reference imple- * mentation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ void (*gpmv0)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); void (*gpmv1)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); void (*gpmvN)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); #ifdef TREAL TYPE alphaY, beta0; #define one ATL_rone #define zero ATL_rzero #else const TYPE * alphaY, * beta0; const TYPE one [2] = { ATL_rone, ATL_rzero }, zero[2] = { ATL_rzero, ATL_rzero }; #endif void * vx = NULL, * vy = NULL; TYPE * A0, * A1, * x, * x0, * x1, * y, * y00, * y0, * y1; int incXY, incXY1, j, jb, lda, lda0, lda1, mb, mb1, n, nb; /* .. * .. Executable Statements .. * */ if( N == 0 ) return; if( SCALAR_IS_ZERO( ALPHA ) ) { if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY ); return; } if( ( INCX != 1 ) || ( ( INCY == 1 ) && !( SCALAR_IS_ONE( ALPHA ) ) ) ) { vx = (void *)malloc( ATL_Cachelen + ATL_MulBySize( N ) ); ATL_assert( vx ); x = ATL_AlignPtr( vx ); Mjoin( PATL, cpsc )( N, ALPHA, X, INCX, x, 1 ); alphaY = one; } else { x = (TYPE *)(X); alphaY = ALPHA; } if( ( INCY != 1 ) || !( SCALAR_IS_ONE( alphaY ) ) ) { vy = malloc( ATL_Cachelen + ATL_MulBySize( N ) ); ATL_assert( vy ); y00 = y = ATL_AlignPtr( vy ); beta0 = zero; } else { y00 = y = (TYPE *)(Y); beta0 = BETA; } ATL_GetPartSPMV( A, N, &mb, &nb ); mb1 = N - ( ( N - 1 ) / mb ) * mb; incXY1 = (nb SHIFT); if( UPLO == AtlasUpper ) { if( SCALAR_IS_ZERO( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 ); else if( SCALAR_IS_ONE ( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 ); else gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 ); gpmv1 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 ); gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 ); lda = 1; lda0 = lda; A0 = (TYPE *)(A); MUpnext( mb, A0, lda0 ); incXY = (mb SHIFT); x0 = x + incXY; y0 = y + incXY; for( n = N - mb; n > 0; n -= mb, x0 += incXY, x += incXY, y0 += incXY, y += incXY ) { Mjoin( PATL, hpmvU )( mb, A, lda, x, beta0, y ); for( j = 0, lda1 = lda0, A1 = A0 - (mb SHIFT), x1 = x0, y1 = y0; j < n; j += nb, x1 += incXY1, y1 += incXY1 ) { jb = n - j; jb = Mmin( jb, nb ); gpmv0( jb, mb, one, A1, lda1, x, 1, beta0, y1, 1 ); gpmvN( mb, jb, one, A1, lda1, x1, 1, one, y, 1 ); MUpnext( jb, A1, lda1 ); A1 -= (jb SHIFT); } beta0 = one; gpmv0 = gpmv1; lda = lda0; A = A0; MUpnext( mb, A0, lda0 ); } Mjoin( PATL, hpmvU )( mb1, A, lda, x, beta0, y ); } else { if( SCALAR_IS_ZERO( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b0_y1 ); else if( SCALAR_IS_ONE ( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 ); else gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_bX_y1 ); gpmv1 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 ); gpmvN = Mjoin( PATL, gpmvLN_a1_x1_b1_y1 ); lda = N; lda0 = lda; A0 = (TYPE *)(A); MLpnext( N, A, lda ); incXY = (mb SHIFT); x0 = x; y0 = y; for( n = N - mb, x += ((N-mb) SHIFT), y += ((N-mb) SHIFT); n > 0; n -= mb, x -= incXY, y -= incXY ) { MLpprev( mb, A, lda ); Mjoin( PATL, hpmvL )( mb, A, lda, x, beta0, y ); for( j = 0, lda1 = lda0, A1 = A0 + (n SHIFT), x1 = x0, y1 = y0; j < n; j += nb, x1 += incXY1, y1 += incXY1 ) { jb = n - j; jb = Mmin( jb, nb ); gpmv0( jb, mb, one, A1, lda1, x, 1, beta0, y1, 1 ); gpmvN( mb, jb, one, A1, lda1, x1, 1, one, y, 1 ); MLpnext( jb, A1, lda1 ); A1 -= (jb SHIFT); } beta0 = one; gpmv0 = gpmv1; } Mjoin( PATL, hpmvL )( mb1, A0, lda0, x0, beta0, y0 ); } if( vx ) free( vx ); if( vy ) { Mjoin( PATL, axpby )( N, alphaY, y00, 1, BETA, Y, INCY ); free( vy ); } /* * End of Mjoin( PATL, hpmv ) */ }
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N0, const int K, const SCALAR alpha, const TYPE *A, const int lda0, const TYPE *B, const int ldb0, const SCALAR beta, TYPE *C, const int ldc0) { size_t incA, incB, incC; const size_t lda=lda0, ldb=ldb0, ldc=ldc0; const size_t incK = ATL_MulByNB((size_t)K); int N = N0; int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n; void *vA=NULL, *vC=NULL; TYPE *pA, *pB, *pC; MAT2BLK A2blk, B2blk; PUTBLK putblk; NBMM0 NBmm0; nMb = ATL_DivByNB(M); nNb = ATL_DivByNB(N); nKb = ATL_DivByNB(K); ib = M - ATL_MulByNB(nMb); jb = N - ATL_MulByNB(nNb); kb = K - ATL_MulByNB(nKb); /* * If K sufficiently large, write to temporary C as safety measure; otherwise * write directly to C */ if (nKb < 12) { putblk = NULL; pC = C; if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1; else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0; else NBmm0 = NBmm_bX; } else { NBmm0 = NBmm_b0; vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB)); if (!vC) return(-1); pC = ATL_AlignPtr(vC); if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1); else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0); else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1); else putblk = Mjoin(PATL,putblk_bX); } /* * Special case if we don't need to copy one or more input matrix */ if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B)) { if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) && ATL_DataIsMinAligned(A)) { i = NBNB; pA = (TYPE *) A; A = NULL; A2blk = NULL; incA = 0; } else { vA = malloc(ATL_Cachelen + ATL_MulBySize(incK)); if (!vA) { free(vC); return(-1); } pA = ATL_AlignPtr(vA); if (TA == AtlasNoTrans) { incA = NB; if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1); else A2blk = Mjoin(PATL,row2blkT_aX); } else { incA = ATL_MulByNB(lda); if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1); else A2blk = Mjoin(PATL,col2blk_aX); } } Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA, incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0); if (vA) free(vA); if (vC) free(vC); return(0); } i = ATL_Cachelen + ATL_MulBySize(N*K + incK); if (i <= ATL_MaxMalloc) vA = malloc(i); if (!vA) { if (TA == AtlasNoTrans && TB == AtlasNoTrans) { if (vC) free(vC); return(1); } if (jb) n = nNb + 1; else n = nNb; for (j=2; !vA; j++) { k = n / j; if (k < 1) break; if (k*j < n) k++; h = ATL_Cachelen + ATL_MulBySize((k+1)*incK); if (h <= ATL_MaxMalloc) vA = malloc(h); } if (!vA) { if (vC) free(vC); return(-1); } n = ATL_MulByNB(k); jb2 = 0; } else { jb2 = jb; k = nNb; n = N; } pA = ATL_AlignPtr(vA); if (TB == AtlasNoTrans) { incB = ldb*n; if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1); else B2blk = Mjoin(PATL,col2blk2_aX); } else { incB = n; if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1); else B2blk = Mjoin(PATL,row2blkT2_aX); } if (TA == AtlasNoTrans) { incA = NB; A2blk = Mjoin(PATL,row2blkT_a1); } else { incA = ATL_MulByNB(lda); A2blk = Mjoin(PATL,col2blk_a1); } incC = ldc*n; pB = pA + incK; do { if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha); else B2blk(n, K, B, ldb, pB, alpha); Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA, incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0); N -= n; nNb -= k; if (N < n) { jb2 = jb; n = N; k = nNb; } C += incC; B += incB; if (!putblk) pC = C; } while (N); if (vC) free(vC); free(vA); return(0); }
void Mjoin(PATL,pputblk_diag) (const int M, const int N, const TYPE *V, const enum ATLAS_UPLO UC, TYPE *C, int ldc, int ldcinc, const SCALAR alpha, const SCALAR beta) /* * Copies only the Upper or Lower portion of V to C */ { int i, j; if (UC == AtlasUpper) { if (SCALAR_IS_ZERO(beta)) { if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = V[i]; C += ldc; V += M; ldc += ldcinc; } } else if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = -V[i]; C += ldc; V += M; ldc += ldcinc; } } else { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = alpha * V[i]; C += ldc; V += M; ldc += ldcinc; } } } else if (SCALAR_IS_ONE(beta)) { if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] += V[i]; C += ldc; V += M; ldc += ldcinc; } } else if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] -= V[i]; C += ldc; V += M; ldc += ldcinc; } } else { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] += alpha * V[i]; C += ldc; V += M; ldc += ldcinc; } } } else { if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = beta*C[i] + V[i]; C += ldc; V += M; ldc += ldcinc; } } else if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = beta*C[i] - V[i]; C += ldc; V += M; ldc += ldcinc; } } else { for (j=0; j < N; j++) { for (i=0; i <= j; i++) C[i] = beta*C[i] + alpha * V[i]; C += ldc; V += M; ldc += ldcinc; } } } } else { if (SCALAR_IS_ZERO(beta)) { if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = -V[i]; C += ldc; V += M; } } else if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = V[i]; C += ldc; V += M; } } else { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = alpha * V[i]; C += ldc; V += M; } } } else if (SCALAR_IS_ONE(beta)) { if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] -= V[i]; C += ldc; V += M; } } else if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] += V[i]; C += ldc; V += M; } } else { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] += alpha * V[i]; C += ldc; V += M; } } } else { if (SCALAR_IS_NONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = beta*C[i] - V[i]; C += ldc; V += M; } } else if (SCALAR_IS_ONE(alpha)) { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = beta*C[i] + V[i]; C += ldc; V += M; } } else { for (j=0; j < N; j++) { ldc += ldcinc; for (i=j; i < M; i++) C[i] = beta*C[i] + alpha * V[i]; C += ldc; V += M; } } } } }
void Mjoin( PATL, hpmvU ) ( const int N, const TYPE * A, const int LDA, const TYPE * X, const SCALAR BETA, TYPE * Y ) { /* * Purpose * ======= * * Mjoin( PATL, hpmvU ) performs the matrix-vector operation * * y := alpha * A * x + beta * y, * * where alpha and beta are scalars, x and y are n-element vectors and A * is an n by n Hermitian matrix, supplied in packed form. * * This is a recursive version of the algorithm. For a more detailed * description of the arguments of this function, see the reference im- * plementation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ void (*gpmvT)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); void (*gpmvN)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); #ifdef TREAL #define one ATL_rone TYPE beta0; #else const TYPE * beta0, one[2] = { ATL_rone, ATL_rzero }; #endif TYPE * A0, * x0, * y0; int j, jb, jbs, lda = LDA, m, mb, nb; /* .. * .. Executable Statements .. * */ ATL_GetPartSPMV( A, N, &mb, &nb ); beta0 = BETA; if( SCALAR_IS_ZERO( beta0 ) ) { gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b0_y1 ); gpmvT = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 ); } else if( SCALAR_IS_ONE ( beta0 ) ) { gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 ); gpmvT = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 ); } else { gpmvN = Mjoin( PATL, gpmvUN_a1_x1_bX_y1 ); gpmvT = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 ); } MUpnext( N, A, lda ); x0 = (TYPE *)(X); X += (N SHIFT); y0 = (TYPE *)(Y); Y += (N SHIFT); for( j = 0; j < N; j += nb ) { jb = N - j; jb = Mmin( jb, nb ); jbs = (jb SHIFT); MUpprev( jb, A, lda ); X -= jbs; Y -= jbs; if( ( m = N-j-jb ) != 0 ) { A0 = (TYPE *)(A) - (m SHIFT); gpmvT( jb, m, one, A0, lda, x0, 1, beta0, Y, 1 ); gpmvN( m, jb, one, A0, lda, X, 1, beta0, y0, 1 ); beta0 = one; } Mjoin( PATL, refhpmvU )( jb, one, A, lda, X, 1, beta0, Y, 1 ); gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 ); beta0 = one; gpmvT = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 ); } /* * End of Mjoin( PATL, hpmvU ) */ }
int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M0, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda0, const TYPE *B, const int ldb0, const SCALAR beta, TYPE *C, const int ldc0) /* * Outer three loops for matmul with outer loop over columns of B */ { int M = M0; int nMb, nNb, nKb, ib, jb, kb, ib2, h, i, j, k, m, n; const size_t lda=lda0, ldb=ldb0, ldc=ldc0; size_t incA, incB, incC; int AlphaIsOne; const size_t incK = ATL_MulByNB((size_t)K); void *vB=NULL, *vC=NULL; TYPE *pA, *pB, *pC; const TYPE one[2] = {1.0,0.0}, zero[2] = {0.0,0.0}; MAT2BLK A2blk, B2blk; MATSCAL gescal; NBMM0 NBmm0; nMb = ATL_DivByNB(M); nNb = ATL_DivByNB(N); nKb = ATL_DivByNB(K); ib = M - ATL_MulByNB(nMb); jb = N - ATL_MulByNB(nNb); kb = K - ATL_MulByNB(nKb); pC = C; if (beta[1] == ATL_rzero) { gescal = NULL; if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1); else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0); else NBmm0 = Mjoin(PATL,CNBmm_bX); } else { NBmm0 = Mjoin(PATL,CNBmm_b1); gescal = Mjoin(PATL,gescal_bX); } /* * Special case for when what we are really doing is * C <- beta*C + alpha * A * A' or C <- beta*C + alpha * A' * A */ if ( A == B && M == N && TA != TB && (SCALAR_IS_ONE(alpha) || M <= NB) && TA != AtlasConjTrans && TB != AtlasConjTrans && lda == ldb ) { AlphaIsOne = SCALAR_IS_ONE(alpha); i = ATL_MulBySize(M * K); if (!AlphaIsOne && pC == C && !SCALAR_IS_ZERO(beta)) i += ATL_MulBySize(M*N); if (i <= ATL_MaxMalloc) vB = malloc(i + ATL_Cachelen); if (vB) { pA = ATL_AlignPtr(vB); if (TA == AtlasNoTrans) Mjoin(PATL,row2blkT2_a1)(M, K, A, lda, pA, alpha); else Mjoin(PATL,col2blk2_a1)(K, M, A, lda, pA, alpha); /* * Can't write directly to C if alpha is not one */ if (!AlphaIsOne) { if (SCALAR_IS_ZERO(beta)) h = ldc; else if (pC == C) { pC = pA + (((size_t)M) * K SHIFT); h = M; } else h = NB; Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, one, pA, NULL, ldb, pA, 0, NULL, zero, pC, h, Mjoin(PATL,gescal_b0), Mjoin(PATL,CNBmm_b0)); if (alpha[1] == ATL_rzero) Mjoin(PATL,gescal_bXi0)(M, N, alpha, pC, h); else Mjoin(PATL,gescal_bX)(M, N, alpha, pC, h); if (C != pC) { if (beta[1] == ATL_rzero) { if (*beta == ATL_rone) Mjoin(PATL,putblk_b1)(M, N, pC, C, ldc, beta); else if (*beta == ATL_rnone) Mjoin(PATL,putblk_bn1)(M, N, pC, C, ldc, beta); else if (*beta == ATL_rzero) Mjoin(PATL,putblk_b0)(M, N, pC, C, ldc, beta); else Mjoin(PATL,putblk_bXi0)(M, N, pC, C, ldc, beta); } else Mjoin(PATL,putblk_bX)(M, N, pC, C, ldc, beta); } } else Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, pA, NULL, ldb, pA, 0, NULL, beta, C, ldc, gescal, NBmm0); free(vB); if (vC) free(vC); return(0); } } i = ATL_Cachelen + ATL_MulBySize(M*K + incK); if (i <= ATL_MaxMalloc) vB = malloc(i); if (!vB) { if (TA != AtlasNoTrans && TB != AtlasNoTrans) return(1); if (ib) n = nMb + 1; else n = nMb; for (j=2; !vB; j++) { k = n / j; if (k < 1) break; if (k*j < n) k++; h = ATL_Cachelen + ATL_MulBySize((k+1)*incK); if (h <= ATL_MaxMalloc) vB = malloc(h); } if (!vB) return(-1); n = k; m = ATL_MulByNB(n); ib2 = 0; } else { n = nMb; m = M; ib2 = ib; } pB = ATL_AlignPtr(vB); if (TA == AtlasNoTrans) { incA = m SHIFT; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) A2blk = Mjoin(PATL,row2blkT2_a1); else A2blk = Mjoin(PATL,row2blkT2_aXi0); } else A2blk = Mjoin(PATL,row2blkT2_aX); } else if (TA == AtlasConjTrans) { incA = lda*m SHIFT; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blkConj2_a1); else A2blk = Mjoin(PATL,col2blkConj2_aXi0); } else A2blk = Mjoin(PATL,col2blkConj2_aX); } else { incA = lda*m SHIFT; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blk2_a1); else A2blk = Mjoin(PATL,col2blk2_aXi0); } else A2blk = Mjoin(PATL,col2blk2_aX); } if (TB == AtlasNoTrans) { incB = ATL_MulByNB(ldb) SHIFT; B2blk = Mjoin(PATL,col2blk_a1); } else if (TB == AtlasConjTrans) { incB = NB2; B2blk = Mjoin(PATL,row2blkC_a1); } else { incB = NB2; B2blk = Mjoin(PATL,row2blkT_a1); } incC = m SHIFT; pA = pB + (incK SHIFT); do { if (TA == AtlasNoTrans) A2blk(m, K, A, lda, pA, alpha); else A2blk(K, m, A, lda, pA, alpha); Mjoin(PATL,mmJIK2)(K, n, nNb, nKb, ib2, jb, kb, alpha, pA, B, ldb, pB, incB, B2blk, beta, C, ldc, gescal, NBmm0); M -= m; nMb -= n; if (M <= m) { ib2 = ib; m = M; n = nMb; } C += incC; A += incA; } while (M); free(vB); if (vC) free(vC); return(0); }
int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M0, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc) /* * Copy matmul algorithm, copies A and B on-the-fly * If M < 0, allocates only (MB+NB)*KB workspace */ { void *v=NULL; const TYPE *a=A; TYPE *pA, *pB, *pB0; MAT2BLK2 A2blk, B2blk; NBMM0 NBmm0, NBmm1, pNBmm0; const int M = (M0 >= 0) ? M0 : -M0; int nkblks, nmblks, nnblks, mr, nr, kr, KR, bigK, h, i, j, ZEROC; size_t incAk, incBk, incAm, incBn, incAW, incAWp, incBW, incBWp, incW; /* * If both M and N <= NB, and one of them is not full, call BPP, which * can sometimes avoid doing cleanup forall cases */ if (M <= MB && N <= NB && (M != MB || N != NB)) return(Mjoin(PATL,mmBPP)(TA, TB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)); /* * If these workspace increments are 0, we do JIT NBxNB copies instead of * copying entire array/panel. Don't copy mat if you can't reuse it. */ if (M0 > 0) { incAW = (N > NB) ? KB*MB : 0; incBW = (M > NB) ? KB*NB : 0; } else /* allocate in minimal space */ incAW = incBW = 0; nmblks = M/MB; nnblks = N/NB; nkblks = K/KB; mr = M - nmblks*MB; nr = N - nnblks*NB; kr = K - nkblks*KB; /* * K-loop is special, in that we don't call user cleanup, must explicitly zero, * and K-cleanup is typically slower even for generated kernels. Therefore, * allow extra leaway for doing extra flops. Note error is unaffected by * any of these extra flops: K-loop has elts zeroed, and multiplying zeros * and adding in zeros doesn't add to error */ KR = (kr && kr+4 >= KB) ? KB : kr; bigK = nkblks*KB+KR; if (incAW) { i = MB*bigK; incAWp = KB*mr; } else { i = MB*KB; incAWp = 0; } if (incBW) { incBWp = KB*nr; incW = bigK*NB; i += N*bigK; } else { incBWp = incW = 0; i += NB*KB; } i *= sizeof(TYPE); if (i <= ATL_MaxMalloc || !(incAW | incBW)) v = malloc(ATL_Cachelen+i); if (!v) return(-1); pA = ATL_AlignPtr(v); pB0 = pA + (incAW ? bigK*MB : KB*MB); if (TA == AtlasNoTrans) { A2blk = Mjoin(PATL,gemoveT); incAk = lda*KB; incAm = MB; } else { A2blk = Mjoin(PATL,gemove); incAk = KB; incAm = MB*lda; } if (TB == AtlasNoTrans) { B2blk = Mjoin(PATL,gemove); incBk = KB; incBn = NB*ldb; } else { B2blk = Mjoin(PATL,gemoveT); incBk = ldb*KB; incBn = NB; } /* * See what kernel we're calling */ if ( SCALAR_IS_ONE(beta) ) { NBmm0 = NBmm_b1; pNBmm0 = Mjoin(PATL,pNBmm_b1); } else if ( SCALAR_IS_ZERO(beta) ) { NBmm0 = NBmm_b0; pNBmm0 = Mjoin(PATL,pNBmm_b0); } else { NBmm0 = NBmm_bX; pNBmm0 = Mjoin(PATL,pNBmm_bX); } KR = (KR == KB) ? KB : 0; ZEROC = !KR && SCALAR_IS_ZERO(beta); for (i=0; i < nmblks; i++) { a = A+i*incAm; pB = pB0; /* foreach row-panel of A, start at B's copy space */ for (j=nnblks; j; j--) { Mjoin(PATL,mmK)(MB, MB, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta, a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBW, C, ldc, A2blk, B2blk, NBmm0, NBmm_b1); B += incBn; /* copy next col panel of B */ pB += incW; /* to next col panel of pB */ a = (incAW ? NULL : a); /* reuse row-panel of A if copied */ C += ldc*NB; } if (nr) { if (ZEROC) Mjoin(PATL,gezero)(MB, nr, C, ldc); Mjoin(PATL,mmK)(MB, MB, nr, nr, nkblks, kr, KR, ATL_rone, alpha, beta, a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBWp, C, ldc, A2blk, B2blk, pNBmm0, Mjoin(PATL,pNBmm_b1)); } C += MB - nnblks*ldc*NB; if (incBW) { B = NULL; /* finished copying B */ incBn = 0; } else B -= nnblks*incBn; } if (mr) { a = A + nmblks*incAm; pB = pB0; if ( SCALAR_IS_ONE(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b1); else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b0); else NBmm0 = Mjoin(PATL,pMBmm_bX); for (j=nnblks; j; j--) { Mjoin(PATL,mmK)(mr, mr, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta, a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBW, C, ldc, A2blk, B2blk, NBmm0, Mjoin(PATL,pMBmm_b1)); B += incBn; /* copy next col panel of B */ pB += incW; /* to next col panel of pB */ a = (incAW ? NULL : a); /* reuse row-panel of A if copied */ C += ldc*NB; } if (nr) { if ( SCALAR_IS_ZERO(beta) ) Mjoin(PATL,gezero)(mr, nr, C, ldc); Mjoin(PATL,mmK)(mr, mr, nr, nr, nkblks, kr, (incAW | incBW) ? KR:0, ATL_rone, alpha, beta, a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBWp, C, ldc, A2blk, B2blk, Mjoin(PATL,pKBmm), Mjoin(PATL,pKBmm)); } } free(v); return(0); }
void Mjoin(PATL,mm_axpy) (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda0, const TYPE *B, const int ldb0, const SCALAR beta, TYPE *C, const int ldc0) /* * TA == AtlasNoTrans * GEMM implemented by calling axpy, with any M partitioning already done */ { int i, j, k, incBk, incBn; const size_t lda=((size_t)lda0)+lda0, ldc=((size_t)ldc0)+ldc0, incAn = lda*K; const int ALONE=SCALAR_IS_ONE(alpha), BEONE=SCALAR_IS_ONE(beta); TYPE alph[2], BC[2]; register TYPE r1, r2, i1, i2; if (TB == AtlasNoTrans) { incBk = 2; incBn = ldb0 - K; } else { incBk = ldb0+ldb0; incBn = 1 - ldb0*K; } incBn += incBn; if (TB == AtlasConjTrans) { if (BEONE) { if (ALONE) { for(j=0; j < N; j++) { for (k=0; k < K; k++, B += incBk, A += lda) { BC[0] = B[0]; BC[1] = -B[1]; Mjoin(PATL,axpy)(M, BC, A, 1, C, 1); } C += ldc; B += incBn; A -= incAn; } } else { for(j=0; j < N; j++) { BC[0] = B[0]; BC[1] = -B[1]; ATL_cplxmul(alph, alpha, BC); Mjoin(PATL,axpby)(M, alph, A, 1, beta, C, 1); B += incBk; A += lda; for (k=1; k < K; k++, B += incBk, A += lda) { BC[0] = B[0]; BC[1] = -B[1]; ATL_cplxmul(alph, alpha, BC); Mjoin(PATL,axpy)(M, alph, A, 1, C, 1); } C += ldc; B += incBn; A -= incAn; } } } else /* BETA != 1.0 */ { if (ALONE) { for(j=0; j < N; j++) { BC[0] = B[0]; BC[1] = -B[1]; Mjoin(PATL,axpby)(M, BC, A, 1, beta, C, 1); B += incBk; A += lda; for (k=1; k < K; k++, B += incBk, A += lda) { BC[0] = B[0]; BC[1] = -B[1]; Mjoin(PATL,axpy)(M, BC, A, 1, C, 1); } C += ldc; B += incBn; A -= incAn; } } else { for(j=0; j < N; j++) { BC[0] = B[0]; BC[1] = -B[1]; ATL_cplxmul(alph, alpha, BC); Mjoin(PATL,axpby)(M, alph, A, 1, beta, C, 1); B += incBk; A += lda; for (k=1; k < K; k++, B += incBk, A += lda) { BC[0] = B[0]; BC[1] = -B[1]; ATL_cplxmul(alph, alpha, BC); Mjoin(PATL,axpy)(M, alph, A, 1, C, 1); } C += ldc; B += incBn; A -= incAn; } } } } else /* B is not conjugated */ { if (BEONE) { if (ALONE) { for(j=0; j < N; j++) { for (k=0; k < K; k++, B += incBk, A += lda) Mjoin(PATL,axpy)(M, B, A, 1, C, 1); C += ldc; B += incBn; A -= incAn; } } else { for(j=0; j < N; j++) { ATL_cplxmul(alph, alpha, B); Mjoin(PATL,axpby)(M, alph, A, 1, beta, C, 1); B += incBk; A += lda; for (k=1; k < K; k++, B += incBk, A += lda) { ATL_cplxmul(alph, alpha, B); Mjoin(PATL,axpy)(M, alph, A, 1, C, 1); } C += ldc; B += incBn; A -= incAn; } } } else /* BETA != 1.0 */ { if (ALONE) { for(j=0; j < N; j++) { Mjoin(PATL,axpby)(M, B, A, 1, beta, C, 1); B += incBk; A += lda; for (k=1; k < K; k++, B += incBk, A += lda) Mjoin(PATL,axpy)(M, B, A, 1, C, 1); C += ldc; B += incBn; A -= incAn; } } else { for(j=0; j < N; j++) { ATL_cplxmul(alph, alpha, B); Mjoin(PATL,axpby)(M, alph, A, 1, beta, C, 1); B += incBk; A += lda; for (k=1; k < K; k++, B += incBk, A += lda) { ATL_cplxmul(alph, alpha, B); Mjoin(PATL,axpy)(M, alph, A, 1, C, 1); } C += ldc; B += incBn; A -= incAn; } } } } }
int Mjoin(PATL,mmJKI)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc) /* * This gemm is for small K, so we build gemm out of AXPY (outer product) * rather than dot (inner product). */ { int Mp, mp, m, k, ldaa=lda; void *vA=NULL; TYPE *pA; const TYPE CONE[2]={ATL_rone, ATL_rzero}, CNONE[2]={ATL_rnone, ATL_rzero}; const SCALAR alp=alpha; /* * Compute M partition necessary to promote reuse in the L1 cache. Check * NB^2 in addition to L1elts, to catch machines where L1 is not used by FPU. * If this gives a small Mp, use CacheEdge instead (reuse in L2 instead of L1). */ Mp = NB*NB; m = ATL_L1elts >> 1; Mp = (m > Mp) ? m : Mp; Mp /= ((K+2)<<1); if (Mp < 128) { #if !defined(CacheEdge) || CacheEdge == 0 Mp = M; #else Mp = (CacheEdge) / ((K+2)*ATL_sizeof); if (Mp < 128) Mp = M; #endif } if (Mp > M) Mp = M; /* * Change Mp if remainder is very small */ else { Mp -= 16; /* small safety margin on filling cache */ mp = M / Mp; m = M - mp*Mp; if (m && m < 32) Mp += (m+mp-1)/mp; } /* * If A not in NoTrans format, need to copy so it can use axpy wt stride=1. * NOTE: this routine should not be called when you can't afford this copy */ if (TA != AtlasNoTrans) { vA = malloc(ATL_Cachelen + Mp*ATL_MulBySize(K)); if (!vA) return(-1); pA = ATL_AlignPtr(vA); alp = CONE; ldaa = Mp; pA += Mp+Mp; } else pA = (TYPE *) A; for (m=0; m < M; m += Mp) { mp = M - m; if (mp > Mp) mp = Mp; /* * If the thing is in Trans format, copy to NoTrans */ if (vA) { pA -= (Mp+Mp); if (TA == AtlasConjTrans) { for (k=0; k < K; k++) { Mjoin(PATL,copy)(mp, A+k+k, lda, pA+((k*ldaa)<<1), 1); Mjoin(PATLU,scal)(mp, ATL_rnone, pA+1+((k*ldaa)<<1), 2); if (!SCALAR_IS_ONE(alpha)) Mjoin(PATL,scal)(mp, alpha, pA+((k*ldaa)<<1), 1); } } else { for (k=0; k < K; k++) Mjoin(PATL,cpsc)(mp, alpha, A+k+k, lda, pA+((k*ldaa)<<1), 1); } A += mp*(lda+lda); } Mjoin(PATL,mm_axpy)(AtlasNoTrans, TB, mp, N, K, alp, pA, ldaa, B, ldb, beta, C, ldc); pA += mp+mp; C += mp+mp; } if (vA) free(vA); return(0); }
int Mjoin(PATL,tsyrk_amm_K) ( const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, ATL_CINT N, ATL_CINT K, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const SCALAR beta, TYPE *C, ATL_CINT ldc ) { amminfo_t mminfo; ATL_tsyrk_ammK_t pd; ablk2cmat_t Mjoin(PATL,tGetSyammInfo_K) (amminfo_t *out, const int P, enum ATLAS_TRANS TA, ATL_CSZT N,ATL_CSZT K); int kb=ATL_AMM_MAXKB, nkb = K / ATL_AMM_MAXKB, P = ATL_NTHREADS; int ku, kr, mb, nb, mu, nu; size_t sz; void *vp=NULL; if (nkb < P) { kb = ATL_AMM_98KB; nkb = K / ATL_AMM_98KB; if (nkb < P) { nkb = K / ATL_AMM_66KB; kb = ATL_AMM_66KB; } } if (nkb < P) { if (nkb < 2) { Mjoin(PATL,syrk)(Uplo, Trans, N, K, alpha, A, lda, beta, C, ldc); return(0); } P = nkb; } pd.blk2c_b0 = Mjoin(PATL,tGetSyammInfo_K)(&mminfo, P, Trans, N, kb); kb = mminfo.kb; nkb = K / kb; mu = mminfo.mu; nu = mminfo.nu; pd.nmu = (N+mu-1) / mu; pd.nnu = (N+nu-1) / nu; pd.mb = mb = pd.nmu*mu; pd.nb = nb = pd.nnu*nu; pd.kb = mminfo.kb; sz = ((((size_t)mb)*nb)<<1) + (mb+nb)*kb; pd.wsz = sz; sz = ATL_MulBySize(sz)*P; vp = malloc(sz+ATL_Cachelen); if (!vp) return(1); pd.w = ATL_AlignPtr(vp); kr = K - nkb*kb; pd.kb0 = pd.KB0 = kr; ku = mminfo.ku; if (!kr) { pd.kb0 = pd.KB0 = kb; pd.ammK_b0 = mminfo.amm_b0; pd.ammK_b1 = mminfo.amm_b1; } else { #if ATL_AMM_MAXKMAJ > 1 if (ATL_AMMFLG_KMAJOR(mminfo.flag)) { pd.KB0 = ((kr+ku-1)/ku)*ku; if (ATL_AMMFLG_KRUNTIME(mminfo.flag)) { pd.ammK_b0 = mminfo.amm_b0; pd.ammK_b1 = mminfo.amm_b1; } else { pd.ammK_b0 = mminfo.amm_k1_b0; pd.ammK_b1 = mminfo.amm_k1_b1; } } else #endif { if (ATL_AMMFLG_KRUNTIME(mminfo.flag) && kr == (kr/ku)*ku && kr > mminfo.kbmin) { pd.ammK_b0 = mminfo.amm_b0; pd.ammK_b1 = mminfo.amm_b1; } else { pd.ammK_b0 = mminfo.amm_k1_b0; pd.ammK_b1 = mminfo.amm_k1_b1; } } } pd.amm_b0 = mminfo.amm_b0; pd.amm_b1 = mminfo.amm_b1; pd.blk2c_b1 = mminfo.Cblk2cm; pd.a2blk = mminfo.a2blk; pd.b2blk = mminfo.b2blk; pd.A = A; pd.C = C; pd.alpha = α pd.beta = β pd.nkblks = (kr) ? nkb+1 : nkb; pd.KbCtr = ATL_SetGlobalAtomicCount(ATL_EstNctr(pd.nkblks, P), pd.nkblks, 0); pd.Cmut = ATL_mutex_init(); pd.BETA_APPLIED = SCALAR_IS_ONE(beta); pd.LOWER = (Uplo == AtlasLower); pd.TA = (Trans == AtlasTrans); pd.N = N; pd.lda = lda; pd.ldc = ldc; // #define DEBUG1 1 #ifdef DEBUG1 { ATL_LAUNCHSTRUCT_t ls; ATL_thread_t ts; ts.rank = 0; ts.P = 1; ls.opstruct = &pd; Mjoin(PATL,DoWork_syrk_amm_K)(&ls, &ts); } #else ATL_goparallel(P, Mjoin(PATL,DoWork_syrk_amm_K), &pd, Mjoin(PATL,CombSyrk_ammK)); #endif /* * Answer is written back to rank0's workspace, extract it & write to C */ { TYPE *wC = pd.w+kb*(mb+nb), *w = wC + mb*nb, *c = C; /* * Put it into block-major storage in w */ pd.blk2c_b0(N, N, ATL_rone, wC, ATL_rzero, w, N); /* * Now copy out only upper or lower portion */ if (pd.LOWER) { int j; for (j=0; j < N; j++, c += ldc+1, w += N+1) Mjoin(PATL,axpby)(N-j, alpha, w, 1, beta, c, 1); } else { int j; for (j=0; j < N; j++, c += ldc, w += N) Mjoin(PATL,axpby)(j+1, alpha, w, 1, beta, c, 1); } } free(vp); ATL_mutex_free(pd.Cmut); ATL_FreeGlobalAtomicCount(pd.KbCtr); return(0); }
void Mjoin( PATL, ptgeadd ) ( const int M, const int N, const SCALAR ALPHA, const TYPE * A, const int LDA, const SCALAR BETA, TYPE * C, const int LDC ) { /* * Purpose * ======= * * Mjoin( PATL, ptgeadd ) adds an m-by-n matrix A to the matrix B. * * This is a multi-threaded version of the algorithm. * * Arguments * ========= * * PTYPE (input) const PT_MISC_TYPE_T * * On entry, PTYPE points to the data structure containing the * type information. * * NODE (input) const unsigned int * On entry, NODE specifies the current node number. * * THREADS (input) const unsigned int * On entry, THREADS specifies the number of threads to be used * for the current operation. * * ATTR (input) pthread_attr_t * * On entry, ATTR specifies the thread attribute object to be * used for the node functions to be threaded. * * NB (input) const int * On entry, NB specifies the blocksize to be used for the * problem size partitioning. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ pthread_attr_t attr; PT_TREE_T root = NULL; #ifdef TREAL TYPE alpha0 = (TYPE)(ALPHA), beta0 = (TYPE)(BETA); #endif void * alpha, * beta; /* .. * .. Executable Statements .. * */ if( ( M <= 0 ) || ( N <= 0 ) || ( SCALAR_IS_ZERO( ALPHA ) && SCALAR_IS_ONE( BETA ) ) ) return; #ifdef TREAL alpha = (void *)(&alpha0); beta = (void *)(&beta0); #else alpha = (void *)(ALPHA); beta = (void *)(BETA); #endif ATL_thread_init( &attr ); root = Mjoin( PATL, ptgeadd_nt )( ATL_NTHREADS, &attr, M, N, alpha, (void *)(A), LDA, beta, (void *)(C), LDC ); ATL_join_tree ( root ); ATL_free_tree ( root ); ATL_thread_exit( &attr ); /* * End of Mjoin( PATL, ptgeadd ) */ }
void Mjoin(PATL,tsymm) (const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const TYPE *B, ATL_CINT ldb, const SCALAR beta, TYPE *C, ATL_CINT ldc) { ATL_INT n, nblks, tblks, nr, minblks, extrablks, p, i, j; ATL_thread_t tp[ATL_NTHREADS]; ATL_TSYMM_t symms[ATL_NTHREADS]; ATL_LAUNCHSTRUCT_t ls; const TYPE *b; TYPE *c; static int nb=0; if (M < 1 || N < 1) return; if (SCALAR_IS_ZERO(alpha)) { if (!SCALAR_IS_ONE(beta)) Mjoin(PATL,gescal)(M, N, beta, C, ldc); return; } if (!nb) nb = Mjoin(PATL,GetNB()); if (Side == AtlasLeft) { nblks = N / nb; nr = N - nblks*nb; tblks = ((double)(M*N)) / ( (double)nb * nb ); p = (nblks+ATL_TSYMM_ADDP-1)/ATL_TSYMM_ADDP; if (p < ATL_NTHREADS) /* N not big enough to give blk to each proc */ { /* * If I can't split N, and M is the dominant cost, use recursion to * decompose symmetric matrix; parallelism will come from TGEMM calls */ if (M > (N<<(ATL_NTHRPOW2+2))) { ATL_tsymm_SYsplit(Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, nb); return; } } else p = ATL_NTHREADS; if (p < 2) goto SERIAL; /* * Distribute N over the processors */ b = B; c = C; minblks = nblks / p; extrablks = nblks - minblks*p; for (i=0; i < p; i++) { if (i < extrablks) n = (minblks+1)*nb; else if (i == extrablks) n = minblks*nb + nr; else n = minblks*nb; symms[i].A = A; symms[i].B = b; symms[i].alpha = SADD alpha; symms[i].beta = SADD beta; symms[i].C = c; symms[i].M = M; symms[i].N = n; symms[i].lda = lda; symms[i].ldb = ldb; symms[i].ldc = ldc; symms[i].side = Side; symms[i].uplo = Uplo; b = MindxT(b, ATL_MulBySize((size_t)ldb)*n); c = MindxT(c, ATL_MulBySize((size_t)ldc)*n); } for (; i < ATL_NTHREADS; i++) /* flag rest of struct as uninitialized */ symms[i].M = 0; } else /* Side == AtlasRight */ { nblks = M / nb; nr = M - nblks*nb; tblks = ((double)(M*N)) / ( (double)nb * nb ); p = (nblks+ATL_TSYMM_ADDP-1)/ATL_TSYMM_ADDP; if (p < ATL_NTHREADS) /* N not big enough to give blk to each proc */ { /* * If I can't split M, and N is the dominant cost, use recursion to * decompose symmetric matrix; parallelism will come from TGEMM calls */ if (N > (M<<(ATL_NTHRPOW2+2))) { ATL_tsymm_SYsplit(Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc, nb); return; } } else p = ATL_NTHREADS; if (p < 2) goto SERIAL; /* * Distribute M over the processors */ b = B; c = C; minblks = nblks / p; extrablks = nblks - minblks*p; for (i=0; i < p; i++) { if (i < extrablks) n = (minblks+1)*nb; else if (i == extrablks) n = minblks*nb + nr; else n = minblks*nb; symms[i].A = A; symms[i].B = b; symms[i].alpha = SADD alpha; symms[i].beta = SADD beta; symms[i].C = c; symms[i].M = n; symms[i].N = N; symms[i].lda = lda; symms[i].ldb = ldb; symms[i].ldc = ldc; symms[i].side = Side; symms[i].uplo = Uplo; b = MindxT(b, ATL_MulBySize((size_t)n)); c = MindxT(c, ATL_MulBySize((size_t)n)); } for (; i < ATL_NTHREADS; i++) /* flag rest of struct as uninitialized */ symms[i].M = 0; } if (p < 2) { SERIAL: Mjoin(PATL,symm)(Side, Uplo, M, N, alpha, A, lda, B, ldb, beta, C, ldc); return; } ATL_goparallel(p, Mjoin(PATL,DoWorkSYMM), symms, NULL); }
/* * This is special-case code that handles rank-2 update by calling GER2 */ int Mjoin(PATL,ammm_rk2) ( enum ATLAS_TRANS TA, enum ATLAS_TRANS TB, ATL_CSZT M, ATL_CSZT N, const SCALAR alpha, const TYPE *A, ATL_CSZT lda, const TYPE *B, ATL_CSZT ldb, const SCALAR beta, TYPE *C, ATL_CSZT ldc ) { #ifdef DREAL const int MB=512, NB=32; #else /* SREAL */ const int MB=512, NB=64; #endif void *vp; TYPE *x, *y, *w, *z; size_t j; ATL_CSZT incC = NB*ldc; /* * If beta is one, can handle by one call to ger2 */ if (SCALAR_IS_ONE(beta)) { if (TA == AtlasNoTrans) { if (TB == AtlasNoTrans) Mjoin(PATL,ger2)(M, N, alpha, A, 1, B, ldb, alpha, A+lda, 1, B+1, ldb, C, ldc); else Mjoin(PATL,ger2)(M, N, alpha, A, 1, B, 1, alpha, A+lda, 1, B+ldb, 1, C, ldc); } else if (TB == AtlasNoTrans) Mjoin(PATL,ger2)(M, N, alpha, A, lda, B, ldb, alpha, A+1, lda, B+1, ldb, C, ldc); else Mjoin(PATL,ger2)(M, N, alpha, A, lda, B, 1, alpha, A+1, lda, B+ldb, 1, C, ldc); return(0); } /* * Later on, do smart think like copy only MB/NB at a time, and don't copy * at all if vectors are contiguous, but right now, always do copy up-front * so loop does not have to worry about TA/TB; this is a O(N) cost in N^2 alg */ vp = malloc(2*ATL_MulBySize(M+N)+4*ATL_Cachelen); if (!vp) return(1); x = ATL_AlignPtr(vp); y = x + M; y = ATL_AlignPtr(y); w = y + N; w = ATL_AlignPtr(w); z = w + M; z = ATL_AlignPtr(z); if (TA == AtlasNoTrans) { Mjoin(PATL,copy)(M, A, 1, x, 1); Mjoin(PATL,copy)(M, A+lda, 1, w, 1); } else { Mjoin(PATL,copy)(M, A, lda, x, 1); Mjoin(PATL,copy)(M, A+1, lda, w, 1); } if (SCALAR_IS_ONE(alpha)) { if (TB == AtlasNoTrans) { Mjoin(PATL,copy)(N, B, ldb, y, 1); Mjoin(PATL,copy)(N, B+1, ldb, z, 1); } else { Mjoin(PATL,copy)(N, B, 1, y, 1); Mjoin(PATL,copy)(N, B+ldb, 1, z, 1); } } else { if (TB == AtlasNoTrans) { Mjoin(PATL,cpsc)(N, alpha, B, ldb, y, 1); Mjoin(PATL,cpsc)(N, alpha, B+1, ldb, z, 1); } else { Mjoin(PATL,cpsc)(N, alpha, B, 1, y, 1); Mjoin(PATL,cpsc)(N, alpha, B+ldb, 1, z, 1); } } for (j=0; j < N; j += NB, C += incC) { size_t i, nb = N-j; nb = (nb >= NB) ? NB : nb; for (i=0; i < M; i += MB) { size_t mb = M-i; mb = (mb >= MB) ? MB : mb; Mjoin(PATL,gescal)(mb, nb, beta, C+i, ldc); Mjoin(PATL,ger2)(mb, nb, ATL_rone, x+i, 1, y+j, 1, ATL_rone, w+i, 1, z+j, 1, C+i, ldc); } } free(vp); return(0); }
void Mjoin( PATL, sbmv ) ( const enum ATLAS_UPLO UPLO, const int N, const int K, const SCALAR ALPHA, const TYPE * A, const int LDA, const TYPE * X, const int INCX, const SCALAR BETA, TYPE * Y, const int INCY ) { /* * Purpose * ======= * * Mjoin( PATL, sbmv ) performs the matrix-vector operation * * y := alpha * A * x + beta * y, * * where alpha and beta are scalars, x and y are n-element vectors and A * is an n by n symmetric band matrix, with k super-diagonals. * * This is a blocked version of the algorithm. For a more detailed des- * cription of the arguments of this function, see the reference imple- * mentation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ void (*gbmv0)( const int, const int, const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); void (*gbmv1)( const int, const int, const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); void (*gbmvN)( const int, const int, const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); #ifdef TREAL TYPE alphaY, beta0; #define lda2 LDA #define one ATL_rone #define zero ATL_rzero void * vx = NULL, * vy = NULL; TYPE * x, * y, * y00; #else const TYPE one [2] = { ATL_rone, ATL_rzero }, zero[2] = { ATL_rzero, ATL_rzero }; const TYPE * alphaY, * beta0; void * vx = NULL, * vy = NULL; TYPE * x, * y, * y00; const int lda2 = ( LDA SHIFT ); #endif int ian, ia, j, ja, jan, k, kb, kl, ku, ma, mb, mb1, n, na, nb; /* .. * .. Executable Statements .. * */ if( N == 0 ) return; if( SCALAR_IS_ZERO( ALPHA ) ) { if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY ); return; } Mjoin(PATL,refsbmv)(UPLO, N, K, ALPHA, A, LDA, X, INCX, BETA, Y, INCY); /* * End of Mjoin( PATL, sbmv ) */ }
int Mjoin(PATU,usergemm)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda0, const TYPE *B, const int ldb0, const SCALAR beta, TYPE *C, const int ldc0) { TYPE *buffer, *a_buf; #ifdef TREAL #define lda lda0 #define ldb ldb0 #define ldc ldc0 #else const int lda = lda0<<1, ldb = ldb0<<1, ldc = ldc0<<1; #endif /* * NOTE: this function pointer is critical. If you call goto's assembly * matmul directly, you get seg fault for complex on the EV6, but it works * fine as long as you use a function pointer. -- RCW */ int (*gemm_xx) (const int, const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, TYPE *, const int, TYPE *); if ( !SCALAR_IS_ONE(beta) ) Mjoin(PATL,gescal)(M, N, beta, C, ldc0); #ifdef ATL_OS_OSF1 buffer = (TYPE *) mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); #else buffer = (TYPE *) malloc(BUFFER_SIZE); #endif if (!buffer) return(-1); a_buf = (TYPE *)(((size_t)buffer + ALIGN_SIZE) & ~ALIGN_SIZE); if (TA == AtlasNoTrans) { if (TB == AtlasNoTrans) gemm_xx = Mjoin(PATU,gotogemm_nn); #ifdef TCPLX else if (TB == AtlasConjTrans) gemm_xx = Mjoin(PATU,gotogemm_nc); #endif else gemm_xx = Mjoin(PATU,gotogemm_nt); } #ifdef TCPLX else if (TA == AtlasConjTrans) { if (TB == AtlasNoTrans) gemm_xx = Mjoin(PATU,gotogemm_cn); else if (TB == AtlasConjTrans) gemm_xx = Mjoin(PATU,gotogemm_cc); else gemm_xx = Mjoin(PATU,gotogemm_ct); } #endif else { if (TB == AtlasNoTrans) gemm_xx = Mjoin(PATU,gotogemm_tn); #ifdef TCPLX else if (TB == AtlasConjTrans) gemm_xx = Mjoin(PATU,gotogemm_tc); #endif else gemm_xx = Mjoin(PATU,gotogemm_tt); } gemm_xx(M, N, K, alpha, A, lda, B, ldb, C, ldc, a_buf); #ifdef ATL_OS_OSF1 munmap((void*)buffer, BUFFER_SIZE); #else free(buffer); #endif return(0); }
void ATL_gemv (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY) /* * Y = alpha*conj(A)*X + beta*Y * For Conjugate transpose, first form x = conj(X), y = A^T * conj(X), * then use axpbyConj to add this to original Y in the operation * Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * X), which is * Y = beta*Y + alpha * A^H * X. */ { ATL_mvkern_t mvtk, mvtk_b1, mvtk_b0; void *vp=NULL; TYPE *x = (TYPE*)X, *y = (TYPE*)Y; size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX; int minM, minN; TYPE one[2] = {ATL_rone, ATL_rzero}; TYPE Zero[2] = {ATL_rzero, ATL_rzero}; TYPE *beta = (TYPE*) beta0; const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha0[1] == ATL_rzero); if (M < 1 || N < 1) /* F77 BLAS doesn't scale in either case */ return; if (SCALAR_IS_ZERO(alpha0)) /* No contrib from alpha*A*x */ { if (!SCALAR_IS_ONE(beta0)) { if (SCALAR_IS_ZERO(beta0)) Mjoin(PATL,zero)(N, Y, incY); else Mjoin(PATL,scal)(N, beta, Y, incY); } return; } /* * ATLAS's MVT kernels loop over M in inner loop, which is bad news if M is * very small. Call code that requires no copy of X & Y for these degenerate * cases */ if (M < 16) { Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX, beta0, Y, incY); return; } /* * Get mvtk kernel pointer along with any usage guidelines, and use the * optimized CacheElts to compute the correct blocking factor */ mvtk_b1 = ATL_GetMVTKern(M, N, A, lda, &mvtk_b0, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignY, &ForceNU, &CacheElts); /* * Set up to handle case where kernel requres N to be a multiple if NU */ if (ForceNU) { Nm = (N/nu)*nu; nr = N - Nm; } else { Nm = N; nr = 0; } /* * For very small N, we can't afford the data copy, so call special case code */ if (N < 4 || Nm < 1) { Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX, beta0, Y, incY); return; } if (CacheElts) { mb = (CacheElts - 2*nu) / (2*(nu+1)); mb = (mb > mu) ? (mb/mu)*mu : M; mb = (mb > M) ? M : mb; } else mb = M; vp = malloc(ATL_MulBySize(mb+N) + 2*ATL_Cachelen); /* * If we cannot allocate enough space to copy the vectors, give up and * call the simple loop-based implementation */ if (!vp) { Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX, beta0, Y, incY); return; } y = ATL_AlignPtr(vp); x = y + (N SHIFT); x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x); beta = Zero; /* * In this step, we form y = A^T * conj(X) */ mvtk = mvtk_b0; m = M; do { imb = Mmin(mb, m); Mjoin(PATL,copyConj)(imb, X, incX, x, 1); /* x = conj(X) */ /* * Call optimized kernel (can be restricted or general) */ if (imb >= minM) mvtk(imb, Nm, A, lda, x, y); else Mjoin(PATL,mvtk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy */ if (nr) Mjoin(PATL,mvtk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda, x, 1, beta, y+(Nm SHIFT), 1); beta = one; mvtk = mvtk_b1; A += imb SHIFT; X += (imb*incX)SHIFT; m -= imb; imb = Mmin(m,mb); } while(m); /* * Given y = A^T * conj(X) from above, now do: * Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * x), which is * Y = beta*Y + alpha * A^H * x. */ Mjoin(PATL,axpbyConj)(N, alpha0, y, 1, beta0, Y, incY); free(vp); }