void Mjoin( PATLF77WRAP, hemm ) ( F77_INTEGER * ISIDE, F77_INTEGER * IUPLO, F77_INTEGER * M, F77_INTEGER * N, TYPE * ALPHA, TYPE * A, F77_INTEGER * LDA, TYPE * B, F77_INTEGER * LDB, TYPE * BETA, TYPE * C, F77_INTEGER * LDC ) { /* * Purpose * ======= * * ATL_F77wrap_hemm performs one of the matrix-matrix operations * * C := alpha * A * B + beta * C, * * or * * C := alpha * B * A + beta * C, * * where alpha and beta are scalars, A is a Hermitian matrix and B and * C are m by n matrices. * * Notes * ===== * * This routine is an internal wrapper function written in C called by * the corresponding Fortran 77 user callable subroutine. It calls the * appropriate ATLAS routine performing the actual computation. * * This wrapper layer resolves the following portability issues: * * - the routines' name sheme translation imposed by the Fortran / C * compilers of your target computer, * - the translation of Fortran characters into the ATLAS correspon- * ding C enumerated type (in cooperation with the Fortan user cal- * lable subroutine), * - the translation of Fortran integers into the proper C correspon- * ding native type; * * and the following ease-of-programming issue: * * - a pointer to the the first entry of vector operands (when appli- * cable) is passed to the ATLAS computational routine even if the * corresponding input increment value is negative. This allows for * a more natural expression in C of the computation performed by * these ATLAS functions. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ Mjoin( PATL, hemm )( *ISIDE, *IUPLO, *M, *N, SVVAL ALPHA, A, *LDA, B, *LDB, SVVAL BETA, C, *LDC ); /* * End of Mjoin( PATLF77WRAP, hemm ) */ }
int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M0, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc) /* * Outer three loops for matmul with outer loop over columns of B */ { int M = M0; int nMb, nNb, nKb, ib, jb, kb, ib2, h, i, j, k, m, n, incA, incB, incC; int AlphaIsOne; const int incK = ATL_MulByNB(K); void *vB=NULL, *vC=NULL; TYPE *pA, *pB, *pC; const TYPE one[2] = {1.0,0.0}, zero[2] = {0.0,0.0}; MAT2BLK A2blk, B2blk; MATSCAL gescal; NBMM0 NBmm0; nMb = ATL_DivByNB(M); nNb = ATL_DivByNB(N); nKb = ATL_DivByNB(K); ib = M - ATL_MulByNB(nMb); jb = N - ATL_MulByNB(nNb); kb = K - ATL_MulByNB(nKb); pC = C; if (beta[1] == ATL_rzero) { gescal = NULL; if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1); else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0); else NBmm0 = Mjoin(PATL,CNBmm_bX); } else { NBmm0 = Mjoin(PATL,CNBmm_b1); gescal = Mjoin(PATL,gescal_bX); } /* * Special case for when what we are really doing is * C <- beta*C + alpha * A * A' or C <- beta*C + alpha * A' * A */ if ( A == B && M == N && TA != TB && (SCALAR_IS_ONE(alpha) || M <= NB) && TA != AtlasConjTrans && TB != AtlasConjTrans && lda == ldb ) { AlphaIsOne = SCALAR_IS_ONE(alpha); i = ATL_MulBySize(M * K); if (!AlphaIsOne && pC == C && !SCALAR_IS_ZERO(beta)) i += ATL_MulBySize(M*N); if (i <= ATL_MaxMalloc) vB = malloc(i + ATL_Cachelen); if (vB) { pA = ATL_AlignPtr(vB); if (TA == AtlasNoTrans) Mjoin(PATL,row2blkT2_a1)(M, K, A, lda, pA, alpha); else Mjoin(PATL,col2blk2_a1)(K, M, A, lda, pA, alpha); /* * Can't write directly to C if alpha is not one */ if (!AlphaIsOne) { if (SCALAR_IS_ZERO(beta)) h = ldc; else if (pC == C) { pC = pA + (M * K SHIFT); h = M; } else h = NB; Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, one, pA, NULL, ldb, pA, 0, NULL, zero, pC, h, Mjoin(PATL,gescal_b0), Mjoin(PATL,CNBmm_b0)); if (alpha[1] == ATL_rzero) Mjoin(PATL,gescal_bXi0)(M, N, alpha, pC, h); else Mjoin(PATL,gescal_bX)(M, N, alpha, pC, h); if (C != pC) { if (beta[1] == ATL_rzero) { if (*beta == ATL_rone) Mjoin(PATL,putblk_b1)(M, N, pC, C, ldc, beta); else if (*beta == ATL_rnone) Mjoin(PATL,putblk_bn1)(M, N, pC, C, ldc, beta); else if (*beta == ATL_rzero) Mjoin(PATL,putblk_b0)(M, N, pC, C, ldc, beta); else Mjoin(PATL,putblk_bXi0)(M, N, pC, C, ldc, beta); } else Mjoin(PATL,putblk_bX)(M, N, pC, C, ldc, beta); } } else Mjoin(PATL,mmJIK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, pA, NULL, ldb, pA, 0, NULL, beta, C, ldc, gescal, NBmm0); free(vB); if (vC) free(vC); return(0); } } i = ATL_Cachelen + ATL_MulBySize(M*K + incK); if (i <= ATL_MaxMalloc) vB = malloc(i); if (!vB) { if (TA != AtlasNoTrans && TB != AtlasNoTrans) return(1); if (ib) n = nMb + 1; else n = nMb; for (j=2; !vB; j++) { k = n / j; if (k < 1) break; if (k*j < n) k++; h = ATL_Cachelen + ATL_MulBySize((k+1)*incK); if (h <= ATL_MaxMalloc) vB = malloc(h); } if (!vB) return(-1); n = k; m = ATL_MulByNB(n); ib2 = 0; } else { n = nMb; m = M; ib2 = ib; } pB = ATL_AlignPtr(vB); if (TA == AtlasNoTrans) { incA = m SHIFT; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) A2blk = Mjoin(PATL,row2blkT2_a1); else A2blk = Mjoin(PATL,row2blkT2_aXi0); } else A2blk = Mjoin(PATL,row2blkT2_aX); } else if (TA == AtlasConjTrans) { incA = lda*m SHIFT; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blkConj2_a1); else A2blk = Mjoin(PATL,col2blkConj2_aXi0); } else A2blk = Mjoin(PATL,col2blkConj2_aX); } else { incA = lda*m SHIFT; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) A2blk = Mjoin(PATL,col2blk2_a1); else A2blk = Mjoin(PATL,col2blk2_aXi0); } else A2blk = Mjoin(PATL,col2blk2_aX); } if (TB == AtlasNoTrans) { incB = ATL_MulByNB(ldb) SHIFT; B2blk = Mjoin(PATL,col2blk_a1); } else if (TB == AtlasConjTrans) { incB = NB2; B2blk = Mjoin(PATL,row2blkC_a1); } else { incB = NB2; B2blk = Mjoin(PATL,row2blkT_a1); } incC = m SHIFT; pA = pB + (incK SHIFT); do { if (TA == AtlasNoTrans) A2blk(m, K, A, lda, pA, alpha); else A2blk(K, m, A, lda, pA, alpha); Mjoin(PATL,mmJIK2)(K, n, nNb, nKb, ib2, jb, kb, alpha, pA, B, ldb, pB, incB, B2blk, beta, C, ldc, gescal, NBmm0); M -= m; nMb -= n; if (M <= m) { ib2 = ib; m = M; n = nMb; } C += incC; A += incA; } while (M); free(vB); if (vC) free(vC); return(0); }
void Mjoin(PATL,mmJIK2) (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb, const SCALAR alpha, const TYPE *pA0, const TYPE *B, int ldb, TYPE *pB0, int incB, MAT2BLK B2blk, const SCALAR beta, TYPE *C, int ldc, MATSCAL gescal, NBMM0 NBmm0) { const int incK = ATL_MulByNB(K)SHIFT, incC = ATL_MulByNB(ldc-nMb) SHIFT; const int ZEROC = ((gescal == NULL) && SCALAR_IS_ZERO(beta)); int i, j = nNb; const TYPE *pA=pA0; const TYPE rbeta = ( (gescal) ? ATL_rone : *beta ); TYPE *pB=pB0, *stB=pB0+(ATL_MulByNBNB(nKb)SHIFT); if (nNb) { do /* Loop over full column panels of B */ { if (B) { B2blk(K, NB, B, ldb, pB, alpha); B += incB; } if (nMb) { i = nMb; do /* loop over full row panels of A */ { if (gescal) gescal(NB, NB, beta, C, ldc); if (nKb) /* loop over full blocks in panels */ { NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, rbeta, C, ldc); pA += NBNB2; pB += NBNB2; if (nKb != 1) { do { NBmm_b1(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, C, ldc); pA += NBNB2; pB += NBNB2; } while (pB != stB); } if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, C, ldc); pA += ATL_MulByNB(kb)<<1; } } else if (kb) { if (ZEROC) Mjoin(PATL,gezero)(MB, NB, C, ldc); KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, rbeta, C, ldc); pA += ATL_MulByNB(kb)<<1; } pB = pB0; C += NB2; } while (--i); } if (ib) { if (gescal) gescal(ib, NB, beta, C, ldc); IBNBmm(ib, K, pA, pB, rbeta, C, ldc); } if (!B) { pB0 += incK; pB = pB0; stB += incK; } C += incC; pA = pA0; } while (--j); } if (jb) { if (B) B2blk(K, jb, B, ldb, pB, alpha); for (i=nMb; i; i--) { if (gescal) gescal(NB, jb, beta, C, ldc); NBJBmm(jb, K, pA, pB, rbeta, C, ldc); pA += incK; C += NB2; } if (ib) { if (gescal) gescal(ib, jb, beta, C, ldc); IBJBmm(ib, jb, K, pA, pB, rbeta, C, ldc); } } }
static TYPE trtritest(enum ATLAS_ORDER Order, enum ATLAS_UPLO Uplo, enum ATLAS_DIAG Diag, int CacheSize, int N, int lda, double *tim) { TYPE *A, *Acompare; int i; double t0, t1; TYPE normA, eps, resid; /*int ierr;*/ #ifdef TCPLX const TYPE one[2]={ATL_rone, ATL_rzero}; #else const TYPE one = ATL_rone; #endif eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*N); Acompare = malloc(ATL_MulBySize(lda)*N); if (A == NULL) return(-1); if (Acompare == NULL) return(-1); t0 = ATL_flushcache(CacheSize); /* create random, diagonally dominant matrix with magic value at unused places. Last number is just the random seed. */ trigen(Order, Uplo, Diag, N, A, lda, PADVAL, N*1029+lda); /* Create backup to calculate residual. This one has to be used as a full matrix, so it has zero fills and correct diagonal. */ trigen(Order, Uplo, Diag, N, Acompare, lda, ATL_rzero, N*1029+lda); if (Diag==AtlasUnit) for (i=0; i < N; i++) Acompare[(i*(lda+1)) SHIFT] = ATL_rone; normA = trinrm1(Order,Uplo, Diag, N, A, lda); #ifdef DEBUG Mjoin(PATL,geprint)("A0", N, N, A, lda); #endif t0 = ATL_flushcache(-1); /* Calculate and time a solution */ t0 = time00(); test_trtri(Order, Uplo, Diag, N, A, lda); t1 = time00() - t0; *tim = t1; /* if (ierr != 0) { fprintf(stderr, "Return values != 0 : %d \n",ierr); return(9999.9999); }*/ t0 = ATL_flushcache(0); /* Instroduce a padding error. */ /* A[(5+5*lda)SHIFT]=114.0; */ #ifdef DEBUG Mjoin(PATL,geprint)("L", N, N, A, lda); #endif ATL_checkpad(Order, Uplo, Diag, N, A, lda); /* Calculate A^{-1}*A */ cblas_trmm(Order,CblasLeft,Uplo,AtlasNoTrans,Diag, N,N,one,A,lda,Acompare,lda); #ifdef DEBUG Mjoin(PATL,geprint)("A^{-1}*A", N, N, Acompare, N); #endif /* Subtract diagonal */ for (i=0; i < N; i++) Acompare[i*((lda+1) SHIFT)] -= ATL_rone; /* resid = trinrm1(Order, Uplo,AtlasNonUnit,N,Acompare,lda); fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid); */ resid = Mjoin(PATL,genrm1)(N, N, Acompare, lda); #ifdef DEBUG if (resid/(normA*eps*N) > 10.0) fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid); #endif resid /= (normA * eps * N); free(Acompare); free(A); return(resid); }
void Mjoin( PATL, trmv ) ( const enum ATLAS_UPLO UPLO, const enum ATLAS_TRANS TRANS, const enum ATLAS_DIAG DIAG, const int N, const TYPE * A, const int LDA, TYPE * X, const int INCX ) { /* * Purpose * ======= * * Mjoin( PATL, trmv ) performs one of the matrix-vector operations * * x := A * x, or x := conjg( A ) * x, or * * x := A'* x, or x := conjg( A' ) * x, * * where x is an n-element vector and A is an n by n unit, or non-unit, * upper or lower triangular matrix. * * This is a blocked version of the algorithm. For a more detailed des- * cription of the arguments of this function, see the reference imple- * mentation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ void * vx = NULL; TYPE * x; /* .. * .. Executable Statements .. * */ if( N == 0 ) return; if( INCX == 1 ) { x = X; } else { vx = (TYPE *)malloc( ATL_Cachelen + ATL_MulBySize( N ) ); ATL_assert( vx ); x = ATL_AlignPtr( vx ); Mjoin( PATL, copy )( N, X, INCX, x, 1 ); } #ifdef TREAL if( ( TRANS == AtlasNoTrans ) || ( TRANS == AtlasConj ) ) #else if( TRANS == AtlasNoTrans ) #endif { if( UPLO == AtlasUpper ) Mjoin( PATL, trmvUN )( DIAG, N, A, LDA, x ); else Mjoin( PATL, trmvLN )( DIAG, N, A, LDA, x ); } #ifdef TCPLX else if( TRANS == AtlasConj ) { if( UPLO == AtlasUpper ) Mjoin( PATL, trmvUC )( DIAG, N, A, LDA, x ); else Mjoin( PATL, trmvLC )( DIAG, N, A, LDA, x ); } #endif #ifdef TREAL else #else else if( TRANS == AtlasTrans ) #endif { if( UPLO == AtlasUpper ) Mjoin( PATL, trmvUT )( DIAG, N, A, LDA, x ); else Mjoin( PATL, trmvLT )( DIAG, N, A, LDA, x ); } #ifdef TCPLX else { if( UPLO == AtlasUpper ) Mjoin( PATL, trmvUH )( DIAG, N, A, LDA, x ); else Mjoin( PATL, trmvLH )( DIAG, N, A, LDA, x ); } #endif if( vx ) { Mjoin( PATL, copy )( N, x, 1, X, INCX ); free( vx ); } /* * End of Mjoin( PATL, trmv ) */ }
void Mjoin(PATL,ttrmm)(const enum ATLAS_SIDE side, const enum ATLAS_UPLO uplo, const enum ATLAS_TRANS TA, const enum ATLAS_DIAG diag, ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb) { ATL_TTRSM_t trsms[ATL_NTHREADS]; TYPE *b; ATL_INT n, nblks, minblks; double tblks; int nr, p, i, j, extrablks; static int nb=0; if (M < 1 || N < 1) return; if (SCALAR_IS_ZERO(alpha)) { Mjoin(PATL,gezero)(M, N, B, ldb); return; } /* * Distribute RHS over the processors */ if (!nb) nb = Mjoin(PATL,GetNB)(); if (side == AtlasLeft) { nblks = N/nb; nr = N - nblks*nb; tblks = ((double)(M*N)) / ( (double)nb * nb ); p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER; p = Mmin(p, ATL_NTHREADS); p = p ? p : 1; b = B; minblks = nblks / p; extrablks = nblks - minblks*p; for (i=0; i < p; i++) { if (i < extrablks) n = (minblks+1)*nb; else if (i == extrablks) n = minblks*nb + nr; else n = minblks*nb; trsms[i].A = A; trsms[i].M = M; trsms[i].N = n; trsms[i].lda = lda; trsms[i].ldb = ldb; trsms[i].B = b; trsms[i].alpha = SADD alpha; trsms[i].side = side; trsms[i].uplo = uplo; trsms[i].TA = TA; trsms[i].diag = diag; n *= (ldb << Mjoin(PATL,shift)); b = MindxT(b, n); } } else /* Side == AtlasRight */ { nblks = M/nb; nr = M - nblks*nb; tblks = (N/nb)*nblks; p = (tblks+ATL_TTRSM_XOVER-1)/ATL_TTRSM_XOVER; p = Mmin(p, ATL_NTHREADS); p = p ? p : 1; b = B; minblks = nblks / p; extrablks = nblks - minblks*p; for (i=0; i < p; i++) { if (i < extrablks) n = (minblks+1)*nb; else if (i == extrablks) n = minblks*nb + nr; else n = minblks*nb; trsms[i].A = A; trsms[i].M = n; trsms[i].N = N; trsms[i].lda = lda; trsms[i].ldb = ldb; trsms[i].B = b; trsms[i].alpha = SADD alpha; trsms[i].side = side; trsms[i].uplo = uplo; trsms[i].TA = TA; trsms[i].diag = diag; n <<= Mjoin(PATL,shift); b = MindxT(b, n); } } if (p < 2) { Mjoin(PATL,trmm)(side, uplo, TA, diag, M, N, alpha, A, lda, B, ldb); return; } for (; i < ATL_NTHREADS; i++) /* flag rest of struct as uninitialized */ trsms[i].B = NULL; ATL_goparallel(p, Mjoin(PATL,DoWorkTRMM), trsms, NULL); }
void Mjoin( PATLF77WRAP, ger ) ( F77_INTEGER * M, F77_INTEGER * N, TYPE * ALPHA, TYPE * X, F77_INTEGER * INCX, TYPE * Y, F77_INTEGER * INCY, TYPE * A, F77_INTEGER * LDA ) { /* * Purpose * ======= * * ATL_F77wrap_ger performs the rank 1 operation * * A := alpha * x * y' + A, * * where alpha is a scalar, x is an m-element vector, y is an n-element * vector and A is an m by n matrix. * * Notes * ===== * * This routine is an internal wrapper function written in C called by * the corresponding Fortran 77 user callable subroutine. It calls the * appropriate ATLAS routine performing the actual computation. * * This wrapper layer resolves the following portability issues: * * - the routines' name sheme translation imposed by the Fortran / C * compilers of your target computer, * - the translation of Fortran characters into the ATLAS correspon- * ding C enumerated type (in cooperation with the Fortan user cal- * lable subroutine), * - the translation of Fortran integers into the proper C correspon- * ding native type; * * and the following ease-of-programming issue: * * - a pointer to the the first entry of vector operands (when appli- * cable) is passed to the ATLAS computational routine even if the * corresponding input increment value is negative. This allows for * a more natural expression in C of the computation performed by * these ATLAS functions. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ Mjoin( PATL, ger )( *M, *N, SVVAL ALPHA, W1N( M, X, INCX ), *INCX, W1N( N, Y, INCY ), *INCY, A, *LDA ); /* * End of Mjoin( PATLF77WRAP, ger ) */ }
void Mjoin( PATLF77WRAP, gbmv ) ( F77_INTEGER * ITRANS, F77_INTEGER * M, F77_INTEGER * N, F77_INTEGER * KL, F77_INTEGER * KU, TYPE * ALPHA, TYPE * A, F77_INTEGER * LDA, TYPE * X, F77_INTEGER * INCX, TYPE * BETA, TYPE * Y, F77_INTEGER * INCY ) { /* * Purpose * ======= * * ATL_F77wrap_gbmv performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or * * y := alpha*conjg( A' )*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an m * by n band matrix, with kl sub-diagonals and ku super-diagonals. * * Notes * ===== * * This routine is an internal wrapper function written in C called by * the corresponding Fortran 77 user callable subroutine. It calls the * appropriate ATLAS routine performing the actual computation. * * This wrapper layer resolves the following portability issues: * * - the routines' name sheme translation imposed by the Fortran / C * compilers of your target computer, * - the translation of Fortran characters into the ATLAS correspon- * ding C enumerated type (in cooperation with the Fortan user cal- * lable subroutine), * - the translation of Fortran integers into the proper C correspon- * ding native type; * * and the following ease-of-programming issue: * * - a pointer to the the first entry of vector operands (when appli- * cable) is passed to the ATLAS computational routine even if the * corresponding input increment value is negative. This allows for * a more natural expression in C of the computation performed by * these ATLAS functions. * * --------------------------------------------------------------------- */ /* .. * .. Executable Statements .. * */ if( (*ITRANS) == F77_INOTRAN ) { Mjoin( PATL, gbmv )( *ITRANS, *M, *N, *KL, *KU, SVVAL ALPHA, A, *LDA, W1N( N, X, INCX ), *INCX, SVVAL BETA, W1N( M, Y, INCY ), *INCY); } else { Mjoin( PATL, gbmv )( *ITRANS, *N, *M, *KL, *KU, SVVAL ALPHA, A, *LDA, W1N( M, X, INCX ), *INCX, SVVAL BETA, W1N( N, Y, INCY ), *INCY); } /* * End of Mjoin( PATLF77WRAP, gbmv ) */ }
/* * This is special-case code that handles rank-2 update by calling GER2 */ int Mjoin(PATL,ammm_rk2) ( enum ATLAS_TRANS TA, enum ATLAS_TRANS TB, ATL_CSZT M, ATL_CSZT N, const SCALAR alpha, const TYPE *A, ATL_CSZT lda, const TYPE *B, ATL_CSZT ldb, const SCALAR beta, TYPE *C, ATL_CSZT ldc ) { #ifdef DREAL const int MB=512, NB=32; #else /* SREAL */ const int MB=512, NB=64; #endif void *vp; TYPE *x, *y, *w, *z; size_t j; ATL_CSZT incC = NB*ldc; /* * If beta is one, can handle by one call to ger2 */ if (SCALAR_IS_ONE(beta)) { if (TA == AtlasNoTrans) { if (TB == AtlasNoTrans) Mjoin(PATL,ger2)(M, N, alpha, A, 1, B, ldb, alpha, A+lda, 1, B+1, ldb, C, ldc); else Mjoin(PATL,ger2)(M, N, alpha, A, 1, B, 1, alpha, A+lda, 1, B+ldb, 1, C, ldc); } else if (TB == AtlasNoTrans) Mjoin(PATL,ger2)(M, N, alpha, A, lda, B, ldb, alpha, A+1, lda, B+1, ldb, C, ldc); else Mjoin(PATL,ger2)(M, N, alpha, A, lda, B, 1, alpha, A+1, lda, B+ldb, 1, C, ldc); return(0); } /* * Later on, do smart think like copy only MB/NB at a time, and don't copy * at all if vectors are contiguous, but right now, always do copy up-front * so loop does not have to worry about TA/TB; this is a O(N) cost in N^2 alg */ vp = malloc(2*ATL_MulBySize(M+N)+4*ATL_Cachelen); if (!vp) return(1); x = ATL_AlignPtr(vp); y = x + M; y = ATL_AlignPtr(y); w = y + N; w = ATL_AlignPtr(w); z = w + M; z = ATL_AlignPtr(z); if (TA == AtlasNoTrans) { Mjoin(PATL,copy)(M, A, 1, x, 1); Mjoin(PATL,copy)(M, A+lda, 1, w, 1); } else { Mjoin(PATL,copy)(M, A, lda, x, 1); Mjoin(PATL,copy)(M, A+1, lda, w, 1); } if (SCALAR_IS_ONE(alpha)) { if (TB == AtlasNoTrans) { Mjoin(PATL,copy)(N, B, ldb, y, 1); Mjoin(PATL,copy)(N, B+1, ldb, z, 1); } else { Mjoin(PATL,copy)(N, B, 1, y, 1); Mjoin(PATL,copy)(N, B+ldb, 1, z, 1); } } else { if (TB == AtlasNoTrans) { Mjoin(PATL,cpsc)(N, alpha, B, ldb, y, 1); Mjoin(PATL,cpsc)(N, alpha, B+1, ldb, z, 1); } else { Mjoin(PATL,cpsc)(N, alpha, B, 1, y, 1); Mjoin(PATL,cpsc)(N, alpha, B+ldb, 1, z, 1); } } for (j=0; j < N; j += NB, C += incC) { size_t i, nb = N-j; nb = (nb >= NB) ? NB : nb; for (i=0; i < M; i += MB) { size_t mb = M-i; mb = (mb >= MB) ? MB : mb; Mjoin(PATL,gescal)(mb, nb, beta, C+i, ldc); Mjoin(PATL,ger2)(mb, nb, ATL_rone, x+i, 1, y+j, 1, ATL_rone, w+i, 1, z+j, 1, C+i, ldc); } } free(vp); return(0); }
void ATL_gemv (ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY) /* * Y = alpha*conj(A)*X + beta*Y * For Conjugate transpose, first form x = conj(X), y = A^T * conj(X), * then use axpbyConj to add this to original Y in the operation * Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * X), which is * Y = beta*Y + alpha * A^H * X. */ { ATL_mvkern_t mvtk, mvtk_b1, mvtk_b0; void *vp=NULL; TYPE *x = (TYPE*)X, *y = (TYPE*)Y; size_t t1, t2; ATL_INT m, Nm, nr, CacheElts, mb, imb, incy=1; int mu, nu, alignX, alignY, ALIGNX2A, ForceNU, COPYX, COPYY, APPLYALPHAX; int minM, minN; TYPE one[2] = {ATL_rone, ATL_rzero}; TYPE Zero[2] = {ATL_rzero, ATL_rzero}; TYPE *beta = (TYPE*) beta0; const int ALPHA_IS_ONE = (alpha0[0] == ATL_rone && alpha0[1] == ATL_rzero); if (M < 1 || N < 1) /* F77 BLAS doesn't scale in either case */ return; if (SCALAR_IS_ZERO(alpha0)) /* No contrib from alpha*A*x */ { if (!SCALAR_IS_ONE(beta0)) { if (SCALAR_IS_ZERO(beta0)) Mjoin(PATL,zero)(N, Y, incY); else Mjoin(PATL,scal)(N, beta, Y, incY); } return; } /* * ATLAS's MVT kernels loop over M in inner loop, which is bad news if M is * very small. Call code that requires no copy of X & Y for these degenerate * cases */ if (M < 16) { Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX, beta0, Y, incY); return; } /* * Get mvtk kernel pointer along with any usage guidelines, and use the * optimized CacheElts to compute the correct blocking factor */ mvtk_b1 = ATL_GetMVTKern(M, N, A, lda, &mvtk_b0, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignY, &ForceNU, &CacheElts); /* * Set up to handle case where kernel requres N to be a multiple if NU */ if (ForceNU) { Nm = (N/nu)*nu; nr = N - Nm; } else { Nm = N; nr = 0; } /* * For very small N, we can't afford the data copy, so call special case code */ if (N < 4 || Nm < 1) { Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX, beta0, Y, incY); return; } if (CacheElts) { mb = (CacheElts - 2*nu) / (2*(nu+1)); mb = (mb > mu) ? (mb/mu)*mu : M; mb = (mb > M) ? M : mb; } else mb = M; vp = malloc(ATL_MulBySize(mb+N) + 2*ATL_Cachelen); /* * If we cannot allocate enough space to copy the vectors, give up and * call the simple loop-based implementation */ if (!vp) { Mjoin(PATL,refgemv)(AtlasConjTrans, N, M, alpha0, A, lda, X, incX, beta0, Y, incY); return; } y = ATL_AlignPtr(vp); x = y + (N SHIFT); x = (ALIGNX2A) ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x); beta = Zero; /* * In this step, we form y = A^T * conj(X) */ mvtk = mvtk_b0; m = M; do { imb = Mmin(mb, m); Mjoin(PATL,copyConj)(imb, X, incX, x, 1); /* x = conj(X) */ /* * Call optimized kernel (can be restricted or general) */ if (imb >= minM) mvtk(imb, Nm, A, lda, x, y); else Mjoin(PATL,mvtk_Mlt16)(imb, Nm, one, A, lda, x, 1, beta, y, 1); /* * Some kernels require N%NU=0; if so nr is remainder, do cleanup with axpy */ if (nr) Mjoin(PATL,mvtk_smallN)(imb, nr, one, A+((size_t)lda)*(Nm SHIFT), lda, x, 1, beta, y+(Nm SHIFT), 1); beta = one; mvtk = mvtk_b1; A += imb SHIFT; X += (imb*incX)SHIFT; m -= imb; imb = Mmin(m,mb); } while(m); /* * Given y = A^T * conj(X) from above, now do: * Y = beta*Y + alpha*conj(y) = beta*Y + alpha*(A^H * x), which is * Y = beta*Y + alpha * A^H * x. */ Mjoin(PATL,axpbyConj)(N, alpha0, y, 1, beta0, Y, incY); free(vp); }
/* * This routine handles N <= MAXN, K & M large (left-looking shape) */ int Mjoin(PATL,ammm_tN) ( enum ATLAS_TRANS TA, enum ATLAS_TRANS TB, ATL_CSZT M, ATL_CSZT N, ATL_CSZT K, const SCALAR alpha, const TYPE *A, ATL_CSZT lda, const TYPE *B, ATL_CSZT ldb, const SCALAR beta, TYPE *C, ATL_CSZT ldc ) { ablk2cmat_t blk2c; cm2am_t a2blk, b2blk; ammkern_t ammK0_b0, ammK0_b1, ammK0_bn, amm_b1, amm_bn; amminfo_t mminfo; const TYPE ONE[2] = {ATL_rone, ATL_rzero}; const TYPE *alpA=ONE, *alpB=ONE, *alpC=ONE; TYPE *rA, *iA, *pB0, *rC, *iC; int mu, nu, ku, nnu, NN, MB, NMU, KB, KB0, kb0, incBw, incBw0; size_t incAk0, incAk, mulAm, incBk0, incBk, nkb, k, nmblks, nsmblks, i, m; void *vp; ATL_assert(N <= ATL_AMM_MAXNB); mu = Mjoin(PATL,GetAmmmInfo)(&mminfo, TA, TB, M, N, K, alpha, beta); if (!mu) alpA = alpha; else if (mu == 1) alpB = alpha; else alpC = alpha; mu = mminfo.mu; nu = mminfo.nu; ku = mminfo.ku; MB = ATL_ComputeB(M, mu, MY_MAXMB, &nsmblks, &nmblks); NMU = MB / mu; nnu = (N+nu-1)/nu; KB = mminfo.kb; NN = nnu * nu; nkb = K/KB; /* * kb0: K remainder, KB0 is CEIL(kb0/ku)*ku for k-vector kerns, and * same as kb0 for M-vector kerns */ KB0 = kb0 = K - nkb*KB; if (!kb0) { KB0 = kb0 = KB; nkb--; } #if ATL_AMM_MAXKMAJ > 1 if (ATL_AMMFLG_KMAJOR(mminfo.flag)) { KB0 = ((kb0+ku-1)/ku)*ku; if (ATL_AMMFLG_KRUNTIME(mminfo.flag)) ammK0_b0 = mminfo.amm_b0; else ammK0_b0 = (mminfo.kb==KB0) ? mminfo.amm_b0 : mminfo.amm_k1_b0; } else #endif { ammK0_b0 = (kb0 == KB) ? mminfo.amm_b0 : mminfo.amm_k1_b0; if (ATL_AMMFLG_KRUNTIME(mminfo.flag) && (kb0/ku)*ku == kb0 && kb0 > mminfo.kbmin) ammK0_b0 = mminfo.amm_b0; } amm_b1 = mminfo.amm_b1; amm_bn = mminfo.amm_bn; if (ammK0_b0 == mminfo.amm_b0) { ammK0_b1 = mminfo.amm_b1; ammK0_bn = mminfo.amm_bn; } else { ammK0_b1 = mminfo.amm_k1_b1; ammK0_bn = mminfo.amm_k1_bn; } a2blk = mminfo.a2blk; b2blk = mminfo.b2blk; blk2c = mminfo.Cblk2cm; /* * Do memory allocation, setup pointers */ { const int szA = MB*KB, szC=MB*NN; size_t szB = (nkb*KB+KB0)*NN; const size_t tsz = ATL_MulBySize(szA+szB+szC+mu*nu*ku) + 3*ATL_Cachelen; if (tsz > ATL_MaxMalloc) return(2); vp = malloc(tsz); if (!vp) return(1); iA = ATL_AlignPtr(vp); rA = iA + szA; iC = rA + szA; iC = ATL_AlignPtr(iC); rC = iC + szC; pB0 = rC + szC; pB0 = ATL_AlignPtr(pB0); } if (TA == AtlasNoTrans) { incAk = (lda*KB)SHIFT; incAk0 = (lda*kb0)SHIFT; mulAm = 2; } else { incAk = KB SHIFT; incAk0 = kb0 SHIFT; mulAm = lda SHIFT; } if (TB == AtlasNoTrans) { incBk = KB SHIFT; incBk0 = kb0 SHIFT; } else { incBk = (KB*ldb)SHIFT; incBk0 = (kb0*ldb)SHIFT; } incBw0 = KB0*NN; incBw = KB*NN; for (m=M,i=0; i < nmblks; i++) { const TYPE *An; TYPE *iB=pB0, *rB=pB0+incBw0, *pBn=rB+incBw0; int mb, nmu, mm; if (i < nsmblks) { mb = MB-mu; nmu = NMU-1; } else { mb = MB; nmu = NMU; } mm = Mmin(m, mb); m -= mm; An = A + mm*mulAm; /* * Do first (possibly partial) K-block */ a2blk(kb0, mm, alpA, A, lda, rA, iA); A += incAk0; if (!i) { b2blk(kb0, N, alpB, B, ldb, rB, iB); B += incBk0; } rB = iB + incBw0; pBn = rB + incBw0; ammK0_b0(nmu, nnu, KB0, iA, iB, rC, rA, iB, iC); ammK0_b0(nmu, nnu, KB0, rA, iB, iC, rA, rB, rC); ammK0_bn(nmu, nnu, KB0, rA, rB, rC, iA, rB, iC); ammK0_b1(nmu, nnu, KB0, iA, rB, iC, rA, pBn, iC); iB = pBn; /* * Loop over all full-sized blocks */ for (k=0; k < nkb; k++) { a2blk(KB, mm, alpA, A, lda, rA, iA); A += incAk; rB = iB + incBw; if (!i) { b2blk(KB, N, alpB, B, ldb, rB, iB); B += incBk; } pBn = (k < nkb-1) ? rB+incBw : pB0; amm_bn(nmu, nnu, KB, iA, iB, rC, rA, iB, iC); amm_b1(nmu, nnu, KB, rA, iB, iC, rA, rB, rC); amm_bn(nmu, nnu, KB, rA, rB, rC, iA, rB, iC); amm_b1(nmu, nnu, KB, iA, rB, iC, rA, pBn, iC); iB = pBn; } blk2c(mm, N, alpC, rC, iC, beta, C, ldc); C += mm+mm; A = An; } free(vp); return(0); }
void Mjoin( PATL, tpmvUC ) ( const enum ATLAS_DIAG DIAG, const int N, /* N > 0 assumed */ const TYPE * A, const int LDA, TYPE * X ) { /* * Purpose * ======= * * Mjoin( PATL, tpmvUC ) performs the following matrix-vector operation * * x := conjg( A ) * x, * * where x is an n-element vector and A is an n by n unit or non-unit, * upper triangular matrix supplied in packed form. * * This is a blocked version of the algorithm. For a more detailed des- * cription of the arguments of this function, see the reference imple- * mentation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ #ifdef TREAL #define one ATL_rone #else TYPE one[2] = { ATL_rone, ATL_rzero }; #endif #ifdef ATL_AXPYMV TYPE * A0, * x0; int incX, lda, lda0, mb, mb1, n, nb; #else TYPE * x0; int incX, lda, m, mb, nb, nb1; #endif void (*tpmv0)( const int, const TYPE *, const int, TYPE * ); #define gpmv0 Mjoin( PATL, gpmvUNc_a1_x1_b1_y1 ) /* .. * .. Executable Statements .. * */ ATL_GetPartMVN( A, LDA, &mb, &nb ); if( DIAG == AtlasNonUnit ) tpmv0 = Mjoin( PATL, tpmvUCN ); else tpmv0 = Mjoin( PATL, tpmvUCU ); #ifdef ATL_AXPYMV mb1 = N - ( ( N - 1 ) / mb ) * mb; incX = (mb SHIFT); lda = lda0 = LDA; A0 = (TYPE *)(A); MUpnext( mb, A0, lda0 ); for( n = N - mb, x0 = X + incX; n > 0; n -= mb, X += incX, x0 += incX ) { tpmv0( mb, A, lda, X ); gpmv0( mb, n, one, A0 - incX, lda0, x0, 1, one, X, 1 ); lda = lda0; A = A0; MUpnext( mb, A0, lda0 ); } tpmv0( mb1, A, lda, X ); #else nb1 = N - ( ( N - 1 ) / nb ) * nb; incX = (nb SHIFT); x0 = X; lda = LDA; tpmv0( nb1, A, lda, X ); X += (nb1 SHIFT); MUpnext( nb1, A, lda ); for( m = nb1; m < N; m += nb, X += incX ) { gpmv0( m, nb, one, A - (m SHIFT), lda, X, 1, one, x0, 1 ); tpmv0( nb, A, lda, X ); MUpnext( nb, A, lda ); } #endif /* * End of Mjoin( PATL, tpmvUC ) */ }
void Mjoin(PATL,sprk_rK) (const enum PACK_UPLO UA, const enum PACK_TRANS TA, const enum ATLAS_UPLO UC, const int CP, const int N, const int K, int R, const SCALAR alpha, const TYPE *A, int lda, const SCALAR beta0, TYPE *C, const int ldc) /* * This routine does the packed symmetric rank-K update by doing ceil(K/R) * rank-R updates of C. This primarily done for CacheEdge, but is also * useful to auto-reduce R until enough workspace may be allocated. */ { const enum PACK_UPLO UC2 = ((CP) ? UC : PackGen); int k=0, kb, ierr; const int ldainc = (UA == AtlasUpper) ? 1 : ((UA == AtlasLower) ? -1 : 0); const int ldcinc = (UC2 == AtlasUpper) ? 1 : ((UC2 == AtlasLower) ? -1 : 0); #ifdef TREAL TYPE beta = beta0; #else TYPE beta[2]; *beta = *beta0; beta[1] = beta0[1]; #endif if (R < NB) R = NB<<4; if ((K - R) < 2*NB) R = K; do { kb = K - k; if (kb - R < 2*NB) R = kb; kb = Mmin(R, kb); /* * If we can't do the rank-R update, reduce R until we can, or R = 1 */ ierr = Mjoin(PATL,prk_kmm)(UC, UA, TA, N, kb, alpha, A, lda, beta, CP, C, ldc); if (ierr && R <= NB*8) { if (UC == AtlasUpper) { if (TA == AtlasNoTrans) ATL_rk_recUN(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc); else ATL_rk_recUT(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc); } else { if (TA == AtlasNoTrans) ATL_rk_recLN(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc); else ATL_rk_recLT(UA, TA, UC, CP, N, kb, alpha, A, lda, beta, C, ldc); } ierr = 0; } if (ierr) { R = Mmin(NB*8, R>>1); ATL_assert(R); } /* * Subsequent updates use beta = 1 */ else { #ifdef TREAL beta = ATL_rone; #else *beta = ATL_rone; beta[1] = ATL_rzero; #endif if (TA == AtlasNoTrans) { A += MindexP(UA, 0, R, lda); lda = Mpld(UA, R, lda); } else A += MindexP(UA, R, 0, lda); k += R; } }
int clapack_zgels(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TA, ATL_CINT M, ATL_CINT N, ATL_CINT NRHS, void *A, ATL_CINT lda, void *B, const int ldb) /* * GELS solves overdetermined or underdetermined linear systems * involving an M-by-N matrix A, or its conjugate-transpose, using a QR * or LQ factorization of A. It is assumed that A has full rank. */ { int ierr = 0; if (Order != CblasRowMajor && Order != CblasColMajor) { ierr = -1; cblas_xerbla(1, "clapack_zgesv", "Order must be %d or %d, but is set to %d.\n", CblasRowMajor, CblasColMajor, Order); } if (TA != AtlasNoTrans && TA != mytrans) { ierr = -2; cblas_xerbla(2, "clapack_zgesv", "Trans must be %d or %d, but is set to %d.\n", CblasNoTrans, mytrans, TA); } if (M < 0) { ierr = -3; cblas_xerbla(3, "clapack_zgesv", "M cannot be less than zero 0,; is set to %d.\n", N); } if (N < 0) { ierr = -4; cblas_xerbla(4, "clapack_zgesv", "N cannot be less than zero 0,; is set to %d.\n", N); } if (NRHS < 0) { ierr = -5; cblas_xerbla(5, "clapack_zgesv", "NRHS cannot be less than zero 0,; is set to %d.\n", NRHS); } if (lda < M || lda < 1) { ierr = -7; cblas_xerbla(7, "clapack_zgesv", "lda must be >= MAX(M,1): lda=%d M=%d\n", lda, M); } if (ldb < Mmax(M,N) || ldb < 1) { ierr = -9; cblas_xerbla(9, "clapack_zgesv", "ldb must be >= MAX(M,N,1): ldb=%d M=%d N=%d\n", ldb, M, N); } if (Order == CblasColMajor) ierr = ATL_zgels(TA, M, N, NRHS, A, lda, B, ldb, NULL, 0); else /* row-major array */ { enum CBLAS_TRANSPOSE TAr = (TA == AtlasNoTrans) ? mytrans : CblasNoTrans; TYPE *a=((TYPE*)A)+1; ATL_CINT lda2 = lda+lda; ATL_INT j; /* * Complex takes only the conjugate tranpose, so conjugate entries before * calling with reversed transpose setting */ for (j=0; j < N; j++, a += lda2) Mjoin(PATLU,scal)(N, ATL_rnone, a, 2); ierr = ATL_zgels(TAr, N, M, NRHS, A, lda, B, ldb, NULL, 0); } return(ierr); }
void Mjoin(PATL,mmIJK2) (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb, const SCALAR alpha, const TYPE *A, const int lda, TYPE *pA0, const int incA, MAT2BLK A2blk, TYPE *pB0, const SCALAR beta, TYPE *C, int ldc, MATSCAL gescal, NBMM0 NBmm0) { const int incK = ATL_MulByNB(K)<<1; const int incCn = ATL_MulByNB(ldc)<<1, incCm = (MB<<1) - nNb*incCn; const int ZEROC = ((gescal == NULL) && SCALAR_IS_ZERO(beta)); int i, j, k; const TYPE *pB=pB0; const TYPE rbeta = ( (gescal) ? ATL_rone : *beta ); TYPE *pA=pA0; for (i=nMb; i; i--) { if (A) { A2blk(K, NB, A, lda, pA, alpha); /* get 1 row panel of A */ A += incA; } for (j=nNb; j; j--) { if (gescal) gescal(MB, NB, beta, C, ldc); if (nKb) { NBmm0(MB, NB, KB, ATL_rone, pA, KB, pB, KB, rbeta, C, ldc); pA += NBNB2; pB += NBNB2; if (nKb != 1) { for (k=nKb-1; k; k--, pA += NBNB2, pB += NBNB2) NBmm_b1(MB, NB, KB, ATL_rone, pA, KB, pB, KB, ATL_rone, C, ldc); } if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, ATL_rone, C, ldc); pB += ATL_MulByNB(kb)<<1; } } else { if (ZEROC) Mjoin(PATL,gezero)(MB, NB, C, ldc); if (kb) { KBmm(MB, NB, kb, ATL_rone, pA, kb, pB, kb, rbeta, C, ldc); pB += ATL_MulByNB(kb)<<1; } } pA = pA0; C += incCn; } if (jb) { if (gescal) gescal(MB, jb, beta, C, ldc); MBJBmm(jb, K, pA, pB, rbeta, C, ldc); } pB = pB0; if (!A) { pA0 += incK; pA = pA0; } C += incCm; } if (ib) { if (A) A2blk(K, ib, A, lda, pA, alpha); /* get last row panel of A */ for(j=nNb; j; j--) /* full column panels of B */ { if (gescal) gescal(ib, NB, beta, C, ldc); IBNBmm(ib, K, pA, pB, rbeta, C, ldc); pB += incK; C += incCn; } if (jb) { if (gescal) gescal(ib, jb, beta, C, ldc); IBJBmm(ib, jb, K, pA, pB, rbeta, C, ldc); } } }
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N0, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc) { int N = N0; int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n, incA, incB, incC; const int incK = ATL_MulByNB(K); void *vA=NULL; TYPE *pA, *pB; MAT2BLK A2blk, B2blk; MATSCAL gescal; NBMM0 NBmm0; nMb = ATL_DivByNB(M); nNb = ATL_DivByNB(N); nKb = ATL_DivByNB(K); ib = M - ATL_MulByNB(nMb); jb = N - ATL_MulByNB(nNb); kb = K - ATL_MulByNB(nKb); if (beta[1] == ATL_rzero) { gescal = NULL; if (*beta == ATL_rone) NBmm0 = Mjoin(PATL,CNBmm_b1); else if (*beta == ATL_rzero) NBmm0 = Mjoin(PATL,CNBmm_b0); else NBmm0 = Mjoin(PATL,CNBmm_bX); } else { gescal = Mjoin(PATL,gescal_bX); NBmm0 = Mjoin(PATL,CNBmm_b1); } i = ATL_Cachelen + ATL_MulBySize(N*K + incK); if (i <= ATL_MaxMalloc) vA = malloc(i); if (!vA) { if (TA == AtlasNoTrans && TB == AtlasNoTrans) return(1); if (jb) n = nNb + 1; else n = nNb; for (j=2; !vA; j++) { k = n / j; if (k < 1) break; if (k*j < n) k++; h = ATL_Cachelen + ATL_MulBySize((k+1)*incK); if (h <= ATL_MaxMalloc) vA = malloc(h); } if (!vA) return(-1); n = ATL_MulByNB(k); jb2 = 0; } else { jb2 = jb; k = nNb; n = N; } pA = ATL_AlignPtr(vA); if (TB == AtlasNoTrans) { incB = ldb*n<<1; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) B2blk = Mjoin(PATL,col2blk2_a1); else B2blk = Mjoin(PATL,col2blk2_aXi0); } else B2blk = Mjoin(PATL,col2blk2_aX); } else if (TB == AtlasConjTrans) { incB = n<<1; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkC2_a1); else B2blk = Mjoin(PATL,row2blkC2_aXi0); } else B2blk = Mjoin(PATL,row2blkC2_aX); } else { incB = n<<1; if (alpha[1] == ATL_rzero) { if (*alpha == ATL_rone) B2blk = Mjoin(PATL,row2blkT2_a1); else B2blk = Mjoin(PATL,row2blkT2_aXi0); } else B2blk = Mjoin(PATL,row2blkT2_aX); } if (TA == AtlasNoTrans) { incA = NB<<1; A2blk = Mjoin(PATL,row2blkT_a1); } else if (TA == AtlasConjTrans) { incA = ATL_MulByNB(lda)<<1; A2blk = Mjoin(PATL,col2blkConj_a1); } else { incA = ATL_MulByNB(lda)<<1; A2blk = Mjoin(PATL,col2blk_a1); } incC = ldc*n<<1; pB = pA + (incK<<1); do { if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha); else B2blk(n, K, B, ldb, pB, alpha); Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA, incA, A2blk, pB, beta, C, ldc, gescal, NBmm0); N -= n; nNb -= k; if (N < n) { jb2 = jb; n = N; k = nNb; } C += incC; B += incB; } while (N); free(vA); return(0); }