void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda) { int info = 2000; void *vx, *vy; float *x0, *y0; const float *x=X, *y=Y, *alp=alpha; const float one[2]={ATL_rone, ATL_rzero}; #ifndef NoCblasErrorChecks if (Order != CblasColMajor && Order != CblasRowMajor) info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (Uplo != CblasUpper && Uplo != CblasLower) info = cblas_errprn(2, info, "UPLO must be %d or %d, but is set to %d", CblasUpper, CblasLower, Uplo); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (!incX) info = cblas_errprn(6, info, "incX cannot be zero; is set to %d.", incX); if (!incY) info = cblas_errprn(8, info, "incY cannot be zero; is set to %d.", incY); if (lda < N || lda < 1) info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d N=%d", lda, N); if (info != 2000) { cblas_xerbla(info, "cblas_cher2", ""); return; } #endif if (incX < 0) x += (1-N)*incX<<1; if (incY < 0) y += (1-N)*incY<<1; if (Order == CblasColMajor) ATL_cher2(Uplo, N, alpha, x, incX, y, incY, A, lda); else if (alp[0] != ATL_rzero || alp[1] != ATL_rzero) { vx = malloc(ATL_Cachelen + ATL_MulBySize(N)); vy = malloc(ATL_Cachelen + ATL_MulBySize(N)); ATL_assert(vx != NULL && vy != NULL); x0 = ATL_AlignPtr(vx); y0 = ATL_AlignPtr(vy); ATL_cmoveConj(N, alpha, y, incY, y0, 1); ATL_ccopyConj(N, x, incX, x0, 1); ATL_cher2(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, one, y0, 1, x0, 1, A, lda); free(vx); free(vy); } else ATL_cher2(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, alpha, y, incY, x, incX, A, lda); }
void Mjoin(Mjoin(Mjoin(PATL,herk),UploNM),N) (const int N, const int K, const void *valpha, const void *A, const int lda, const void *vbeta, void *C, const int ldc) { void *vc; TYPE *c; TYPE alpha[2]; const TYPE beta = *( (const TYPE *)vbeta ); const TYPE zero[2] = {0.0, 0.0}; alpha[0] = *( (const TYPE *)valpha ); if (K > HERK_Xover) { alpha[1] = 0.0; vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N); ATL_assert(vc); c = ATL_AlignPtr(vc); CgemmNC(N, N, K, alpha, A, lda, A, lda, zero, c, N); if ( beta == 1.0 ) Mjoin(her_put,_b1)(N, c, vbeta, C, ldc); else if ( beta == 0.0 ) Mjoin(her_put,_b0)(N, c, vbeta, C, ldc); else Mjoin(her_put,_bXi0)(N, c, vbeta, C, ldc); free(vc); } else Mjoin(PATL,refherk)(Uplo_, AtlasNoTrans, N, K, *alpha, A, lda, beta, C, ldc); }
int Mjoin(PATL,her2kLN) #endif #endif (const int N, const int K, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { int i; void *vc=NULL; TYPE *c; const TYPE beta =*( (const TYPE *)vbeta ); const TYPE zero[2]={0.0, 0.0}; i = ATL_MulBySize(N)*N; if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i); if (vc == NULL) return(1); c = ATL_AlignPtr(vc); #ifdef Transpose_ ATL_ammm(AtlasConjTrans, AtlasNoTrans, N, N, K, valpha, A, lda, B, ldb, #else ATL_ammm(AtlasNoTrans, AtlasConjTrans, N, N, K, valpha, A, lda, B, ldb, #endif zero, c, N); if ( beta == 1.0 ) Mjoin(her2k_put,_b1)(N, c, vbeta, C, ldc); else if ( beta == 0.0 ) Mjoin(her2k_put,_b0)(N, c, vbeta, C, ldc); else Mjoin(her2k_put,_bXi0)(N, c, vbeta, C, ldc); free(vc); return(0); }
void Mjoin(Mjoin(PATL,symmL),UploNM) (const int M, const int N, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0; #else #define alpha valpha #define beta vbeta #endif TYPE *a; void *va; if (N > SYMM_Xover) { va = malloc(ATL_Cachelen + (ATL_MulBySize(M)*M)); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(M, alpha, A, lda, a); else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(M, alpha, A, lda, a); CgemmTN(M, N, M, one, a, M, B, ldb, beta, C, ldc); #else Mjoin(Mjoin(PATL,sycopy),UploNM)(M, A, lda, a); CgemmTN(M, N, M, valpha, a, M, B, ldb, vbeta, C, ldc); #endif free(va); } else Mjoin(PATL,refsymm)(AtlasLeft, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
int Mjoin(PATL,syr2kLT) #endif (const int N, const int K, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { int i; void *vc=NULL; TYPE *c; #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0, zero=0.0; #else #define alpha valpha const TYPE *beta=vbeta; const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0}; #endif i = ATL_MulBySize(N)*N; if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i); if (vc == NULL) return(1); c = ATL_AlignPtr(vc); CgemmTN(N, N, K, alpha, A, lda, B, ldb, zero, c, N); if ( SCALAR_IS_ONE(beta) ) Mjoin(syr2k_put,_b1)(N, c, beta, C, ldc); else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr2k_put,_b0)(N, c, beta, C, ldc); #ifdef TCPLX else if (SCALAR_IS_NONE(beta)) Mjoin(syr2k_put,_bn1)(N, c, beta, C, ldc); else if (beta[1] == *zero) Mjoin(syr2k_put,_bXi0)(N, c, beta, C, ldc); #endif else Mjoin(syr2k_put,_bX)(N, c, beta, C, ldc); free(vc); return(0); }
void Mjoin(Mjoin(Mjoin(PATL,syrk),UploNM),T) (const int N, const int K, const void *valpha, const void *A, const int lda, const void *vbeta, void *C, const int ldc) { void *vc; TYPE *c; #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0, zero=0.0; #else #define alpha valpha const TYPE *beta=vbeta; const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0}; #endif if (K > SYRK_Xover) { vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N); ATL_assert(vc); c = ATL_AlignPtr(vc); CgemmTN(N, N, K, alpha, A, lda, A, lda, zero, c, N); if ( SCALAR_IS_ONE(beta) ) Mjoin(syr_put,_b1)(N, c, beta, C, ldc); else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr_put,_b0)(N, c, beta, C, ldc); #ifdef TCPLX else if ( SCALAR_IS_NONE(beta) ) Mjoin(syr_put,_bn1)(N, c, beta, C, ldc); else if (beta[1] == *zero) Mjoin(syr_put,_bXi0)(N, c, beta, C, ldc); #endif else Mjoin(syr_put,_bX)(N, c, beta, C, ldc); free(vc); } else Mjoin(PATL,refsyrk)(Uplo_, AtlasTrans, N, K, alpha, A, lda, beta, C, ldc); }
void Mjoin(Mjoin(PATL,symmR),UploNM) (const int M, const int N, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0; #else #define alpha valpha #define beta vbeta #endif void *va; TYPE *a; if (M > SYMM_Xover) { va = malloc(ATL_Cachelen + ATL_MulBySize(N)*N); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(N, alpha, A, lda, a); else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(N, alpha, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, one, B, ldb, a, N, beta, C, ldc); #else Mjoin(Mjoin(PATL,sycopy),UploNM)(N, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, valpha, B, ldb, a, N, vbeta, C, ldc); #endif free(va); } else Mjoin(PATL,refsymm)(AtlasRight, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
void Mjoin(Mjoin(PATL,trmmL),ATLP) (const int M, const int N, const void *valpha, const void *A, const int lda, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR one=1.0, zero=0.0; #else const TYPE zero[2]={0.0,0.0}; #define alpha valpha #endif void *va; TYPE *a; if (N > TRMM_Xover) { va = malloc(ATL_Cachelen + ATL_MulBySize(M)*M); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(ATL_trcopy,_a1)(M, alpha, A, lda, a); else Mjoin(ATL_trcopy,_aX)(M, alpha, A, lda, a); CAgemmTN(M, N, M, one, a, M, C, ldc, zero, C, ldc); #else ATL_trcopy(M, A, lda, a); CAgemmTN(M, N, M, valpha, a, M, C, ldc, zero, C, ldc); #endif free(va); } else Mjoin(PATL,reftrmm)(AtlasLeft, Uplo_, Trans_, Unit_, M, N, alpha, A, lda, C, ldc); }
void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda) { int info = 2000; const double *x = X, *y = Y; void *vy; double *y0; double one[2] = {ATL_rone, ATL_rzero}; #ifndef NoCblasErrorChecks if (M < 0) info = cblas_errprn(2, info, "M cannot be less than zero; is set to %d.", M); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (!incX) info = cblas_errprn(6, info, "incX cannot be zero; is set to %d.", incX); if (!incY) info = cblas_errprn(8, info, "incY cannot be zero; is set to %d.", incY); if (Order == CblasColMajor) { if (lda < M || lda < 1) info = cblas_errprn(10, info, "lda must be >= MAX(M,1): lda=%d M=%d", lda, M); } else if (Order == CblasRowMajor) { if (lda < N || lda < 1) info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d M=%d", lda, N); } else info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (info != 2000) { cblas_xerbla(info, "cblas_zgerc", ""); return; } #endif if (incX < 0) x += (1-M)*incX<<1; if (incY < 0) y += (1-N)*incY<<1; if (Order == CblasColMajor) ATL_zgerc(M, N, alpha, x, incX, y, incY, A, lda); else { vy = malloc(ATL_Cachelen + ATL_MulBySize(N)); ATL_assert(vy); y0 = ATL_AlignPtr(vy); ATL_zmoveConj(N, alpha, y, incY, y0, 1); ATL_zgeru(N, M, one, y0, 1, x, incX, A, lda); free(vy); } }
void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A, const int lda) { int info = 2000; void *vx; double one[2] = {ATL_rone, ATL_rzero}; double *x0; const double *x=X; #ifndef NoCblasErrorChecks if (Order != CblasColMajor && Order != CblasRowMajor) info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (Uplo != CblasUpper && Uplo != CblasLower) info = cblas_errprn(2, info, "UPLO must be %d or %d, but is set to %d", CblasUpper, CblasLower, Uplo); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (!incX) info = cblas_errprn(6, info, "incX cannot be zero; is set to %d.", incX); if (lda < N || lda < 1) info = cblas_errprn(8, info, "lda must be >= MAX(N,1): lda=%d N=%d", lda, N); if (info != 2000) { cblas_xerbla(info, "cblas_zher", ""); return; } #endif if (incX < 0) x += (1-N)*incX<<1; if (Order == CblasColMajor) ATL_zher(Uplo, N, alpha, x, incX, A, lda); else if (alpha != ATL_rzero) { vx = malloc(ATL_Cachelen + ATL_MulBySize(N)); ATL_assert(vx); x0 = ATL_AlignPtr(vx); ATL_zmoveConj(N, one, x, incX, x0, 1); ATL_zher(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, alpha, x0, 1, A, lda); free(vx); } else ATL_zher(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, ATL_rzero, x, incX, A, lda); }
double ATL_flushcache(long long size) /* * flush cache by reading enough mem; note that if the compiler gets * really smart, may be necessary to make vp a global variable so it * can't figure out it's not being modified other than during setup; * the fact that ATL_dzero is external will confuse most compilers */ { static void *vp=NULL; static long long N = 0; double *cache; double dret=0.0; size_t i; if (size < 0) /* flush cache */ { ATL_assert(vp); cache = ATL_AlignPtr(vp); if (N > 0) for (i=0; i != N; i++) dret += cache[i]; } else if (size > 0) /* initialize */ { vp = malloc(ATL_Cachelen + size); ATL_assert(vp); N = size / sizeof(double); cache = ATL_AlignPtr(vp); ATL_dzero(N, cache, 1); } else if (size == 0) /* free cache */ { if (vp) free(vp); vp = NULL; N = 0; } return(dret); }
void Mjoin(Mjoin(PATL,hemmL),UploNM) (const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc) { TYPE *a; void *va; if (N > HEMM_Xover) { va = malloc(ATL_Cachelen + (ATL_MulBySize(M)*M)); ATL_assert(va); a = ATL_AlignPtr(va); Mjoin(Mjoin(PATL,hecopy),UploNM)(M, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, M, alpha, a, M, B, ldb, beta, C, ldc); free(va); } else Mjoin(PATL,refhemm)(AtlasLeft, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
double ATL_ptflushcache(long long size) /* * flush cache by reading enough mem; note that if the compiler gets * really smart, may be necessary to make vp a global variable so it * can't figure out it's not being modified other than during setup; * the fact that ATL_dzero is external will confuse most compilers */ { static void *vp=NULL; static double *cache=NULL; double dret=0.0; static long long i, N = 0; ATL_FC fct[ATL_NTHREADS]; if (size < 0) /* flush cache */ { ATL_assert(cache); for (i=0; i < ATL_NTHREADS; i++) { fct[i].N = N; fct[i].dp = cache+i*N; } ATL_goparallel(ATL_NTHREADS, ATL_DoWorkFC, fct, NULL); } else if (size > 0) /* initialize */ { vp = malloc(ATL_Cachelen + (size * ATL_NTHREADS)); ATL_assert(vp); cache = ATL_AlignPtr(vp); N = size / sizeof(double); ATL_dzero(N*ATL_NTHREADS, cache, 1); } else if (size == 0) /* free cache */ { if (vp) free(vp); vp = cache = NULL; N = 0; } return(dret); }
void Mjoin(Mjoin(PATL,trsmR),ATLP) (const int M, const int N, const void *valpha, const void *A, const int lda, void *C, const int ldc) { const TYPE *alpha=valpha; #ifdef TREAL #if defined(Transpose_) || defined(ConjTrans_) if ( M > (N<<2) ) { void *va; TYPE *a; va = malloc(ATL_Cachelen + (ATL_MulBySize(N*N))); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL Mjoin(ATL_trcopy,_a1)(N, ATL_rone, A, lda, a); #else ATL_trcopy(N, A, lda, a); #endif Mjoin(Mjoin(PATL,trsmKR),ATLPt)(M, N, *alpha, a, N, C, ldc); free(va); } else Mjoin(PATL,reftrsm)(AtlasRight, Uplo_, Trans_, Unit_, M, N, *alpha, A, lda, C, ldc); #else Mjoin(Mjoin(PATL,trsmKR),ATLP)(M, N, *alpha, A, lda, C, ldc); #endif #else if (M > (N<<2) && N <= 4) Mjoin(PATL,CtrsmKR)(Uplo_, Trans_, Unit_, M, N, valpha, A, lda, C, ldc); else Mjoin(PATL,reftrsm)(AtlasRight, Uplo_, Trans_, Unit_, M, N, alpha, A, lda, C, ldc); #endif }
int Mjoin(PC2F,ormrq) (const enum CBLAS_SIDE Side, const enum CBLAS_TRANSPOSE TA, ATL_CINT M, ATL_CINT N, ATL_CINT K, TYPE *A, ATL_CINT lda, TYPE *TAU, TYPE *C, ATL_CINT ldc) { TYPE work[2]; void *vp; TYPE *wrk; ATL_INT lwrk; int iret; /* * Query routine for optimal workspace, allocate it, and call routine with it */ ATL_assert(!Mjoin(PC2F,ormrq_wrk)(Side, TA, M, N, K, A, lda, TAU, C, ldc, work, -1)); lwrk = work[0]; vp = malloc(ATL_MulBySize(lwrk) + ATL_Cachelen); ATL_assert(vp); wrk = ATL_AlignPtr(vp); iret = Mjoin(PC2F,ormrq_wrk)(Side, TA, M, N, K, A, lda, TAU, C, ldc, wrk, lwrk); free(vp); return(iret); }
int clapack_sgetri(const enum CBLAS_ORDER Order, const int N, float *A, const int lda, const int *ipiv) { int ierr=0, lwrk; int Mjoin(PATL,GetNB)(); void *vp; lwrk = Mjoin(PATL,GetNB)(); if (lwrk <= N) lwrk *= N; else lwrk = N*N; vp = malloc(ATL_Cachelen + ATL_MulBySize(lwrk)); if (vp) { ierr = ATL_getri(Order, N, A, lda, ipiv, ATL_AlignPtr(vp), &lwrk); free(vp); } else { cblas_xerbla(7, "clapack_sgetri", "Cannot allocate workspace of %d\n", lwrk); return(-7); } return(ierr); }
void cblas_cger2c(const enum CBLAS_ORDER Order, ATL_CINT M, ATL_CINT N, const void *alpha, const void *X, ATL_CINT incX, const void *Y, ATL_CINT incY, const void *beta, const void *W, ATL_CINT incW, const void *Z, ATL_CINT incZ, void *A, ATL_CINT lda) { int info = 2000; const float *x = X, *y = Y, *w = W, *z = Z; void *vy; float *y0, *z0; float one[2] = {ATL_rone, ATL_rzero}; #ifndef NoCblasErrorChecks if (M < 0) info = cblas_errprn(2, info, "M cannot be less than zero; is set to %d.", M); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (!incX) info = cblas_errprn(6, info, "incX cannot be zero; is set to %d.", incX); if (!incY) info = cblas_errprn(8, info, "incY cannot be zero; is set to %d.", incY); if (!incW) info = cblas_errprn(11, info, "incW cannot be zero; is set to %d.", incW); if (!incZ) info = cblas_errprn(13, info, "incZ cannot be zero; is set to %d.", incZ); if (Order == CblasColMajor) { if (lda < M || lda < 1) info = cblas_errprn(15, info, "lda must be >= MAX(M,1): lda=%d M=%d", lda, M); } else if (Order == CblasRowMajor) { if (lda < N || lda < 1) info = cblas_errprn(15, info, "lda must be >= MAX(N,1): lda=%d M=%d", lda, N); } else info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (info != 2000) { cblas_xerbla(info, "cblas_cger2c", ""); return; } #endif if (incX < 0) x += (1-M)*incX<<1; if (incY < 0) y += (1-N)*incY<<1; if (incW < 0) w += (1-M)*incW<<1; if (incZ < 0) z += (1-N)*incZ<<1; if (Order == CblasColMajor) ATL_cger2c(M, N, alpha, x, incX, y, incY, beta, w, incW, z, incZ, A, lda); else { vy = malloc(ATL_Cachelen+ATL_Cachelen + ATL_MulBySize(N+N)); ATL_assert(vy); y0 = ATL_AlignPtr(vy); z0 = y0 + N; z0 = ATL_AlignPtr(z0); ATL_cmoveConj(N, alpha, y, incY, y0, 1); ATL_cmoveConj(N, alpha, z, incZ, z0, 1); ATL_cger2u(N, M, one, y0, 1, x, incX, beta, w, incW, z, incZ, A, lda); free(vy); } }
int ATL_ormqr (const enum CBLAS_SIDE SIDE, const enum CBLAS_TRANSPOSE TRANS, ATL_CINT M, ATL_CINT N, ATL_CINT K, TYPE *A, ATL_CINT lda, const TYPE *TAU, TYPE *C, ATL_CINT ldc, TYPE *WORK, ATL_CINT LWORK) /* * This is the C translation of the standard LAPACK Fortran routine: * SUBROUTINE ATL_ormqr( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, * WORK, LWORK, INFO ) * * ATL_ormqr.c : * int ATL_ormqr(const enum CBLAS_SIDE SIDE SIDE, * const enum CBLAS_TRANSPOSE TRANS, ATL_CINT M, ATL_CINT N, * ATL_CINT K, TYPE * A, ATL_CINT lda,TYPE * TAU, TYPE * C, ATL_CINT ldc, * TYPE * WORK, ATL_CINT LWORK) * * NOTE : ATL_ormqr.c will get compiled to four precisions * single precision real, double precision real * single precision complex, double precision complex * * * * Purpose * ======= * * ATL_ormqr overwrites the general real M-by-N matrix C with * * SIDE = 'L' SIDE = 'R' * TRANS = 'N': Q * C C * Q * TRANS = 'T': Q**T * C C * Q**T * * where Q is, * a real orthogonal matrix defined as the product of k * elementary reflectors * * Q = H(1) H(2) . . . H(k) * * OR * a complex unitary matrix defined as a product of k * elementary reflectors * * Q = H(1) H(2) . . . H(k) * * as returned by ATLL_geqrf.c. Q is of order M if SIDE = 'L' and of order N * if SIDE = 'R'. * * Arguments * ========= * * SIDE (input) CHARACTER*1 * = 'L': apply Q or Q**T from the Left; * = 'R': apply Q or Q**T from the Right. * * TRANS (input) CHARACTER*1 * = 'N': No transpose, apply Q; * = 'T': Transpose, apply Q**T. * * M (input) INTEGER * The number of rows of the matrix C. M >= 0. * * N (input) INTEGER * The number of columns of the matrix C. N >= 0. * * K (input) INTEGER * The number of elementary reflectors whose product defines * the matrix Q. * If SIDE = 'L', M >= K >= 0; * if SIDE = 'R', N >= K >= 0. * * A (input) array, dimension (LDA,K) * The i-th column must contain the vector which defines the * elementary reflector H(i), for i = 1,2,...,k, as returned by * DGEQRF in the first k columns of its array argument A. * A is modified by the routine but restored on exit. * * lda (input) INTEGER * The leading dimension of the array A. * If SIDE = 'L', LDA >= max(1,M); * if SIDE = 'R', LDA >= max(1,N). * * TAU (input) array, dimension (K) * TAU(i) must contain the scalar factor of the elementary * reflector H(i), as returned by ATL_geqrf.c. * * C (input/output) array, dimension (LDC,N) * On entry, the M-by-N matrix C. * On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q. * * ldc (input) INTEGER * The leading dimension of the array C. LDC >= max(1,M). * * WORK (workspace/output) array, dimension (MAX(1,LWORK)) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (input) INTEGER * The dimension of the array WORK. * If SIDE = 'L', LWORK >= max(1,N); * if SIDE = 'R', LWORK >= max(1,M). * For optimum performance LWORK >= N*NB if SIDE = 'L', and * LWORK >= M*NB if SIDE = 'R', where NB is the optimal * blocksize. * * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued by XERBLA. * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value */ { ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N); ATL_INT n, nb, j, ib, mi, ni, ic, jc ; TYPE *ws_QR2, *ws_T, *ws_larfb; /* Workspace for QR2,T, larfb */ void *vp=NULL; nb = clapack_ilaenv(LAIS_OPT_NB, LAormqr, MYOPT+LARight+LAUpper, M, N, K,-1); /* * If it is a workspace query, return the size of work required. * wrksz = wrksz of ATL_larfb + ATL_larft + ATL_geqr2 */ if (LWORK < 0) { if(SIDE == CblasLeft) { *WORK = ( N*nb + nb*nb + maxMN ) ; } else { *WORK = ( M*nb + nb*nb + maxMN ) ; } return(0); } else if (M < 1 || N < 1) /* quick return if no work to do */ return(0); /* * If the user gives us too little space, see if we can allocate it ourselves */ else { if(SIDE == CblasLeft) { if (LWORK < (N*nb + nb*nb + maxMN)) { vp = malloc(ATL_MulBySize(N*nb + nb*nb + maxMN) + ATL_Cachelen); if (!vp) return(-7); WORK = ATL_AlignPtr(vp); } } else { if (LWORK < (M*nb + nb*nb + maxMN)) { vp = malloc(ATL_MulBySize(M*nb + nb*nb + maxMN) + ATL_Cachelen); if (!vp) return(-7); WORK = ATL_AlignPtr(vp); } } /* if CblasRight */ } /* * Assign workspace areas for ATL_larft, ATL_geqr2, ATL_larfb */ ws_T = WORK; /* T at begining of work */ ws_QR2 = WORK +(nb SHIFT)*nb; /* After T Work space */ ws_larfb = ws_QR2 + (maxMN SHIFT); /* After workspace for T and QR2 */ if (SIDE == CblasLeft) { if ( TRANS == CblasNoTrans ) { j = (K/nb)*nb; if (j == K) { j=K -nb; } for (j; j >= 0; j = j - nb) { ib = nb; if ((j+nb) > K) { ib = K - j; } /* * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) */ ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1), lda, TAU+(j SHIFT), ws_T, ib); /* * H or H' is applied to C(i:m,1:n) */ ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore, (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib, C+(j SHIFT), ldc, ws_larfb, N); } /* for */ } /* CblasNoTrans */ else { for (j = 0 ; j < K; j = j + nb) { ib = Mmin(K-j, nb); /* * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) */ ATL_larft(LAForward, LAColumnStore, M-j, ib, A+(j SHIFT)*(lda+1), lda, TAU+(j SHIFT), ws_T, ib); /* * H or H' is applied to C(i:m,1:n) */ ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore, (M-j), N, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib, C+(j SHIFT), ldc, ws_larfb, N); } /* for */ } /* CblasNoTran */ } /* cblasLeft */ else { if ( TRANS == CblasNoTrans ) { for (j = 0 ; j < K; j = j + nb) { ib = Mmin(K-j, nb); /* * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) */ ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1), lda, TAU+(j SHIFT), ws_T, ib); /* * H or H' is applied to C(1:m,i:n) */ ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore, M, N-j, ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib, C+((j SHIFT)*ldc), ldc, ws_larfb, M); } /* for */ } else { j = (K/nb)*nb; if (j == K) { j=K -nb; } for (j; j >= 0; j = j - nb) { ib = nb; if ((j+nb) > K) { ib = K - j; } /* * Form the triangular factor of the block reflector * H = H(i) H(i+1) . . . H(i+ib-1) */ ATL_larft(LAForward, LAColumnStore, N-j, ib, A+(j SHIFT)*(lda+1), lda, TAU+(j SHIFT), ws_T, ib); /* * H or H' is applied to C(1:m,i:n) */ ATL_larfb(SIDE, TRANS, LAForward, LAColumnStore, M, N-j , ib, A+(j SHIFT)*(lda+1), lda, ws_T, ib, C+((j SHIFT)*ldc) , ldc, ws_larfb, M); } /* for */ } /* Cblas Tran on Right */ } if (vp) free(vp); return(0); } /* END ATL_ormqr */
int Mjoin(PATL,mmJKI)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc) /* * This gemm is for small K, so we build gemm out of AXPY (outer product) * rather than dot (inner product). */ { int Mp, mp, m, k, ldaa=lda; void *vA=NULL; TYPE *pA; const TYPE CONE[2]={ATL_rone, ATL_rzero}, CNONE[2]={ATL_rnone, ATL_rzero}; const SCALAR alp=alpha; /* * Compute M partition necessary to promote reuse in the L1 cache. Check * NB^2 in addition to L1elts, to catch machines where L1 is not used by FPU. * If this gives a small Mp, use CacheEdge instead (reuse in L2 instead of L1). */ Mp = NB*NB; m = ATL_L1elts >> 1; Mp = (m > Mp) ? m : Mp; Mp /= ((K+2)<<1); if (Mp < 128) { #if !defined(CacheEdge) || CacheEdge == 0 Mp = M; #else Mp = (CacheEdge) / ((K+2)*ATL_sizeof); if (Mp < 128) Mp = M; #endif } if (Mp > M) Mp = M; /* * Change Mp if remainder is very small */ else { Mp -= 16; /* small safety margin on filling cache */ mp = M / Mp; m = M - mp*Mp; if (m && m < 32) Mp += (m+mp-1)/mp; } /* * If A not in NoTrans format, need to copy so it can use axpy wt stride=1. * NOTE: this routine should not be called when you can't afford this copy */ if (TA != AtlasNoTrans) { vA = malloc(ATL_Cachelen + Mp*ATL_MulBySize(K)); if (!vA) return(-1); pA = ATL_AlignPtr(vA); alp = CONE; ldaa = Mp; pA += Mp+Mp; } else pA = (TYPE *) A; for (m=0; m < M; m += Mp) { mp = M - m; if (mp > Mp) mp = Mp; /* * If the thing is in Trans format, copy to NoTrans */ if (vA) { pA -= (Mp+Mp); if (TA == AtlasConjTrans) { for (k=0; k < K; k++) { Mjoin(PATL,copy)(mp, A+k+k, lda, pA+((k*ldaa)<<1), 1); Mjoin(PATLU,scal)(mp, ATL_rnone, pA+1+((k*ldaa)<<1), 2); if (!SCALAR_IS_ONE(alpha)) Mjoin(PATL,scal)(mp, alpha, pA+((k*ldaa)<<1), 1); } } else { for (k=0; k < K; k++) Mjoin(PATL,cpsc)(mp, alpha, A+k+k, lda, pA+((k*ldaa)<<1), 1); } A += mp*(lda+lda); } Mjoin(PATL,mm_axpy)(AtlasNoTrans, TB, mp, N, K, alp, pA, ldaa, B, ldb, beta, C, ldc); pA += mp+mp; C += mp+mp; } if (vA) free(vA); return(0); }
void ATL_her(const enum ATLAS_UPLO Uplo, ATL_CINT N, const TYPE alpha, const TYPE *X, ATL_CINT incX, TYPE *A, ATL_CINT lda) { const TYPE calpha[2] = {alpha, ATL_rzero}; void *vp=NULL; TYPE *x, *xt; ATL_r1kern_t gerk; ATL_INT CacheElts; const int ALP1 = (alpha == ATL_rone); int COPYX, COPYXt; int mu, nu, minM, minN, alignX, alignXt, FNU, ALIGNX2A; if (N < 1 || (alpha == ATL_rzero)) return; /* * For very small problems, avoid overhead of func calls & data copy */ if (N < 50) { Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda); return; } /* * Determine the GER kernel to use, and its parameters */ gerk = ATL_GetR1Kern(N-ATL_s1L_NU, ATL_s1L_NU, A, lda, &mu, &nu, &minM, &minN, &alignX, &ALIGNX2A, &alignXt, &FNU, &CacheElts); /* * Determine if we need to copy the vectors */ COPYX = (incX != 1); if (!COPYX) /* may still need to copy due to alignment issues */ { /* * ATL_Cachelen is the highest alignment that can be requested, so * make X's % with Cachelen match that of A if you want A & X to have * the same alignment */ if (ALIGNX2A) { size_t t1 = (size_t) A, t2 = (size_t) X; COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) != (t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2))); } else if (alignX) { size_t t1 = (size_t) X; COPYX = ((t1/alignX)*alignX != t1); } } vp = malloc((ATL_Cachelen+ATL_MulBySize(N))*(1+COPYX)); if (!vp) { Mjoin(PATL,refher)(Uplo, N, alpha, X, incX, A, lda); return; } xt = ATL_AlignPtr(vp); if (COPYX) { x = xt + N+N; x = ALIGNX2A ? ATL_Align2Ptr(x, A) : ATL_AlignPtr(x); Mjoin(PATL,copy)(N, X, incX, x, 1); } else x = (TYPE*) X; if (ALP1) Mjoin(PATL,copyConj)(N, X, incX, xt, 1); else Mjoin(PATL,moveConj)(N, calpha, X, incX, xt, 1); if (Uplo == AtlasUpper) Mjoin(PATL,her_kU)(gerk, N, alpha, x, xt, A, lda); else Mjoin(PATL,her_kL)(gerk, N, alpha, x, xt, A, lda); if (vp) free(vp); }
int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M0, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc) /* * Copy matmul algorithm, copies A and B on-the-fly * If M < 0, allocates only (MB+NB)*KB workspace */ { void *v=NULL; const TYPE *a=A; TYPE *pA, *pB, *pB0; MAT2BLK2 A2blk, B2blk; NBMM0 NBmm0, NBmm1, pNBmm0; const int M = (M0 >= 0) ? M0 : -M0; int nkblks, nmblks, nnblks, mr, nr, kr, KR, bigK, h, i, j, ZEROC; size_t incAk, incBk, incAm, incBn, incAW, incAWp, incBW, incBWp, incW; /* * If both M and N <= NB, and one of them is not full, call BPP, which * can sometimes avoid doing cleanup forall cases */ if (M <= MB && N <= NB && (M != MB || N != NB)) return(Mjoin(PATL,mmBPP)(TA, TB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)); /* * If these workspace increments are 0, we do JIT NBxNB copies instead of * copying entire array/panel. Don't copy mat if you can't reuse it. */ if (M0 > 0) { incAW = (N > NB) ? KB*MB : 0; incBW = (M > NB) ? KB*NB : 0; } else /* allocate in minimal space */ incAW = incBW = 0; nmblks = M/MB; nnblks = N/NB; nkblks = K/KB; mr = M - nmblks*MB; nr = N - nnblks*NB; kr = K - nkblks*KB; /* * K-loop is special, in that we don't call user cleanup, must explicitly zero, * and K-cleanup is typically slower even for generated kernels. Therefore, * allow extra leaway for doing extra flops. Note error is unaffected by * any of these extra flops: K-loop has elts zeroed, and multiplying zeros * and adding in zeros doesn't add to error */ KR = (kr && kr+4 >= KB) ? KB : kr; bigK = nkblks*KB+KR; if (incAW) { i = MB*bigK; incAWp = KB*mr; } else { i = MB*KB; incAWp = 0; } if (incBW) { incBWp = KB*nr; incW = bigK*NB; i += N*bigK; } else { incBWp = incW = 0; i += NB*KB; } i *= sizeof(TYPE); if (i <= ATL_MaxMalloc || !(incAW | incBW)) v = malloc(ATL_Cachelen+i); if (!v) return(-1); pA = ATL_AlignPtr(v); pB0 = pA + (incAW ? bigK*MB : KB*MB); if (TA == AtlasNoTrans) { A2blk = Mjoin(PATL,gemoveT); incAk = lda*KB; incAm = MB; } else { A2blk = Mjoin(PATL,gemove); incAk = KB; incAm = MB*lda; } if (TB == AtlasNoTrans) { B2blk = Mjoin(PATL,gemove); incBk = KB; incBn = NB*ldb; } else { B2blk = Mjoin(PATL,gemoveT); incBk = ldb*KB; incBn = NB; } /* * See what kernel we're calling */ if ( SCALAR_IS_ONE(beta) ) { NBmm0 = NBmm_b1; pNBmm0 = Mjoin(PATL,pNBmm_b1); } else if ( SCALAR_IS_ZERO(beta) ) { NBmm0 = NBmm_b0; pNBmm0 = Mjoin(PATL,pNBmm_b0); } else { NBmm0 = NBmm_bX; pNBmm0 = Mjoin(PATL,pNBmm_bX); } KR = (KR == KB) ? KB : 0; ZEROC = !KR && SCALAR_IS_ZERO(beta); for (i=0; i < nmblks; i++) { a = A+i*incAm; pB = pB0; /* foreach row-panel of A, start at B's copy space */ for (j=nnblks; j; j--) { Mjoin(PATL,mmK)(MB, MB, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta, a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBW, C, ldc, A2blk, B2blk, NBmm0, NBmm_b1); B += incBn; /* copy next col panel of B */ pB += incW; /* to next col panel of pB */ a = (incAW ? NULL : a); /* reuse row-panel of A if copied */ C += ldc*NB; } if (nr) { if (ZEROC) Mjoin(PATL,gezero)(MB, nr, C, ldc); Mjoin(PATL,mmK)(MB, MB, nr, nr, nkblks, kr, KR, ATL_rone, alpha, beta, a, lda, incAk, pA, incAW, B, ldb, incBk, pB, incBWp, C, ldc, A2blk, B2blk, pNBmm0, Mjoin(PATL,pNBmm_b1)); } C += MB - nnblks*ldc*NB; if (incBW) { B = NULL; /* finished copying B */ incBn = 0; } else B -= nnblks*incBn; } if (mr) { a = A + nmblks*incAm; pB = pB0; if ( SCALAR_IS_ONE(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b1); else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = Mjoin(PATL,pMBmm_b0); else NBmm0 = Mjoin(PATL,pMBmm_bX); for (j=nnblks; j; j--) { Mjoin(PATL,mmK)(mr, mr, NB, NB, nkblks, kr, KR, ATL_rone, alpha, beta, a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBW, C, ldc, A2blk, B2blk, NBmm0, Mjoin(PATL,pMBmm_b1)); B += incBn; /* copy next col panel of B */ pB += incW; /* to next col panel of pB */ a = (incAW ? NULL : a); /* reuse row-panel of A if copied */ C += ldc*NB; } if (nr) { if ( SCALAR_IS_ZERO(beta) ) Mjoin(PATL,gezero)(mr, nr, C, ldc); Mjoin(PATL,mmK)(mr, mr, nr, nr, nkblks, kr, (incAW | incBW) ? KR:0, ATL_rone, alpha, beta, a, lda, incAk, pA, incAWp, B, ldb, incBk, pB, incBWp, C, ldc, A2blk, B2blk, Mjoin(PATL,pKBmm), Mjoin(PATL,pKBmm)); } } free(v); return(0); }
int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N0, const int K, const SCALAR alpha, const TYPE *A, const int lda0, const TYPE *B, const int ldb0, const SCALAR beta, TYPE *C, const int ldc0) { size_t incA, incB, incC; const size_t lda=lda0, ldb=ldb0, ldc=ldc0; const size_t incK = ATL_MulByNB((size_t)K); int N = N0; int nMb, nNb, nKb, ib, jb, kb, jb2, h, i, j, k, n; void *vA=NULL, *vC=NULL; TYPE *pA, *pB, *pC; MAT2BLK A2blk, B2blk; PUTBLK putblk; NBMM0 NBmm0; nMb = ATL_DivByNB(M); nNb = ATL_DivByNB(N); nKb = ATL_DivByNB(K); ib = M - ATL_MulByNB(nMb); jb = N - ATL_MulByNB(nNb); kb = K - ATL_MulByNB(nKb); /* * If K sufficiently large, write to temporary C as safety measure; otherwise * write directly to C */ if (nKb < 12) { putblk = NULL; pC = C; if ( SCALAR_IS_ONE(beta) ) NBmm0 = NBmm_b1; else if ( SCALAR_IS_ZERO(beta) ) NBmm0 = NBmm_b0; else NBmm0 = NBmm_bX; } else { NBmm0 = NBmm_b0; vC = malloc(ATL_Cachelen + ATL_MulBySize(NBNB)); if (!vC) return(-1); pC = ATL_AlignPtr(vC); if ( SCALAR_IS_ONE(beta) ) putblk = Mjoin(PATL,putblk_b1); else if ( SCALAR_IS_ZERO(beta) ) putblk = Mjoin(PATL,putblk_b0); else if ( SCALAR_IS_NONE(beta) ) putblk = Mjoin(PATL,putblk_bn1); else putblk = Mjoin(PATL,putblk_bX); } /* * Special case if we don't need to copy one or more input matrix */ if (K == NB && TB == AtlasNoTrans && ldb == NB && ATL_DataIsMinAligned(B)) { if (lda == NB && TA == AtlasTrans && SCALAR_IS_ONE(alpha) && ATL_DataIsMinAligned(A)) { i = NBNB; pA = (TYPE *) A; A = NULL; A2blk = NULL; incA = 0; } else { vA = malloc(ATL_Cachelen + ATL_MulBySize(incK)); if (!vA) { free(vC); return(-1); } pA = ATL_AlignPtr(vA); if (TA == AtlasNoTrans) { incA = NB; if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,row2blkT_a1); else A2blk = Mjoin(PATL,row2blkT_aX); } else { incA = ATL_MulByNB(lda); if ( SCALAR_IS_ONE(alpha) ) A2blk = Mjoin(PATL,col2blk_a1); else A2blk = Mjoin(PATL,col2blk_aX); } } Mjoin(PATL,mmIJK2)(K, nMb, nNb, nKb, ib, jb, kb, alpha, A, lda, pA, incA, A2blk, B, beta, C, ldc, pC, putblk, NBmm0); if (vA) free(vA); if (vC) free(vC); return(0); } i = ATL_Cachelen + ATL_MulBySize(N*K + incK); if (i <= ATL_MaxMalloc) vA = malloc(i); if (!vA) { if (TA == AtlasNoTrans && TB == AtlasNoTrans) { if (vC) free(vC); return(1); } if (jb) n = nNb + 1; else n = nNb; for (j=2; !vA; j++) { k = n / j; if (k < 1) break; if (k*j < n) k++; h = ATL_Cachelen + ATL_MulBySize((k+1)*incK); if (h <= ATL_MaxMalloc) vA = malloc(h); } if (!vA) { if (vC) free(vC); return(-1); } n = ATL_MulByNB(k); jb2 = 0; } else { jb2 = jb; k = nNb; n = N; } pA = ATL_AlignPtr(vA); if (TB == AtlasNoTrans) { incB = ldb*n; if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,col2blk2_a1); else B2blk = Mjoin(PATL,col2blk2_aX); } else { incB = n; if ( SCALAR_IS_ONE(alpha) ) B2blk = Mjoin(PATL,row2blkT2_a1); else B2blk = Mjoin(PATL,row2blkT2_aX); } if (TA == AtlasNoTrans) { incA = NB; A2blk = Mjoin(PATL,row2blkT_a1); } else { incA = ATL_MulByNB(lda); A2blk = Mjoin(PATL,col2blk_a1); } incC = ldc*n; pB = pA + incK; do { if (TB == AtlasNoTrans) B2blk(K, n, B, ldb, pB, alpha); else B2blk(n, K, B, ldb, pB, alpha); Mjoin(PATL,mmIJK2)(K, nMb, k, nKb, ib, jb2, kb, alpha, A, lda, pA, incA, A2blk, pB, beta, C, ldc, pC, putblk, NBmm0); N -= n; nNb -= k; if (N < n) { jb2 = jb; n = N; k = nNb; } C += incC; B += incB; if (!putblk) pC = C; } while (N); if (vC) free(vC); free(vA); return(0); }
int ATL_gelqf(ATL_CINT M, ATL_CINT N, TYPE *A, ATL_CINT lda, TYPE *TAU, TYPE *WORK, ATL_CINT LWORK) /* * This is the C translation of the standard LAPACK Fortran routine: * SUBROUTINE gelqf( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * * ATL_gelqf.c : * int ATL_gelqf(int M, int N, TYPE *A, int LDA, TYPE *TAU, * TYPE *WORK, int LWORK) * * Purpose * ======= * * ATL_gelqf computes an LQ factorization of a real/complex M-by-N matrix A: * A = L * Q. * * Compared to LAPACK, here, a recursive panel factorization is implemented. * Refer to ATL_gelqr.c andd ATL_larft.c for details. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) array, dimension (LDA,N) * On entry, the M-by-N matrix A. * On exit, the elements on and above the diagonal of the array * contain the min(M,N)-by-N upper trapezoidal matrix R (R is * upper triangular if m >= n); the elements below the diagonal, * with the array TAU, represent the orthogonal matrix Q as a * product of min(m,n) elementary reflectors (see Further * Details). * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * TAU (output) array, dimension (min(M,N)) * The scalar factors of the elementary reflectors (see Further * Details). * * WORK (workspace/output) array, dimension (MAX(1,LWORK)) * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. * * LWORK (input) INTEGER * The dimension of the array WORK. LWORK >= max(1,N). * For optimum performance LWORK >= N*NB, where NB is * the optimal blocksize. * * If LWORK = -1, then a workspace query is assumed; the routine * only calculates the optimal size of the WORK array, returns * this value as the first entry of the WORK array, and no error * message related to LWORK is issued . * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Further Details * =============== * * The matrix Q is represented as a product of elementary reflectors * * Q = H(1) H(2) . . . H(k), where k = min(m,n). * * Each H(i) has the form * * H(i) = I - tau * v * v' (For Real precision) * H(i) = I - tau * v * conjugate(v)' (For Complex precision) * * where tau is a real/complex scalar, and v is a real/complex vector with * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), * and tau in TAU(i). * */ { ATL_CINT minMN = Mmin(M, N), maxMN = Mmax(M, N); ATL_INT n, nb, j; TYPE *ws_LQ2, *ws_T, *ws_larfb; /* Workspace level 2, T, larfb. */ void *vp=NULL; /* For transpose function, may need type-appropriate 'ONE' for alpha. */ #ifdef TREAL const TYPE ONE = ATL_rone; #else const TYPE ONE[2] = {ATL_rone, ATL_rzero}; #endif TYPE *ws_CP=NULL, *ws_CPRaw=NULL; ATL_INT ldCP; #if defined(ATL_TUNING) /*-------------------------------------------------------------------------*/ /* For tuning recursion crossover points, the blocking factor is set by */ /* la2xover, the tuning program for that purpose. */ /*-------------------------------------------------------------------------*/ if (ATL_PanelTune) nb=ATL_PanelTune; else #endif /* ATL_TUNING */ nb = clapack_ilaenv(LAIS_OPT_NB, LAgeqrf, MYOPT+LALeft+LALower, M, N,-1,-1); /* * If it is a workspace query, return the size of work required. * wrksz = wrksz of ATL_larfb + ATL_larft + ATL_gelq2 */ if (LWORK < 0) { *WORK = ( maxMN*nb + nb*nb + maxMN ) ; return(0); } else if (M < 1 || N < 1) /* quick return if no work to do */ return(0); /* * LQ is the transpose of QR: We use this to go from row-major LQ to * col-major QR, typically faster. Here, if we are square and large, * we transpose the whole matrix in-place and then transpose it back. * This should be a tunable parameter; perhaps if the matrix fits in * L1 or L2? (Note by Tony C, short on time to conduct tuning). */ if (M == N && N >= 128) { Mjoin(PATL,sqtrans)(N, A, lda); n = ATL_geqrf(M, N, A, lda, TAU, WORK, LWORK); Mjoin(PATL,sqtrans)(N, A, lda); /* Take the conjugate for Complex TAU. */ #ifdef TCPLX ATL_INT i; for (i=1; i<(minMN<<1); i+=2) *(TAU+i) = 0.-*(TAU+i); /* Negate imaginary part. */ #endif return(n); } /* * If the user gives us too little space, see if we can allocate it ourselves */ else if (LWORK < (maxMN*nb + nb*nb + maxMN)) { vp = malloc(ATL_MulBySize(maxMN*nb + nb*nb + maxMN) + ATL_Cachelen); if (!vp) return(-7); WORK = ATL_AlignPtr(vp); } /* * Assign workspace areas for ATL_larft, ATL_gelq2, ATL_larfb */ ws_T = WORK; /* T at begining of work */ ws_LQ2 = WORK +(nb SHIFT)*nb; /* After T Work space */ ws_larfb = ws_LQ2 + (maxMN SHIFT); /* After workspace for T and LQ2 */ /* * Leave one iteration to be done outside loop, so we don't build T * Any loop iterations are therefore known to be of size nb (no partial blocks) */ n = (minMN / nb) * nb; if (n == minMN) n -= Mmin(nb, minMN); /* when n is a multiple of nb, reduce by nb */ #if !defined(ATL_USEPTHREADS) /* If no PCA, try to copy up front. */ j = M - n; j = Mmax(nb, j); ldCP = (N&7) ? (((N+7)>>3)<<3) : N; ws_CPRaw = malloc(ATL_MulBySize(ldCP)*j + ATL_Cachelen); if (ws_CPRaw) ws_CP=ATL_AlignPtr(ws_CPRaw); /* Align if malloced. */ #endif /* Serial Mode */ for (j=0; j < n; j += nb) { #if !defined(ATL_USEPTHREADS) /* If no PCA it won't copy. Try it here. */ /* If we got our copy workspace, transpose panel before recursion. */ if (ws_CP) /* If workspace exists. */ { int ci, cj; /* for conjugation. */ ldCP = N-j; if (ldCP&7) ldCP = ((ldCP+7)>>3)<<3; Mjoin(PATL,gemoveT)(N-j, nb, ONE, A+(j SHIFT)*(lda+1), lda, ws_CP, ldCP); ATL_assert(!ATL_geqrr(N-j, nb, ws_CP, ldCP, TAU+(j SHIFT), ws_LQ2, ws_T, nb, ws_larfb, 1)); Mjoin(PATL,gemoveT)(nb, N-j, ONE, ws_CP, ldCP, A+(j SHIFT)*(lda+1), lda); #if defined(TCPLX) /* conj upTri T, TAU. */ for (cj=0; cj<nb; cj++) /* column loop... */ { TAU[((j+cj) SHIFT)+1] = 0.-TAU[((j+cj) SHIFT)+1]; for (ci=0; ci<=cj; ci++) /* row loop... */ ws_T[((ci+cj*nb) SHIFT)+1] = 0.-ws_T[((ci+cj*nb) SHIFT)+1]; } #endif /* defined(TCPLX) */ } else /* copy workspace was not allocated, use native. */ #endif /* Serial Mode (No PCA) */ { ATL_assert(!ATL_gelqr(nb, N-j, A+(j SHIFT)*(lda+1), lda, TAU+(j SHIFT), ws_LQ2, ws_T, nb, ws_larfb, 1)); } if (j+nb < M) /* if there are more cols left to bottom, update them */ { /* * ====================================================================== * Form the triangular factor of the block reflector * After gelqr, ws_T contains 'T', the nb x nb triangular factor 'T' * of the block reflector. It is an output used in the next call, dlarfb. * H = Id - Y'*T*Y, with Id=(N-j)x(N-j), Y=(N-j)xNB. * * The ws_T array used above is an input to dlarfb; it is 'T' in * that routine, and LDT x K (translates here to LDWORK x NB). * WORK is an LDWORK x NB workspace (not input or output). * ====================================================================== */ ATL_larfb(CblasRight, CblasNoTrans, LAForward, LARowStore, M-j-nb, N-j, nb, A+(j SHIFT)*(lda+1), lda, ws_T, nb, A+((j SHIFT)*(lda+1))+(nb SHIFT), lda, ws_larfb, M); } }
void Mjoin( PATL, tbsv ) ( const enum ATLAS_UPLO UPLO, const enum ATLAS_TRANS TRANS, const enum ATLAS_DIAG DIAG, const int N, const int K, const TYPE * A, const int LDA, TYPE * X, const int INCX ) { /* * Purpose * ======= * * Mjoin( PATL, tbsv ) solves one of the systems of equations * * A * x = b, or conjg( A ) * x = b, or * * A'* x = b, or conjg( A' ) * x = b, * * where b and x are n-element vectors and A is an n by n unit, or non- * unit, upper or lower triangular band matrix, with (k+1) diagonals. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * This is a blocked version of the algorithm. For a more detailed des- * cription of the arguments of this function, see the reference imple- * mentation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ void * vx = NULL; TYPE * x; /* .. * .. Executable Statements .. * */ if( N == 0 ) return; Mjoin(PATL,reftbsv)(UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX); return; if( INCX == 1 ) { x = X; } else { vx = (TYPE *)malloc( ATL_Cachelen + ATL_MulBySize( N ) ); ATL_assert( vx ); x = ATL_AlignPtr( vx ); Mjoin( PATL, copy )( N, X, INCX, x, 1 ); } #ifdef TREAL if( ( TRANS == AtlasNoTrans ) || ( TRANS == AtlasConj ) ) #else if( TRANS == AtlasNoTrans ) #endif { if( UPLO == AtlasUpper ) Mjoin( PATL, tbsvUN )( DIAG, N, K, A, LDA, x ); else Mjoin( PATL, tbsvLN )( DIAG, N, K, A, LDA, x ); } #ifdef TCPLX else if( TRANS == AtlasConj ) { if( UPLO == AtlasUpper ) Mjoin( PATL, tbsvUC )( DIAG, N, K, A, LDA, x ); else Mjoin( PATL, tbsvLC )( DIAG, N, K, A, LDA, x ); } #endif #ifdef TREAL else #else else if( TRANS == AtlasTrans ) #endif { if( UPLO == AtlasUpper ) Mjoin( PATL, tbsvUT )( DIAG, N, K, A, LDA, x ); else Mjoin( PATL, tbsvLT )( DIAG, N, K, A, LDA, x ); } #ifdef TCPLX else { if( UPLO == AtlasUpper ) Mjoin( PATL, tbsvUH )( DIAG, N, K, A, LDA, x ); else Mjoin( PATL, tbsvLH )( DIAG, N, K, A, LDA, x ); } #endif if( vx ) { Mjoin( PATL, copy )( N, x, 1, X, INCX ); free( vx ); } /* * End of Mjoin( PATL, tbsv ) */ }
void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const int K, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY) { int info = 2000; const enum CBLAS_UPLO ruplo = (Uplo == CblasUpper) ? CblasLower : CblasUpper; void *vx; double *X0, *x = (double*) X; double *y = Y; const double *alp=alpha; const double *bet=beta; double calpha[2], cbeta[2]; const double one[2] = {ATL_rone, ATL_rzero}; calpha[0] = *alp; calpha[1] = -alp[1]; cbeta[0] = *bet; cbeta[1] = -bet[1]; #ifndef NoCblasErrorChecks if (Order != CblasColMajor && Order != CblasRowMajor) info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (Uplo != CblasUpper && Uplo != CblasLower) info = cblas_errprn(2, info, "Uplo must be %d or %d, but is set to %d", CblasUpper, CblasLower, Uplo); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (K < 0) info = cblas_errprn(4, info, "Valid K: 0 < K < N; K=%d, N=%d.", K, N); if (lda < K+1) info = cblas_errprn(7, info, "lda cannot be less than K+1; K=%d, lda=%d\n", K, lda); if (!incX) info = cblas_errprn(9, info, "incX cannot be zero; is set to %d.", incX); if (!incY) info = cblas_errprn(12, info, "incY cannot be zero; is set to %d.", incY); if (info != 2000) { cblas_xerbla(info, "cblas_zhbmv", ""); return; } #endif if (incX < 0) x += (1-N)*incX<<1; if (incY < 0) y += (1-N)*incY<<1; if (Order == CblasColMajor) ATL_zhbmv(Uplo, N, K, alpha, A, lda, x, incX, beta, y, incY); else { vx = malloc(ATL_Cachelen + 2*N*sizeof(double)); ATL_assert(vx); X0 = x; x = ATL_AlignPtr(vx); ATL_zmoveConj(N, calpha, X0, incX, x, 1); if (*bet != ATL_rzero || bet[1] != ATL_rzero) { ATL_zscalConj(N, cbeta, y, incY); ATL_zhbmv(ruplo, N, K, one, A, lda, x, 1, one, y, incY); } else ATL_zhbmv(ruplo, N, K, one, A, lda, x, 1, beta, y, incY); free(vx); ATL_zscalConj(N, one, y, incY); } }
void Mjoin( PATL, hpmv ) ( const enum ATLAS_UPLO UPLO, const int N, const SCALAR ALPHA, const TYPE * A, const TYPE * X, const int INCX, const SCALAR BETA, TYPE * Y, const int INCY ) { /* * Purpose * ======= * * Mjoin( PATL, hpmv ) performs the matrix-vector operation * * y := alpha * A * x + beta * y, * * where alpha and beta are scalars, x and y are n-element vectors and A * is an n by n Hermitian matrix, supplied in packed form. * * This is a blocked version of the algorithm. For a more detailed des- * cription of the arguments of this function, see the reference imple- * mentation in the ATLAS/src/blas/reference directory. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ void (*gpmv0)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); void (*gpmv1)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); void (*gpmvN)( const int, const int, const SCALAR, const TYPE *, const int, const TYPE *, const int, const SCALAR, TYPE *, const int ); #ifdef TREAL TYPE alphaY, beta0; #define one ATL_rone #define zero ATL_rzero #else const TYPE * alphaY, * beta0; const TYPE one [2] = { ATL_rone, ATL_rzero }, zero[2] = { ATL_rzero, ATL_rzero }; #endif void * vx = NULL, * vy = NULL; TYPE * A0, * A1, * x, * x0, * x1, * y, * y00, * y0, * y1; int incXY, incXY1, j, jb, lda, lda0, lda1, mb, mb1, n, nb; /* .. * .. Executable Statements .. * */ if( N == 0 ) return; if( SCALAR_IS_ZERO( ALPHA ) ) { if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY ); return; } if( ( INCX != 1 ) || ( ( INCY == 1 ) && !( SCALAR_IS_ONE( ALPHA ) ) ) ) { vx = (void *)malloc( ATL_Cachelen + ATL_MulBySize( N ) ); ATL_assert( vx ); x = ATL_AlignPtr( vx ); Mjoin( PATL, cpsc )( N, ALPHA, X, INCX, x, 1 ); alphaY = one; } else { x = (TYPE *)(X); alphaY = ALPHA; } if( ( INCY != 1 ) || !( SCALAR_IS_ONE( alphaY ) ) ) { vy = malloc( ATL_Cachelen + ATL_MulBySize( N ) ); ATL_assert( vy ); y00 = y = ATL_AlignPtr( vy ); beta0 = zero; } else { y00 = y = (TYPE *)(Y); beta0 = BETA; } ATL_GetPartSPMV( A, N, &mb, &nb ); mb1 = N - ( ( N - 1 ) / mb ) * mb; incXY1 = (nb SHIFT); if( UPLO == AtlasUpper ) { if( SCALAR_IS_ZERO( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b0_y1 ); else if( SCALAR_IS_ONE ( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 ); else gpmv0 = Mjoin( PATL, gpmvUC_a1_x1_bX_y1 ); gpmv1 = Mjoin( PATL, gpmvUC_a1_x1_b1_y1 ); gpmvN = Mjoin( PATL, gpmvUN_a1_x1_b1_y1 ); lda = 1; lda0 = lda; A0 = (TYPE *)(A); MUpnext( mb, A0, lda0 ); incXY = (mb SHIFT); x0 = x + incXY; y0 = y + incXY; for( n = N - mb; n > 0; n -= mb, x0 += incXY, x += incXY, y0 += incXY, y += incXY ) { Mjoin( PATL, hpmvU )( mb, A, lda, x, beta0, y ); for( j = 0, lda1 = lda0, A1 = A0 - (mb SHIFT), x1 = x0, y1 = y0; j < n; j += nb, x1 += incXY1, y1 += incXY1 ) { jb = n - j; jb = Mmin( jb, nb ); gpmv0( jb, mb, one, A1, lda1, x, 1, beta0, y1, 1 ); gpmvN( mb, jb, one, A1, lda1, x1, 1, one, y, 1 ); MUpnext( jb, A1, lda1 ); A1 -= (jb SHIFT); } beta0 = one; gpmv0 = gpmv1; lda = lda0; A = A0; MUpnext( mb, A0, lda0 ); } Mjoin( PATL, hpmvU )( mb1, A, lda, x, beta0, y ); } else { if( SCALAR_IS_ZERO( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b0_y1 ); else if( SCALAR_IS_ONE ( beta0 ) ) gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 ); else gpmv0 = Mjoin( PATL, gpmvLC_a1_x1_bX_y1 ); gpmv1 = Mjoin( PATL, gpmvLC_a1_x1_b1_y1 ); gpmvN = Mjoin( PATL, gpmvLN_a1_x1_b1_y1 ); lda = N; lda0 = lda; A0 = (TYPE *)(A); MLpnext( N, A, lda ); incXY = (mb SHIFT); x0 = x; y0 = y; for( n = N - mb, x += ((N-mb) SHIFT), y += ((N-mb) SHIFT); n > 0; n -= mb, x -= incXY, y -= incXY ) { MLpprev( mb, A, lda ); Mjoin( PATL, hpmvL )( mb, A, lda, x, beta0, y ); for( j = 0, lda1 = lda0, A1 = A0 + (n SHIFT), x1 = x0, y1 = y0; j < n; j += nb, x1 += incXY1, y1 += incXY1 ) { jb = n - j; jb = Mmin( jb, nb ); gpmv0( jb, mb, one, A1, lda1, x, 1, beta0, y1, 1 ); gpmvN( mb, jb, one, A1, lda1, x1, 1, one, y, 1 ); MLpnext( jb, A1, lda1 ); A1 -= (jb SHIFT); } beta0 = one; gpmv0 = gpmv1; } Mjoin( PATL, hpmvL )( mb1, A0, lda0, x0, beta0, y0 ); } if( vx ) free( vx ); if( vy ) { Mjoin( PATL, axpby )( N, alphaY, y00, 1, BETA, Y, INCY ); free( vy ); } /* * End of Mjoin( PATL, hpmv ) */ }
int Mjoin(PATL,NCmmJIK_c) (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc) /* * JIK loop-ordered matmul with no matrix copy */ { const int Mb = M / MB, Nb = N / NB, Kb = K / KB; const int mr = M - Mb*MB, nr = N - Nb*NB, kr = K - Kb*KB; int incAk, incAm, incAn, incBk, incBm, incBn; #define incCm MB const int incCn = ldc*NB - M + mr; int i, j, k; const TYPE *a=A, *b=B; TYPE *c=C; TYPE btmp; void *vp; TYPE *cp; void (*geadd)(const int M, const int N, const SCALAR scalar, const TYPE *A, const int lda, const SCALAR beta, TYPE *C, const int ldc); void (*mm_bX)(const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc); void (*mm_b1)(const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc); void (*mmcu) (const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc); void (*mm_fixedKcu)(const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc); if (TA == AtlasNoTrans) { if (TB == AtlasNoTrans) { mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,NN),0x0x0),_a1_b0); mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,NN),0x0x0),_a1_b1); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),NN),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),NN),0x0x0_aX_bX); } else { mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_b0); mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_b1); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),NT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),NT),0x0x0_aX_bX); } incAk = lda * KB; incAm = MB - Kb * incAk; incAn = -Mb * MB; } else { if (TB == AtlasNoTrans) { mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_b0); mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_b1); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TN),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TN),0x0x0_aX_bX); } else { mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b0); mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TT),0x0x0_aX_bX); } incAk = KB; incAm = lda*MB - Kb*KB; incAn = -lda*MB*Mb; } if (TB == AtlasNoTrans) { incBk = KB; incBm = -KB*Kb; incBn = ldb*NB; } else { incBk = KB*ldb; incBm = -Kb * incBk; incBn = NB; } if (alpha == ATL_rone) { if (beta == ATL_rzero) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_b0); else if (beta == ATL_rone) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_b1); else geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_bX); } else if (beta == ATL_rzero) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_b0); else if (beta == ATL_rone) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_b1); else geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_bX); vp = malloc(ATL_Cachelen + ATL_MulBySize(MB * NB)); ATL_assert(vp); cp = ATL_AlignPtr(vp); if (mr || nr || kr) for (j=MB*NB, i=0; i != j; i++) cp[i] = ATL_rzero; for (j=Nb; j; j--, a += incAn, b += incBn, c += incCn) { for (i=Mb; i; i--, a += incAm, b += incBm, c += incCm) { if (Kb) { mm_bX(MB, NB, KB, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB); a += incAk; b += incBk; for (k=Kb-1; k; k--, a += incAk, b += incBk) mm_b1(MB, NB, KB, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB); if (kr) mmcu(MB, NB, kr, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB); } else if (kr) { Mjoin(PATL,zero)(MB*NB, cp, 1); /* kill NaN/INF from last time */ mmcu(MB, NB, kr, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB); } geadd(MB, NB, alpha, cp, MB, beta, c, ldc); } } if (mr && N != nr) ATL_assert(Mjoin(PATL,NCmmIJK)(TA, TB, mr, N-nr, K, alpha, A+Mb*(incAm+Kb*incAk), lda, B, ldb, beta, C+Mb*MB, ldc) ==0); if (nr) { for (i=Mb; i; i--, a += incAm, b += incBm, c += incCm) { Mjoin(PATL,zero)(MB*nr, cp, 1); /* kill NaN and INF from last time */ if (Kb) { mm_fixedKcu(MB, nr, KB, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB); a += incAk; b += incBk; for (k=Kb-1; k; k--, a += incAk, b += incBk) mm_fixedKcu(MB, nr, KB, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB); if (kr) mmcu(MB, nr, kr, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB); } else if (kr) mmcu(MB, nr, kr, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB); geadd(MB, nr, alpha, cp, MB, beta, c, ldc); } if (mr) /* cleanup small mr x nr block of C */ { c = C + Mb*MB + ldc*Nb*NB; a = A + Mb*(incAm+Kb*incAk); b = B + Nb*( incBn+(Mb*(incBm+Kb*incBk)) ); Mjoin(PATL,zero)(MB*nr, cp, 1); /* kill NaN and INF from last time */ if (Kb) { mm_fixedKcu(mr, nr, KB, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB); a += incAk; b += incBk; for (k=Kb-1; k; k--, a += incAk, b += incBk) mm_fixedKcu(mr, nr, KB, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB); if (kr) mmcu(mr, nr, kr, ATL_rone, a, lda, b, ldb, ATL_rone, cp, MB); } else if (kr) mmcu(mr, nr, kr, ATL_rone, a, lda, b, ldb, ATL_rzero, cp, MB); geadd(mr, nr, alpha, cp, MB, beta, c, ldc); } } free(vp); return(0); }
void Mjoin(PATL,CtrsmKL) (enum ATLAS_UPLO Uplo, enum ATLAS_TRANS Trans, enum ATLAS_DIAG Diag, const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda, TYPE *B, const int ldb) #endif { TYPE tmp[2], ra, ia; void *vp; TYPE *a; if (N > 0) { if (M > 1) { vp = malloc(ATL_Cachelen + ATL_MulBySize(M)*M); ATL_assert(vp); a = ATL_AlignPtr(vp); Diag = trsmcopy(Uplo, Trans, Diag, M, alpha, A, lda, a); if (Trans != AtlasNoTrans) { if (Uplo == AtlasLower) Uplo = AtlasUpper; else Uplo = AtlasLower; } switch(M) { case 2: if (Uplo == AtlasLower) trsmLL_2(N, a, B, ldb); else trsmLU_2(N, a, B, ldb); break; case 3: if (Uplo == AtlasLower) trsmLL_3(N, a, B, ldb); else trsmLU_3(N, a, B, ldb); break; case 4: if (Uplo == AtlasLower) trsmLL_4(N, a, B, ldb); else trsmLU_4(N, a, B, ldb); break; default: /* this crap should never be used */ tmp[0] = ATL_rone; tmp[1] = ATL_rzero; Mjoin(PATL,cplxinvert)(M, a, M+M+2, a, M+M+2); Mjoin(PATL,reftrsm)(AtlasLeft, Uplo, AtlasNoTrans, Diag, M, N, tmp, a, M, B, ldb); } free(vp); } else if (M == 1) { if (Diag == AtlasUnit) #ifdef Right_ Mjoin(PATL,scal)(N, alpha, B, 1); #else Mjoin(PATL,scal)(N, alpha, B, ldb); #endif else { tmp[0] = A[0]; if (Trans != AtlasConjTrans) tmp[1] = A[1]; else tmp[1] = -A[1]; Mjoin(PATL,cplxinvert)(1, tmp, 2, tmp, 2); /* safe cplx invers */ ra = tmp[0]; ia = tmp[1]; tmp[0] = *alpha * ra - alpha[1] * ia; tmp[1] = *alpha * ia + alpha[1] * ra; #ifdef Right_ Mjoin(PATL,scal)(N, tmp, B, 1); #else Mjoin(PATL,scal)(N, tmp, B, ldb); #endif } } }
int mmcase0(int MFLOP, int CACHESIZE, char TA, char TB, int M, int N, int K, SCALAR alpha, int lda, int ldb, SCALAR beta, int ldc) { char *pc; #ifdef TREAL char *form="%4d %c %c %4d %4d %4d %5.1f %5.1f %6.2f %5.1f %5.2f %3s\n"; #define MALPH alpha #define MBETA beta TYPE betinv, bet=beta; #else #define MALPH *alpha, alpha[1] #define MBETA *beta, beta[1] char *form="%4d %c %c %4d %4d %4d %5.1f %5.1f %5.1f %5.1f %6.2f %6.1f %4.2f %3s\n"; TYPE betinv[2], *bet=beta; #endif int nreps, incA, incB, incC, inc, nmat, k; TYPE *c, *C, *a, *A, *b, *B, *st; int ii, jj, i, j=0, PASSED, nerrs; double t0, t1, t2, t3, mflop, mf, mops; TYPE maxval, f1, ferr; static TYPE feps=0.0; static int itst=1; enum ATLAS_TRANS TAc, TBc; void *vp; #ifdef TCPLX if (*beta == 0.0 && beta[1] == 0.0) betinv[0] = betinv[1] = 0.0; else if (beta[1] == 0.0) { betinv[0] = 1 / *beta; betinv[1] = 0.0; } else { t0 = *beta; t1 = beta[1]; if (Mabs(t1) <= Mabs(t0)) { t2 = t1 / t0; betinv[0] = t0 = 1.0 / (t0 + t1*t2); betinv[1] = -t0 * t2; } else { t2 = t0 / t1; betinv[1] = t0 = -1.0 / (t1 + t0*t2); betinv[0] = -t2 * t0; } } mops = ( ((8.0*M)*N)*K ) / 1000000.0; #else if (beta != 0.0) betinv = 1.0 / beta; else betinv = beta; mops = ( ((2.0*M)*N)*K ) / 1000000.0; #endif nreps = MFLOP / mops; if (nreps < 1) nreps = 1; if (TA == 'n' || TA == 'N') { TAc = AtlasNoTrans; incA = lda * K; } else { if (TA == 'c' || TA == 'C') TAc = AtlasConjTrans; else TAc = AtlasTrans; incA = lda * M; } if (TB == 'n' || TB == 'N') { incB = ldb * N; TBc = AtlasNoTrans; } else { incB = ldb * K; if (TB == 'c' || TB == 'C') TBc = AtlasConjTrans; else TBc = AtlasTrans; } incC = ldc*N; inc = incA + incB + incC; i = M*K + K*N + M*N; /* amount of inc actually referenced */ /* This is a hack; change to use of flushcache instead. */ nmat = ((CACHESIZE/ATL_sizeof) + i)/i; vp = malloc(ATL_MulBySize(nmat*inc)+ATL_Cachelen); ATL_assert(vp); C = c = ATL_AlignPtr(vp); a = A = C + incC; b = B = A + incA; st = C + nmat*inc; matgen(inc, nmat, C, inc, M*N); #ifdef DEBUG printmat("A0", M, K, A, lda); printmat("B0", K, N, B, ldb); printmat("C0", M, N, C, ldc); #endif t0 = time00(); for (k=nreps; k; k--) { trusted_gemm(TAc, TBc, M, N, K, alpha, a, lda, b, ldb, bet, c, ldc); c += inc; a += inc; b += inc; if (c == st) { c = C; a = A; b = B; if (bet == beta) bet = betinv; else bet = beta; } } t1 = time00() - t0; t1 /= nreps; if (t1 <= 0.0) mflop = t1 = 0.0; else /* flop rates actually 8MNK+12MN & 2MNK + 2MN, resp */ mflop = mops / t1; printf(form, itst, TA, TB, M, N, K, MALPH, MBETA, t1, mflop, 1.0, "---"); #ifdef DEBUG printmat("C", M, N, C, ldc); #endif matgen(inc, nmat, C, inc, M*N); t0 = time00(); for (k=nreps; k; k--) { test_gemm(TAc, TBc, M, N, K, alpha, a, lda, b, ldb, bet, c, ldc); c += inc; a += inc; b += inc; if (c == st) { c = C; a = A; b = B; if (bet == beta) bet = betinv; else bet = beta; } } t2 = time00() - t0; t2 /= nreps; if (t2 <= 0.0) t2 = mflop = 0.0; else mflop = mops / t2; pc = "---"; if (t1 == t2) t3 = 1.0; else if (t2 != 0.0) t3 = t1/t2; else t3 = 0.0; printf(form, itst++, TA, TB, M, N, K, MALPH, MBETA, t2, mflop, t3, pc); free(vp); return(1); }
static int ATL_trmvLT ( const enum ATLAS_DIAG Diag, const int nb, ATL_CINT N, const TYPE *A, ATL_CINT lda, TYPE *X, ATL_CINT incX ) /* * RETURNS: 0 if TRMV was performed, non-zero if nothing done */ { static void (*trmvK)(ATL_CINT, const TYPE*, ATL_CINT, const TYPE*, TYPE*); void (*gemv)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT, const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT); void *vp; TYPE *x, *y; const size_t opsize = (N*N+N+N)*sizeof(TYPE)SHIFT; size_t t0; #ifdef TCPLX size_t N2=N+N, lda2 = lda+lda; TYPE one[2] = {ATL_rone, ATL_rzero}; #else #define N2 N #define lda2 lda #define one ATL_rone #endif const size_t incA = ((size_t)lda+1)*(nb SHIFT); ATL_CINT Nnb = ((N-1)/nb)*nb, Nr = N-Nnb; ATL_INT j; if (N < nb+nb) return(1); if (opsize > MY_CE) gemv = Mjoin(PATL,gemvT); else gemv = (opsize <= ATL_MulBySize(ATL_L1elts)) ? Mjoin(PATL,gemvT_L1) : Mjoin(PATL,gemvT_L2); trmvK = (Diag == AtlasNonUnit) ? ATL_trmvLTNk : ATL_trmvLTUk; /* * If X is aligned to Cachelen wt inc=1, use it as y */ t0 = (size_t) X; if (incX == 1 && (ATL_MulByCachelen(ATL_DivByCachelen(t0)) == t0)) { ATL_INT i; vp = malloc(ATL_Cachelen+ATL_MulBySize(N)); if (!vp) return(2); x = ATL_AlignPtr(vp); y = X; for (i=0; i < N2; i++) { x[i] = X[i]; X[i] = ATL_rzero; } } else /* allocate both X and Y */ { vp = malloc((ATL_Cachelen+ATL_MulBySize(N))<<1); if (!vp) return(3); x = ATL_AlignPtr(vp); y = x + N2; y = ATL_AlignPtr(y); Mjoin(PATL,copy)(N, X, incX, x, 1); Mjoin(PATL,zero)(N, y, 1); } for (j=0; j < Nnb; j += nb, A += incA) { #ifdef TCPLX const register size_t j2=j+j, nb2=nb+nb; #else #define j2 j #define nb2 nb #endif trmvK(nb, A, lda, x+j2, y+j2); gemv(N-j-nb, nb, one, A+nb2, lda, x+j2+nb2, 1, one, y+j2, 1); #ifndef TCPLX #undef j2 #undef nb2 #endif } #ifdef TCPLX j += j; #endif trmvK(Nr, A, lda, x+j, y+j); if (y != X) Mjoin(PATL,copy)(N, y, 1, X, incX); free(vp); return(0); }