static void geinv (const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda) { int *ipiv; TYPE *wrk; int lwrk; ipiv = malloc(sizeof(int)*N); ATL_assert(ipiv); #ifdef TimeF77 lwrk = N * Mjoin(PATL,GetNB)(); wrk = malloc(ATL_MulBySize(lwrk)); if (Order == AtlasRowMajor) Mjoin(PATL,tstsqtran)(N, A, lda); ATL_assert(Mjoin(PATL,f77getrf)(AtlasColMajor, N, N, A, lda, ipiv) == 0); ATL_assert(Mjoin(PATL,f77getri) (AtlasColMajor, N, A, lda, ipiv, wrk, &lwrk) == 0); if (Order == AtlasRowMajor) Mjoin(PATL,tstsqtran)(N, A, lda); free(wrk); #elif defined(TimeC) ATL_assert(Mjoin(CLP,getrf)(Order, N, N, A, lda, ipiv) == 0); ATL_assert(Mjoin(CLP,getri)(Order, N, A, lda, ipiv) == 0); #else lwrk = N * Mjoin(PATL,GetNB)(); wrk = malloc(ATL_MulBySize(lwrk)); ATL_assert(Mjoin(PATL,getrf)(Order, N, N, A, lda, ipiv) == 0); ATL_assert(Mjoin(PATL,getri)(Order, N, A, lda, ipiv, wrk, &lwrk) == 0); free(wrk); #endif free(ipiv); }
static TYPE *DupMat(enum ATLAS_ORDER Order, int M, int N, TYPE *A, int lda, int ldc) /* * returns a duplicate of the A matrix, with new leading dimension */ { int i, j, M2; const int ldc2 = (ldc SHIFT), lda2 = (lda SHIFT); TYPE *C; if (Order == CblasRowMajor) { i = M; M = N; N = i; } M2 = M SHIFT; ATL_assert(ldc >= M); C = malloc(ATL_MulBySize(ldc)*N); ATL_assert(C); #if defined(ATL_USEPTHREADS) && !defined(ATL_NONUMATOUCH) ATL_NumaTouchSpread(ATL_MulBySize(ldc)*N, C); #endif for (j=0; j != N; j++) { for (i=0; i != M2; i++) C[i] = A[i]; C += ldc2; A += lda2; } return(C-N*ldc2); }
void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda) { int info = 2000; void *vx, *vy; float *x0, *y0; const float *x=X, *y=Y, *alp=alpha; const float one[2]={ATL_rone, ATL_rzero}; #ifndef NoCblasErrorChecks if (Order != CblasColMajor && Order != CblasRowMajor) info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (Uplo != CblasUpper && Uplo != CblasLower) info = cblas_errprn(2, info, "UPLO must be %d or %d, but is set to %d", CblasUpper, CblasLower, Uplo); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (!incX) info = cblas_errprn(6, info, "incX cannot be zero; is set to %d.", incX); if (!incY) info = cblas_errprn(8, info, "incY cannot be zero; is set to %d.", incY); if (lda < N || lda < 1) info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d N=%d", lda, N); if (info != 2000) { cblas_xerbla(info, "cblas_cher2", ""); return; } #endif if (incX < 0) x += (1-N)*incX<<1; if (incY < 0) y += (1-N)*incY<<1; if (Order == CblasColMajor) ATL_cher2(Uplo, N, alpha, x, incX, y, incY, A, lda); else if (alp[0] != ATL_rzero || alp[1] != ATL_rzero) { vx = malloc(ATL_Cachelen + ATL_MulBySize(N)); vy = malloc(ATL_Cachelen + ATL_MulBySize(N)); ATL_assert(vx != NULL && vy != NULL); x0 = ATL_AlignPtr(vx); y0 = ATL_AlignPtr(vy); ATL_cmoveConj(N, alpha, y, incY, y0, 1); ATL_ccopyConj(N, x, incX, x0, 1); ATL_cher2(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, one, y0, 1, x0, 1, A, lda); free(vx); free(vy); } else ATL_cher2(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, alpha, y, incY, x, incX, A, lda); }
static TYPE uumtest(enum ATLAS_ORDER Order, enum ATLAS_UPLO Uplo, int CacheSize, int N, int lda, double *tim) { TYPE *A, *Ag, *LmLt; double t0, t1; TYPE normA, eps, resid; enum ATLAS_UPLO MyUplo = Uplo; if (Order == CblasRowMajor) { if (Uplo == CblasUpper) MyUplo = CblasLower; else MyUplo = CblasUpper; } eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*N + ATL_MulBySize(N)*N); if (A == NULL) return(-1); Ag = A + lda*(N SHIFT); t0 = ATL_flushcache(CacheSize); lltgen(MyUplo, N, A, lda, N*1029+lda); lltgen(MyUplo, N, Ag, N, N*1029+lda); normA = lltnrm1(MyUplo, N, A, lda); #ifdef DEBUG Mjoin(PATL,geprint)("A", N, N, A, lda); Mjoin(PATL,geprint)("Ag", N, N, Ag, N); #endif t0 = ATL_flushcache(-1); t0 = time00(); test_lauum(Order, Uplo, N, A, lda); t1 = time00() - t0; *tim = t1; t0 = ATL_flushcache(0); ATL_checkpad(MyUplo, N, A, lda); if (Uplo == CblasUpper) LmLt = ATL_UmulUt(Order, N, Ag, N); else LmLt = ATL_LtmulL(Order, N, Ag, N); #ifdef DEBUG Mjoin(PATL,geprint)("A", N, N, A, lda); Mjoin(PATL,geprint)("Ag", N, N, LmLt, N); #endif lltdiff(MyUplo, N, A, lda, LmLt, N); #ifdef DEBUG Mjoin(PATL,geprint)("A-L*Lt", N, N, LmLt, N); #endif resid = lltnrm1(MyUplo, N, LmLt, N) / (normA * eps * N); if (resid > 10.0 || resid != resid) fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid); free(LmLt); free(A); return(resid); }
TYPE *GetGE(int M, int N, int lda) { TYPE *A; A = malloc(ATL_MulBySize(lda)*N); if (A) { #if defined(ATL_USEPTHREADS) && !defined(ATL_NONUMATOUCH) ATL_NumaTouchSpread(ATL_MulBySize(lda)*N, A); #endif Mjoin(PATL,gegen)(M, N, A, lda, M*N+lda); } return(A); }
void Mjoin(Mjoin(Mjoin(PATL,syrk),UploNM),T) (const int N, const int K, const void *valpha, const void *A, const int lda, const void *vbeta, void *C, const int ldc) { void *vc; TYPE *c; #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0, zero=0.0; #else #define alpha valpha const TYPE *beta=vbeta; const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0}; #endif if (K > SYRK_Xover) { vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N); ATL_assert(vc); c = ATL_AlignPtr(vc); CgemmTN(N, N, K, alpha, A, lda, A, lda, zero, c, N); if ( SCALAR_IS_ONE(beta) ) Mjoin(syr_put,_b1)(N, c, beta, C, ldc); else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr_put,_b0)(N, c, beta, C, ldc); #ifdef TCPLX else if ( SCALAR_IS_NONE(beta) ) Mjoin(syr_put,_bn1)(N, c, beta, C, ldc); else if (beta[1] == *zero) Mjoin(syr_put,_bXi0)(N, c, beta, C, ldc); #endif else Mjoin(syr_put,_bX)(N, c, beta, C, ldc); free(vc); } else Mjoin(PATL,refsyrk)(Uplo_, AtlasTrans, N, K, alpha, A, lda, beta, C, ldc); }
TYPE *GetGE(int M, int N, int lda) { TYPE *A; A = malloc(ATL_MulBySize(lda)*N); if (A) Mjoin(PATL,gegen)(M, N, A, lda, M*N+lda); return(A); }
void Mjoin(Mjoin(PATL,trmmL),ATLP) (const int M, const int N, const void *valpha, const void *A, const int lda, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR one=1.0, zero=0.0; #else const TYPE zero[2]={0.0,0.0}; #define alpha valpha #endif void *va; TYPE *a; if (N > TRMM_Xover) { va = malloc(ATL_Cachelen + ATL_MulBySize(M)*M); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(ATL_trcopy,_a1)(M, alpha, A, lda, a); else Mjoin(ATL_trcopy,_aX)(M, alpha, A, lda, a); CAgemmTN(M, N, M, one, a, M, C, ldc, zero, C, ldc); #else ATL_trcopy(M, A, lda, a); CAgemmTN(M, N, M, valpha, a, M, C, ldc, zero, C, ldc); #endif free(va); } else Mjoin(PATL,reftrmm)(AtlasLeft, Uplo_, Trans_, Unit_, M, N, alpha, A, lda, C, ldc); }
int Mjoin(PATL,her2kLN) #endif #endif (const int N, const int K, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { int i; void *vc=NULL; TYPE *c; const TYPE beta =*( (const TYPE *)vbeta ); const TYPE zero[2]={0.0, 0.0}; i = ATL_MulBySize(N)*N; if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i); if (vc == NULL) return(1); c = ATL_AlignPtr(vc); #ifdef Transpose_ ATL_ammm(AtlasConjTrans, AtlasNoTrans, N, N, K, valpha, A, lda, B, ldb, #else ATL_ammm(AtlasNoTrans, AtlasConjTrans, N, N, K, valpha, A, lda, B, ldb, #endif zero, c, N); if ( beta == 1.0 ) Mjoin(her2k_put,_b1)(N, c, vbeta, C, ldc); else if ( beta == 0.0 ) Mjoin(her2k_put,_b0)(N, c, vbeta, C, ldc); else Mjoin(her2k_put,_bXi0)(N, c, vbeta, C, ldc); free(vc); return(0); }
void Mjoin(Mjoin(PATL,symmL),UploNM) (const int M, const int N, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0; #else #define alpha valpha #define beta vbeta #endif TYPE *a; void *va; if (N > SYMM_Xover) { va = malloc(ATL_Cachelen + (ATL_MulBySize(M)*M)); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(M, alpha, A, lda, a); else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(M, alpha, A, lda, a); CgemmTN(M, N, M, one, a, M, B, ldb, beta, C, ldc); #else Mjoin(Mjoin(PATL,sycopy),UploNM)(M, A, lda, a); CgemmTN(M, N, M, valpha, a, M, B, ldb, vbeta, C, ldc); #endif free(va); } else Mjoin(PATL,refsymm)(AtlasLeft, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
void Mjoin(Mjoin(PATL,symmR),UploNM) (const int M, const int N, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0; #else #define alpha valpha #define beta vbeta #endif void *va; TYPE *a; if (M > SYMM_Xover) { va = malloc(ATL_Cachelen + ATL_MulBySize(N)*N); ATL_assert(va); a = ATL_AlignPtr(va); #ifdef TREAL if ( SCALAR_IS_ONE(alpha) ) Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_a1)(N, alpha, A, lda, a); else Mjoin(Mjoin(Mjoin(PATL,sycopy),UploNM),_aX)(N, alpha, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, one, B, ldb, a, N, beta, C, ldc); #else Mjoin(Mjoin(PATL,sycopy),UploNM)(N, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, N, valpha, B, ldb, a, N, vbeta, C, ldc); #endif free(va); } else Mjoin(PATL,refsymm)(AtlasRight, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
static TYPE geresid(enum CBLAS_ORDER Order, int N, TYPE *A, int lda, TYPE *AI, int ldi) /* * returns ||A - AI|| / (N * eps * ||A|| * ||AI||); * for row-major, we are not using 1-norm, since we are adding rows instead * of cols, but it should be an equally good norm, so don't worry about it. */ { TYPE numer, denom, eps; const int ldcp1 = (N+1)SHIFT; TYPE *C; int i; #ifdef TREAL TYPE one = ATL_rone, zero = ATL_rzero; #else TYPE one[2] = {ATL_rone, ATL_rzero}, zero[2] = {ATL_rzero, ATL_rzero}; #endif eps = Mjoin(PATL,epsilon)(); C = malloc(N*ATL_MulBySize(N)); ATL_assert(C); cblas_gemm(Order, CblasNoTrans, CblasNoTrans, N, N, N, one, A, lda, AI, ldi, zero, C, N); /* C now has A*inv(A) */ for (i=0; i != N; i++) C[i*ldcp1] -= ATL_rone; /* C now has A*inv(A)-I */ numer = Mjoin(PATL,genrm1)(N, N, C, N); denom = Mjoin(PATL,genrm1)(N, N, A, lda) * Mjoin(PATL,genrm1)(N, N, AI, ldi) * N * eps; free(C); return(numer/denom); }
static TYPE *ATL_LmulLt(const int N, const TYPE *L, const int ldl) /* * A = L * L^H */ { const int incA = 1 SHIFT, incL = (ldl+1) SHIFT; TYPE *A; int i, j; #ifdef TCPLX int i1, i2; TYPE tmp; #endif A = malloc(N*ATL_MulBySize(N)); ATL_assert(A); for (j=0; j < N; j++) { for (i=j; i < N; i++) { #ifdef TREAL A[i+j*N] = L[i+j*ldl] * L[j+j*ldl] + Mjoin(PATL,dot)(j, L+i, ldl, L+j, ldl); #else tmp = L[(j+j*ldl)<<1]; i1 = (i + j * N)<<1; i2 = (i + j * ldl)<<1; Mjoin(PATL,dotc_sub)(j, L+(j<<1), ldl, L+(i<<1), ldl, A+i1); A[i1] += L[i2] * tmp; if (i != j) A[i1+1] += tmp * L[i2+1]; #endif } } return(A); }
int Mjoin(PATL,syr2kLT) #endif (const int N, const int K, const void *valpha, const void *A, const int lda, const void *B, const int ldb, const void *vbeta, void *C, const int ldc) { int i; void *vc=NULL; TYPE *c; #ifdef TREAL const SCALAR alpha=*( (const SCALAR *)valpha ); const SCALAR beta =*( (const SCALAR *)vbeta ); const SCALAR one=1.0, zero=0.0; #else #define alpha valpha const TYPE *beta=vbeta; const TYPE one[2]={1.0,0.0}, zero[2]={0.0,0.0}; #endif i = ATL_MulBySize(N)*N; if (i <= ATL_MaxMalloc) vc = malloc(ATL_Cachelen+i); if (vc == NULL) return(1); c = ATL_AlignPtr(vc); CgemmTN(N, N, K, alpha, A, lda, B, ldb, zero, c, N); if ( SCALAR_IS_ONE(beta) ) Mjoin(syr2k_put,_b1)(N, c, beta, C, ldc); else if ( SCALAR_IS_ZERO(beta) ) Mjoin(syr2k_put,_b0)(N, c, beta, C, ldc); #ifdef TCPLX else if (SCALAR_IS_NONE(beta)) Mjoin(syr2k_put,_bn1)(N, c, beta, C, ldc); else if (beta[1] == *zero) Mjoin(syr2k_put,_bXi0)(N, c, beta, C, ldc); #endif else Mjoin(syr2k_put,_bX)(N, c, beta, C, ldc); free(vc); return(0); }
void Mjoin(Mjoin(Mjoin(PATL,herk),UploNM),N) (const int N, const int K, const void *valpha, const void *A, const int lda, const void *vbeta, void *C, const int ldc) { void *vc; TYPE *c; TYPE alpha[2]; const TYPE beta = *( (const TYPE *)vbeta ); const TYPE zero[2] = {0.0, 0.0}; alpha[0] = *( (const TYPE *)valpha ); if (K > HERK_Xover) { alpha[1] = 0.0; vc = malloc(ATL_Cachelen+ATL_MulBySize(N)*N); ATL_assert(vc); c = ATL_AlignPtr(vc); CgemmNC(N, N, K, alpha, A, lda, A, lda, zero, c, N); if ( beta == 1.0 ) Mjoin(her_put,_b1)(N, c, vbeta, C, ldc); else if ( beta == 0.0 ) Mjoin(her_put,_b0)(N, c, vbeta, C, ldc); else Mjoin(her_put,_bXi0)(N, c, vbeta, C, ldc); free(vc); } else Mjoin(PATL,refherk)(Uplo_, AtlasNoTrans, N, K, *alpha, A, lda, beta, C, ldc); }
static double RunTiming (enum CBLAS_ORDER Order, enum TEST_UPLO Uplo, int N, int lda, int CacheSize, int nreps) { TYPE *A, *a; const int incA = N*lda; int i, k; double t0, t1=0.0; if (nreps < 1) nreps = 1; i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL; k = i = (i + N*N-1) / (N*N); if (nreps > i) k = i = nreps; a = A = malloc(i * ATL_MulBySize(incA)); if (A) { if (Uplo == TestGE) for (i=0; i < k; i++) Mjoin(PATL,gegen)(N, N, A+i*incA, lda, N+lda); else for (i=0; i < k; i++) hegen(Order, Uplo, N, A+i*incA, lda); t0 = time00(); for (i=nreps; i; i--, a += incA) test_inv(Order, Uplo, N, a, lda); t1 = time00() - t0; free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); return(t1/nreps); }
int ATL_getrfC(const int M, const int N, TYPE *A, const int lda, int *ipiv) /* * Column-major factorization of form * A = P * L * U * where P is a row-permutation matrix, L is lower triangular with unit diagonal * elements (lower trapazoidal if M > N), and U is upper triangular (upper * trapazoidal if M < N). This is the recursive Level 3 BLAS version. */ { const int MN = Mmin(M, N); int Nleft, Nright, k, i, ierr=0; #ifdef TCPLX const TYPE one[2] = {ATL_rone, ATL_rzero}; const TYPE none[2] = {ATL_rnone, ATL_rzero}; TYPE inv[2], tmp[2]; #else #define one ATL_rone #define none ATL_rnone TYPE tmp; #endif TYPE *Ac, *An; if (((size_t)M)*N <= ATL_L1elts) return(Mjoin(PATL,getf2)(M, N, A, lda, ipiv)); #if defined(ATL_USEPTHREADS) && defined(ATL_USEPCA) if (N <= (NB<<2) && N >= 16 && M-N >= ATL_PCAMin && ((size_t)ATL_MulBySize(M)*N) <= CacheEdge*ATL_NTHREADS) { if (N >= 16) ierr = Mjoin(PATL,tgetf2)(M, N, A, lda, ipiv); else ierr = Mjoin(PATL,tgetf2_nocp)(M, N, A, lda, ipiv); return(ierr); } #endif if (MN > ATL_luMmin) { Nleft = MN >> 1; #ifdef NB if (Nleft > NB) Nleft = ATL_MulByNB(ATL_DivByNB(Nleft)); #endif Nright = N - Nleft; i = ATL_getrfC(M, Nleft, A, lda, ipiv); /* factor left to L & U */ if (i) if (!ierr) ierr = i; /* * Update trailing submatrix */ Ac = A + (Nleft * lda SHIFT); An = Ac + (Nleft SHIFT); ATL_laswp(Nright, Ac, lda, 0, Nleft, ipiv, 1); cblas_trsm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, Nleft, Nright, one, A, lda, Ac, lda); cblas_gemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M-Nleft, Nright, Nleft, none, A+(Nleft SHIFT), lda, Ac, lda, one, An, lda); i = ATL_getrfC(M-Nleft, Nright, An, lda, ipiv+Nleft); if (i) if (!ierr) ierr = i + Nleft; for (i=Nleft; i != MN; i++) ipiv[i] += Nleft; ATL_laswp(Nleft, A, lda, Nleft, MN, ipiv, 1); }
void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N, const void *alpha, const void *X, const int incX, const void *Y, const int incY, void *A, const int lda) { int info = 2000; const double *x = X, *y = Y; void *vy; double *y0; double one[2] = {ATL_rone, ATL_rzero}; #ifndef NoCblasErrorChecks if (M < 0) info = cblas_errprn(2, info, "M cannot be less than zero; is set to %d.", M); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (!incX) info = cblas_errprn(6, info, "incX cannot be zero; is set to %d.", incX); if (!incY) info = cblas_errprn(8, info, "incY cannot be zero; is set to %d.", incY); if (Order == CblasColMajor) { if (lda < M || lda < 1) info = cblas_errprn(10, info, "lda must be >= MAX(M,1): lda=%d M=%d", lda, M); } else if (Order == CblasRowMajor) { if (lda < N || lda < 1) info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d M=%d", lda, N); } else info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (info != 2000) { cblas_xerbla(info, "cblas_zgerc", ""); return; } #endif if (incX < 0) x += (1-M)*incX<<1; if (incY < 0) y += (1-N)*incY<<1; if (Order == CblasColMajor) ATL_zgerc(M, N, alpha, x, incX, y, incY, A, lda); else { vy = malloc(ATL_Cachelen + ATL_MulBySize(N)); ATL_assert(vy); y0 = ATL_AlignPtr(vy); ATL_zmoveConj(N, alpha, y, incY, y0, 1); ATL_zgeru(N, M, one, y0, 1, x, incX, A, lda); free(vy); } }
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order, enum ATLAS_UPLO Uplo, int N, int lda) { char *Ups, *Ord; TYPE resid = 0.0; double mflop, mflops, t0, tim=0.0; int nreps=1, passed, i, imem; const int incA = lda*N; TYPE *a, *A; mflops = N; mflops = (mflops*mflops*mflops) / 4.0; #ifdef TCPLX mflops *= 4.0; #endif mflops /= 1000000.0; if (thresh > ATL_rzero) resid = uumtest(Order, Uplo, CacheSize, N, lda, &tim); else resid = -1.0; if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */ { nreps = (mflops * 1000000); nreps = (MFLOP*1000000 + nreps-1) / nreps; if (nreps < 1) nreps = 1; imem = ATL_DivBySize(CacheSize) ATL_PTCACHEMUL; imem = (imem + 2*N*N-1) / (N*N); if (imem < nreps) imem = nreps; a = A = malloc(imem * ATL_MulBySize(incA)); if (A != NULL) { for (i=0; i < imem; i++) lltgen(Uplo, N, A+i*incA, lda, N*1029+lda); t0 = time00(); for (i=nreps; i; i--, a += incA) test_lauum(Order, Uplo, N, a, lda); tim = time00() - t0; tim /= nreps; free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); } if (tim > 0.0) mflop = mflops / tim; else mflop = 0.0; if (Uplo == AtlasUpper) Ups = "Upper"; else Ups = "Lower"; if (Order == CblasColMajor) Ord = "Col"; else Ord = "Row"; fprintf(stdout, "%5d %3s %5s %6d %6d %12.5f %12.3f %12e\n", nreps, Ord, Ups, N, lda, tim, mflop, resid); if (resid > thresh || resid != resid) passed = 0; else if (resid < 0.0) passed = -1; else passed = 1; return(passed); }
static TYPE lutestR(int CacheSize, int M, int N, int lda, int *npiv, double *tim) { TYPE *A, *LmU; int *ipiv; const int MN = Mmin(M,N); int i; double t0, t1; TYPE normA, eps, resid; eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*M); if (A == NULL) return(-1); ipiv = malloc( MN * sizeof(int) ); if (ipiv == NULL) { free(A); return(-1); } t0 = ATL_flushcache(CacheSize); Mjoin(PATL,gegen)(N, M, A, lda, M*N+lda); #ifdef DEBUG Mjoin(PATL,geprint)("A0", N, M, A, lda); #endif normA = Mjoin(PATL,genrm1)(N, M, A, lda); /* actually infnrm, but OK */ t0 = ATL_flushcache(-1); t0 = time00(); test_getrf(CblasRowMajor, M, N, A, lda, ipiv); t1 = time00() - t0; *tim = t1; t0 = ATL_flushcache(0); #ifdef DEBUG Mjoin(PATL,geprint)("LU", N, M, A, lda); #endif LmU = ATL_LmulUR(M, N, A, lda); /* LmU contains L * U */ #ifdef DEBUG Mjoin(PATL,geprint)("L*U", N, M, LmU, N); #endif Mjoin(PATL,gegen)(N, M, A, lda, M*N+lda); /* regenerate A, overwriting LU */ ATL_laswp(M, A, lda, 0, MN, ipiv, 1); /* apply swaps to A */ resid = Mjoin(PATL,gediffnrm1)(N, M, A, lda, LmU, N); resid /= (normA * eps * Mmin(M,N)); *npiv = findnpvt(MN, ipiv); free(LmU); free(A); free(ipiv); return(resid); }
static TYPE llttest(enum ATLAS_UPLO Uplo, int CacheSize, int N, int lda, double *tim) { TYPE *A, *LmLt; int i; double t0, t1; TYPE normA, eps, resid; eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*N); if (A == NULL) return(-1); t0 = ATL_flushcache(CacheSize); lltgen(Uplo, N, A, lda, N*1029+lda); normA = lltnrm1(Uplo, N, A, lda); #ifdef DEBUG Mjoin(PATL,geprint)("A0", N, N, A, lda); #endif t0 = ATL_flushcache(-1); t0 = time00(); test_potrf(Uplo, N, A, lda); t1 = time00() - t0; *tim = t1; t0 = ATL_flushcache(0); #ifdef DEBUG Mjoin(PATL,geprint)("L", N, N, A, lda); #endif ATL_checkpad(Uplo, N, A, lda); if (Uplo == AtlasUpper) LmLt = ATL_UtmulU(N, A, lda); else LmLt = ATL_LmulLt(N, A, lda); #ifdef DEBUG Mjoin(PATL,geprint)("L*Lt", N, N, LmLt, N); #endif lltgen(Uplo, N, A, lda, N*1029+lda); /* regen A over LLt */ lltdiff(Uplo, N, A, lda, LmLt, N); #ifdef DEBUG Mjoin(PATL,geprint)("A-L*Lt", N, N, LmLt, N); #endif resid = lltnrm1(Uplo, N, LmLt, N); #ifdef DEBUG if (resid/(normA*eps*N) > 10.0) fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid); #endif resid /= (normA * eps * N); free(LmLt); free(A); return(resid); }
static TYPE *ATL_LmulUC(const int M, const int N, const TYPE *LU, const int ldl) { const int lda = ldl SHIFT, MN = Mmin(M,N); int i, j, m; TYPE *C, *c; #ifdef TREAL const TYPE ONE=ATL_rone; #else const TYPE ONE[2] = {ATL_rone, ATL_rzero}; #endif C = c = malloc(M*ATL_MulBySize(N)); ATL_assert(c); if (M >= N) { for (j=0; j < MN; j++) { m = j SHIFT; for (i=0; i < m; i++) c[i] = ATL_rzero; #ifdef TCPLX c[i++] = ATL_rone; c[i++] = ATL_rzero; #else c[i++] = ATL_rone; #endif for (m=M SHIFT; i < m; i++) c[i] = LU[i]; c += m; LU += lda; } LU -= MN * lda; for (m=M SHIFT; j < N; j++, c += m) Mjoin(PATL,zero)(M, c, 1); cblas_trmm(CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, M, N, ONE, LU, ldl, C, M); } else /* M < N */ { for (j=0; j < M; j++) { m = (j+1) SHIFT; for (i=0; i < m; i++) c[i] = LU[i]; for (m=M SHIFT; i < m; i++) c[i] = ATL_rzero; c += m; LU += lda; } Mjoin(PATL,gecopy)(M, N-M, LU, ldl, c, M); LU -= M * lda; cblas_trmm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, M, N, ONE, LU, ldl, C, M); } return(C); }
void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const void *X, const int incX, void *A, const int lda) { int info = 2000; void *vx; double one[2] = {ATL_rone, ATL_rzero}; double *x0; const double *x=X; #ifndef NoCblasErrorChecks if (Order != CblasColMajor && Order != CblasRowMajor) info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d", CblasRowMajor, CblasColMajor, Order); if (Uplo != CblasUpper && Uplo != CblasLower) info = cblas_errprn(2, info, "UPLO must be %d or %d, but is set to %d", CblasUpper, CblasLower, Uplo); if (N < 0) info = cblas_errprn(3, info, "N cannot be less than zero; is set to %d.", N); if (!incX) info = cblas_errprn(6, info, "incX cannot be zero; is set to %d.", incX); if (lda < N || lda < 1) info = cblas_errprn(8, info, "lda must be >= MAX(N,1): lda=%d N=%d", lda, N); if (info != 2000) { cblas_xerbla(info, "cblas_zher", ""); return; } #endif if (incX < 0) x += (1-N)*incX<<1; if (Order == CblasColMajor) ATL_zher(Uplo, N, alpha, x, incX, A, lda); else if (alpha != ATL_rzero) { vx = malloc(ATL_Cachelen + ATL_MulBySize(N)); ATL_assert(vx); x0 = ATL_AlignPtr(vx); ATL_zmoveConj(N, one, x, incX, x0, 1); ATL_zher(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, alpha, x0, 1, A, lda); free(vx); } else ATL_zher(( (Uplo == CblasUpper) ? CblasLower : CblasUpper ), N, ATL_rzero, x, incX, A, lda); }
static TYPE *ATL_LtmulL (const enum CBLAS_ORDER Order, const int N, const TYPE *L, const int ldl) { TYPE *C; #ifdef TREAL const TYPE one=ATL_rone, zero=ATL_rzero; #else const TYPE one[2] = {ATL_rone,ATL_rzero}, zero[2] = {ATL_rzero,ATL_rzero}; #endif C = malloc(N*ATL_MulBySize(N)); ATL_assert(C); ATL_L2GE(Order, N, L, ldl, C, N); cblas_trmm(Order, CblasLeft, CblasLower, CblasConjTrans, CblasNonUnit, N, N, one, L, ldl, C, N); return(C); }
static TYPE *ATL_UmulUt (const enum CBLAS_ORDER Order, const int N, const TYPE *U, const int ldu) { TYPE *C; #ifdef TREAL const TYPE one=ATL_rone, zero=ATL_rzero; #else const TYPE one[2] = {ATL_rone,ATL_rzero}, zero[2] = {ATL_rzero,ATL_rzero}; #endif C = malloc(N*ATL_MulBySize(N)); ATL_assert(C); ATL_U2GE(Order, N, U, ldu, C, N); cblas_trmm(Order, CblasRight, CblasUpper, CblasConjTrans, CblasNonUnit, N, N, one, U, ldu, C, N); return(C); }
static TYPE *ATL_LmulUR(const int M, const int N, const TYPE *LU, const int ldl) { const int lda = ldl SHIFT, ldc = N SHIFT, MN = Mmin(M,N); int i, j, m; TYPE *C, *c; #ifdef TREAL const TYPE ONE=ATL_rone; #else const TYPE ONE[2] = {ATL_rone, ATL_rzero}; #endif C = c = malloc(M*ATL_MulBySize(N)); ATL_assert(c); if (M >= N) { for (i=0; i != N; i++, LU += lda, C += ldc) { Mjoin(PATL,copy)(i+1, LU, 1, C, 1); Mjoin(PATL,zero)(N-i-1, C+((i+1)SHIFT), 1); } for(; i != M; i++, LU += lda, C += ldc) Mjoin(PATL,copy)(N, LU, 1, C, 1); LU -= lda * M; C -= ldc * M; cblas_trmm(CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasUnit, M, N, ONE, LU, ldl, C, N); } else /* N > M */ { for (i=0; i != M; i++, C += ldc, LU += lda) { Mjoin(PATL,zero)(i, C, 1); C[i SHIFT] = ATL_rone; #ifdef TCPLX C[(i SHIFT)+1] = ATL_rzero; #endif Mjoin(PATL,copy)(N-i-1, LU+((i+1)SHIFT), 1, C+((i+1)SHIFT), 1); } LU -= lda * M; C -= ldc * M; cblas_trmm(CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, M, N, ONE, LU, ldl, C, N); } return(C); }
int Mjoin(PC2F,gels)(const enum CBLAS_TRANSPOSE TA, ATL_CINT M, ATL_CINT N, ATL_CINT NRHS, TYPE *A, ATL_CINT lda, TYPE *B, ATL_CINT ldb) { TYPE work[2]; TYPE *wrk; ATL_INT lwrk; int iret; /* * Query routine for optimal workspace, allocate it, and call routine with it */ ATL_assert(!Mjoin(PC2F,gels_wrk)(TA, M, N, NRHS, A, lda, B, ldb, work, -1)); lwrk = work[0]; wrk = malloc(ATL_MulBySize(lwrk)); ATL_assert(wrk); iret = Mjoin(PC2F,gels_wrk)(TA, M, N, NRHS, A, lda, B, ldb, wrk, lwrk); free(wrk); return(iret); }
static TYPE poresid(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, int N, TYPE *A, int lda, TYPE *AI, int ldi) /* * returns ||A - AI|| / (N * eps * ||A|| * ||AI||); */ { enum CBLAS_UPLO uplo=Uplo; TYPE numer, denom, eps; const int ldcp1 = (N+1)SHIFT; int i; #ifdef TREAL TYPE one = ATL_rone, zero = ATL_rzero; #else TYPE one[2] = {ATL_rone, ATL_rzero}, zero[2] = {ATL_rzero, ATL_rzero}; #endif TYPE *C, *B; C = malloc(N*ATL_MulBySize(N)); ATL_assert(C); B = DupMat(Order, N, N, AI, ldi, N); ReflectHE(Order, Uplo, N, B, N); #ifdef TREAL cblas_symm(Order, CblasRight, Uplo, N, N, one, A, lda, B, N, zero, C, N); #else cblas_hemm(Order, CblasRight, Uplo, N, N, one, A, lda, B, N, zero, C, N); #endif free(B); eps = Mjoin(PATL,epsilon)(); if (Order == CblasRowMajor) uplo = (Uplo == CblasUpper) ? CblasLower : CblasUpper; for (i=0; i != N; i++) C[i*ldcp1] -= ATL_rone; /* C now has A*inv(A)-I */ numer = Mjoin(PATL,genrm1)(N, N, C, N); #ifdef TREAL denom = Mjoin(PATL,synrm)(uplo, N, A, lda) * Mjoin(PATL,synrm)(uplo, N, AI, ldi) * N * eps; #else denom = Mjoin(PATL,henrm)(uplo, N, A, lda) * Mjoin(PATL,henrm)(uplo, N, AI, ldi) * N * eps; #endif free(C); return(numer/denom); }
void Mjoin(Mjoin(PATL,hemmL),UploNM) (const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc) { TYPE *a; void *va; if (N > HEMM_Xover) { va = malloc(ATL_Cachelen + (ATL_MulBySize(M)*M)); ATL_assert(va); a = ATL_AlignPtr(va); Mjoin(Mjoin(PATL,hecopy),UploNM)(M, A, lda, a); ATL_ammm(AtlasNoTrans, AtlasNoTrans, M, N, M, alpha, a, M, B, ldb, beta, C, ldc); free(va); } else Mjoin(PATL,refhemm)(AtlasLeft, Uplo_, M, N, alpha, A, lda, B, ldb, beta, C, ldc); }
static TYPE *ATL_UtmulU(const int N, const TYPE *U, const int ldu) { TYPE *A; int i, j; #ifdef TCPLX const int ldu2 = ldu<<1; int i1, i2; TYPE tmp; #endif A = malloc(N*ATL_MulBySize(N)); ATL_assert(A); for (j=0; j < N; j++) { #ifdef TREAL for (i=0; i <= j; i++) A[i+j*N] = Mjoin(PATL,dot)(i+1, U+i*ldu, 1, U+ldu*j, 1); #else for (i=0; i <= j; i++) { i1 = (i+j*N)<<1; i2 = (i+j*ldu)<<1; tmp = U[(i+i*ldu)<<1]; Mjoin(PATL,dotc_sub)(i, U+i*ldu2, 1, U+j*ldu2, 1, A+i1); if (i != j) { A[i1] += U[i2] * tmp; A[i1+1] += U[i2+1] * tmp; } else { A[i1] += tmp * tmp; A[i1+1] += ATL_rzero; } } #endif } return(A); }