static double RunTest (enum CBLAS_ORDER Order, enum TEST_UPLO Uplo, int N, int lda, int CacheSize, TYPE *res) { TYPE *A, *AI, *C; int ierr; double t0, t1; A = GetMat(Order, Uplo, N, lda); #ifdef DEBUG Mjoin(PATL,geprint)("A0", N, N, A, lda); #endif AI = DupMat(Order, N, N, A, lda, lda); t0 = ATL_flushcache(CacheSize); t0 = ATL_flushcache(-1); t0 = time00(); test_inv(Order, Uplo, N, AI, lda); /* AI should now have inverse(A) */ t1 = time00() - t0; t0 = ATL_flushcache(0); #ifdef DEBUG Mjoin(PATL,geprint)("A ", N, N, A, lda); Mjoin(PATL,geprint)("AI", N, N, AI, lda); #endif *res = GetResid(Order, Uplo, N, A, lda, AI, lda); free(AI); free(A); return(t1); }
static double RunTiming (enum CBLAS_ORDER Order, enum TEST_UPLO Uplo, int N, int lda, int CacheSize, int nreps) { TYPE *A, *a; const int incA = N*lda; int i, k; double t0, t1=0.0; if (nreps < 1) nreps = 1; i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL; k = i = (i + N*N-1) / (N*N); if (nreps > i) k = i = nreps; a = A = malloc(i * ATL_MulBySize(incA)); if (A) { if (Uplo == TestGE) for (i=0; i < k; i++) Mjoin(PATL,gegen)(N, N, A+i*incA, lda, N+lda); else for (i=0; i < k; i++) hegen(Order, Uplo, N, A+i*incA, lda); t0 = time00(); for (i=nreps; i; i--, a += incA) test_inv(Order, Uplo, N, a, lda); t1 = time00() - t0; free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); return(t1/nreps); }
double GetTimeWithReps_LU (int mflopF, int lda, int M, int N, int nb, int Uplo, int Side, int flsizeKB) { double mflop, t0, t1, drep; char *wrksets; /* working sets for kernel calls */ #ifdef TCPLX const int lda2 = lda+lda; #else const int lda2 = lda; #endif size_t setsz, setszT; /* work set size in memory, and amnt of it touched */ size_t nrep; /* # of reps required to force mflopF flops */ size_t nset; /* # of working sets allocated */ int i; /* * Keep setsz a multiple of TYPE size for alignment reasons. LU only accesses * M*N of matrix and all of IPIV. */ setsz = lda*N*ATL_sizeof + ((M*sizeof(int)+ATL_sizeof-1)/ATL_sizeof)*ATL_sizeof; setszT = M*N*ATL_sizeof + M*sizeof(int); mflop = GetFlopCount(LAgetrf, 0, M, N, 0, 0, CAN_NB); /* * Cannot reuse matrices (bogus to factor an already factored matrix), so we * must take as our total memspace MAX(nrep,nset)*setsz */ ATL_assert(mflop > 0.0); drep = (mflopF*1.0e6) / mflop; nrep = (int)(drep+0.999999); /* * If cacheline flush doesn't work, then we must use this method */ #if ATL_LINEFLUSH if (nrep < 2) return(-1.0); /* do wt normal timer */ #else nrep = (nrep >= 1) ? nrep : 1; #endif nset = (flsizeKB*1024+setszT-1)/setszT; if (nset < nrep) nset = nrep; wrksets = malloc(nset * setsz); ATL_assert(wrksets); for (i=0; i < nset; i++) Mjoin(PATL,gegen)(M, N, (TYPE*)(wrksets+i*setsz), lda, M*N+lda); t0 = time00(); for (i=0; i < nrep; i++) { test_getrf(CblasColMajor, M, N, (TYPE*)(wrksets+i*setsz), lda, (int*)(wrksets+i*setsz+lda*N*ATL_sizeof)); } t1 = time00(); free(wrksets); return((t1-t0)/((double)nrep)); }
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order, enum ATLAS_UPLO Uplo, int N, int lda) { char *Ups, *Ord; TYPE resid = 0.0; double mflop, mflops, t0, tim=0.0; int nreps=1, passed, i, imem; const int incA = lda*N; TYPE *a, *A; mflops = N; mflops = (mflops*mflops*mflops) / 4.0; #ifdef TCPLX mflops *= 4.0; #endif mflops /= 1000000.0; if (thresh > ATL_rzero) resid = uumtest(Order, Uplo, CacheSize, N, lda, &tim); else resid = -1.0; if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */ { nreps = (mflops * 1000000); nreps = (MFLOP*1000000 + nreps-1) / nreps; if (nreps < 1) nreps = 1; imem = ATL_DivBySize(CacheSize) ATL_PTCACHEMUL; imem = (imem + 2*N*N-1) / (N*N); if (imem < nreps) imem = nreps; a = A = malloc(imem * ATL_MulBySize(incA)); if (A != NULL) { for (i=0; i < imem; i++) lltgen(Uplo, N, A+i*incA, lda, N*1029+lda); t0 = time00(); for (i=nreps; i; i--, a += incA) test_lauum(Order, Uplo, N, a, lda); tim = time00() - t0; tim /= nreps; free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); } if (tim > 0.0) mflop = mflops / tim; else mflop = 0.0; if (Uplo == AtlasUpper) Ups = "Upper"; else Ups = "Lower"; if (Order == CblasColMajor) Ord = "Col"; else Ord = "Row"; fprintf(stdout, "%5d %3s %5s %6d %6d %12.5f %12.3f %12e\n", nreps, Ord, Ups, N, lda, tim, mflop, resid); if (resid > thresh || resid != resid) passed = 0; else if (resid < 0.0) passed = -1; else passed = 1; return(passed); }
static TYPE lutestR(int CacheSize, int M, int N, int lda, int *npiv, double *tim) { TYPE *A, *LmU; int *ipiv; const int MN = Mmin(M,N); int i; double t0, t1; TYPE normA, eps, resid; eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*M); if (A == NULL) return(-1); ipiv = malloc( MN * sizeof(int) ); if (ipiv == NULL) { free(A); return(-1); } t0 = ATL_flushcache(CacheSize); Mjoin(PATL,gegen)(N, M, A, lda, M*N+lda); #ifdef DEBUG Mjoin(PATL,geprint)("A0", N, M, A, lda); #endif normA = Mjoin(PATL,genrm1)(N, M, A, lda); /* actually infnrm, but OK */ t0 = ATL_flushcache(-1); t0 = time00(); test_getrf(CblasRowMajor, M, N, A, lda, ipiv); t1 = time00() - t0; *tim = t1; t0 = ATL_flushcache(0); #ifdef DEBUG Mjoin(PATL,geprint)("LU", N, M, A, lda); #endif LmU = ATL_LmulUR(M, N, A, lda); /* LmU contains L * U */ #ifdef DEBUG Mjoin(PATL,geprint)("L*U", N, M, LmU, N); #endif Mjoin(PATL,gegen)(N, M, A, lda, M*N+lda); /* regenerate A, overwriting LU */ ATL_laswp(M, A, lda, 0, MN, ipiv, 1); /* apply swaps to A */ resid = Mjoin(PATL,gediffnrm1)(N, M, A, lda, LmU, N); resid /= (normA * eps * Mmin(M,N)); *npiv = findnpvt(MN, ipiv); free(LmU); free(A); free(ipiv); return(resid); }
static TYPE uumtest(enum ATLAS_ORDER Order, enum ATLAS_UPLO Uplo, int CacheSize, int N, int lda, double *tim) { TYPE *A, *Ag, *LmLt; double t0, t1; TYPE normA, eps, resid; enum ATLAS_UPLO MyUplo = Uplo; if (Order == CblasRowMajor) { if (Uplo == CblasUpper) MyUplo = CblasLower; else MyUplo = CblasUpper; } eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*N + ATL_MulBySize(N)*N); if (A == NULL) return(-1); Ag = A + lda*(N SHIFT); t0 = ATL_flushcache(CacheSize); lltgen(MyUplo, N, A, lda, N*1029+lda); lltgen(MyUplo, N, Ag, N, N*1029+lda); normA = lltnrm1(MyUplo, N, A, lda); #ifdef DEBUG Mjoin(PATL,geprint)("A", N, N, A, lda); Mjoin(PATL,geprint)("Ag", N, N, Ag, N); #endif t0 = ATL_flushcache(-1); t0 = time00(); test_lauum(Order, Uplo, N, A, lda); t1 = time00() - t0; *tim = t1; t0 = ATL_flushcache(0); ATL_checkpad(MyUplo, N, A, lda); if (Uplo == CblasUpper) LmLt = ATL_UmulUt(Order, N, Ag, N); else LmLt = ATL_LtmulL(Order, N, Ag, N); #ifdef DEBUG Mjoin(PATL,geprint)("A", N, N, A, lda); Mjoin(PATL,geprint)("Ag", N, N, LmLt, N); #endif lltdiff(MyUplo, N, A, lda, LmLt, N); #ifdef DEBUG Mjoin(PATL,geprint)("A-L*Lt", N, N, LmLt, N); #endif resid = lltnrm1(MyUplo, N, LmLt, N) / (normA * eps * N); if (resid > 10.0 || resid != resid) fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid); free(LmLt); free(A); return(resid); }
double GetTimeWithReps_LLT (int mflopF, int lda, int M, int N, int nb, int Uplo, int Side, int flsizeKB) { double mflop, t0, t1, drep; char *wrksets; /* working sets for kernel calls */ #ifdef TCPLX const int lda2 = lda+lda; #else const int lda2 = lda; #endif size_t setsz, setszT; /* work set size in memory, and amnt of it touched */ size_t nrep; /* # of reps required to force mflopF flops */ size_t nset; /* # of working sets allocated */ int i; setsz=lda*N*ATL_sizeof; /* matrix is entire working set of LLt */ setszT=N*N*ATL_sizeof; /* only touch N*N portion */ mflop = GetFlopCount(LApotrf, Uplo, M, N, 0, 0, CAN_NB); /* * Cannot reuse matrices (bogus to factor an already factored matrix), so we * must take as our total memspace MAX(nrep,nset)*setsz */ ATL_assert(mflop > 0.0); drep = (mflopF*1.0e6) / mflop; nrep = (int)(drep+0.999999); /* * If cacheline flush doesn't work, then we must use this method */ #if ATL_LINEFLUSH if (nrep < 2) return(-1.0); /* do wt normal timer */ #else nrep = (nrep >= 1) ? nrep : 1; #endif nset = (flsizeKB*1024+setszT-1)/setszT; if (nset < nrep) nset = nrep; wrksets = malloc(nset * setsz); ATL_assert(wrksets); for (i=0; i < nset; i++) PosDefGen(CblasColMajor, Uplo_LA2ATL(Uplo), N, (TYPE*)(wrksets+i*setsz), lda); t0 = time00(); for (i=0; i < nrep; i++) { test_potrf(Uplo, N, (TYPE*)(wrksets+i*setsz), lda); } t1 = time00(); free(wrksets); return((t1-t0)/((double)nrep)); }
static TYPE llttest(enum ATLAS_UPLO Uplo, int CacheSize, int N, int lda, double *tim) { TYPE *A, *LmLt; int i; double t0, t1; TYPE normA, eps, resid; eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*N); if (A == NULL) return(-1); t0 = ATL_flushcache(CacheSize); lltgen(Uplo, N, A, lda, N*1029+lda); normA = lltnrm1(Uplo, N, A, lda); #ifdef DEBUG Mjoin(PATL,geprint)("A0", N, N, A, lda); #endif t0 = ATL_flushcache(-1); t0 = time00(); test_potrf(Uplo, N, A, lda); t1 = time00() - t0; *tim = t1; t0 = ATL_flushcache(0); #ifdef DEBUG Mjoin(PATL,geprint)("L", N, N, A, lda); #endif ATL_checkpad(Uplo, N, A, lda); if (Uplo == AtlasUpper) LmLt = ATL_UtmulU(N, A, lda); else LmLt = ATL_LmulLt(N, A, lda); #ifdef DEBUG Mjoin(PATL,geprint)("L*Lt", N, N, LmLt, N); #endif lltgen(Uplo, N, A, lda, N*1029+lda); /* regen A over LLt */ lltdiff(Uplo, N, A, lda, LmLt, N); #ifdef DEBUG Mjoin(PATL,geprint)("A-L*Lt", N, N, LmLt, N); #endif resid = lltnrm1(Uplo, N, LmLt, N); #ifdef DEBUG if (resid/(normA*eps*N) > 10.0) fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid); #endif resid /= (normA * eps * N); free(LmLt); free(A); return(resid); }
int mmcase0(int MFLOP, int CACHESIZE, char TA, char TB, int M, int N, int K, SCALAR alpha, int lda, int ldb, SCALAR beta, int ldc) { char *pc; #ifdef TREAL char *form="%4d %c %c %4d %4d %4d %5.1f %5.1f %6.2f %5.1f %5.2f %3s\n"; #define MALPH alpha #define MBETA beta TYPE betinv, bet=beta; #else #define MALPH *alpha, alpha[1] #define MBETA *beta, beta[1] char *form="%4d %c %c %4d %4d %4d %5.1f %5.1f %5.1f %5.1f %6.2f %6.1f %4.2f %3s\n"; TYPE betinv[2], *bet=beta; #endif int nreps, incA, incB, incC, inc, nmat, k; TYPE *c, *C, *a, *A, *b, *B, *st; int ii, jj, i, j=0, PASSED, nerrs; double t0, t1, t2, t3, mflop, mf, mops; TYPE maxval, f1, ferr; static TYPE feps=0.0; static int itst=1; enum ATLAS_TRANS TAc, TBc; void *vp; #ifdef TCPLX if (*beta == 0.0 && beta[1] == 0.0) betinv[0] = betinv[1] = 0.0; else if (beta[1] == 0.0) { betinv[0] = 1 / *beta; betinv[1] = 0.0; } else { t0 = *beta; t1 = beta[1]; if (Mabs(t1) <= Mabs(t0)) { t2 = t1 / t0; betinv[0] = t0 = 1.0 / (t0 + t1*t2); betinv[1] = -t0 * t2; } else { t2 = t0 / t1; betinv[1] = t0 = -1.0 / (t1 + t0*t2); betinv[0] = -t2 * t0; } } mops = ( ((8.0*M)*N)*K ) / 1000000.0; #else if (beta != 0.0) betinv = 1.0 / beta; else betinv = beta; mops = ( ((2.0*M)*N)*K ) / 1000000.0; #endif nreps = MFLOP / mops; if (nreps < 1) nreps = 1; if (TA == 'n' || TA == 'N') { TAc = AtlasNoTrans; incA = lda * K; } else { if (TA == 'c' || TA == 'C') TAc = AtlasConjTrans; else TAc = AtlasTrans; incA = lda * M; } if (TB == 'n' || TB == 'N') { incB = ldb * N; TBc = AtlasNoTrans; } else { incB = ldb * K; if (TB == 'c' || TB == 'C') TBc = AtlasConjTrans; else TBc = AtlasTrans; } incC = ldc*N; inc = incA + incB + incC; i = M*K + K*N + M*N; /* amount of inc actually referenced */ /* This is a hack; change to use of flushcache instead. */ nmat = ((CACHESIZE/ATL_sizeof) + i)/i; vp = malloc(ATL_MulBySize(nmat*inc)+ATL_Cachelen); ATL_assert(vp); C = c = ATL_AlignPtr(vp); a = A = C + incC; b = B = A + incA; st = C + nmat*inc; matgen(inc, nmat, C, inc, M*N); #ifdef DEBUG printmat("A0", M, K, A, lda); printmat("B0", K, N, B, ldb); printmat("C0", M, N, C, ldc); #endif t0 = time00(); for (k=nreps; k; k--) { trusted_gemm(TAc, TBc, M, N, K, alpha, a, lda, b, ldb, bet, c, ldc); c += inc; a += inc; b += inc; if (c == st) { c = C; a = A; b = B; if (bet == beta) bet = betinv; else bet = beta; } } t1 = time00() - t0; t1 /= nreps; if (t1 <= 0.0) mflop = t1 = 0.0; else /* flop rates actually 8MNK+12MN & 2MNK + 2MN, resp */ mflop = mops / t1; printf(form, itst, TA, TB, M, N, K, MALPH, MBETA, t1, mflop, 1.0, "---"); #ifdef DEBUG printmat("C", M, N, C, ldc); #endif matgen(inc, nmat, C, inc, M*N); t0 = time00(); for (k=nreps; k; k--) { test_gemm(TAc, TBc, M, N, K, alpha, a, lda, b, ldb, bet, c, ldc); c += inc; a += inc; b += inc; if (c == st) { c = C; a = A; b = B; if (bet == beta) bet = betinv; else bet = beta; } } t2 = time00() - t0; t2 /= nreps; if (t2 <= 0.0) t2 = mflop = 0.0; else mflop = mops / t2; pc = "---"; if (t1 == t2) t3 = 1.0; else if (t2 != 0.0) t3 = t1/t2; else t3 = 0.0; printf(form, itst++, TA, TB, M, N, K, MALPH, MBETA, t2, mflop, t3, pc); free(vp); return(1); }
int mmcase(int TEST, int CACHESIZE, char TA, char TB, int M, int N, int K, SCALAR alpha, TYPE *A, int lda, TYPE *B, int ldb, SCALAR beta, TYPE *C, int ldc, TYPE *D, int ldd) { char *pc; #ifdef TREAL char *form="%4d %c %c %4d %4d %4d %5.1f %5.1f %6.2f %5.1f %5.2f %3s\n"; #define MALPH alpha #define MBETA beta #else #define MALPH *alpha, alpha[1] #define MBETA *beta, beta[1] char *form="%4d %c %c %4d %4d %4d %5.1f %5.1f %5.1f %5.1f %6.2f %6.1f %4.2f %3s\n"; #endif int ii, jj, i, j=0, PASSED, nerrs; double t0, t1, t2, t3, mflop; TYPE maxval, f1, ferr; static TYPE feps=0.0; static int itst=1; /*int *L2, nL2=(1.3*L2SIZE)/sizeof(int);*/ enum ATLAS_TRANS TAc, TBc; double l2ret; if (!TEST) D = C; /*if (nL2) L2 = malloc(nL2*sizeof(int));*/ l2ret = ATL_flushcache( CACHESIZE ); if (TA == 'n' || TA == 'N') { matgen(M, K, A, lda, K*1112); TAc = AtlasNoTrans; } else { matgen(K, M, A, lda, K*1112); if (TA == 'c' || TA == 'C') TAc = AtlasConjTrans; else TAc = AtlasTrans; } if (TB == 'n' || TB == 'N') { matgen(K, N, B, ldb, N*2238); TBc = AtlasNoTrans; } else { matgen(N, K, B, ldb, N*2238); if (TB == 'c' || TB == 'C') TBc = AtlasConjTrans; else TBc = AtlasTrans; } matgen(M, N, C, ldc, M*N); #ifdef DEBUG printmat("A0", M, K, A, lda); printmat("B0", K, N, B, ldb); printmat("C0", M, N, C, ldc); #endif /* if (L2) { for (i=0; i != nL2; i++) L2[i] = 0.0; for (i=0; i != nL2; i++) j += L2[i]; }*/ /* invalidate L2 cache */ l2ret = ATL_flushcache( -1 ); t0 = time00(); trusted_gemm(TAc, TBc, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); t1 = time00() - t0; if (t1 <= 0.0) mflop = t1 = 0.0; else /* flop rates actually 8MNK+12MN & 2MNK + 2MN, resp */ #ifdef TCPLX mflop = ( ((8.0*M)*N)*K ) / (t1*1000000.0); #else mflop = ( ((2.0*M)*N)*K ) / (t1*1000000.0); #endif printf(form, itst, TA, TB, M, N, K, MALPH, MBETA, t1, mflop, 1.0, "---"); #ifdef DEBUG printmat("C", M, N, C, ldc); #endif #ifndef TIMEONLY matgen(M, N, D, ldd, M*N); /* invalidate L2 cache */ l2ret = ATL_flushcache( -1 ); t0 = time00(); test_gemm(TAc, TBc, M, N, K, alpha, A, lda, B, ldb, beta, D, ldd); t2 = time00() - t0; if (t2 <= 0.0) t2 = mflop = 0.0; else #ifdef TCPLX mflop = ( ((8.0*M)*N)*K ) / (t2*1000000.0); #else mflop = ( ((2.0*M)*N)*K ) / (t2*1000000.0); #endif #ifdef DEBUG printmat("D", M, N, D, ldd); #endif if (TEST) { if (feps == 0.0) { #if 0 f1 = feps = 0.5; do { feps = f1; f1 *= 0.5; maxval = 1.0 + f1; } while (maxval != 1.0); printf("feps=%e\n",feps); #else feps = EPS; #endif #ifdef DEBUG printf("feps=%e\n",feps); #endif } #ifdef TREAL ferr = 2.0 * (Mabs(alpha) * 2.0*K*feps + Mabs(beta) * feps) + feps; #else f1 = Mabs(*alpha) + Mabs(alpha[1]); maxval = Mabs(*beta) + Mabs(beta[1]); ferr = 2.0 * (f1*8.0*K*feps + maxval*feps) + feps; #endif PASSED = 1; maxval = 0.0; pc = "YES"; nerrs = ii = jj = 0; for (j=0; j != N; j++) { for (i=0; i != M SHIFT; i++) { f1 = D[i] - C[i]; if (f1 < 0.0) f1 = -f1; if (f1 > ferr) { nerrs++; PASSED = 0; pc = "NO!"; if (f1 > maxval) { maxval=f1; ii = i+1; jj = j+1; } } } D += ldd SHIFT; C += ldc SHIFT; } if (maxval != 0.0) fprintf(stderr, "ERROR: nerr=%d, i=%d, j=%d, maxval=%e\n", nerrs, ii,jj, maxval); } else pc = "---"; if (t1 == t2) t3 = 1.0; else if (t2 != 0.0) t3 = t1/t2; else t3 = 0.0; printf(form, itst++, TA, TB, M, N, K, MALPH, MBETA, t2, mflop, t3, pc); #else itst++; PASSED = 1; #endif /*free(L2);*/ l2ret = ATL_flushcache( 0 ); return(PASSED); }
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order, int M, int N, int lda) { char *cord = (Order == AtlasColMajor ? "Col" : "Row"); const double maxMN = Mmax(M,N), minMN = Mmin(M,N); unsigned long nreps=0; int npiv=(-1), *ipiv; const int incA = (Order == AtlasColMajor ? N*lda : M*lda); double mflops, mflop, resid, tim=(-1.0), t0; TYPE *A, *a; int i; #ifdef TREAL mflops = maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) - (minMN*minMN) / 2.0; #else mflops = (maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) + (maxMN*minMN) / 2.0)*4.0 - 3.0 * minMN*minMN; #endif mflops /= 1000000.0; if (thresh > ATL_rzero) { if (Order == AtlasColMajor) resid = lutestC(CacheSize, M, N, lda, &npiv, &tim); else resid = lutestR(CacheSize, M, N, lda, &npiv, &tim); } else resid = -1.0; if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */ { nreps = (mflops*1000000); nreps = (MFLOP*1000000 + nreps-1) / nreps; if (nreps < 1) nreps = 1; i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL; i = (i + M*N) / (M*N); if (i < nreps) i = nreps; /* don't reuse mem or no pivoting */ a = A = malloc(i * ATL_MulBySize(incA)); if (A != NULL) { ipiv = malloc(Mmin(M,N)*sizeof(int)); /* what the hell - reuse ipiv */ if (ipiv) { Mjoin(PATL,gegen)(i*incA, 1, A, i*incA, incA+M+3012); t0 = time00(); for (i=nreps; i; i--, a += incA) test_getrf(Order, M, N, a, lda, ipiv); tim = time00() - t0; tim /= nreps; if (npiv == 0) npiv = findnpvt(Mmin(M,N), ipiv); free(ipiv); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); } if (tim > 0.0) mflop = mflops / tim; else mflop = 0.0; fprintf(stdout, "%5d %3s %6d %6d %6d %6d %9.3f %9.3f %9.3e\n", nreps, cord, M, N, lda, npiv, tim, mflop, resid); return(resid <= thresh); }
double gemvcase(const int MFLOP, const enum ATLAS_TRANS TA, const int l2size, const int M, const int N, const SCALAR alpha, const int lda, const SCALAR beta) { unsigned long reps; int i, lx, ly, la; #ifdef TREAL const double flops = 2.0 * M * N; #else const double flops = 8.0 * M * N; #endif double ttest, mftest, t0; const int aincY=1, aincX=1, incY=1, incX=1; const int inca = (TA == AtlasNoTrans) ? lda * (N SHIFT) : lda * (M SHIFT); const int incx = N*incX SHIFT, incy = M*incY SHIFT; TYPE *a, *A, *stA, *A0, *x, *X, *X0, *stX, *y, *Y, *Y0, *stY; #ifdef TREAL const TYPE nbeta = -beta; TYPE bet = beta; #else const TYPE *bet = beta; TYPE nbeta[2]; nbeta[0] = -beta[0]; nbeta[1] = -beta[1]; #endif i = (ATL_DivBySize(l2size)+N-1)/N; if (i < 1) i = 1; lx = i * N * aincX; X0 = X = x = malloc(ATL_MulBySize(lx)); if (x == NULL) return(-1); i = (ATL_DivBySize(l2size)+M-1)/M; if (i < 1) i = 1; ly = i * M * aincY; Y0 = Y = y = malloc(ATL_MulBySize(ly)); if (y == NULL) { free(x); return(-1); } i = (ATL_DivBySize(l2size)+M*N)/(M*N); la = i * inca; A0 = A = a = malloc(ATL_MulBySize(la)); if (a == NULL) { free(x); free(y); return(-1); } if (incX < 1) { stX = x; x = X = x + (lx SHIFT); } else stX = x + (lx SHIFT); if (incY < 1) { stY = y; y = Y = y + (ly SHIFT); } else stY = y + (ly SHIFT); stA = a + (la SHIFT); reps = (MFLOP * 1000000.0) / flops; if (reps < 1) reps = 1; Mjoin(PATL,gegen)(ly, 1, Y0, ly, M*incY); Mjoin(PATL,gegen)(lx, 1, X0, lx, N*incY+127*50+77); Mjoin(PATL,gegen)(la, 1, A0, la, N*M+513*7+90); t0 = time00(); for (i=reps; i; i--) { #ifdef SYMM_ Mjoin(PATL,symv)(AtlasLower, N, alpha, a, lda, x, incX, beta, y, incY); #else Mjoin(PATL,gemv)(TA, M, N, alpha, a, lda, x, incX, beta, y, incY); #endif x += incx; y += incy; a += inca; if (x == stX) x = X; if (y == stY) { y = Y; if (bet == beta) bet = nbeta; else bet = beta; } if (a == stA) a = A; } ttest = time00() - t0; if (ttest > 0.0) mftest = (reps * flops) / (1000000.0 * ttest); else mftest = 0.0; free(A0); free(X0); free(Y0); return(mftest); }
double GetTimeWithReps_QL (int mflopF, int lda, int M, int N, int nb, int Uplo, int Side, int flsizeKB) { double mflop, t0, t1, drep; TYPE dtmp, dtmp1; char *wrksets; /* working sets for kernel calls */ #ifdef TCPLX const int lda2 = lda+lda; #else const int lda2 = lda; #endif size_t setsz, setszT; /* work set size in memory, and amnt of it touched */ size_t nrep; /* # of reps required to force mflopF flops */ size_t nset; /* # of working sets allocated */ int wlen; /* length of QR's workspace */ int i; /* * Figure out how much workspace is required, and allocate it */ test_geqlf(CblasColMajor, M, N, &dtmp1, lda, &dtmp1, &dtmp, -1); wlen = dtmp; /* * QR accesses matrix, Min(M,N)-length tau & workspace, but for flush purposes * be conservative and say it only accesses A */ setsz = (lda*N + wlen + Mmin(M,N)) * ATL_sizeof; setszT = M*N*ATL_sizeof; mflop = GetFlopCount(LAgeqrf, LARight+LALower, M, N, 0, 0, CAN_NB); /* * Cannot reuse matrices (bogus to factor an already factored matrix), so we * must take as our total memspace MAX(nrep,nset)*setsz */ ATL_assert(mflop > 0.0); drep = (mflopF*1.0e6) / mflop; nrep = (int)(drep+0.999999); /* * If cacheline flush doesn't work, then we must use this method */ #if ATL_LINEFLUSH if (nrep < 2) return(-1.0); /* do wt normal timer */ #else nrep = (nrep >= 1) ? nrep : 1; #endif nset = (flsizeKB*1024+setszT-1)/setszT; if (nset < nrep) nset = nrep; wrksets = malloc(nset * setsz); ATL_assert(wrksets); for (i=0; i < nset; i++) Mjoin(PATL,gegen)(M, N, (TYPE*)(wrksets+i*setsz), lda, M*N+lda); t0 = time00(); for (i=0; i < nrep; i++) { test_geqlf(CblasColMajor, M, N, (TYPE*)(wrksets+i*setsz), lda, (TYPE*)(wrksets+i*setsz+(N*lda+wlen)*ATL_sizeof), (TYPE*)(wrksets+i*setsz+N*lda*ATL_sizeof), wlen); } t1 = time00(); free(wrksets); return((t1-t0)/((double)nrep)); }
double GetTime(int rout, int mflopF, int lda, int M, int N, int nb, int Uplo, int Side, int flsizeKB) { #if ATL_LINEFLUSH FLSTRUCT *flp; #endif TYPE *A, *wrk=NULL, dtmp, dtmp1, *tau=NULL; int *ipiv=NULL, itmp, wlen; double t0, t1; /* * Call routs that force particular flop count if requested; they return -1.0 * if one invocation will suffice to force mflopF, in which case do the timing * in this routine, which is simpler & doesn't require LRU & as much workspace * If we don't have the ability to do cacheline flushing, must use LRU rout! */ #if ATL_LINEFLUSH if (mflopF > 0) { #endif if (rout == LApotrf) t1 = GetTimeWithReps_LLT(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); else if (rout == LAgeqrf) { if (Side == LARight) { if (Uplo == LAUpper) t1 = GetTimeWithReps_QR(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); else t1 = GetTimeWithReps_QL(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); } else if (Uplo == LAUpper) t1 = GetTimeWithReps_RQ(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); else t1 = GetTimeWithReps_LQ(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); } else t1 = GetTimeWithReps_LU(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); #if ATL_LINEFLUSH == 0 return(t1); #else if (t1 >= 0.0) return(t1); } #endif #if ATL_LINEFLUSH != 0 /* * Generate operands */ A = GetGE(M, N, lda); ATL_assert(A); flp = ATL_GetFlushStruct(A, N*((size_t)lda)*ATL_sizeof, NULL); if (rout == LApotrf) PosDefGen(CblasColMajor, Uplo_LA2ATL(Uplo), N, A, lda); else if (rout & LAgeqrf) { /* QR must allocate workspace */ if (Side == LARight) { if (Uplo == LAUpper) { test_geqrf(CblasColMajor, M, N, A, lda, &dtmp1, &dtmp, -1); } else { test_geqlf(CblasColMajor, M, N, A, lda, &dtmp1, &dtmp, -1); } } else if (Uplo == LAUpper) { test_gerqf(CblasColMajor, M, N, A, lda, &dtmp1, &dtmp, -1); } else { test_gelqf(CblasColMajor, M, N, A, lda, &dtmp1, &dtmp, -1); } wlen = dtmp; wrk = calloc(wlen, ATL_sizeof); ATL_assert(wrk); flp = ATL_GetFlushStruct(wrk, wlen*ATL_sizeof, flp); itmp = (M >= N) ? M : N; tau = calloc(itmp, ATL_sizeof); flp = ATL_GetFlushStruct(tau, itmp*ATL_sizeof, flp); } else { ipiv = calloc(M, sizeof(int)); ATL_assert(ipiv); flp = ATL_GetFlushStruct(ipiv, M*sizeof(int), flp); } /* * Flush cache, and do timing */ ATL_FlushAreasByCL(flp); if (rout == LApotrf) { t0 = time00(); test_potrf(Uplo, N, A, lda); t1 = time00(); } else if (rout == LAgeqrf) { if (Side == LARight) { if (Uplo == LAUpper) { t0 = time00(); test_geqrf(CblasColMajor, M, N, A, lda, tau, wrk, wlen); t1 = time00(); } else { t0 = time00(); test_geqlf(CblasColMajor, M, N, A, lda, tau, wrk, wlen); t1 = time00(); } } else if (Uplo == LAUpper) { t0 = time00(); test_gerqf(CblasColMajor, M, N, A, lda, tau, wrk, wlen); t1 = time00(); } else { t0 = time00(); test_gelqf(CblasColMajor, M, N, A, lda, tau, wrk, wlen); t1 = time00(); } } else { t0 = time00(); test_getrf(CblasColMajor, M, N, A, lda, ipiv); t1 = time00(); } if (tau) free(tau); if (wrk) free(wrk); if (ipiv) free(ipiv); free(A); ATL_KillAllFlushStructs(flp); return(t1 - t0); #endif }
static TYPE trtritest(enum ATLAS_ORDER Order, enum ATLAS_UPLO Uplo, enum ATLAS_DIAG Diag, int CacheSize, int N, int lda, double *tim) { TYPE *A, *Acompare; int i; double t0, t1; TYPE normA, eps, resid; /*int ierr;*/ #ifdef TCPLX const TYPE one[2]={ATL_rone, ATL_rzero}; #else const TYPE one = ATL_rone; #endif eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*N); Acompare = malloc(ATL_MulBySize(lda)*N); if (A == NULL) return(-1); if (Acompare == NULL) return(-1); t0 = ATL_flushcache(CacheSize); /* create random, diagonally dominant matrix with magic value at unused places. Last number is just the random seed. */ trigen(Order, Uplo, Diag, N, A, lda, PADVAL, N*1029+lda); /* Create backup to calculate residual. This one has to be used as a full matrix, so it has zero fills and correct diagonal. */ trigen(Order, Uplo, Diag, N, Acompare, lda, ATL_rzero, N*1029+lda); if (Diag==AtlasUnit) for (i=0; i < N; i++) Acompare[(i*(lda+1)) SHIFT] = ATL_rone; normA = trinrm1(Order,Uplo, Diag, N, A, lda); #ifdef DEBUG Mjoin(PATL,geprint)("A0", N, N, A, lda); #endif t0 = ATL_flushcache(-1); /* Calculate and time a solution */ t0 = time00(); test_trtri(Order, Uplo, Diag, N, A, lda); t1 = time00() - t0; *tim = t1; /* if (ierr != 0) { fprintf(stderr, "Return values != 0 : %d \n",ierr); return(9999.9999); }*/ t0 = ATL_flushcache(0); /* Instroduce a padding error. */ /* A[(5+5*lda)SHIFT]=114.0; */ #ifdef DEBUG Mjoin(PATL,geprint)("L", N, N, A, lda); #endif ATL_checkpad(Order, Uplo, Diag, N, A, lda); /* Calculate A^{-1}*A */ cblas_trmm(Order,CblasLeft,Uplo,AtlasNoTrans,Diag, N,N,one,A,lda,Acompare,lda); #ifdef DEBUG Mjoin(PATL,geprint)("A^{-1}*A", N, N, Acompare, N); #endif /* Subtract diagonal */ for (i=0; i < N; i++) Acompare[i*((lda+1) SHIFT)] -= ATL_rone; /* resid = trinrm1(Order, Uplo,AtlasNonUnit,N,Acompare,lda); fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid); */ resid = Mjoin(PATL,genrm1)(N, N, Acompare, lda); #ifdef DEBUG if (resid/(normA*eps*N) > 10.0) fprintf(stderr, "normA=%e, eps=%e, num=%e\n", normA, eps, resid); #endif resid /= (normA * eps * N); free(Acompare); free(A); return(resid); }