double GetTimeWithReps_LU (int mflopF, int lda, int M, int N, int nb, int Uplo, int Side, int flsizeKB) { double mflop, t0, t1, drep; char *wrksets; /* working sets for kernel calls */ #ifdef TCPLX const int lda2 = lda+lda; #else const int lda2 = lda; #endif size_t setsz, setszT; /* work set size in memory, and amnt of it touched */ size_t nrep; /* # of reps required to force mflopF flops */ size_t nset; /* # of working sets allocated */ int i; /* * Keep setsz a multiple of TYPE size for alignment reasons. LU only accesses * M*N of matrix and all of IPIV. */ setsz = lda*N*ATL_sizeof + ((M*sizeof(int)+ATL_sizeof-1)/ATL_sizeof)*ATL_sizeof; setszT = M*N*ATL_sizeof + M*sizeof(int); mflop = GetFlopCount(LAgetrf, 0, M, N, 0, 0, CAN_NB); /* * Cannot reuse matrices (bogus to factor an already factored matrix), so we * must take as our total memspace MAX(nrep,nset)*setsz */ ATL_assert(mflop > 0.0); drep = (mflopF*1.0e6) / mflop; nrep = (int)(drep+0.999999); /* * If cacheline flush doesn't work, then we must use this method */ #if ATL_LINEFLUSH if (nrep < 2) return(-1.0); /* do wt normal timer */ #else nrep = (nrep >= 1) ? nrep : 1; #endif nset = (flsizeKB*1024+setszT-1)/setszT; if (nset < nrep) nset = nrep; wrksets = malloc(nset * setsz); ATL_assert(wrksets); for (i=0; i < nset; i++) Mjoin(PATL,gegen)(M, N, (TYPE*)(wrksets+i*setsz), lda, M*N+lda); t0 = time00(); for (i=0; i < nrep; i++) { test_getrf(CblasColMajor, M, N, (TYPE*)(wrksets+i*setsz), lda, (int*)(wrksets+i*setsz+lda*N*ATL_sizeof)); } t1 = time00(); free(wrksets); return((t1-t0)/((double)nrep)); }
static TYPE lutestR(int CacheSize, int M, int N, int lda, int *npiv, double *tim) { TYPE *A, *LmU; int *ipiv; const int MN = Mmin(M,N); int i; double t0, t1; TYPE normA, eps, resid; eps = Mjoin(PATL,epsilon)(); A = malloc(ATL_MulBySize(lda)*M); if (A == NULL) return(-1); ipiv = malloc( MN * sizeof(int) ); if (ipiv == NULL) { free(A); return(-1); } t0 = ATL_flushcache(CacheSize); Mjoin(PATL,gegen)(N, M, A, lda, M*N+lda); #ifdef DEBUG Mjoin(PATL,geprint)("A0", N, M, A, lda); #endif normA = Mjoin(PATL,genrm1)(N, M, A, lda); /* actually infnrm, but OK */ t0 = ATL_flushcache(-1); t0 = time00(); test_getrf(CblasRowMajor, M, N, A, lda, ipiv); t1 = time00() - t0; *tim = t1; t0 = ATL_flushcache(0); #ifdef DEBUG Mjoin(PATL,geprint)("LU", N, M, A, lda); #endif LmU = ATL_LmulUR(M, N, A, lda); /* LmU contains L * U */ #ifdef DEBUG Mjoin(PATL,geprint)("L*U", N, M, LmU, N); #endif Mjoin(PATL,gegen)(N, M, A, lda, M*N+lda); /* regenerate A, overwriting LU */ ATL_laswp(M, A, lda, 0, MN, ipiv, 1); /* apply swaps to A */ resid = Mjoin(PATL,gediffnrm1)(N, M, A, lda, LmU, N); resid /= (normA * eps * Mmin(M,N)); *npiv = findnpvt(MN, ipiv); free(LmU); free(A); free(ipiv); return(resid); }
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order, int M, int N, int lda) { char *cord = (Order == AtlasColMajor ? "Col" : "Row"); const double maxMN = Mmax(M,N), minMN = Mmin(M,N); unsigned long nreps=0; int npiv=(-1), *ipiv; const int incA = (Order == AtlasColMajor ? N*lda : M*lda); double mflops, mflop, resid, tim=(-1.0), t0; TYPE *A, *a; int i; #ifdef TREAL mflops = maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) - (minMN*minMN) / 2.0; #else mflops = (maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) + (maxMN*minMN) / 2.0)*4.0 - 3.0 * minMN*minMN; #endif mflops /= 1000000.0; if (thresh > ATL_rzero) { if (Order == AtlasColMajor) resid = lutestC(CacheSize, M, N, lda, &npiv, &tim); else resid = lutestR(CacheSize, M, N, lda, &npiv, &tim); } else resid = -1.0; if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */ { nreps = (mflops*1000000); nreps = (MFLOP*1000000 + nreps-1) / nreps; if (nreps < 1) nreps = 1; i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL; i = (i + M*N) / (M*N); if (i < nreps) i = nreps; /* don't reuse mem or no pivoting */ a = A = malloc(i * ATL_MulBySize(incA)); if (A != NULL) { ipiv = malloc(Mmin(M,N)*sizeof(int)); /* what the hell - reuse ipiv */ if (ipiv) { Mjoin(PATL,gegen)(i*incA, 1, A, i*incA, incA+M+3012); t0 = time00(); for (i=nreps; i; i--, a += incA) test_getrf(Order, M, N, a, lda, ipiv); tim = time00() - t0; tim /= nreps; if (npiv == 0) npiv = findnpvt(Mmin(M,N), ipiv); free(ipiv); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); } if (tim > 0.0) mflop = mflops / tim; else mflop = 0.0; fprintf(stdout, "%5d %3s %6d %6d %6d %6d %9.3f %9.3f %9.3e\n", nreps, cord, M, N, lda, npiv, tim, mflop, resid); return(resid <= thresh); }
double GetTime(int rout, int mflopF, int lda, int M, int N, int nb, int Uplo, int Side, int flsizeKB) { #if ATL_LINEFLUSH FLSTRUCT *flp; #endif TYPE *A, *wrk=NULL, dtmp, dtmp1, *tau=NULL; int *ipiv=NULL, itmp, wlen; double t0, t1; /* * Call routs that force particular flop count if requested; they return -1.0 * if one invocation will suffice to force mflopF, in which case do the timing * in this routine, which is simpler & doesn't require LRU & as much workspace * If we don't have the ability to do cacheline flushing, must use LRU rout! */ #if ATL_LINEFLUSH if (mflopF > 0) { #endif if (rout == LApotrf) t1 = GetTimeWithReps_LLT(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); else if (rout == LAgeqrf) { if (Side == LARight) { if (Uplo == LAUpper) t1 = GetTimeWithReps_QR(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); else t1 = GetTimeWithReps_QL(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); } else if (Uplo == LAUpper) t1 = GetTimeWithReps_RQ(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); else t1 = GetTimeWithReps_LQ(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); } else t1 = GetTimeWithReps_LU(mflopF, lda, M, N, nb, Uplo, Side, flsizeKB); #if ATL_LINEFLUSH == 0 return(t1); #else if (t1 >= 0.0) return(t1); } #endif #if ATL_LINEFLUSH != 0 /* * Generate operands */ A = GetGE(M, N, lda); ATL_assert(A); flp = ATL_GetFlushStruct(A, N*((size_t)lda)*ATL_sizeof, NULL); if (rout == LApotrf) PosDefGen(CblasColMajor, Uplo_LA2ATL(Uplo), N, A, lda); else if (rout & LAgeqrf) { /* QR must allocate workspace */ if (Side == LARight) { if (Uplo == LAUpper) { test_geqrf(CblasColMajor, M, N, A, lda, &dtmp1, &dtmp, -1); } else { test_geqlf(CblasColMajor, M, N, A, lda, &dtmp1, &dtmp, -1); } } else if (Uplo == LAUpper) { test_gerqf(CblasColMajor, M, N, A, lda, &dtmp1, &dtmp, -1); } else { test_gelqf(CblasColMajor, M, N, A, lda, &dtmp1, &dtmp, -1); } wlen = dtmp; wrk = calloc(wlen, ATL_sizeof); ATL_assert(wrk); flp = ATL_GetFlushStruct(wrk, wlen*ATL_sizeof, flp); itmp = (M >= N) ? M : N; tau = calloc(itmp, ATL_sizeof); flp = ATL_GetFlushStruct(tau, itmp*ATL_sizeof, flp); } else { ipiv = calloc(M, sizeof(int)); ATL_assert(ipiv); flp = ATL_GetFlushStruct(ipiv, M*sizeof(int), flp); } /* * Flush cache, and do timing */ ATL_FlushAreasByCL(flp); if (rout == LApotrf) { t0 = time00(); test_potrf(Uplo, N, A, lda); t1 = time00(); } else if (rout == LAgeqrf) { if (Side == LARight) { if (Uplo == LAUpper) { t0 = time00(); test_geqrf(CblasColMajor, M, N, A, lda, tau, wrk, wlen); t1 = time00(); } else { t0 = time00(); test_geqlf(CblasColMajor, M, N, A, lda, tau, wrk, wlen); t1 = time00(); } } else if (Uplo == LAUpper) { t0 = time00(); test_gerqf(CblasColMajor, M, N, A, lda, tau, wrk, wlen); t1 = time00(); } else { t0 = time00(); test_gelqf(CblasColMajor, M, N, A, lda, tau, wrk, wlen); t1 = time00(); } } else { t0 = time00(); test_getrf(CblasColMajor, M, N, A, lda, ipiv); t1 = time00(); } if (tau) free(tau); if (wrk) free(wrk); if (ipiv) free(ipiv); free(A); ATL_KillAllFlushStructs(flp); return(t1 - t0); #endif }