void Mjoin(PATL,gpmm) (const enum PACK_UPLO UA, const enum PACK_TRANS TA, const enum PACK_UPLO UB, const enum PACK_TRANS TB, const enum PACK_UPLO UC, const int M, const int N, const int K, const SCALAR alpha, const TYPE *A, const int IA, const int JA, const int lda, const TYPE *B, const int IB, const int JB, const int ldb, const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc) { int j; #ifdef CacheEdge static const int CE_K = ((ATL_DivBySize(CacheEdge)-(NBNB SHIFT)) / (NB*(NB+NB)))*NB; #else #define CE_K K #endif if (!M || !N) return; if (!K || SCALAR_IS_ZERO(alpha)) { for (j=0; j != N; j++) Mjoin(PATL,scal)(M, beta, C+MindexP(UC,IC,JC+j,ldc), 1); return; } /* * Packed gpmm not yet implemented for complex, * so die if not really a dense gemm */ #ifdef TCPLX ATL_assert (UA == PackGen && UB == PackGen && UC == PackGen); Mjoin(PATL,gemm)(TA, TB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); #else Mjoin(PATL,prankK)(UA, TA, UB, TB, M, N, K, CE_K, alpha, A+MindexP(UA,IA,JA,lda), Mpld(UA,JA,lda), B+MindexP(UB,IB,JB,ldb), Mpld(UB,JB,ldb), beta, UC, C+MindexP(UC,IB,JB,ldc), Mpld(UC,JC,ldc)); #endif }
void Mjoin(PATL,sprk) (const enum PACK_UPLO UA, const enum PACK_TRANS TA, const enum ATLAS_UPLO UC, const int CP, const int N, const int K, const SCALAR alpha, const TYPE *A, const int IA, const int JA, const int lda, const SCALAR beta, TYPE *C, const int IC, const int JC, const int ldc) { const enum PACK_UPLO UC2 = ((CP) ? UC : PackGen); int j; #ifdef CacheEdge static const int CE_K = ((ATL_DivBySize(CacheEdge SHIFT)-(NBNB SHIFT)) / (NB*(NB+NB)))*NB; #else #define CE_K K #endif if ((!N) || ((SCALAR_IS_ZERO(alpha) || (!K)) && (SCALAR_IS_ONE(beta)))) return; if (!K || SCALAR_IS_ZERO(alpha)) { if (UC == CblasLower) { for (j=0; j != N; j++) Mjoin(PATL,scal)(N-j, beta, C+MindexP(UC2,IC+j,JC+j,ldc), 1); } else /* UC == CblasUpper */ { for (j=0; j != N; j++) Mjoin(PATL,scal)(j+1, beta, C+MindexP(UC2,IC,JC+j,ldc), 1); } return; } Mjoin(PATL,sprk_rK)(UA, TA, UC, CP, N, K, CE_K, alpha, A, lda, beta, C, ldc); }
static double RunTiming (enum CBLAS_ORDER Order, enum TEST_UPLO Uplo, int N, int lda, int CacheSize, int nreps) { TYPE *A, *a; const int incA = N*lda; int i, k; double t0, t1=0.0; if (nreps < 1) nreps = 1; i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL; k = i = (i + N*N-1) / (N*N); if (nreps > i) k = i = nreps; a = A = malloc(i * ATL_MulBySize(incA)); if (A) { if (Uplo == TestGE) for (i=0; i < k; i++) Mjoin(PATL,gegen)(N, N, A+i*incA, lda, N+lda); else for (i=0; i < k; i++) hegen(Order, Uplo, N, A+i*incA, lda); t0 = time00(); for (i=nreps; i; i--, a += incA) test_inv(Order, Uplo, N, a, lda); t1 = time00() - t0; free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); return(t1/nreps); }
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order, enum ATLAS_UPLO Uplo, int N, int lda) { char *Ups, *Ord; TYPE resid = 0.0; double mflop, mflops, t0, tim=0.0; int nreps=1, passed, i, imem; const int incA = lda*N; TYPE *a, *A; mflops = N; mflops = (mflops*mflops*mflops) / 4.0; #ifdef TCPLX mflops *= 4.0; #endif mflops /= 1000000.0; if (thresh > ATL_rzero) resid = uumtest(Order, Uplo, CacheSize, N, lda, &tim); else resid = -1.0; if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */ { nreps = (mflops * 1000000); nreps = (MFLOP*1000000 + nreps-1) / nreps; if (nreps < 1) nreps = 1; imem = ATL_DivBySize(CacheSize) ATL_PTCACHEMUL; imem = (imem + 2*N*N-1) / (N*N); if (imem < nreps) imem = nreps; a = A = malloc(imem * ATL_MulBySize(incA)); if (A != NULL) { for (i=0; i < imem; i++) lltgen(Uplo, N, A+i*incA, lda, N*1029+lda); t0 = time00(); for (i=nreps; i; i--, a += incA) test_lauum(Order, Uplo, N, a, lda); tim = time00() - t0; tim /= nreps; free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); } if (tim > 0.0) mflop = mflops / tim; else mflop = 0.0; if (Uplo == AtlasUpper) Ups = "Upper"; else Ups = "Lower"; if (Order == CblasColMajor) Ord = "Col"; else Ord = "Row"; fprintf(stdout, "%5d %3s %5s %6d %6d %12.5f %12.3f %12e\n", nreps, Ord, Ups, N, lda, tim, mflop, resid); if (resid > thresh || resid != resid) passed = 0; else if (resid < 0.0) passed = -1; else passed = 1; return(passed); }
void Mjoin(Mjoin(PATL,t),MY_GER) (ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *X, ATL_CINT incX, const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda) { ATL_INT mb, nb, mu, nu, nblks, nrblks, ncblks, ldaP; ATL_TGER_t pd; int P; static TYPE *A0=NULL, *A0e=NULL; if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha)) /* quick return if no-op */ return; pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda; pd.alpha = alpha; pd.X = X; pd.Y = Y; pd.A = A; pd.flg = (A0 == A || A0e == A+(M SHIFT)) ? 1 : 2; A0 = A; A0e = A+(M SHIFT); P = ATL_DivBySize(CacheEdge); P = ((size_t)M*N+P-1) / P; /* add more procs only when cache is full */ P = (P&1 && P > 1)?P+1 : P; /* don't use odd P, since it hurts alignment */ // printf("TGER, P=%d\n", P); P = Mmin(ATL_NTHREADS, P); /* * Make sure we don't overflow 32-bit integer lda */ ldaP = P * lda; while ((size_t)ldaP != ((size_t)lda)*P) { P--; ldaP = P * lda; } if (P > 1) ATL_goparallel(P, MY_DOWORK_cols, &pd, NULL); else MY_GER1(M, N, alpha, X, incX, Y, incY, A, lda); }
void Mjoin(PATL,tgemv) (const enum ATLAS_TRANS TA, ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX, const SCALAR beta, TYPE *Y, ATL_CINT incY) { static size_t ALb=0, ALe=0; size_t at = (size_t) A; ATL_INT n, P, ldaP; ATL_TGEMV_t pd; /* * quick return if possible. */ if (M < 1 || N < 1) return; if (SCALAR_IS_ZERO(alpha)) /* No contrib from alpha*A*x */ { ATL_CINT NY = (TA == AtlasTrans || TA == AtlasConjTrans) ? N : M; if (!SCALAR_IS_ONE(beta)) { if (SCALAR_IS_ZERO(beta)) Mjoin(PATL,zero)(NY, Y, incY); else Mjoin(PATL,scal)(NY, beta, Y, incY); } return; } pd.flg = (at >= ALb && at <= ALe) ? 1 : 0; ALb = (size_t)A; ALe = (size_t)(A+(M SHIFT)); #ifdef TREAL pd.flg |= (TA == AtlasTrans || TA == AtlasConjTrans) ? 2 : 0; #else if (TA != AtlasNoTrans) { if (TA == AtlasConj) pd.flg |= 4; else if (TA == AtlasTrans) pd.flg |= 2; else /* if (TA == AtlasConjTrans) */ pd.flg |= (2|4); } #endif P = ATL_DivBySize(CacheEdge); P = ((size_t)M*N+P-1) / P; /* add more procs only when cache is full */ P = (P&1 && P > 1)?P+1 : P; /* don't use odd P; it hurts alignment */ P = Mmin(ATL_NTHREADS, P); if (TA == AtlasNoTrans || TA == AtlasConj) P=1; //fprintf(stderr, "P=%d, TA=%d, M=%d, N=%d\n", P, (TA==AtlasTrans), M, N); /* * Make sure we don't overflow 32-bit integer lda */ ldaP = P * lda; while ((size_t)ldaP != ((size_t)lda)*P) { P--; ldaP = P * lda; } if (P > 1) { pd.M = M; pd.N = N; pd.incX = incX; pd.incY = incY; pd.lda = lda; pd.alpha = alpha; pd.beta = beta; pd.X = X; pd.Y = Y; pd.A = A; pd.P = P; n = N / P; pd.n = n; pd.nr = N - n*P; if (pd.flg & 2) /* Transpose case */ { ATL_goparallel(P, Mjoin(PATL,DOMVTWORK_cols), &pd, NULL); return; } /* * For gemvN, everyone needs a private M-length y. Don't do this unless * we are sure the combine cost is likely dominated by the parallelism */ else if (n > Mmax(P,8)) { int vrank; const TYPE *a; TYPE *y, *y0; #ifdef TCPLX TYPE one[2] = {ATL_rone, ATL_rzero}; TYPE zero[2] = {ATL_rzero, ATL_rzero}; #endif y0 = y = malloc(P*(ATL_Cachelen+ATL_MulBySize(M))); ATL_assert(y); pd.Y = y; pd.incY = 1; #ifdef TREAL pd.alpha = ATL_rone; pd.beta = ATL_rzero; #else pd.alpha = one; pd.beta = zero; #endif ATL_goparallel(P, Mjoin(PATL,DOMVNWORK_cols), &pd, Mjoin(PATL,CombineMVN)); /* * goparallel reduces all node's Ys to node 0's. Extract his from the * work array, and combine it with input array, applying both alpha * and beta in the process */ vrank = (!pd.nr || (pd.flg & 1)) ? 0 : pd.nr-1; a = A + (lda SHIFT)*vrank; y = ATL_Align2Ptr(y, a); Mjoin(PATL,axpby)(M, alpha, y, 1, beta, Y, incY); free(y0); return; } } /* * If we haven't parallelized this thing, just do it serial */ Mjoin(PATL,gemv)(TA, M, N, alpha, A, lda, X, incX, beta, Y, incY); }
int RunCase(int CacheSize, TYPE thresh, int MFLOP, enum ATLAS_ORDER Order, int M, int N, int lda) { char *cord = (Order == AtlasColMajor ? "Col" : "Row"); const double maxMN = Mmax(M,N), minMN = Mmin(M,N); unsigned long nreps=0; int npiv=(-1), *ipiv; const int incA = (Order == AtlasColMajor ? N*lda : M*lda); double mflops, mflop, resid, tim=(-1.0), t0; TYPE *A, *a; int i; #ifdef TREAL mflops = maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) - (minMN*minMN) / 2.0; #else mflops = (maxMN * minMN * minMN - ((minMN*minMN*minMN) / 3.0) + (maxMN*minMN) / 2.0)*4.0 - 3.0 * minMN*minMN; #endif mflops /= 1000000.0; if (thresh > ATL_rzero) { if (Order == AtlasColMajor) resid = lutestC(CacheSize, M, N, lda, &npiv, &tim); else resid = lutestR(CacheSize, M, N, lda, &npiv, &tim); } else resid = -1.0; if (MFLOP > mflops || thresh <= ATL_rzero) /* need to time repetitively */ { nreps = (mflops*1000000); nreps = (MFLOP*1000000 + nreps-1) / nreps; if (nreps < 1) nreps = 1; i = ATL_DivBySize(2*CacheSize) ATL_PTCACHEMUL; i = (i + M*N) / (M*N); if (i < nreps) i = nreps; /* don't reuse mem or no pivoting */ a = A = malloc(i * ATL_MulBySize(incA)); if (A != NULL) { ipiv = malloc(Mmin(M,N)*sizeof(int)); /* what the hell - reuse ipiv */ if (ipiv) { Mjoin(PATL,gegen)(i*incA, 1, A, i*incA, incA+M+3012); t0 = time00(); for (i=nreps; i; i--, a += incA) test_getrf(Order, M, N, a, lda, ipiv); tim = time00() - t0; tim /= nreps; if (npiv == 0) npiv = findnpvt(Mmin(M,N), ipiv); free(ipiv); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); free(A); } else fprintf(stderr, " WARNING: not enough mem to run timings!\n"); } if (tim > 0.0) mflop = mflops / tim; else mflop = 0.0; fprintf(stdout, "%5d %3s %6d %6d %6d %6d %9.3f %9.3f %9.3e\n", nreps, cord, M, N, lda, npiv, tim, mflop, resid); return(resid <= thresh); }
double gemvcase(const int MFLOP, const enum ATLAS_TRANS TA, const int l2size, const int M, const int N, const SCALAR alpha, const int lda, const SCALAR beta) { unsigned long reps; int i, lx, ly, la; #ifdef TREAL const double flops = 2.0 * M * N; #else const double flops = 8.0 * M * N; #endif double ttest, mftest, t0; const int aincY=1, aincX=1, incY=1, incX=1; const int inca = (TA == AtlasNoTrans) ? lda * (N SHIFT) : lda * (M SHIFT); const int incx = N*incX SHIFT, incy = M*incY SHIFT; TYPE *a, *A, *stA, *A0, *x, *X, *X0, *stX, *y, *Y, *Y0, *stY; #ifdef TREAL const TYPE nbeta = -beta; TYPE bet = beta; #else const TYPE *bet = beta; TYPE nbeta[2]; nbeta[0] = -beta[0]; nbeta[1] = -beta[1]; #endif i = (ATL_DivBySize(l2size)+N-1)/N; if (i < 1) i = 1; lx = i * N * aincX; X0 = X = x = malloc(ATL_MulBySize(lx)); if (x == NULL) return(-1); i = (ATL_DivBySize(l2size)+M-1)/M; if (i < 1) i = 1; ly = i * M * aincY; Y0 = Y = y = malloc(ATL_MulBySize(ly)); if (y == NULL) { free(x); return(-1); } i = (ATL_DivBySize(l2size)+M*N)/(M*N); la = i * inca; A0 = A = a = malloc(ATL_MulBySize(la)); if (a == NULL) { free(x); free(y); return(-1); } if (incX < 1) { stX = x; x = X = x + (lx SHIFT); } else stX = x + (lx SHIFT); if (incY < 1) { stY = y; y = Y = y + (ly SHIFT); } else stY = y + (ly SHIFT); stA = a + (la SHIFT); reps = (MFLOP * 1000000.0) / flops; if (reps < 1) reps = 1; Mjoin(PATL,gegen)(ly, 1, Y0, ly, M*incY); Mjoin(PATL,gegen)(lx, 1, X0, lx, N*incY+127*50+77); Mjoin(PATL,gegen)(la, 1, A0, la, N*M+513*7+90); t0 = time00(); for (i=reps; i; i--) { #ifdef SYMM_ Mjoin(PATL,symv)(AtlasLower, N, alpha, a, lda, x, incX, beta, y, incY); #else Mjoin(PATL,gemv)(TA, M, N, alpha, a, lda, x, incX, beta, y, incY); #endif x += incx; y += incy; a += inca; if (x == stX) x = X; if (y == stY) { y = Y; if (bet == beta) bet = nbeta; else bet = beta; } if (a == stA) a = A; } ttest = time00() - t0; if (ttest > 0.0) mftest = (reps * flops) / (1000000.0 * ttest); else mftest = 0.0; free(A0); free(X0); free(Y0); return(mftest); }
/* * This routine handles the case where N <= maxNB && K <= maxKB, so B is * only one block. It is particularly important for the panel factorizations * of both LU and QR. */ int Mjoin(PATL,tammm_tNK) ( enum ATLAS_TRANS TA, enum ATLAS_TRANS TB, ATL_CINT M, ATL_CINT N, ATL_CINT K, const SCALAR alpha, const TYPE *A, ATL_CINT lda, const TYPE *B, ATL_CINT ldb, const SCALAR beta, TYPE *C, ATL_CINT ldc ) { ATL_SZT nmblks; amminfo_t mminfo; unsigned int i, mb, nb, kb, mu, nu, ku, P, mr; ATL_tamm_tNK_t pd; /* problem definition structure */ void *vp; /* * Special case for tiny N&K, and large M */ if (N >= ATL_AMM_MAXNB || K >= ATL_AMM_MAXKB || M < ATL_AMM_MAXMB || M < Mmin(8,ATL_NTHREADS)*ATL_AMM_MAXMB) return(1); Mjoin(PATL,GetRankKInfo)(&mminfo, TA, TB, M, N, K, alpha, beta); pd.a2blk = mminfo.a2blk; pd.b2blk = mminfo.b2blk; pd.blk2c = mminfo.Cblk2cm; pd.amm_b0 = mminfo.amm_b0; pd.TA = (TA == AtlasTrans); pd.TB = (TB == AtlasTrans); pd.N = N; pd.K = K; pd.A = A; pd.B = B; pd.C = C; pd.lda = lda; pd.ldb = ldb; pd.ldc = ldc; pd.alpha = α pd.beta = β mu = mminfo.mu; nu = mminfo.nu; ku = mminfo.ku; pd.mb = mb = mminfo.mb; pd.nmu = mb / mu; pd.nnu = (N+nu-1)/nu; nb = pd.nnu * nu; kb = mminfo.kb; nmblks = M / mb; mr = M - nmblks*mb; if (!mr) { pd.mbL = mr = mb; pd.nmuL = pd.nmu; } else { nmblks++; pd.nmuL = (mr+mu-1)/mu; pd.mbL = pd.nmuL * mu; } pd.mr = mr; pd.nmblks = nmblks; pd.KB0 = K; #if ATL_MAXKMAJ_RKK > 1 if (ATL_AMMFLG_KMAJOR(mminfo.flag)) pd.KB0 = ((K+ku-1)/ku)*ku; #endif /* * Maximum scale is limited by NTHREADS or max number of M-blocks */ P = (ATL_NTHREADS <= nmblks) ? ATL_NTHREADS : nmblks; /* * We have a common B wrk of size KB0*nb, then * for each node, we need workspace: sz(A,C) = mb*K, K*nb, mb*N, laid out * in memory as A,C, then we add safety margin mu*nu*ku so advance loads don't * seg fault, and we add space for aligning the ptrs */ pd.bsz = pd.KB0*nb; pd.wsz = mb*(pd.nnu*nu + pd.bsz) + 2*ATL_DivBySize(ATL_Cachelen); vp = malloc(ATL_MulBySize(pd.wsz*P + pd.bsz+mu*nu*ku) + ATL_Cachelen); if (!vp) return(2); pd.w = ATL_AlignPtr(vp); pd.MbCtr = ATL_SetGlobalAtomicCount(ATL_EstNctr(nmblks, P), nmblks, 0); pd.BassgCtr = ATL_SetAtomicCount(1); pd.BdoneCtr = ATL_SetAtomicCount(1); #ifdef DEBUG1 { ATL_LAUNCHSTRUCT_t ls; ATL_thread_t ts; ts.rank = 0; ts.P = 1; ls.opstruct = &pd; Mjoin(PATL,DoWork_tamm_tNK)(&ls, &ts); } #else ATL_goparallel(P, Mjoin(PATL,DoWork_tamm_tNK), &pd, NULL); #endif ATL_FreeAtomicCount(pd.BdoneCtr); ATL_FreeAtomicCount(pd.BassgCtr); ATL_FreeGlobalAtomicCount(pd.MbCtr); free(vp); return(0); }
static void ATL_symvL ( ATL_symvK_t symvK, const int NB, ATL_CINT N, const TYPE *A, ATL_CINT lda, const TYPE *x, TYPE *y, const TYPE *xt, TYPE *yt ) { const TYPE one[2] = {ATL_rone, ATL_rzero}; ATL_INT Mmb2; ATL_INT Mmb, mr, MB, j; const size_t incA = (NB SHIFT)*lda; const size_t opsize = ((size_t)(N+8)*(N+4))*(sizeof(TYPE)>>1)SHIFT; void (*gemvT)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT, const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT); void (*gemvN)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT, const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT); if (opsize > MY_CE) { gemvT = Mjoin(PATL,gemvT); gemvN = Mjoin(PATL,gemvN_L2); } else if (opsize <= ATL_MulBySize(ATL_L1elts)) { gemvT = Mjoin(PATL,gemvT_L1); gemvN = Mjoin(PATL,gemvN_L1); } else { gemvT = Mjoin(PATL,gemvT_L2); gemvN = Mjoin(PATL,gemvN_L2); } /* * Choose MB such that A is retained in L2 cache for second GEMV call * If partial block is tiny, absorbe it into last block since cache is not * precise anyway. */ MB = ATL_DivBySize(MY_CE) / NB; MB = (MB > N || MB < 240) ? N : MB; Mmb2 = Mmb+Mmb; for (j=0; j < N; j += NB, A += incA) { const register size_t j2 = j+j; register int i, nb=N-j; nb = (nb >= NB) ? NB : nb; symvK(AtlasLower, nb, one, A+j2, lda, x+j2, 1, one, y+j2, 1); for (i=j+nb; i < N; i += MB) { const register size_t i2 = i+i; register int mb = N-i; mb = (mb >= MB) ? MB : mb; gemvT(mb, nb, one, A+i2, lda, xt+i2, 1, one, yt+j2, 1); gemvN(mb, nb, one, A+i2, lda, x+j2, 1, one, y+i2, 1); } } }