static TYPE *getvec(int npad, TYPE padval, int N, int incX) { TYPE *X, *x; int i, n; if (N <= 0) return(NULL); incX = Mabs(incX); n = 2*npad + 1+(N-1)*incX; X = malloc( ATL_sizeof*n ); assert(X); vecset(n, padval, X); #ifdef TCPLX npad *= 2; incX *= 2; #endif x = X + npad; for (i=0; i < N; i++, x += incX) { #ifdef TREAL *x = dumb_rand(); #else *x = dumb_rand(); x[1] = dumb_rand(); #endif } return(X); }
void Mjoin(PATL,gegen) (const int M0, const int N, TYPE *A, const int lda0, const int seed) { const int M = M0 SHIFT, lda = lda0 SHIFT; int i, j; dumb_seed(seed); Mjoin(PATL,gefillgap)(M0, N, A, lda0); for (j=N; j; j--) { for (i=0; i != M; i++) A[i] = dumb_rand(); A += lda; } }
void matgen(int M, int N, TYPE *A, int lda, int seed) { int i, j; #ifdef TCPLX M *= 2; lda *= 2; #endif dumb_seed(seed); for (j=N; j; j--) { for (i=0; i != M; i++) A[i] = dumb_rand(); A += lda; } }
void Mjoin(PATL,trgen)(const enum ATLAS_UPLO Uplo, const enum ATLAS_DIAG Diag, const int N, TYPE *A, const int lda0, const int seed) { const int M = N SHIFT, lda = lda0 SHIFT; int i, j; dumb_seed(seed); Mjoin(PATL,gefillgap)(N, N, A, lda0); if (Uplo == AtlasUpper) { for (j=0; j != N; j++) { for (i=0; i != (j SHIFT); i++) A[i] = dumb_rand(); if (Diag == AtlasNonUnit) { A[i++] = dumb_rand(); #ifdef TCPLX A[i++] = dumb_rand(); #endif } for (; i < M; i++) A[i] = FILLCONST; A += lda; } } else { for (j=0; j != N; j++) { for (i=0; i != (j SHIFT); i++) A[i] = FILLCONST; if (Diag == AtlasNonUnit) { A[i++] = dumb_rand(); #ifdef TCPLX A[i++] = dumb_rand(); #endif } for (; i != M; i++) A[i] = dumb_rand(); A += lda; } } }
int mmtst(void) { char fnam[80]; #if defined(LDA) && LDA != 0 const int lda=LDA; #else const int lda=2*LDA2; #endif #if defined(LDB) && LDB != 0 const int ldb=LDB; #else const int ldb=2*LDB2; #endif #if defined(LDC) && LDC != 0 const int ldc=LDC; #else const int ldc=2*LDC2; #endif int nA, nB; #ifdef TCPLX int inca, incb, incc; const TYPE one=1.0, none=(-1.0); #if (ALPHA == 1) TYPE alpha[2] = {1.0, 0.0}; #elif (ALPHA == -1) TYPE alpha[2] = {-1.0, 0.0}; #else TYPE alpha[2] = {2.3, 0.0}; #endif #if (BETA == 1) TYPE beta[2] = {1.0, 0.0}; #elif (BETA == -1) TYPE beta[2] = {-1.0, 0.0}; #elif (BETA == 0) TYPE beta[2] = {0.0, 0.0}; #else TYPE beta[2] = {1.3, 0.0}; #endif #else #ifdef ALPHA TYPE alpha=ALPHA; #else TYPE alpha=1.0; #endif #ifdef BETA TYPE beta=BETA; #else TYPE beta=1.0; #endif #endif const TYPE rone=1.0, rnone=(-1.0); void *va=NULL, *vb=NULL, *vc=NULL; TYPE *C0, *C1, *A, *B; TYPE diff, tmp; int i, j, k, n, nerr; int M=MB, N=NB, K=KB; TYPE ErrBound; if (!M) M = MB0; if (!N) N = NB0; if (!K) K = KB0; #ifdef TREAL ErrBound = 2.0 * (Mabs(alpha) * 2.0*K*EPS + Mabs(beta) * EPS) + EPS; #else diff = Mabs(*alpha) + Mabs(alpha[1]); tmp = Mabs(*beta) + Mabs(beta[1]); ErrBound = 2.0 * (diff*8.0*K*EPS + tmp*EPS) + EPS; #endif #ifdef NoTransA nA = K; #else nA = M; #endif #ifdef NoTransB nB = N; #else nB = K; #endif #ifdef TCPLX inca = lda*nA; incb = ldb*nB; #endif #ifdef ATL_MinMMAlign va = malloc(ATL_MinMMAlign + lda*nA*ATL_sizeof); vb = malloc(ATL_MinMMAlign + ldb*nB*ATL_sizeof); vc = C0 = malloc(2*ldc*N*ATL_sizeof); assert(va && vb && C0); A = (TYPE *) ( ( ((size_t) va)/ATL_MinMMAlign ) * ATL_MinMMAlign + ATL_MinMMAlign ); B = (TYPE *) ( ( ((size_t) vb)/ATL_MinMMAlign ) * ATL_MinMMAlign + ATL_MinMMAlign ); #else C0 = vc = malloc( (2*ldc*N + lda*nA + ldb*nB) * ATL_sizeof); assert(vc); A = C1 + (ldc * N SHIFT); B = A + (lda * nA SHIFT); #endif C1 = C0 + (ldc * N SHIFT); for (n=lda*nA SHIFT, i=0; i < n; i++) A[i] = dumb_rand(); for (n=ldb*nB SHIFT, i=0; i < n; i++) B[i] = dumb_rand(); for (n=ldc*N SHIFT, i=0; i < n; i++) C0[i] = C1[i] = dumb_rand(); tst_mm(M, N, K, alpha, A, lda, B, ldb, beta, C0, ldc); NBmm(M, N, K, alpha, A, lda, B, ldb, beta, C1, ldc); nerr = 0; for (j=0; j < N; j++) { for (i=0; i < M SHIFT; i++) { k = i + j*(ldc SHIFT); diff = C0[k] - C1[k]; if (diff < 0.0) diff = -diff; if (diff > ErrBound) { fprintf(stderr, "C(%d,%d) : expected=%f, got=%f\n", i, j, C0[k], C1[k]); nerr++; } } } free(vc); if (va) free(va); if (vb) free(vb); return(nerr); }
double GetKmmMflop ( CINT mb, CINT nb, CINT kb, /* C: mbxnb, At: kbxmb, B: kbXnb */ #ifdef ATL_NEWTIME CINT mu, CINT nu, CINT ku, #endif CINT movA, CINT movB, CINT movC, /* which mat move in flush array? */ int FLSIZE, /* min area to move in in bytes */ CINT reps, /* # calls to kmm in one timing */ CINT LDC /* what should ldc be set to? */ ) /* * Returns MFLOP rate of matmul kernel KMM * LDC: if (LDC == 0), then set ldc=MB for timings. * if (LDC != 0 && movC != 0), then ldc= col length in move space * else ldc = LDC; * */ { #ifdef ATL_NEWTIME CINT mblks = mb/mu, nblks = nb/nu; #endif const int NOMOVE = !(movA|movB|movC); int ldc, setsz, nset, i, j, incA, incB, incC, n, extra; TYPE *C, *A, *B, *a, *b, *c; double t0, t1, mf; const TYPE alpha=1.0; TYPE beta=1.0; void *vp=NULL; if (NOMOVE) { ldc = (LDC) ? LDC : mb; setsz = (ldc * nb + kb*(mb+nb)); vp = malloc(ATL_Cachelen + ATL_MulBySize(setsz)); ATL_assert(vp); A = ATL_AlignPtr(vp); B = A + mb*kb; C = B + kb*nb; for (i=0; i < setsz; i++) A[i] = dumb_rand(); incA = incB = incC = 0; } else { if (movA && movB && movC) /* no reuse at all */ { setsz = ATL_MulBySize(mb*nb+kb*(mb+nb)); nset = (FLSIZE+setsz-1)/setsz; FLSIZE = nset*setsz; setsz = mb*nb+kb*(mb+nb); vp = malloc(ATL_Cachelen + ATL_MulBySize(setsz)); ATL_assert(vp); A = ATL_AlignPtr(vp); B = A + kb*mb*nset; C = B + kb*nb*nset; ldc = (LDC) ? mb*nset : mb; for (n=setsz*nset,i=0; i < n; i++) A[i] = dumb_rand(); incA = mb*kb; incB = kb*nb; incC = mb*nb; } else if (movA && movB && !movC) /* square-case ATLAS behavior */ { setsz = kb*(mb+nb); ldc = (LDC) ? LDC : mb; ATL_assert(ldc >= mb); extra = ldc*nb; incA = mb*kb; incB = kb*nb; incC = 0; } else if (!movB && movA && movC) /* rank-K behavior */ { setsz = mb*(kb+nb); extra = kb*nb; incA = mb*kb; incB = 0; incC = mb*nb; } else { fprintf(stderr, "%s,%d: What case are you wanting?\n", __FILE__, __LINE__); exit(-1); } if (!vp) { i = ATL_MulBySize(setsz); nset = (FLSIZE+i-1)/i; FLSIZE = nset * i; vp = malloc(ATL_Cachelen + ATL_MulBySize(FLSIZE+extra)); ATL_assert(vp); A = ATL_AlignPtr(vp); if (movC) { C = A + mb*kb*nset; ldc = (LDC) ? mb*nset : mb; B = C + mb*nb*nset; } else { B = A + mb*kb*nset; C = B + kb*nb*nset; } for (n=setsz*nset+extra,i=0; i < n; i++) A[i] = dumb_rand(); } } a = A; b = B; c = C; t0 = ATL_walltime(); for (j=0,i=reps; i; i--) { #ifdef ATL_NEWTIME KMM(mblks, nblks, kb, a, b, c, movA ? a+incA : a, movB ? b+incB : b, movC ? c+incC : c); #else KMM(mb, nb, kb, alpha, a, kb, b, kb, beta, c, ldc); #endif if (++j != nset) { a += incA; b += incB; c += incC; } else { #ifndef ATL_NEWTIME beta = (beta != 0.0) ? -beta : 0.0; #endif j = 0; a = A; b = B; c = C; } } t1 = ATL_walltime() - t0; mf = (2.0*reps*mb*nb*kb) / (t1*1000000.0); free(vp); return(mf); }