int WaitForIt(int nsec, char *fnam) /* * only probe for file once every 1/5 of a second to avoid beating the crap * out of NFS server */ { FILE *fp; double t0, t1, dsec=(double)nsec, dwait; double ATL_walltime(void); fp = fopen(fnam, "r"); if (!fp) { t0 = ATL_walltime(); do { t1 = ATL_walltime(); while(ATL_walltime()-t1 < 0.2); if (ATL_walltime()-t0 > dsec) return(1); fp = fopen(fnam, "r"); } while(!fp); } fclose(fp); return(0); }
double GetKmmMflop ( CINT mb, CINT nb, CINT kb, /* C: mbxnb, At: kbxmb, B: kbXnb */ #ifdef ATL_NEWTIME CINT mu, CINT nu, CINT ku, #endif CINT movA, CINT movB, CINT movC, /* which mat move in flush array? */ int FLSIZE, /* min area to move in in bytes */ CINT reps, /* # calls to kmm in one timing */ CINT LDC /* what should ldc be set to? */ ) /* * Returns MFLOP rate of matmul kernel KMM * LDC: if (LDC == 0), then set ldc=MB for timings. * if (LDC != 0 && movC != 0), then ldc= col length in move space * else ldc = LDC; * */ { #ifdef ATL_NEWTIME CINT mblks = mb/mu, nblks = nb/nu; #endif const int NOMOVE = !(movA|movB|movC); int ldc, setsz, nset, i, j, incA, incB, incC, n, extra; TYPE *C, *A, *B, *a, *b, *c; double t0, t1, mf; const TYPE alpha=1.0; TYPE beta=1.0; void *vp=NULL; if (NOMOVE) { ldc = (LDC) ? LDC : mb; setsz = (ldc * nb + kb*(mb+nb)); vp = malloc(ATL_Cachelen + ATL_MulBySize(setsz)); ATL_assert(vp); A = ATL_AlignPtr(vp); B = A + mb*kb; C = B + kb*nb; for (i=0; i < setsz; i++) A[i] = dumb_rand(); incA = incB = incC = 0; } else { if (movA && movB && movC) /* no reuse at all */ { setsz = ATL_MulBySize(mb*nb+kb*(mb+nb)); nset = (FLSIZE+setsz-1)/setsz; FLSIZE = nset*setsz; setsz = mb*nb+kb*(mb+nb); vp = malloc(ATL_Cachelen + ATL_MulBySize(setsz)); ATL_assert(vp); A = ATL_AlignPtr(vp); B = A + kb*mb*nset; C = B + kb*nb*nset; ldc = (LDC) ? mb*nset : mb; for (n=setsz*nset,i=0; i < n; i++) A[i] = dumb_rand(); incA = mb*kb; incB = kb*nb; incC = mb*nb; } else if (movA && movB && !movC) /* square-case ATLAS behavior */ { setsz = kb*(mb+nb); ldc = (LDC) ? LDC : mb; ATL_assert(ldc >= mb); extra = ldc*nb; incA = mb*kb; incB = kb*nb; incC = 0; } else if (!movB && movA && movC) /* rank-K behavior */ { setsz = mb*(kb+nb); extra = kb*nb; incA = mb*kb; incB = 0; incC = mb*nb; } else { fprintf(stderr, "%s,%d: What case are you wanting?\n", __FILE__, __LINE__); exit(-1); } if (!vp) { i = ATL_MulBySize(setsz); nset = (FLSIZE+i-1)/i; FLSIZE = nset * i; vp = malloc(ATL_Cachelen + ATL_MulBySize(FLSIZE+extra)); ATL_assert(vp); A = ATL_AlignPtr(vp); if (movC) { C = A + mb*kb*nset; ldc = (LDC) ? mb*nset : mb; B = C + mb*nb*nset; } else { B = A + mb*kb*nset; C = B + kb*nb*nset; } for (n=setsz*nset+extra,i=0; i < n; i++) A[i] = dumb_rand(); } } a = A; b = B; c = C; t0 = ATL_walltime(); for (j=0,i=reps; i; i--) { #ifdef ATL_NEWTIME KMM(mblks, nblks, kb, a, b, c, movA ? a+incA : a, movB ? b+incB : b, movC ? c+incC : c); #else KMM(mb, nb, kb, alpha, a, kb, b, kb, beta, c, ldc); #endif if (++j != nset) { a += incA; b += incB; c += incC; } else { #ifndef ATL_NEWTIME beta = (beta != 0.0) ? -beta : 0.0; #endif j = 0; a = A; b = B; c = C; } } t1 = ATL_walltime() - t0; mf = (2.0*reps*mb*nb*kb) / (t1*1000000.0); free(vp); return(mf); }
int main(int nargs, char **args) { int i, k, nreps = 200, opstride, which; double t0, tlin, tlg2, tdyn, trnk; ATL_TUNE_T ta[ATL_NTHREADS]; volatile int done[ATL_NTHREADS]; tlg2 = tdyn = tlin = 0.0; nreps = GetFlags(nargs, args, &which); for (i=0; i < ATL_NTHREADS; i++) { ta[i].rank = i; ta[i].nthr = ATL_NTHREADS; ta[i].donearr = done; } opstride = (int) ( ((char*)(ta+1)) - (char*)(ta) ); printf("FINDING SPEED OF CREATE/BARRIER/JOIN USING %d REPITITIONS:\n", nreps); if (which & 1) { t0 = ATL_walltime(); for (k=0; k < nreps; k++) { for (i=0; i < ATL_NTHREADS; i++) done[i] = 0; ATL_goparallel_dyn(ATL_NTHREADS, TuneDoWork, ta, NULL); } tdyn = ATL_walltime() - t0; printf(" dyn time = %e\n", (float)tdyn); } if (which & 2) { t0 = ATL_walltime(); for (k=0; k < nreps; k++) { for (i=0; i < ATL_NTHREADS; i++) done[i] = 0; ATL_goparallel_log2(ATL_NTHREADS, TuneDoWork, ta, NULL); } tlg2 = ATL_walltime() - t0; printf(" lg2 time = %e\n", (float)tlg2); } if (which & 4) { t0 = ATL_walltime(); for (k=0; k < nreps; k++) { for (i=0; i < ATL_NTHREADS; i++) done[i] = 0; ATL_goparallel_lin(ATL_NTHREADS, TuneDoWork, ta, NULL); } tlin = ATL_walltime() - t0; printf(" lin time = %e\n", (float)tlin); } if (which & 8) { t0 = ATL_walltime(); for (k=0; k < nreps; k++) { for (i=0; i < ATL_NTHREADS; i++) done[i] = 0; ATL_goparallel_prank(ATL_NTHREADS, TuneDoWork_gp, ta, NULL); } trnk = ATL_walltime() - t0; printf(" rnk time = %e\n", (float)trnk); } if ((which | 7) == which) printf("DYNAMIC is %.2f%% of LINEAR and %.2f%% of LOG2 SPEED.\n", (tdyn/tlin)*100.0, (tdyn/tlg2)*100.0); if ((which & 1) && (which & 8)) printf("rank dynamic is %.2f%% of affinity dynamic\n", (trnk/tdyn)*100.0); return(0); }