int main(int argc, char *argv[]) { if(argc != 2) { usage(argv); return 1; } int L = atoi(argv[1]); st_coords arr; arr.x = alloc(sizeof(float)*L); arr.y = alloc(sizeof(float)*L); arr.z = alloc(sizeof(float)*L); arr.t = alloc(sizeof(float)*L); arr.s = alloc(sizeof(float)*L); for(int i=0; i<L; i++) { arr.x[i] = drand48(); arr.y[i] = drand48(); arr.z[i] = drand48(); arr.t[i] = drand48()*C; } { /* Warm up */ comp_s(arr, L); double t0acc = 0; double t1acc = 0; int n = 1; /* Loop accumulating run-time. Stop when the average time has less than a 10% error */ while(1) { double t0 = stop_watch(0); for(int i=0; i<NREP; i++) comp_s(arr, L); t0 = stop_watch(t0)/(double)NREP; t0acc += t0; t1acc += t0*t0; if(n > 2) { double ave = t0acc/n; double err = sqrt(t1acc/n - ave*ave)/sqrt(n); if(err/ave < 0.1) { t0acc = ave; t1acc = err; break; } } n++; } printf(" Done L = %d, in %3.1e +/- %3.1e secs, %g Mflop/s\n", L, t0acc, t1acc, (double)9*L/1e6/t0acc); } free(arr.x); free(arr.y); free(arr.z); free(arr.t); free(arr.s); return 0; }
/* * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient */ void cg(latparams lp, field **x, field **b, link **g) { size_t L = lp.L; int max_iter = 100; float tol = 1e-9; /* Temporary fields needed for CG */ field **r = new_field(lp); field **p = new_field(lp); field **Ap = new_field(lp); /* Initial residual and p-vector */ lapl(lp, r, x, g); xmy(lp, b, r); xeqy(lp, p, r); /* Initial r-norm and b-norm */ float rr = xdotx(lp, r); float bb = xdotx(lp, b); double t_lapl = 0; int iter = 0; for(iter=0; iter<max_iter; iter++) { printf(" %6d, res = %+e\n", iter, rr/bb); if(sqrt(rr/bb) < tol) break; double t = stop_watch(0); lapl(lp, Ap, p, g); t_lapl += stop_watch(t); float pAp = xdoty(lp, p, Ap); float alpha = rr/pAp; axpy(lp, alpha, p, x); axpy(lp, -alpha, Ap, r); float r1r1 = xdotx(lp, r); float beta = r1r1/rr; xpay(lp, r, beta, p); rr = r1r1; } /* Recompute residual after convergence */ lapl(lp, r, x, g); xmy(lp, b, r); rr = xdotx(lp, r); double beta_fp = 50*((double)L*L*L)/(t_lapl/(double)iter)*1e-9; double beta_io = 40*((double)L*L*L)/(t_lapl/(double)iter)*1e-9; printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb); printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n", t_lapl/(double)iter, beta_fp, beta_io); del_field(r); del_field(p); del_field(Ap); return; }
/* * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient */ void cg(size_t L, _Complex float *x, _Complex float *b, _Complex float *u) { int max_iter = 100; float tol = 1e-6; /* Temporary fields needed for CG */ _Complex float *r = new_field(L); _Complex float *p = new_field(L); _Complex float *Ap = new_field(L); /* Initial residual and p-vector */ lapl(L, r, x, u); xmy(L, b, r); xeqy(L, p, r); /* Initial r-norm and b-norm */ float rr = xdotx(L, r); float bb = xdotx(L, b); double t_lapl = 0; int iter = 0; for(iter=0; iter<max_iter; iter++) { printf(" %6d, res = %+e\n", iter, rr/bb); if(sqrt(rr/bb) < tol) break; double t = stop_watch(0); lapl(L, Ap, p, u); t_lapl += stop_watch(t); float pAp = xdoty(L, p, Ap); float alpha = rr/pAp; axpy(L, alpha, p, x); axpy(L, -alpha, Ap, r); float r1r1 = xdotx(L, r); float beta = r1r1/rr; xpay(L, r, beta, p); rr = r1r1; } /* Recompute residual after convergence */ lapl(L, r, x, u); xmy(L, b, r); rr = xdotx(L, r); double beta_fp = 34*L*L/(t_lapl/(double)iter)*1e-9; double beta_io = 32*L*L/(t_lapl/(double)iter)*1e-9; printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb); printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n", t_lapl/(double)iter, beta_fp, beta_io); free(r); free(p); free(Ap); return; }
int main() { char *mem = malloc((N_PAGES+1) * 4096); intptr_t *p; int i; unsigned int j; /* Align to page start */ mem = (char *) ((intptr_t) (mem + 4096) & ~0xfff); for (j = 0; j < sizeof(numPagesList)/sizeof(int); j++) { int numPages = numPagesList[j]; int pageIdx = 0; int entryOffset = 0; /* * page 0 page 1 page 2 .... page N * ------ ------ ------ ------ * word 0 -> word 0 -> word 0 -> .... -> word 0 -> (page 0/word 0) * : : : : : * word 1023 word 1023 word 1023 : word 1023 */ for (i = 0; i < numPages; i++) { int nextPageIdx = (pageIdx + 1) % numPages; /* Looks like spread the pointer across cache lines introduce noise * to get to the asymptote * int nextEntryOffset = (entryOffset + 32) % 1024; */ int nextEntryOffset = entryOffset; if (i != numPages -1) { *(intptr_t *) (mem + 4096 * pageIdx + entryOffset) = (intptr_t) (mem + 4096 * nextPageIdx + nextEntryOffset); } else { /* Last page - form the cycle */ *(intptr_t *) (mem + 4096 * pageIdx + entryOffset) = (intptr_t) &mem[0]; } pageIdx = nextPageIdx; entryOffset = nextEntryOffset; } /* Starting point of the pointer chase */ p = (intptr_t *) &mem[0]; /* Warmup (ie pre-thrash the memory system */ for (i = 0; i < WARMUP; i++) { p = (intptr_t *) *p; } /* Real work */ unsigned long long t0 = stop_watch(); for (i = 0; i < WORKLOAD; i++) { p = (intptr_t *) *p; } unsigned long long t1 = stop_watch(); /* To keep p from being optimized by gcc */ if (p) printf("%d, %f\n", numPages, (float) (t1 - t0) / WORKLOAD); } return 0; }
int main(int argc, char *argv[]) { /* Check the number of command line arguments */ if(argc != 6) { usage(argv); exit(1); } /* The length of the array in x and y is read from the command line */ Lx = atoi(argv[1]); Ly = atoi(argv[2]); /* The number of iterations */ int niter = atoi(argv[3]); /* Fixed "sigma" */ float sigma = 0.01; printf(" Ly,Lx = %d,%d\n", Ly, Lx); printf(" niter = %d\n", niter); printf(" input file = %s\n", argv[4]); printf(" output file = %s\n", argv[5]); /* Allocate the buffer for the data */ float *arr = malloc(sizeof(float)*Lx*Ly); /* read file to buffer */ read_from_file(arr, argv[4]); /* allocate super-site buffers */ supersite *ssarr[2]; posix_memalign((void**)&ssarr[0], 16, sizeof(supersite)*Lx*Ly/4); posix_memalign((void**)&ssarr[1], 16, sizeof(supersite)*Lx*Ly/4); /* convert input array to super-site packed */ to_supersite(ssarr[0], arr); /* do iterations, record time */ double t0 = stop_watch(0); for(int i=0; i<niter; i++) { lapl_iter_supersite(ssarr[(i+1)%2], sigma, ssarr[i%2]); } t0 = stop_watch(t0)/(double)niter; /* write the result after niter iteraions */ char fname[256]; /* construct filename */ sprintf(fname, "%s.ss%08d", argv[5], niter); /* convert from super-site packed */ from_supersite(arr, ssarr[niter%2]); /* write to file */ write_to_file(fname, arr); /* write timing info */ printf(" iters = %8d, (Lx,Ly) = %6d, %6d, t = %8.1f usec/iter, BW = %6.3f GB/s, P = %6.3f Gflop/s\n", niter, Lx, Ly, t0*1e6, Lx*Ly*sizeof(float)*2.0/(t0*1.0e9), (Lx*Ly*6.0)/(t0*1.0e9)); /* free super-site buffers */ for(int i=0; i<2; i++) { free(ssarr[i]); } /* * GPU part */ /* read file again for GPU run */ read_from_file(arr, argv[4]); /* Fixed number of threads per block (in x- and y-direction), number of blocks per direction determined by dimensions Lx, Ly */ int threads[] = {1, NTY, NTX}; int blocks[] = {1, Ly/NTY, Lx/NTX}; /* Initialize: allocate GPU arrays and load array to GPU */ init_lapl_cuda(arr, sigma); /* Do iterations on GPU, record time */ t0 = stop_watch(0); for(int i=0; i<niter; i++) { lapl_iter_cuda(blocks, threads); } t0 = stop_watch(t0)/(double)niter; /* construct filename for writing */ sprintf(fname, "%s.cu%08d", argv[5], niter); /* copy GPU array to main memory and free GPU arrays */ fini_lapl_cuda(arr); /* write to file */ write_to_file(fname, arr); /* write timing info */ printf(" iters = %8d, (Lx,Ly) = %6d, %6d, t = %8.1f usec/iter, BW = %6.3f GB/s, P = %6.3f Gflop/s\n", niter, Lx, Ly, t0*1e6, Lx*Ly*sizeof(float)*2.0/(t0*1.0e9), (Lx*Ly*6.0)/(t0*1.0e9)); /* free main memory array */ free(arr); return 0; }
int main(int argc, char** argv){ if(argc<2){ printf("usage: %s array_size\n",argv[0]); exit(0);} int N=atoi(argv[1]); int i,j,k; __m128d register c1,c2,c3; _Complex double *A = (_Complex double *) amalloc(N*sizeof(_Complex double),16); _Complex double *B = (_Complex double *) amalloc(N*sizeof(_Complex double),16); double ts,tf, tsum; //some intialization for(i=0; i<N; i++){ A[i]=0.1*i + I*10*i;} tsum =0.0; for(j=0; j<100; j++) { ts=stop_watch(0.0); for(i=0; i<N; i++) { c1=_mm_load_pd((double *) &A[i]); _mm_store_pd((double *) &B[i],c1); } tf=stop_watch(ts); tsum += tf; } printf("SIMD copy time %f \n",tsum/100.00); tsum=0.0; for(j=0; j<100; j++) { ts=stop_watch(0.0); for(i=0; i<N; i++){ B[i]=A[i];} tf=stop_watch(ts); tsum += tf; } printf("direct copy time %f\n",tsum/100.00); tsum =0.0; for(j=0; j<100; j++) { ts=stop_watch(0.0); for(i=0; i<N; i++) { c1=_mm_load_pd((double *) &A[i]); _mm_store_pd((double *) &B[i],c1); } tf=stop_watch(ts); tsum += tf; } printf("SIMD copy time %f \n",tsum/100.00); tsum=0.0; for(j=0; j<100; j++) { ts=stop_watch(0.0); for(i=0; i<N; i++){ B[i]=A[i];} tf=stop_watch(ts); tsum += tf; } printf("direct copy time %f\n",tsum/100.00); afree(A); afree(B); return 0; }
int main(int argc, char **argv) { //initialize plqcd int init_status; if(argc < 3) { fprintf(stderr,"Error. Must pass the name of the input file and the number of multiplications to be performed \n"); fprintf(stderr,"Usage: %s input_file_name Nmul\n",argv[0]); exit(1); } init_status = init_plqcd(argc,argv); if(init_status != 0) printf("Error initializing plqcd\n"); int proc_id; int i,j,k,Nmul; proc_id = ipr(plqcd_g.cpr); Nmul=atoi(argv[2]); #if 0 //Intialize the ranlux random number generator start_ranlux(0,1); #endif int NPROCS=plqcd_g.nprocs[0]*plqcd_g.nprocs[1]*plqcd_g.nprocs[2]*plqcd_g.nprocs[3]; char ofname[128]; char buff[128]; strcpy(ofname,"test_hopping_output.procgrid."); sprintf(buff,"%d-%d-%d-%d.nthreads.%d.proc.%d",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3],plqcd_g.nthread,proc_id); strcat(ofname,buff); FILE *ofp; //FILE *ofp_source; //if(proc_id==0) //{ // ofp_source = fopen("test_rand_vals.out","w"); //} if(proc_id==0) { ofp=fopen(ofname,"w"); fprintf(ofp,"INPUT GLOBALS:\n"); fprintf(ofp,"----------------\n"); fprintf(ofp,"NPROC0 %d, NPROC1 %d, NPROC2 %d, NPROC3 %d, NTHREAD %d\n",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3], plqcd_g.nthread); fprintf(ofp,"L0 %d, L1 %d, L2 %d, L3 %d\n\n",plqcd_g.latdims[0],plqcd_g.latdims[1],plqcd_g.latdims[2],plqcd_g.latdims[3]); //printf("sizeof(spinor) %ld, sizeof(halfspinor) %ld, sizeof(su3) %ld \n",sizeof(spinor),sizeof(halfspinor),sizeof(su3)); } int nthr; #ifdef _OPENMP #pragma omp parallel { nthr=omp_get_num_threads(); if(omp_get_thread_num() == 0) if(proc_id==0) fprintf(ofp,"Number of threads as returned by openmp %d\n",nthr); } #endif /***************************************************** *Testing the Dirac operator interface ****************************************************/ spinor *pin= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN); if(pin==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pin.\n"); exit(2); } spinor *pout= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN); if(pout==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pout.\n"); exit(2); } su3 *ufield= (su3 *) amalloc(4*plqcd_g.VOLUME*sizeof(su3), plqcd_g.ALIGN); if(ufield==NULL) { fprintf(stderr,"ERROR: insufficient memory for gauge field ufield.\n"); exit(2); } //256 arrays #ifdef AVX spinor_256 *pin_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN); if(pin_256==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pin_256.\n"); exit(2); } spinor_256 *pout_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN); if(pout_256==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pout_256.\n"); exit(2); } su3_256 *ufield_256= (su3_256 *) amalloc(4*plqcd_g.VOLUME/2*sizeof(su3_256), plqcd_g.ALIGN); if(ufield_256==NULL) { fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_256.\n"); exit(2); } #endif //512 arrays #ifdef MIC spinor_512 *pin_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN); if(pin_512==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pin_512.\n"); exit(2); } spinor_512 *pout_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN); if(pout_512==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pout_512.\n"); exit(2); } su3_512 *ufield_512= (su3_512 *) amalloc(4*plqcd_g.VOLUME/4*sizeof(su3_512), plqcd_g.ALIGN); if(ufield_512==NULL) { fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_512.\n"); exit(2); } #endif //intialize the random number generator by a seed equals to the process rank srand((unsigned int) proc_id); //Initialize the input spinor and gauge links to random numbers //intialize the random number generator by a seed equals to the process rank srand((unsigned int) proc_id); //Initialize the input spinor and gauge links to random numbers double ru[18]; double rs[24]; for(i=0; i<plqcd_g.VOLUME; i++) { //ranlxd(rs,24); for(j=0; j<24; j++) { rs[j]= rand() / (double)RAND_MAX; //fprintf(stderr,"rs[%d]=%lf\n",j,rs[j]); } pin[i].s0.c0=rs[0]+I*rs[1]; pin[i].s0.c1=rs[2]+I*rs[3]; pin[i].s0.c2=rs[4]+I*rs[5]; pin[i].s1.c0=rs[6]+I*rs[7]; pin[i].s1.c1=rs[8]+I*rs[9]; pin[i].s1.c2=rs[10]+I*rs[11]; pin[i].s2.c0=rs[12]+I*rs[13]; pin[i].s2.c1=rs[14]+I*rs[15]; pin[i].s2.c2=rs[16]+I*rs[17]; pin[i].s3.c0=rs[18]+I*rs[19]; pin[i].s3.c1=rs[20]+I*rs[21]; pin[i].s3.c2=rs[22]+I*rs[23]; //ranlxd(rs,24); for(j=0; j<24; j++) rs[j]= rand() / (double)RAND_MAX; pout[i].s0.c0=rs[0]+I*rs[1]; pout[i].s0.c1=rs[2]+I*rs[3]; pout[i].s0.c2=rs[4]+I*rs[5]; pout[i].s1.c0=rs[6]+I*rs[7]; pout[i].s1.c1=rs[8]+I*rs[9]; pout[i].s1.c2=rs[10]+I*rs[11]; pout[i].s2.c0=rs[12]+I*rs[13]; pout[i].s2.c1=rs[14]+I*rs[15]; pout[i].s2.c2=rs[16]+I*rs[17]; pout[i].s3.c0=rs[18]+I*rs[19]; pout[i].s3.c1=rs[20]+I*rs[21]; pout[i].s3.c2=rs[22]+I*rs[23]; for(j=0; j<4; j++) { //ranlxd(ru,18); for(k=0; k<18; k++) { ru[k]= rand() / (double)RAND_MAX; //fprintf(stderr,"ru[%d]=%lf\n",k,ru[k]); } ufield[4*i+j].c00=ru[0]+I*ru[1]; ufield[4*i+j].c01=ru[2]+I*ru[3]; ufield[4*i+j].c02=ru[4]+I*ru[5]; ufield[4*i+j].c10=ru[6]+I*ru[7]; ufield[4*i+j].c11=ru[8]+I*ru[9]; ufield[4*i+j].c12=ru[10]+I*ru[11]; ufield[4*i+j].c20=ru[12]+I*ru[13]; ufield[4*i+j].c21=ru[14]+I*ru[15]; ufield[4*i+j].c22=ru[16]+I*ru[17]; } } #ifdef AVX for(i=0; i<plqcd_g.VOLUME; i +=2) { for(j=0; j<4; j++) copy_su3_to_su3_256(ufield_256+4*i/2+j, ufield+4*i+j, ufield+4*(i+1)+j); copy_spinor_to_spinor_256(pin_256+i/2, pin+i, pin+i+1); copy_spinor_to_spinor_256(pout_256+i/2, pout+i, pout+i+1); } #endif #ifdef MIC for(i=0; i<plqcd_g.VOLUME; i +=4) { for(j=0; j<4; j++) copy_su3_to_su3_512(ufield_512+4*i/4+j, ufield+4*i+j, ufield+4*(i+1)+j, ufield+4*(i+2)+j, ufield+4*(i+3)+j); copy_spinor_to_spinor_512(pin_512+i/4, pin+i, pin+i+1, pin+i+2, pin+i+3); copy_spinor_to_spinor_512(pout_512+i/4, pout+i, pout+i+1, pout+i+2, pout+i+3); } #endif double total,t1=0.0,t2=0.0,mytotal; int matvecs; #ifdef ASSYMBLY //--------------------------------------------- //1: non-blocking assymbly/c version //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=plqcd_hopping_matrix_eo_sse3_assymbly(pin,pout,ufield); t2=plqcd_hopping_matrix_oe_sse3_assymbly(pin,pout,ufield); mytotal += t1+t2; } matvecs += Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"non-blocking assymbly/c version:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } #endif #ifdef SSE3_INTRIN //--------------------------------------------- //1: non-blocking sse3 with intrinsics version //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=plqcd_hopping_matrix_eo_sse3_intrin(pin,pout,ufield); t2=plqcd_hopping_matrix_oe_sse3_intrin(pin,pout,ufield); mytotal += t1+t2; } matvecs += Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"non-blocking sse3 with intrinsics version:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } //--------------------------------------------- //2: blocking sse3 with intrinsics version //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=plqcd_hopping_matrix_eo_sse3_intrin_blocking(pin,pout,ufield); t2=plqcd_hopping_matrix_oe_sse3_intrin_blocking(pin,pout,ufield); mytotal += t1+t2; } matvecs += Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"blocking sse3 with intrinsics version:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } #endif #ifdef AVX //--------------------------------------------- //2: avx version //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256); while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256); t2=plqcd_hopping_matrix_oe_intrin_256(pin_256,pout_256,ufield_256); mytotal += t1+t2; } matvecs += Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"avxversion:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } #endif #ifdef MIC #ifdef TEST_HOPPING_MIC //--------------------------------------------- //3: MIC version full su3 matrix //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512); while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512); //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512); t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512); t2=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512); mytotal += t1+t2; } matvecs += 2*Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"mic version, 3x3 links:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } //--------------------------------------------- //3: MIC version full reduced su3 storage //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512); while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512); //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512); t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512); t2=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512); mytotal += t1+t2; } matvecs += 2*Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"mic version, 2x3 links:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } #endif #ifdef TEST_SU3MUL_MIC matvecs=0; total=0.0; mytotal =0.0; //while(mytotal < 10) //{ MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=stop_watch(0.0); #ifdef _OPENMP #pragma omp parallel { #endif __m512d U[3][3], gin[3],gout[3]; su3_512 *u0; su3_vector_512 *hin,*hout; #ifdef _OPENMP #pragma omp for #endif for(j=0; j< plqcd_g.VOLUME/4; j++) { u0 = &ufield_512[4*j]; hin = &pin_512[j].s0; hout= &pout_512[j].s0; intrin_su3_load_512(U,u0); intrin_vector_load_512(gin,hin); intrin_su3_multiply_512(gout,U,gin); intrin_vector_store_512(hout,gout); u0++; hin++; hout++; intrin_su3_load_512(U,u0); intrin_vector_load_512(gin,hin); intrin_su3_multiply_512(gout,U,gin); intrin_vector_store_512(hout,gout); u0++; hin++; hout++; intrin_su3_load_512(U,u0); intrin_vector_load_512(gin,hin); intrin_su3_multiply_512(gout,U,gin); intrin_vector_store_512(hout,gout); u0++; hin++; hout++; intrin_su3_load_512(U,u0); intrin_vector_load_512(gin,hin); intrin_su3_multiply_512(gout,U,gin); intrin_vector_store_512(hout,gout); } #ifdef _OPENMP } #endif t2 = stop_watch(t1); mytotal += t2; } matvecs += 4*Nmul*plqcd_g.VOLUME; //} MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"su3mul mic version:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*66.0/total/1e+6); } #endif #endif //MIC finalize_plqcd(); return 0; }
int main(int argc, char *argv[]) { if(argc != 3) { usage(argv); exit(1); } char *e; int L = (int)strtoul(argv[1], &e, 10); if(*e != '\0') { usage(argv); exit(2); } int nreps = (int)strtoul(argv[2], &e, 10); if(*e != '\0') { usage(argv); exit(2); } double *x = alloc(sizeof(double)*L*N*N); double *y = alloc(sizeof(double)*L*N*N); double *a = alloc(sizeof(double)*N*N); randNxN(a); for(int i=0; i<L; i++) randNxN(&y[i]); for(int i=0; i<L; i++) randNxN(&x[i]); int nreps_inner = 2; double tave = 0; double tvar = 0; for(int k=0; ;k++) { tave = 0; tvar = 0; mulNxN(L, y, a, x); for(int i=0; i<nreps; i++) { double t0 = stop_watch(0); for(int j=0; j<nreps_inner; j++) mulNxN(L, y, a, x); t0 = stop_watch(t0)/nreps_inner; tave += t0; tvar += t0*t0; } tave /= (double)nreps; tvar /= (double)nreps; tvar = sqrt(tvar - tave*tave); if(tvar < tave/15) break; nreps_inner = nreps_inner*2; } /* ___TODO_1___ Print: 1) Time per kernel call 2) Susstained floating-point rate (GFlop/sec) 3) Susstained bandwidth (GBytes/sec) Note: keep as function of N */ free(x); free(y); return 0; }
bool SceneLoader::LoadFromFile(const char* file_name) { KTimer stop_watch(true); FILE* pFile = NULL; bool ret = false; int ext = _FileExtension(file_name); std::string file_dir; GetPathDir(file_name, file_dir); Texture::TextureManager::GetInstance()->AddSearchPath(file_dir.c_str()); if (!mpScene) mpScene = new KSceneSet; else mpScene->Reset(); mIsFromOBJ = false; mIsSceneLoaded = false; KTimer fileReadingTime(true); // Perform the file reading if (ext == FILE_EXT_OBJ) { KRT_ObjFileLoader OBJLoader; OBJLoader.mUseTexMap = USE_TEX_MAP ? true : false; // Create scene UINT32 kd_idx = 0; KScene* pKDScene = mpScene->AddKDScene(kd_idx); mpScene->SceneNode_Create(kd_idx); if (OBJLoader.LoadObjFile(file_name, *pKDScene)) { ret = true; mIsFromOBJ = true; } else { mpScene->Reset(); ret = false; } } else if (ext == FILE_EXT_ABC) { if (mAbcLoader.Load(file_name, *mpScene)) ret = true; else { mpScene->Reset(); ret = false; } } if (ret) { BuildNodeIdMap(); mIsSceneLoaded = true; } mFileLoadingTime = UINT32(fileReadingTime.Stop() * 1000); // End of file reading, now build the acceleration structure mpAccelData = new KAccelStruct_BVH(mpScene); mpAccelData->SceneNode_BuildAccelData(NULL); KBBox scene_box = mpAccelData->GetSceneBBox(); KVec3 center = scene_box.Center(); float radius = nvmath::length(scene_box.mMax - scene_box.mMin) * 1.0f; CameraManager* pCameraMan = CameraManager::GetInstance(); if (pCameraMan->GetCameraCnt() == 0 && mpScene) { // If there's no camera, create a default light regarding the bounding box of scene KCamera* pPinHoleCamera = pCameraMan->OpenCamera("__default", true); KCamera::MotionState ms; ms.pos = center + KVec3(0.5,0.5,0.5)*radius; ms.lookat = center; ms.up = KVec3(0, 1.0f, 0); ms.xfov = 45.0f; ms.focal = radius * 0.5f; pPinHoleCamera->SetupStillCamera(ms); } LightScheme* pLightScheme = LightScheme::GetInstance(); if (pLightScheme->GetLightCount() == 0 && mpScene) { // If there's no light source, create a default one, otherwize the scene will be entirely dark. PointLightBase* pLight0 = dynamic_cast<PointLightBase*>(pLightScheme->CreateLightSource(POINT_LIGHT_TYPE)); PointLightBase* pLight1 = dynamic_cast<PointLightBase*>(pLightScheme->CreateLightSource(POINT_LIGHT_TYPE)); pLight0->SetIntensity(KColor(0.55f, 0.55f, 0.55f)); pLight1->SetIntensity(KColor(0.55f, 0.55f, 0.55f)); pLight0->SetPos(center + KVec3(0,1,1)*(radius*3.0f)); pLight1->SetPos(center + KVec3(1,1,0)*(radius*3.0f)); } if (pFile) fclose(pFile); mLoadingTime = stop_watch.Stop(); return ret; }
int main(int argc, char **argv) { Arguments args; Settings::readSettings(); args.readArguments(argc, argv); if(args.print_mini_help) args.printMiniHelp(); if(args.print_help) args.printHelp(); args.printSetup(); //load gtfs databases to mysql load_gtfs(args.reload_gtfs); //collect stop data if(args.read_stop_data) { if(args.route_all) { //get all the routes string call_data = make_curl_call("http://www3.septa.org/hackathon/TransitViewAll/"); vector<string> route_list; parse_bus_data(call_data, &route_list); //get and store stop data for all routes for(int i=0;i<route_list.size();i++) get_and_store_stop_data(route_list[i]); } else { //store stop data for just the specified route get_and_store_stop_data(args.route_id); } } //collect bus data if(args.read_bus_data) { MysqlDB *myobj = MysqlDB::getInstance(); GoogDir *googdir = GoogDir::getInstance(); map<int, BusObject> old_buses_cache; //setup google directions api googdir->to_front_end = false; googdir->setRouteID(args.route_id, args.route_all); for(int s=0;1;s++) { //let googdir manage itself googdir->doProcess(); //read new weather if past 5 minutes or every 15 seconds check if there is a new update in the front end if((abs(time(0) - Weather::getInstance()->last_timestamp) > WEATHER_TIME_REGRAB + 30) || (s%15 && Weather::getInstance()->newRecentFrontEndWeatherAvailable())) { if(!Weather::getInstance()->getRecentWeather()) printf("Weather::getRecentWeather() failed!\n"); } //read bus data every 5 seconds if(!(s%5)) { stop_watch(); //get all the routes string call_data = make_curl_call("http://www3.septa.org/hackathon/TransitViewAll/"); vector<BusObject> bus_list = parse_bus_data(call_data); //needed for google directions googdir->makeRouteList(bus_list); //get and store stop data for all routes for(int i=0;i<bus_list.size();i++) { BusObject &bus = bus_list[i]; if(!bus_is_new(bus, old_buses_cache)) continue; //don't bother with buses 1 second + old if(bus.offset) continue; //this a route we care about? if(!args.route_all && bus.route_id != args.route_id) continue; //make sure weather and googdir timestamp are reasonable if(!bus_weather_and_googdir_timestamp_good(bus)) continue; myobj->insertBusData(bus_list[i]); } stop_watch("read_bus_data"); } //pause and grab data again sleep(1); } } //create coefficients if(args.create_coeff_data) { MysqlDB *myobj = MysqlDB::getInstance(); if(args.route_all) { //get all the routes string call_data = make_curl_call("http://www3.septa.org/hackathon/TransitViewAll/"); vector<string> route_list = myobj->getBusDataRoutes(); parse_bus_data(call_data); //create_coefficient_data for all routes for(int i=0;i<route_list.size();i++) create_coefficient_data(route_list[i], args.stop_all, args.stop_id, args.iterations, args.retrain_time); } else { //create_coefficient_data for just the specified route create_coefficient_data(args.route_id, args.stop_all, args.stop_id, args.iterations, args.retrain_time); } } //keep front end php scripts up to date with bus / weather data if(args.optimize_front_end) { MysqlDB *myobj = MysqlDB::getInstance(); GoogDir *googdir = GoogDir::getInstance(); PredictionCache *pcache = PredictionCache::getInstance(); pcache->setRouteID(args.route_id, args.route_all); pcache->setStopID(args.stop_id, args.stop_all); //setup google directions api googdir->to_front_end = true; googdir->setRouteID(args.route_id, args.route_all); //clear out old bus predictions - myobj->clearPredictions(); for(int s=0;1;s++) { //let googdir manage itself googdir->doProcess(); //read weather data every 5 minutes //if(!(s%(WEATHER_TIME_REGRAB))) if(abs(time(0) - Weather::getInstance()->last_timestamp) > WEATHER_TIME_REGRAB) { WeatherObject w_obj; if(Weather::getInstance()->getRecentWeather(w_obj, false)) { pcache->setWeather(w_obj); if(!myobj->insertWeatherFrontEnd(w_obj)) printf("Weather::getRecentWeather() failed!\n"); } else printf("Weather::getRecentWeather() failed!\n"); } //read bus data every 5 seconds if(!(s%5)) { stop_watch(); //get all the routes string call_data = make_curl_call("http://www3.septa.org/hackathon/TransitViewAll/"); vector<BusObject> bus_list = parse_bus_data(call_data); //needed for google directions googdir->makeRouteList(bus_list); //clear old stored bus data myobj->clearBusFrontEnd(); //get and store stop data for all routes for(int i=0;i<bus_list.size();i++) { BusObject &bus = bus_list[i]; if(args.route_all || bus.route_id == args.route_id) myobj->insertBusData(bus, true); } stop_watch("read_bus_data"); stop_watch(); pcache->processBusList(bus_list); stop_watch("cache_bus_predictions"); } //pause and grab data again sleep(1); } } return 0; }
/* * Main */ int main(int argc, char *argv[]) { if(argc != 3) { usage(argv); exit(1); } char *e; int L = (int)strtoul(argv[1], &e, 10); if(*e != '\0') { usage(argv); exit(2); } int nreps = (int)strtoul(argv[2], &e, 10); if(*e != '\0') { usage(argv); exit(2); } _Complex float *x = alloc(sizeof(_Complex float)*L); _Complex float *y = alloc(sizeof(_Complex float)*L); _Complex float a; random_vec(L, x); random_vec(L, y); random_vec(1, &a); axpy(L, a, x, y); int nreps_inner = 2; double tave = 0; double tvar = 0; for(int k=0; ;k++) { tave = 0; tvar = 0; for(int i=0; i<nreps; i++) { double t0 = stop_watch(0); for(int j=0; j<nreps_inner; j++) axpy(L, a, x, y); t0 = stop_watch(t0)/nreps_inner; tave += t0; tvar += t0*t0; } tave /= (double)nreps; tvar /= (double)nreps; tvar = sqrt(tvar - tave*tave); if(tvar < tave/25) break; nreps_inner = nreps_inner*2; } /* ___TODO_1___ Print: 1) Time per kernel call with error (usec) 2) Susstained floating-point rate (GFlop/sec) 3) Susstained bandwidth (GBytes/sec) */ double beta_fp = (8*L/tave)*1e-9; double beta_io = (8*3*L/tave)*1e-9; printf(" L = %12d, %4.2e ± %4.2e usec/call, perf. = %6.4e GFlop/sec, bw = %6.4e GBytes/sec\n", L, tave*1e6, tvar*1e6, beta_fp, beta_io); free(x); free(y); return 0; }
int main(int argc, char *argv[]) { int nargs = 3; if(argc != nargs) { usage(argv); return 1; } int M = atoi(argv[1]); int N = atoi(argv[2]); double *A = alloc(sizeof(double)*M*N); double *B = alloc(sizeof(double)*M*N); rand_mat(A, M, N); rand_mat(B, N, M); double *C = alloc(sizeof(double)*M*M); zero_mat(C, M, M); mat_mul(C, M, N, A, B); { double t0 = stop_watch(0); mat_mul(C, M, N, A, B); t0 = stop_watch(t0); double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */; printf(" ORIG: M = %d, N = %d,", M, N); printf(" took: %4.2e sec,", t0); printf(" P = %4.2e Mflop/s\n", beta_fp); } #ifdef BLCK double *Cb = alloc(sizeof(double)*M*M); zero_mat(Cb, M, M); mat_mul_blocked(Cb, M, N, A, B); { double t0 = stop_watch(0); mat_mul_blocked(Cb, M, N, A, B); t0 = stop_watch(t0); double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */; printf(" BLCK: M = %d, N = %d,", M, N); printf(" took: %4.2e sec,", t0); printf(" P = %4.2e Mflop/s, BM = %d, BN = %d\n", beta_fp, BM, BN); } #endif #ifdef BLCK double eps = 1e-12; double diff = 0; for(int i=0; i<M*M; i++) { diff += fabs((C[i] - Cb[i])/C[i]); } /* * If the difference between the flat and blocked result is larger * than eps, complain to stdout and write the two matrices to file * "diffs.out". */ diff /= (double)M*M; if(diff > eps) { printf(" Non zero diff: %e\n", diff); FILE *fp = fopen("diffs.out", "w"); for(int i=0; i<M*M; i++) fprintf(fp, "%e\n", fabs((C[i]-Cb[i])/C[i])); fclose(fp); } #endif free(A); free(B); free(C); #ifdef BLCK free(Cb); #endif return 0; }
k8s::~k8s() { stop_watch(); cleanup(); }