/*\ quick fix for inacurate timer \*/ double Timer() { #define DELTA 0.000001 double t=MP_TIMER(); if(t<=_tt0 + DELTA) _tt0 += DELTA; else _tt0 = t; return _tt0; }
void time_reduce(double *test,int len) { int i; double t; t = MP_TIMER(); for(i=0; i<LOOP; i++){ armci_msg_reduce(test, len, "+",ARMCI_DOUBLE); } t = MP_TIMER() -t; t /= LOOP; if(me==0){ printf("Time per reduce %lf len=%d doubles\n",t, len); fflush(stdout); } }
void time_gop(double *test,int len) { int i; double t; t = MP_TIMER(); for(i=0; i<LOOP; i++){ armci_msg_dgop(test, len, "+"); } t = MP_TIMER() -t; t /= LOOP; if(me==0){ printf("Time per gop %lf len=%d doubles\n",t, len); fflush(stdout); } }
static int armci_profile_close_event(int event_type, int range, double *time, char *name) { int curr_event = gCURRENT_EVNT.event_type; #ifdef DEBUG if(SERVER_CONTEXT) printf("\n%d(s):call profile close for %s isset is %d",armci_me, gEventName[event_type],gCURRENT_EVNT.is_set); else printf("\n%d:call profile close for %s isset is %d",armci_me, gEventName[event_type],gCURRENT_EVNT.is_set); fflush(stdout); #endif if(gCURRENT_EVNT.is_set==1) { /* Yep, there is an event set. So close it.*/ /*Check if "profile stop" is called for corresponding "profile start"*/ if(event_type != curr_event) { printf( "%d: %s: ERROR:Profile started for %s, but stopped for %s\n", armci_me,name,gEventName[curr_event],gEventName[event_type]); fflush(stdout); armci_die("Profile_stop is called a different event", armci_me); } *time = MP_TIMER() - gCURRENT_EVNT.start_time; ARMCI_PROF[curr_event][range].time += *time; gCURRENT_EVNT.is_set = 0; /* close the event */ return ARMCI_EVENT_CLOSED; } else { /* event overlapping */ gCURRENT_EVNT.is_set--; if(gCURRENT_EVNT.is_set<=0) { char *msg="Profile_stop is called before profile_start"; printf("%d: %s: ERROR: %s. Event Name = %s\n", armci_me, name, msg, gEventName[curr_event]); fflush(stdout); armci_die(" profile_stop is called before profile_start", armci_me); } } return ARMCI_EVENT_NOTCLOSED; }
static int armci_profile_set_event(int event_type, int range) { #ifdef DEBUG if(SERVER_CONTEXT) printf("\n%d(s):call profile set for %s isset is %d",armci_me, gEventName[event_type],gCURRENT_EVNT.is_set); else printf("\n%d:call profile set for %s isset is %d",armci_me, gEventName[event_type],gCURRENT_EVNT.is_set); fflush(stdout); #endif if(gCURRENT_EVNT.is_set == 0) { /* set an event */ gCURRENT_EVNT.is_set = 1; gCURRENT_EVNT.event_type = event_type; gCURRENT_EVNT.range = range; gCURRENT_EVNT.start_time = MP_TIMER(); return ARMCI_EVENT_SET; } else gCURRENT_EVNT.is_set++; /* event overlap */ return ARMCI_EVENT_NOTSET; }
int main(int argc, char **argv) { Integer heap=9000000, stack=9000000; int me, nproc; DoublePrecision time; MP_INIT(argc,argv); GA_INIT(argc,argv); /* initialize GA */ nproc = GA_Nnodes(); me = GA_Nodeid(); if(me==0) printf("Using %d processes\n\n",nproc); if (me==0) printf ("Matrix size is %d X %d\n",N,N); #ifdef USE_REGULAR if (me == 0) printf("\nUsing regular data distribution\n\n"); #endif #ifdef USE_SIMPLE_CYCLIC if (me == 0) printf("\nUsing simple block-cyclic data distribution\n\n"); #endif #ifdef USE_SCALAPACK if (me == 0) printf("\nUsing ScaLAPACK data distribution\n\n"); #endif #ifdef USE_TILED if (me == 0) printf("\nUsing tiled data distribution\n\n"); #endif if(!MA_init((Integer)MT_F_DBL, stack/nproc, heap/nproc)) GA_Error("MA_init failed bytes= %d",stack+heap); #ifdef PERMUTE { int i, *list = (int*)malloc(nproc*sizeof(int)); if(!list)GA_Error("malloc failed",nproc); for(i=0; i<nproc;i++)list[i]=nproc-1-i; GA_Register_proclist(list, nproc); free(list); } #endif if(GA_Uses_fapi())GA_Error("Program runs with C API only",1); time = MP_TIMER(); do_work(); /* printf("%d: Total Time = %lf\n", me, MP_TIMER()-time); printf("%d: GEMM Total Time = %lf\n", me, gTime); */ if(me==0)printf("\nSuccess\n\n"); GA_Terminate(); MP_FINALIZE(); return 0; }
void test_aggregate(int dryrun) { int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; double *ddst_put[MAXPROC]; double *ddst_get[MAXPROC]; double *dsrc[MAXPROC]; armci_hdl_t aggr_hdl_put[MAXPROC]; armci_hdl_t aggr_hdl_get[MAXPROC]; armci_hdl_t hdl_put[MAXELEMS]; armci_hdl_t hdl_get[MAXELEMS]; armci_giov_t darr; void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS]; int start = 0, end = 0; double start_time; create_array((void**)ddst_put, sizeof(double),2, elems); create_array((void**)ddst_get, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) { ddst_put[me][i]=0.0; ddst_get[me][i]=0.0; } MP_BARRIER(); /* only proc 0 does the work */ if(me == 0) { if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS); /* initializing non-blocking handles */ for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]); for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]); /* aggregate handles */ for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]); bytes = sizeof(double); /* **************** PUT **************** */ /* register put */ start_time=MP_TIMER(); start = 0; end = elems[1]; for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i, &hdl_put[j]); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Value Put time = %.2es\n", me, MP_TIMER()-start_time); /* vector put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[me][j]; dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i]))) ARMCI_Error("armci_nbputv failed\n",rc); } for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]); if(!dryrun)printf("%d: Vector Put time = %.2es\n", me, MP_TIMER()-start_time); /* regular put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &hdl_put[j]))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Regular Put time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &aggr_hdl_put[i]))) ARMCI_Error("armci_nbput failed\n",rc); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]); if(!dryrun)printf("%d: Aggregate Put time = %.2es\n\n", me, MP_TIMER()-start_time); /* **************** GET **************** */ /* vector get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[i][j]; dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i]))) ARMCI_Error("armci_nbgetv failed\n",rc); ARMCI_Wait(&hdl_get[i]); } if(!dryrun)printf("%d: Vector Get time = %.2es\n", me, MP_TIMER()-start_time); /* regular get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &hdl_get[j]))) ARMCI_Error("armci_nbget failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]); } if(!dryrun)printf("%d: Regular Get time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &aggr_hdl_get[i]); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]); if(!dryrun)printf("%d: Aggregate Get time = %.2es\n", me, MP_TIMER()-start_time); } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); /* Verify */ if(!(me==0)) for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_put[me][j]-j*1.001) > 0.1) { ARMCI_Error("aggregate put failed...1", 0); } } MP_BARRIER(); if(!dryrun)if(me==0) printf("\n aggregate put ..O.K.\n"); fflush(stdout); if(me==0) { for(i=1; i<nproc; i++) { for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) { ARMCI_Error("aggregate get failed...1", 0); } } } } MP_BARRIER(); if(!dryrun)if(me==0) printf(" aggregate get ..O.K.\n"); fflush(stdout); ARMCI_AllFence(); MP_BARRIER(); if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst_put); destroy_array((void **)ddst_get); destroy_array((void **)dsrc); }
int main(int argc, char **argv) { int i; double **myptrs; double t0, t1, tnbget=0, tnbwait=0, t2=0; MP_INIT(argc,argv); ARMCI_Init(); MP_PROCS(&nprocs); MP_MYID(&me); if (nprocs < 2) ARMCI_Error("This program requires at least to processes", 1); myptrs = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); MP_BARRIER(); if(me == 0) { for(i = 0; i < 10; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1); } t0 = MP_TIMER(); for(i = 0; i < LOOP; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1); } t1 = MP_TIMER(); printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP); fflush(stdout); t1 = t0 = 0; for(i = 0; i < LOOP; i++) { armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MP_TIMER(); //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh); ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh); t1 = MP_TIMER(); ARMCI_Wait(&nbh); t2 = MP_TIMER(); tnbget += (t1-t0); tnbwait += (t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); } else sleep(1); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
void test_io_dbl() { int n, ndim = NDIM; double err, tt0, tt1, mbytes; int g_a, g_b, d_a; int i, itmp, j, req, loop; int glo[MAXDIM],ghi[MAXDIM]; dra_size_t dlo[MAXDIM],dhi[MAXDIM]; dra_size_t ddims[MAXDIM],reqdims[MAXDIM]; dra_size_t m; int index[MAXDIM], dims[MAXDIM]; int me, nproc, isize; double *ptr; double plus, minus; int ld[MAXDIM], chunk[MAXDIM]; char filename[80]; FILE *fd; n = SIZE; m = ((dra_size_t)NFACTOR)*((dra_size_t)SIZE); loop = 1; for (i=0; i<ndim; i++) loop *= NFACTOR; req = -1; nproc = GA_Nnodes(); me = GA_Nodeid(); if (me == 0) { printf("Creating temporary global arrays %d",n); for (i=1; i<ndim; i++) { printf(" x %d",n); } printf("\n"); } if (me == 0) fflush(stdout); GA_Sync(); for (i=0; i<ndim; i++) { dims[i] = n; chunk[i] = 1; } g_a = NGA_Create(MT_DBL, ndim, dims, "a", chunk); if (!g_a) GA_Error("NGA_Create failed: a", 0); g_b = NGA_Create(MT_DBL, ndim, dims, "b", chunk); if (!g_b) GA_Error("NGA_Create failed: b", 0); if (me == 0) printf("done\n"); if (me == 0) fflush(stdout); /* initialize g_a, g_b with random values ... use ga_access to avoid allocating local buffers for ga_put */ GA_Sync(); NGA_Distribution(g_a, me, glo, ghi); NGA_Access(g_a, glo, ghi, &ptr, ld); isize = 1; for (i=0; i<ndim; i++) isize *= (ghi[i]-glo[i]+1); fill_random(ptr, isize); GA_Sync(); GA_Zero(g_b); /*.......................................................................*/ if (me == 0) { printf("Creating Disk array %ld",m); for (i=1; i<ndim; i++) { printf(" x %ld",m); } printf("\n"); } if (me == 0) fflush(stdout); for (i=0; i<ndim; i++) { ddims[i] = m; reqdims[i] = (dra_size_t)n; } GA_Sync(); strcpy(filename,FNAME); if (! (fd = fopen(filename, "w"))) { strcpy(filename,FNAME_ALT); if (! (fd = fopen(filename, "w"))) { GA_Error("open failed",0); } } fclose(fd); if (NDRA_Create(MT_DBL, ndim, ddims, "A", filename, DRA_RW, reqdims, &d_a) != 0) { GA_Error("NDRA_Create failed(d_a): ",0); } if (me == 0) printf("testing write\n"); fflush(stdout); tt1 = 0.0; for (i=0; i<loop; i++) { itmp=i; for (j=0; j<ndim; j++) { index[j] = itmp%NFACTOR; itmp = (itmp - index[j])/NFACTOR; } for (j=0; j<ndim; j++) { glo[j] = 0; ghi[j] = SIZE - 1; dlo[j] = ((dra_size_t)index[j])*((dra_size_t)SIZE); dhi[j] = (((dra_size_t)index[j])+(dra_size_t)1) * ((dra_size_t)SIZE) - (dra_size_t)1; } tt0 = MP_TIMER(); if (NDRA_Write_section(FALSE, g_a, glo, ghi, d_a, dlo, dhi, &req) != 0) { GA_Error("ndra_write_section failed:",0); } if (DRA_Wait(req) != 0) { GA_Error("DRA_Wait failed(d_a): ",req); } tt1 += (MP_TIMER() - tt0); } GA_Dgop(&tt1,1,"+"); tt1 = tt1/((double)nproc); mbytes = 1.e-6 * (double)(pow(m,ndim)*sizeof(double)); if (me == 0) { printf("%11.2f MB time = %11.2f rate = %11.3f MB/s\n", mbytes,tt1,mbytes/tt1); } if (DRA_Close(d_a) != 0) { GA_Error("DRA_Close failed(d_a): ",d_a); } if (me == 0) printf("\n"); if (me == 0) printf("disk array closed\n"); if (me == 0) fflush(stdout); /*..........................................................*/ if (me == 0) printf("\n"); if (me == 0) printf("opening disk array\n"); if (DRA_Open(filename, DRA_R, &d_a) != 0) { GA_Error("DRA_Open failed",0); } if (me == 0) printf("testing read\n"); /* printf("testing read on proc %d\n",me); */ if (me == 0) fflush(stdout); tt1 = 0.0; for (i=0; i<loop; i++) { itmp=i; for (j=0; j<ndim; j++) { index[j] = itmp%NFACTOR; itmp = (itmp - index[j])/NFACTOR; } for (j=0; j<ndim; j++) { glo[j] = 0; ghi[j] = SIZE - 1; dlo[j] = ((dra_size_t)index[j])*((dra_size_t)SIZE); dhi[j] = (((dra_size_t)index[j])+(dra_size_t)1) * ((dra_size_t)SIZE) - (dra_size_t)1; } tt0 = MP_TIMER(); if (NDRA_Read_section(FALSE, g_b, glo, ghi, d_a, dlo, dhi, &req) != 0) { GA_Error("ndra_read_section failed:",0); } if (DRA_Wait(req) != 0) { GA_Error("DRA_Wait failed(d_a): ",req); } tt1 += (MP_TIMER() - tt0); plus = 1.0; minus = -1.0; GA_Add(&plus, g_a, &minus, g_b, g_b); err = GA_Ddot(g_b, g_b); if (err != 0) { if (me == 0) { printf("BTW, we have error = %f on loop value %d\n", err,i); } GA_Error(" bye",0); } } GA_Dgop(&tt1,1,"+"); tt1 = tt1/((double)nproc); if (me == 0) { printf("%11.2f MB time = %11.2f rate = %11.3f MB/s\n", mbytes,tt1,mbytes/tt1); } if (DRA_Delete(d_a) != 0) GA_Error("DRA_Delete failed",0); /*.......................................................................*/ GA_Destroy(g_a); GA_Destroy(g_b); }
/* * test ga_dgemm * Note: - change nummax for large arrays * - turn off "dgemm_verify" for large arrays due to memory * limitations, as dgemm_verify=1 for large arrays produces * segfault, dumps core,or any crap. */ int main(int argc, char **argv) { int num_m; int num_n; int num_k; int i; int ii; double *h0; int g_c; int g_b; int g_a; double a; double t1; double mf; double avg_t[ntrans]; double avg_mf[ntrans]; int itime; int ntimes; int nums_m[/*howmany*/] = {512,1024}; int nums_n[/*howmany*/] = {512,1024}; int nums_k[/*howmany*/] = {512,1024}; char transa[/*ntrans*/] = "ntnt"; char transb[/*ntrans*/] = "nntt"; char ta; char tb; double *tmpa; double *tmpb; double *tmpc; int ndim; int dims[2]; #ifdef BLOCK_CYCLIC int block_size[2]; #endif #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &me ); ElMPICommSize( MPI_COMM_WORLD, &nproc ); // instantiate el::global array ElGlobalArraysConstruct_d( &eldga ); // initialize global arrays ElGlobalArraysInitialize_d( eldga ); #else MP_INIT(argc,argv); if (!MA_init(MT_DBL,1,20000000)) { GA_Error("failed: ma_init(MT_DBL,1,20000000)",10); } GA_INIT(argc,argv); me = GA_Nodeid(); #endif h0 = (double*)malloc(sizeof(double) * nummax*nummax); tmpa = (double*)malloc(sizeof(double) * nummax*nummax); tmpb = (double*)malloc(sizeof(double) * nummax*nummax); tmpc = (double*)malloc(sizeof(double) * nummax*nummax); ii = 0; for (i=0; i<nummax*nummax; i++) { ii = ii + 1; if (ii > nummax) { ii = 0; } h0[i] = ii; } /* Compute times assuming 500 mflops and 5 second target time */ /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */ ntimes = 5; for (ii=0; ii<howmany; ii++) { num_m = nums_m[ii]; num_n = nums_n[ii]; num_k = nums_k[ii]; a = 0.5/(num_m*num_n); if (num_m > nummax || num_n > nummax || num_k > nummax) { GA_Error("Insufficient memory: check nummax", 1); } #ifndef BLOCK_CYCLIC ndim = 2; /* dims[0] = num_m; dims[1] = num_n; */ dims[1] = num_m; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c ); #else if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) { GA_Error("failed: create g_c",20); } #endif /* dims[0] = num_k; dims[1] = num_n; */ dims[1] = num_k; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b ); #else if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) { GA_Error("failed: create g_b",30); } #endif /* dims[0] = num_m; dims[1] = num_k; */ dims[1] = num_m; dims[0] = num_k; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a ); #else if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) { GA_Error("failed: create g_a",40); } #endif #else ndim = 2; block_size[0] = 128; block_size[1] = 128; dims[0] = num_m; dims[1] = num_n; g_c = GA_Create_handle(); GA_Set_data(g_c,ndim,dims,MT_DBL); GA_Set_array_name(g_c,"g_c"); GA_Set_block_cyclic(g_c,block_size); if (!GA_Allocate(g_c)) { GA_Error("failed: create g_c",40); } dims[0] = num_k; dims[1] = num_n; g_b = GA_Create_handle(); GA_Set_data(g_b,ndim,dims,MT_DBL); GA_Set_array_name(g_b,"g_b"); GA_Set_block_cyclic(g_b,block_size); if (!ga_allocate(g_b)) { GA_Error("failed: create g_b",40); } dims[0] = num_m; dims[1] = num_k; g_a = GA_Create_handle(); GA_Set_data(g_a,ndim,dims,MT_DBL); GA_Set_array_name(g_a,"g_a"); GA_Set_block_cyclic(g_a,block_size); if (!ga_allocate(g_a)) { GA_Error('failed: create g_a',40); } #endif /* Initialize matrices A and B */ if (me == 0) { load_ga(g_a, h0, num_m, num_k); load_ga(g_b, h0, num_k, num_n); } #if defined(USE_ELEMENTAL) double zero = 0.0; ElGlobalArraysFill_d( eldga, g_c, &zero ); ElGlobalArraysSync_d( eldga ); #else GA_Zero(g_c); GA_Sync(); #endif #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n", (long)num_m, (long)num_k, (long)num_k, (long)num_n); fflush(stdout); } for (i=0; i<ntrans; i++) { avg_t[i] = 0.0; avg_mf[i] = 0.0; } for (itime=0; itime<ntimes; itime++) { for (i=0; i<ntrans; i++) { #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); #else GA_Sync(); #endif ta = transa[i]; tb = transb[i]; t1 = MP_TIMER(); #if defined(USE_ELEMENTAL) ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c ); #else GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c); #endif t1 = MP_TIMER() - t1; #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif #if defined(USE_ELEMENTAL) mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc; #else mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes(); #endif avg_t[i] = avg_t[i]+t1; avg_mf[i] = avg_mf[i] + mf; printf("%15s%2d: %12.4f seconds %12.1f mflops/proc %c %c\n", "Run#", itime, t1, mf, ta, tb); fflush(stdout); if (dgemm_verify && itime == 0) { /* recall the C API swaps the matrix order */ /* we swap it here for the Fortran-based verify */ verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0, g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc); } } } } #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\n"); for (i=0; i<ntrans; i++) { printf("%17s: %12.4f seconds %12.1f mflops/proc %c %c\n", "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes, transa[i], transb[i]); } if(dgemm_verify) { printf("All GA_Dgemms are verified...O.K.\n"); } fflush(stdout); } /* GA_Print(g_a); GA_Print(g_b); GA_Print(g_c); */ #if defined(USE_ELEMENTAL) ElGlobalArraysDestroy_d( eldga, g_a ); ElGlobalArraysDestroy_d( eldga, g_b ); ElGlobalArraysDestroy_d( eldga, g_c ); #else GA_Destroy(g_c); GA_Destroy(g_b); GA_Destroy(g_a); #endif } /* ??? format(a15, i2, ': ', e12.4, ' seconds ',f12.1, . ' mflops/proc ', 3a2) */ #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("All tests successful\n"); } free(h0); free(tmpa); free(tmpb); free(tmpc); #if defined(USE_ELEMENTAL) // call el::global arrays destructor ElGlobalArraysTerminate_d( eldga ); ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Terminate(); MP_FINALIZE(); #endif return 0; } /* * Verify for correctness. Process 0 computes BLAS dgemm * locally. For larger arrays, disbale this test as memory * might not be sufficient */ void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k, double alpha, int g_a, int g_b, double beta, int g_c, double *tmpa, double *tmpb, double *tmpc) { int i,j,type,ndim,dims[2],lo[2],hi[2]; double abs_value; for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { tmpc[j+i*num_m] = -1.0; tmpa[j+i*num_m] = -2.0; } } #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_a, lo, hi, tmpa, &dims[1]); #endif #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] ); #else NGA_Get(g_b, lo, hi, tmpb, &dims[1]); #endif /* compute dgemm sequentially */ #if defined(USE_ELEMENTAL) cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), ( xt2 == 'n'? CblasNoTrans: CblasTrans ), num_m /* M */, num_n /* N */, num_k /* K */, alpha, tmpa, num_m, /* lda */ tmpb, num_k, /* ldb */ beta, tmpc, num_m /* ldc */); #else xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k, &alpha, tmpa, &num_m, tmpb, &num_k, &beta, tmpc, &num_m); #endif /* after computing c locally, verify it with the values in g_c */ #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_c, lo, hi, tmpa, &dims[1]); #endif for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]); if(abs_value > 1.0 || abs_value < -1.0) { printf("Values are = %f %f\n", tmpc[j+i*num_m], tmpa[j+i*num_m]); printf("Values are = %f %f\n", fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value); fflush(stdout); GA_Error("verify ga_dgemm failed", 1); } } } } /** * called by process '0' (or your master process ) */ void load_ga(int handle, double *f, int dim1, int dim2) { int lo[2], hi[2]; if (dim1 < 0 || dim2 < 0) { return; } lo[0] = 0; lo[1] = 0; hi[0] = dim1-1; hi[1] = dim2-1; #if defined(USE_ELEMENTAL) ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 ); #else NGA_Put(handle, lo, hi, f, &dim1); #endif }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }