int main(int argc, char **argv) { int k,i; double **myptrs[10]; double t0,t1,tget=0,tnbget=0,tput=0,tnbput=0,tnbwait=0,t2=0; #if PORTALS ARMCI_NetInit(); #endif MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); ARMCI_Init(); ARMCI_Init(); for(k=0;k<10;k++){ myptrs[k] = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs[k],400000*LOOP*sizeof(double)); for(i=0;i<LOOP;i++)myptrs[k][me][i]=me+0.414; MPI_Barrier(MPI_COMM_WORLD); for(i=0;i<LOOP;i++){ ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs); /*if(myptrs[k][me][i]!=0.414+(me+1)%nprocs)ARMCI_Error("errr",myptrs[k][me][i]);*/ } t0=t1=tget=tnbget=tput=tnbput=tnbwait=t2=0; t0 = MPI_Wtime(); for(i=0;i<LOOP;i++){ ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs); } t1 = MPI_Wtime(); printf("\nGet Latency=%lf\n",1e6*(t1-t0)/LOOP);fflush(stdout); t1=t0=0; for(i=0;i<LOOP;i++){ armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MPI_Wtime(); ARMCI_NbGet(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs,&nbh); t1 = MPI_Wtime(); ARMCI_Wait(&nbh); t2 = MPI_Wtime(); tnbget+=(t1-t0); tnbwait+=(t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); } for(k=0;k<10;k++)ARMCI_Free(myptrs[k][me]); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Finalize(); ARMCI_Finalize(); MPI_Finalize(); }
void test_aggregate(int dryrun) { int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; double *ddst_put[MAXPROC]; double *ddst_get[MAXPROC]; double *dsrc[MAXPROC]; armci_hdl_t aggr_hdl_put[MAXPROC]; armci_hdl_t aggr_hdl_get[MAXPROC]; armci_hdl_t hdl_put[MAXELEMS]; armci_hdl_t hdl_get[MAXELEMS]; armci_giov_t darr; void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS]; int start = 0, end = 0; double start_time; create_array((void**)ddst_put, sizeof(double),2, elems); create_array((void**)ddst_get, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) { ddst_put[me][i]=0.0; ddst_get[me][i]=0.0; } MP_BARRIER(); /* only proc 0 does the work */ if(me == 0) { if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS); /* initializing non-blocking handles */ for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]); for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]); /* aggregate handles */ for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]); bytes = sizeof(double); /* **************** PUT **************** */ /* register put */ start_time=MP_TIMER(); start = 0; end = elems[1]; for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i, &hdl_put[j]); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Value Put time = %.2es\n", me, MP_TIMER()-start_time); /* vector put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[me][j]; dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i]))) ARMCI_Error("armci_nbputv failed\n",rc); } for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]); if(!dryrun)printf("%d: Vector Put time = %.2es\n", me, MP_TIMER()-start_time); /* regular put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &hdl_put[j]))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Regular Put time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &aggr_hdl_put[i]))) ARMCI_Error("armci_nbput failed\n",rc); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]); if(!dryrun)printf("%d: Aggregate Put time = %.2es\n\n", me, MP_TIMER()-start_time); /* **************** GET **************** */ /* vector get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[i][j]; dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i]))) ARMCI_Error("armci_nbgetv failed\n",rc); ARMCI_Wait(&hdl_get[i]); } if(!dryrun)printf("%d: Vector Get time = %.2es\n", me, MP_TIMER()-start_time); /* regular get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &hdl_get[j]))) ARMCI_Error("armci_nbget failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]); } if(!dryrun)printf("%d: Regular Get time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &aggr_hdl_get[i]); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]); if(!dryrun)printf("%d: Aggregate Get time = %.2es\n", me, MP_TIMER()-start_time); } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); /* Verify */ if(!(me==0)) for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_put[me][j]-j*1.001) > 0.1) { ARMCI_Error("aggregate put failed...1", 0); } } MP_BARRIER(); if(!dryrun)if(me==0) printf("\n aggregate put ..O.K.\n"); fflush(stdout); if(me==0) { for(i=1; i<nproc; i++) { for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) { ARMCI_Error("aggregate get failed...1", 0); } } } } MP_BARRIER(); if(!dryrun)if(me==0) printf(" aggregate get ..O.K.\n"); fflush(stdout); ARMCI_AllFence(); MP_BARRIER(); if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst_put); destroy_array((void **)ddst_get); destroy_array((void **)dsrc); }
int main(int argc, char **argv) { int i; double **myptrs; double t0, t1, tnbget=0, tnbwait=0, t2=0; MP_INIT(argc,argv); ARMCI_Init(); MP_PROCS(&nprocs); MP_MYID(&me); if (nprocs < 2) ARMCI_Error("This program requires at least to processes", 1); myptrs = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); MP_BARRIER(); if(me == 0) { for(i = 0; i < 10; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1); } t0 = MP_TIMER(); for(i = 0; i < LOOP; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1); } t1 = MP_TIMER(); printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP); fflush(stdout); t1 = t0 = 0; for(i = 0; i < LOOP; i++) { armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MP_TIMER(); //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh); ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh); t1 = MP_TIMER(); ARMCI_Wait(&nbh); t2 = MP_TIMER(); tnbget += (t1-t0); tnbwait += (t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); } else sleep(1); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
static void sparse_multiply(int n, int non_zero, int *row_ind, int **col_ind, double **values, double **vec, double **svec) { int i, j, k, num_elements, offset, *tmp_indices; double start_time, comm_time, comp_time, v, vec_local[COL]; int start, end, prev, nrows, idx; #if 0 /* ---- Sequential Case ---- */ for(i=0; i<n; i++) { svec[me][i] = 0; for(k=row_ind[i]; k<row_ind[i+1]; k++) { j = col_ind[me][k]; v = values[me][k]; svec[me][i] += v*vec[me][j]; printf("%.1f %.1f\n", v, vec[me][j]); } } for(i=0; i<n; i++) printf("%.1f ", svec[me][i]); printf("\n"); #else num_elements = proc_nz_list[me]; printf("num_elements = %d\n", num_elements); tmp_indices = (int *)malloc(num_elements*sizeof(int)); for(i=0; i<num_elements; i++) tmp_indices[i] = col_ind[me][i]; qsort(tmp_indices, num_elements, sizeof(int), compare); start_time = armci_timer(); /* get the required portion of vector you need to local array */ start = prev = tmp_indices[0]; for(i=1; i<num_elements; i++) { if(tmp_indices[i]>prev+1) { end = prev; get_data(n, start, end, vec_local, vec); start = prev = tmp_indices[i]; } else prev = tmp_indices[i]; } get_data(n, start, prev, vec_local, vec); #if 1 if(count>=0) for(i=0; i<=count; i++) ARMCI_Wait(&gHandle[i]); #endif comm_time = armci_timer() - start_time; start_time = armci_timer(); /* Perform Matrix-Vector multiply and store the result in solution vector - "svec[]" */ if(me==0) { nrows = proc_row_list[me]; offset = row_ind[0]; } else { nrows = proc_row_list[me]-proc_row_list[me-1]; offset = row_ind[proc_row_list[me-1]]; } /* printf("%d: My total Work = %d\n", me, nrows); */ for(i=0; i<nrows; i++) { /* loop over rows owned by me */ svec[me][i] = 0; if(me==0) idx = i; else idx = proc_row_list[me-1] + i; for(k=row_ind[idx]; k<row_ind[idx+1]; k++) { j = col_ind[me][k-offset]; v = values[me][k-offset]; svec[me][i] += v*vec_local[j]; } } comp_time = armci_timer()-start_time; printf("%d: %f + %f = %f (count = %d)\n", me, comm_time, comp_time, comm_time+comp_time, count+1); #endif }
int main(int argc, char *argv[]) { size_t i, rank, nranks, msgsize, dest; size_t iterations, max_msgsize; int bufsize; double **buffer; double t_start, t_stop, t_total, d_total; double expected, bandwidth; int provided; armci_hdl_t handle; max_msgsize = MAX_MSGSIZE; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); bufsize = max_msgsize * ITERATIONS_LARGE; buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_INIT_HANDLE(&handle); ARMCI_SET_AGGREGATE_HANDLE(&handle); ARMCI_Barrier(); if (rank == 0) { printf("ARMCI_Put Bandwidth in MBPS \n"); printf("%20s %22s \n", "Message Size", "Bandwidth"); fflush(stdout); dest = 1; expected = 1 + dest; for (msgsize = sizeof(double); msgsize <= max_msgsize; msgsize *= 2) { if (msgsize <= 16 * 1024) iterations = ITERATIONS_VERYSMALL; else if (msgsize <= 64 * 1024) iterations = ITERATIONS_SMALL; else if (msgsize <= 512 * 1024) iterations = ITERATIONS_MEDIUM; else iterations = ITERATIONS_LARGE; t_start = MPI_Wtime(); for (i = 0; i < iterations; i++) { ARMCI_NbPut((void *) ((size_t) buffer[dest] + (size_t)(i * msgsize)), (void *) ((size_t) buffer[rank] + (size_t)(i * msgsize)), msgsize, dest, &handle); } ARMCI_Wait(&handle); t_stop = MPI_Wtime(); d_total = (iterations * msgsize) / (1024 * 1024); t_total = t_stop - t_start; bandwidth = d_total / t_total; printf("%20d %20.4lf \n", msgsize, bandwidth); fflush(stdout); ARMCI_Fence(dest); } } ARMCI_Barrier(); ARMCI_UNSET_AGGREGATE_HANDLE(&handle); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void armci_agg_complete(armci_ihdl_t nb_handle, int condition) { int i, index=0, rc; /* get the buffer index for this handle */ for(i=ulist.size-1; i>=0; i--) { index = ulist.index[i]; if(aggr[index]->tag == nb_handle->tag && aggr[index]->proc == nb_handle->proc) break; } if(i<0) return; /* implies this handle has no requests at all */ #if 0 printf("%d: Aggregation Complete to remote process %d (%d:%d requests)\n", armci_me, nb_handle->proc, index, aggr[index]->request_len); #endif /* complete the data transfer. NOTE: in LAPI, Non-blocking calls (followed by wait) performs better than blocking put/get */ if(aggr[index]->request_len) { switch(nb_handle->op) { #ifdef LAPI armci_hdl_t usr_hdl; case PUT: ARMCI_INIT_HANDLE(&usr_hdl); if((rc=ARMCI_NbPutV(aggr[index]->darr, aggr[index]->request_len, nb_handle->proc, (armci_hdl_t*)&usr_hdl))) ARMCI_Error("armci_agg_complete: nbputv failed",rc); ARMCI_Wait((armci_hdl_t*)&usr_hdl); break; case GET: ARMCI_INIT_HANDLE(&usr_hdl); if((rc=ARMCI_NbGetV(aggr[index]->darr, aggr[index]->request_len, nb_handle->proc, (armci_hdl_t*)&usr_hdl))) ARMCI_Error("armci_agg_complete: nbgetv failed",rc); ARMCI_Wait((armci_hdl_t*)&usr_hdl); break; #else case PUT: if((rc=ARMCI_PutV(aggr[index]->darr, aggr[index]->request_len, nb_handle->proc))) ARMCI_Error("armci_agg_complete: putv failed",rc); break; case GET: if((rc=ARMCI_GetV(aggr[index]->darr, aggr[index]->request_len, nb_handle->proc))) ARMCI_Error("armci_agg_complete: getv failed",rc); break; #endif } } /* setting request length to zero, as the requests are completed */ aggr[index]->request_len = 0; aggr[index]->ptr_array_len = 0; aggr[index]->buf_pos_end = _MAX_AGG_BUFSIZE; /* If armci_agg_complete() is called ARMCI_Wait(), then unset nb_handle*/ if(condition==UNSET) { nb_handle->proc = -1; _armci_agg_update_lists(index); } }
int main(int argc, char *argv[]) { int rank, nranks; size_t i, msgsize, dest; size_t iterations, max_msgsize; int bufsize; double **buffer; double t_start, t_stop, t_total, d_total; double expected, bandwidth; int provided; armci_hdl_t handle; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); max_msgsize = MAX_MSGSIZE; ARMCI_Init_args(&argc, &argv); bufsize = max_msgsize * ITERATIONS; buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_INIT_HANDLE(&handle); ARMCI_SET_AGGREGATE_HANDLE(&handle); ARMCI_Barrier(); if (rank == 0) { printf("ARMCI_Get Bandwidth in MBPS \n"); printf("%20s %22s \n", "Message Size", "Bandwidth"); fflush(stdout); dest = 1; expected = 1 + dest; for (msgsize = sizeof(double); msgsize <= max_msgsize; msgsize *= 2) { iterations = bufsize/msgsize; t_start = MPI_Wtime(); for (i = 0; i < iterations; i++) { ARMCI_NbGet((void *) ((size_t) buffer[dest] + (size_t)(i * msgsize)), (void *) ((size_t) buffer[rank] + (size_t)(i * msgsize)), msgsize, dest, &handle); } ARMCI_Wait(&handle); t_stop = MPI_Wtime(); d_total = (iterations * msgsize) / (1024 * 1024); t_total = t_stop - t_start; bandwidth = d_total / t_total; printf("%20d %20.4lf \n", msgsize, bandwidth); fflush(stdout); #ifdef DATA_VALIDATION { for(j=0; j<((iterations*msgsize)/sizeof(double)); j++) { if(*(buffer[rank] + j) != expected) { printf("Data validation failed At displacement : %d Expected : %lf Actual : %lf \n", j, expected, *(buffer[rank] + j)); fflush(stdout); return -1; } } for(j=0; j<bufsize/sizeof(double); j++) { *(buffer[rank] + j) = 1.0 + rank; } } #endif } } ARMCI_Barrier(); ARMCI_UNSET_AGGREGATE_HANDLE(&handle); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void ARMCI_Gpc_wait(gpc_hdl_t *nbh) { if(SAMECLUSNODE(nbh->proc)) return; ARMCI_Wait(&nbh->ahdl); }
int main(int argc, char *argv[]) { int i, j, rank, nranks, msgsize, dest; int dim, iterations; long bufsize; double **buffer; double t_start, t_stop, t_total, d_total, bw; int count[2], src_stride, trg_stride, stride_level; int provided; armci_hdl_t handle; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); bufsize = MAX_DIM * MAX_DIM * sizeof(double); buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_INIT_HANDLE(&handle); ARMCI_SET_AGGREGATE_HANDLE(&handle); ARMCI_Barrier(); if (rank == 0) { printf("ARMCI_PutS Bandwidth in MBPS \n"); printf("%30s %22s \n", "Dimensions(array of doubles)", "Latency"); fflush(stdout); dest = 1; src_stride = MAX_DIM * sizeof(double); trg_stride = MAX_DIM * sizeof(double); stride_level = 1; for (dim = 1; dim <= MAX_DIM; dim *= 2) { count[0] = dim*sizeof(double); count[1] = dim; iterations = 10*(MAX_DIM * MAX_DIM)/(dim * dim); t_start = MPI_Wtime(); for (i = 0; i < iterations; i++) { ARMCI_NbPutS((void *) buffer[rank], &src_stride, (void *) buffer[dest], &trg_stride, count, stride_level, dest, &handle); } ARMCI_Wait(&handle); t_stop = MPI_Wtime(); ARMCI_Fence(1); char temp[10]; sprintf(temp, "%dX%d", dim, dim); t_total = t_stop - t_start; d_total = (dim*dim*sizeof(double)*iterations)/(1024*1024); bw = d_total/t_total; printf("%30s %20.2f \n", temp, bw); fflush(stdout); } } ARMCI_Barrier(); ARMCI_UNSET_AGGREGATE_HANDLE(&handle); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }