inline int DDI_ARMCI_Acc_proc(DDI_Patch *patch, void *scale, void *buf, int proc) { int handle = patch->handle; int nops = 1; DDA_ARMCI_Index *armci_index = gv(dda_armci_index)[handle]; int trows,tcols,nrows,ncols; size_t offset; char *dst,*src = (char*)buf; int src_stride_arr[2],dst_stride_arr[2],count[2]; int stride_levels = 1; int armci_proc; trows = gv(dda_index)[handle].nrows; tcols = gv(pcmap)[handle][proc+1] - gv(pcmap)[handle][proc]; nrows = patch->ihi - patch->ilo + 1; ncols = patch->jhi - patch->jlo + 1; offset = (patch->jlo - gv(pcmap)[handle][proc])*trows + patch->ilo; offset *= sizeof(double); DDI_ARMCI_Acquire(armci_index,handle,proc,DDI_WRITE_ACCESS,(void**)&dst,&armci_proc); dst += offset; if (nrows == trows) { src_stride_arr[0] = sizeof(double)*nrows; dst_stride_arr[0] = sizeof(double)*trows; count[0] = patch->size; stride_levels = 0; #if defined DDI_ARMCI_IMPLICIT_NBACC // Apparantely ARMCI_Acc is not always present //ARMCI_Acc(ARMCI_ACC_DBL, 1.0, (void*)src, (void*)dst, subp[i].size, armci_proc, NULL); ARMCI_NbAccS(ARMCI_ACC_DBL, scale, (void*)src, src_stride_arr, (void*)dst, dst_stride_arr, count, stride_levels, armci_proc, NULL); #else // Apparantely ARMCI_Acc is not always present //ARMCI_Acc(ARMCI_ACC_DBL, 1.0, (void*)src, (void*)dst, subp[i].size, armci_proc); ARMCI_AccS(ARMCI_ACC_DBL, scale, (void*)src, src_stride_arr, (void*)dst, dst_stride_arr, count, stride_levels, armci_proc); #endif } else { // i dimensions src_stride_arr[0] = sizeof(double)*nrows; dst_stride_arr[0] = sizeof(double)*trows; // j dimensions src_stride_arr[1] = src_stride_arr[0]*ncols; dst_stride_arr[1] = dst_stride_arr[0]*tcols; // block size count, first dimension must be in bytes count[0] = sizeof(double)*nrows; count[1] = ncols; stride_levels = 1; #if defined DDI_ARMCI_IMPLICIT_NBACC ARMCI_NbAccS(ARMCI_ACC_DBL, scale, (void*)src, src_stride_arr, (void*)dst, dst_stride_arr, count, stride_levels, armci_proc, NULL); #else ARMCI_AccS(ARMCI_ACC_DBL, scale, (void*)src, src_stride_arr, (void*)dst, dst_stride_arr, count, stride_levels, armci_proc); #endif } DDI_ARMCI_Release(armci_index,handle,proc,DDI_WRITE_ACCESS); return nops; }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }