double time_acc(double *src_buf, double *dst_buf, int chunk, int loop, int proc, int levels) { int i, bal = 0; int stride[2]; int count[2]; int stride_levels = levels; double *before_buf, *after_buf; double start_time, stop_time, total_time = 0; stride[0] = SIZE * sizeof(double); count[0] = chunk * sizeof(double); count[1] = chunk; if(CHECK_RESULT) { before_buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(before_buf != NULL); after_buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(after_buf != NULL); } start_time = TIMER(); for(i=0; i<loop; i++) { double scale = (double)i; if(CHECK_RESULT) { ARMCI_GetS(dst_buf, stride, before_buf, stride, count, stride_levels, proc); acc_array(scale, before_buf, src_buf, stride, count,stride_levels); } ARMCI_AccS(ARMCI_ACC_DBL, &scale, src_buf, stride, dst_buf, stride, count, stride_levels, proc); if(CHECK_RESULT) { ARMCI_GetS(dst_buf, stride, after_buf, stride, count, stride_levels, proc); sprintf(check_type, "ARMCI_AccS:"); check_result(after_buf, before_buf, stride, count, stride_levels); } /* prepare next src and dst ptrs: avoid cache locality */ if(bal == 0) { src_buf += 128; dst_buf += 128; bal = 1; } else { src_buf -= 128; dst_buf -= 128; bal = 0; } } stop_time = TIMER(); total_time = (stop_time - start_time); if(CHECK_RESULT) { free(before_buf); free(after_buf); } if(total_time == 0.0){ total_time=0.000001; /* workaround for inaccurate timers */ warn_accuracy++; } return(total_time/loop); }
void test_acc_type(const int datatype) { int i = 0; int datatype_size = 0; void * scale; void * a; void *b[MAXPROC]; int elems = ELEMS; int dim = 1; int count = 0; int strideA = 0; int strideB = 0; switch(datatype) { case ARMCI_ACC_INT: datatype_size = sizeof(int); scale = malloc(datatype_size); *((int *) scale) = 1; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((int *) a)[i] = i + me; ((int *) b[me])[i] = 0; } break; case ARMCI_ACC_LNG: datatype_size = sizeof(long); scale = malloc(datatype_size); *((long *) scale) = 1; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((long *) a)[i] = i + me; ((long *) b[me])[i] = 0; } break; case ARMCI_ACC_FLT: datatype_size = sizeof(float); scale = malloc(datatype_size); *((float *) scale) = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((float *) a)[i] = (float) i + me; ((float *) b[me])[i] = 0.0; } break; case ARMCI_ACC_DBL: datatype_size = sizeof(double); scale = malloc(datatype_size); *((double *) scale) = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((double *) a)[i] = (double) i + me; ((double *) b[me])[i] = 0.0; } break; case ARMCI_ACC_CPL: datatype_size = sizeof(cmpl_t); scale = malloc(datatype_size); ((cmpl_t *) scale)->real = 2.0; ((cmpl_t *) scale)->imag = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((cmpl_t *) a)[i].real = ((float) i + me); ((cmpl_t *) a)[i].imag = ((float) i + me); ((cmpl_t *) b[me])[i].real = 0.0; ((cmpl_t *) b[me])[i].imag = 0.0; } break; case ARMCI_ACC_DCP: datatype_size = sizeof(dcmpl_t); scale = malloc(datatype_size); ((dcmpl_t *) scale)->real = 2.0; ((dcmpl_t *) scale)->imag = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((dcmpl_t *) a)[i].real = ((double) i + me); ((dcmpl_t *) a)[i].imag = ((double) i + me); ((dcmpl_t *) b[me])[i].real = 0.0; ((dcmpl_t *) b[me])[i].imag = 0.0; } break; default: return; break; } count = elems * datatype_size; strideA = elems * datatype_size; strideB = elems * datatype_size; ARMCI_AllFence(); MP_BARRIER(); for(i = 0; i < nproc; i++) ARMCI_AccS(datatype, scale, a, &strideA, b[(me + i) % nproc], &strideB, &count, 0, (me + i) % nproc); ARMCI_AllFence(); MP_BARRIER(); switch(datatype) { case ARMCI_ACC_INT: for(i = 0; i < elems; i++) { int compare = (i * nproc) + nproc / 2 * (nproc - 1); if(((int *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_INT [%d] = %d != %d\n", i, ((int *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_LNG: for(i = 0; i < elems; i++) { long compare = (i * nproc) + nproc / 2 * (nproc - 1); if(((long *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_LNG [%d] = %d != %ld\n", i, ((int *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_FLT: for(i = 0; i < elems; i++) { float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((float *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_FLT [%d] = %f != %f\n", i, ((float *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_DBL: for(i = 0; i < elems; i++) { double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((double *)b[me])[i] != (double) ((i * nproc) + nproc / 2 * (nproc - 1))) { printf("ERROR accumulate ARMCI_ACC_DBL [%d] = %f != %f \n", i, ((double *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_CPL: for(i = 0; i < elems; i++) { float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((cmpl_t *)b[me])[i].real != compare && ((cmpl_t *)b[me])[i].imag != 3 * compare) { printf("ERROR accumulate ARMCI_ACC_CPL [%d] = %f + %fj != %f + %fj\n", i, ((cmpl_t *)b[me])[i].real, ((cmpl_t *)b[me])[i].imag, compare, 3 * compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_DCP: for(i = 0; i < elems; i++) { double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((dcmpl_t *)b[me])[i].real != compare && ((dcmpl_t *)b[me])[i].imag != 3 * compare) { printf("ERROR accumulate ARMCI_ACC_DCP [%d] = %f + %fj != %f + %fj\n", i, ((dcmpl_t *)b[me])[i].real, ((dcmpl_t *)b[me])[i].imag, compare, 3 * compare); ARMCI_Error("test_acc_type failed\n",0); } } break; default: break; } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("O.K.\n\n"); fflush(stdout);} destroy_array((void**)b); free(a); free(scale); }
/* test Put/Get/Acc sequence regardless of communication pattern * tgt -- remote target for put/get/acc (none if -1) * rmt -- list of remote thread that put/acc to here (correctness is cheked here) * rmt_cnt -- # of threads in rmt */ void test_PutGetAcc(int th_idx, int tgt, int *rmt, int rmt_cnt) { /* a - local thread, b - remote thread */ int a, b, b_proc, stride[2], count[2]; int i, j; void *src, *dst; #ifdef DEBUG for (i = 0, cbufl = 0; i < rmt_cnt; i++) cbufl += sprintf(cbuf+cbufl, " %d", rmt[i]); prndbg(th_idx, "test_PutGetAcc: put/acc to %d, get from %d, check put/acc from %s\n", tgt, tgt, rmt_cnt ? cbuf : "none"); #endif a = TH_ME; stride[0] = ASIZE_BYTES; count[0] = ASIZE_BYTES; count[1] = 1; /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* put - put a.ptrs1[b] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ // assert(!ARMCI_Put(src, dst, ASIZE_BYTES, b_proc)); assert(!ARMCI_PutS(src, stride, dst, stride, count, 1, b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "PUT:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "PUT:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk put(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, PUT, &AELEM(ptrs2[a], b, 0, 0)); } //return; // REMOVE WHEN DONE /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* get - get b.ptrs1[a] into a.ptrs2[b] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[b], a, i, 0); /* b.ptrs1[a] */ dst = &AELEM(ptrs2[a], b, i, 0); /* a.ptrs2[b] */ assert(!ARMCI_GetS(src, stride, dst, stride, count, 1, b_proc)); } } print_array(th_idx, "GET:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "GET:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk get from b: a.ptrs2[b] */ if (tgt != -1) { check_PutGetAcc(th_idx, b, GET, &AELEM(ptrs2[a], b, 0, 0)); } #if 1 /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* acc - acc a.ptrs1[b] * scale + b.ptrs2[a] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ assert(!ARMCI_AccS(ARMCI_ACC_DBL,&scale,src,stride,dst,stride,count,1,b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "ACC:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "ACC:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk acc(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, ACC, &AELEM(ptrs2[a], b, 0, 0)); } #endif MT_BARRIER(); }
int main(int argc, char **argv) { int i, j, rank, nranks, peer; size_t xdim, ydim; unsigned long bufsize; double **buffer, *src_buf; double t_start=0.0, t_stop; int count[2], src_stride, trg_stride, stride_level; double scaling; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if (nranks < 2) { printf("%s: Must be run with at least 2 processes\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } ARMCI_Init_args(&argc, &argv); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = MAX_XDIM * MAX_YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) { printf("ARMCI_AccS Latency - local and remote completions - in usec \n"); printf("%30s %22s %22s\n", "Dimensions(array of double)", "Local Completion", "Remote completion"); fflush(stdout); } ARMCI_Access_begin(buffer[rank]); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); scaling = 2.0; src_stride = MAX_YDIM * sizeof(double); trg_stride = MAX_YDIM * sizeof(double); stride_level = 1; ARMCI_Barrier(); for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2) { count[1] = xdim; for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2) { count[0] = ydim * sizeof(double); if (rank == 0) { peer = 1; for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); } t_stop = MPI_Wtime(); ARMCI_Fence(1); char temp[10]; sprintf(temp, "%dX%d", (int) xdim, (int) ydim); printf("%30s %20.2f ", temp, ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); ARMCI_Fence(1); } t_stop = MPI_Wtime(); printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); } else { peer = 0; ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); } } } ARMCI_Barrier(); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); return 0; }
inline int DDI_ARMCI_Acc_proc(DDI_Patch *patch, void *scale, void *buf, int proc) { int handle = patch->handle; int nops = 1; DDA_ARMCI_Index *armci_index = gv(dda_armci_index)[handle]; int trows,tcols,nrows,ncols; size_t offset; char *dst,*src = (char*)buf; int src_stride_arr[2],dst_stride_arr[2],count[2]; int stride_levels = 1; int armci_proc; trows = gv(dda_index)[handle].nrows; tcols = gv(pcmap)[handle][proc+1] - gv(pcmap)[handle][proc]; nrows = patch->ihi - patch->ilo + 1; ncols = patch->jhi - patch->jlo + 1; offset = (patch->jlo - gv(pcmap)[handle][proc])*trows + patch->ilo; offset *= sizeof(double); DDI_ARMCI_Acquire(armci_index,handle,proc,DDI_WRITE_ACCESS,(void**)&dst,&armci_proc); dst += offset; if (nrows == trows) { src_stride_arr[0] = sizeof(double)*nrows; dst_stride_arr[0] = sizeof(double)*trows; count[0] = patch->size; stride_levels = 0; #if defined DDI_ARMCI_IMPLICIT_NBACC // Apparantely ARMCI_Acc is not always present //ARMCI_Acc(ARMCI_ACC_DBL, 1.0, (void*)src, (void*)dst, subp[i].size, armci_proc, NULL); ARMCI_NbAccS(ARMCI_ACC_DBL, scale, (void*)src, src_stride_arr, (void*)dst, dst_stride_arr, count, stride_levels, armci_proc, NULL); #else // Apparantely ARMCI_Acc is not always present //ARMCI_Acc(ARMCI_ACC_DBL, 1.0, (void*)src, (void*)dst, subp[i].size, armci_proc); ARMCI_AccS(ARMCI_ACC_DBL, scale, (void*)src, src_stride_arr, (void*)dst, dst_stride_arr, count, stride_levels, armci_proc); #endif } else { // i dimensions src_stride_arr[0] = sizeof(double)*nrows; dst_stride_arr[0] = sizeof(double)*trows; // j dimensions src_stride_arr[1] = src_stride_arr[0]*ncols; dst_stride_arr[1] = dst_stride_arr[0]*tcols; // block size count, first dimension must be in bytes count[0] = sizeof(double)*nrows; count[1] = ncols; stride_levels = 1; #if defined DDI_ARMCI_IMPLICIT_NBACC ARMCI_NbAccS(ARMCI_ACC_DBL, scale, (void*)src, src_stride_arr, (void*)dst, dst_stride_arr, count, stride_levels, armci_proc, NULL); #else ARMCI_AccS(ARMCI_ACC_DBL, scale, (void*)src, src_stride_arr, (void*)dst, dst_stride_arr, count, stride_levels, armci_proc); #endif } DDI_ARMCI_Release(armci_index,handle,proc,DDI_WRITE_ACCESS); return nops; }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors, total_errors; double **buf_bvec, **src_bvec, *src_buf; int count[2], src_stride, trg_stride, stride_level; double scaling, time; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); buf_bvec = (double **) malloc(sizeof(double *) * nranks); src_bvec = (double **) malloc(sizeof(double *) * nranks); bufsize = XDIM * YDIM * sizeof(double); ARMCI_Malloc((void **) buf_bvec, bufsize); ARMCI_Malloc((void **) src_bvec, bufsize); src_buf = src_bvec[rank]; if (rank == 0) printf("ARMCI Strided DLA Accumulate Test:\n"); ARMCI_Access_begin(buf_bvec[rank]); ARMCI_Access_begin(src_buf); for (i = 0; i < XDIM*YDIM; i++) { *(buf_bvec[rank] + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } ARMCI_Access_end(src_buf); ARMCI_Access_end(buf_bvec[rank]); scaling = 2.0; src_stride = XDIM * sizeof(double); trg_stride = XDIM * sizeof(double); stride_level = 1; count[1] = YDIM; count[0] = XDIM * sizeof(double); ARMCI_Barrier(); time = MPI_Wtime(); peer = (rank+1) % nranks; for (i = 0; i < ITERATIONS; i++) { ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, src_buf, &src_stride, (void *) buf_bvec[peer], &trg_stride, count, stride_level, peer); } ARMCI_Barrier(); time = MPI_Wtime() - time; if (rank == 0) printf("Time: %f sec\n", time); ARMCI_Access_begin(buf_bvec[rank]); for (i = errors = 0; i < XDIM; i++) { for (j = 0; j < YDIM; j++) { const double actual = *(buf_bvec[rank] + i + j*XDIM); const double expected = (1.0 + rank) + scaling * (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } ARMCI_Access_end(buf_bvec[rank]); MPI_Allreduce(&errors, &total_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ARMCI_Free((void *) buf_bvec[rank]); ARMCI_Free((void *) src_bvec[rank]); free(buf_bvec); free(src_bvec); ARMCI_Finalize(); MPI_Finalize(); if (total_errors == 0) { if (rank == 0) printf("Success.\n"); return 0; } else { if (rank == 0) printf("Fail.\n"); return 1; } }