/** One-sided copy of data from the source to the destination. Set a flag on * the remote process when the transfer is complete. * * @param[in] src Source buffer * @param[in] dst Destination buffer on proc * @param[in] size Number of bytes to transfer * @param[in] flag Address of the flag buffer on proc * @param[in] value Value to set the flag to * @param[in] proc Process id of the target * @return 0 on success, non-zero on failure */ int ARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc) { ARMCI_Put(src, dst, size, proc); ARMCI_Fence(proc); ARMCI_Put(&value, flag, sizeof(int), proc); return 0; }
void test_one_group(ARMCI_Group *group, int *pid_list) { int grp_me, grp_size; int i,j,src_proc,dst_proc; double *ddst_put[MAXPROC]; double dsrc[ELEMS]; int elems[2] = {MAXPROC,ELEMS}; int value = -1, bytes, world_me; MP_MYID(&world_me); ARMCI_Group_rank(group, &grp_me); ARMCI_Group_size(group, &grp_size); if(grp_me==0) printf("GROUP SIZE = %d\n", grp_size); printf("%d:group rank = %d\n", me, grp_me); src_proc = 0; dst_proc = grp_size-1; bytes = ELEMS*sizeof(double); ARMCI_Malloc_group((void **)ddst_put, bytes, group); for(i=0; i<ELEMS; i++) dsrc[i]=i*1.001*(grp_me+1); for(i=0; i<ELEMS; i++) ddst_put[grp_me][i]=-1.0; armci_msg_group_barrier(group); if(grp_me==src_proc) { /* NOTE: make sure to specify absolute ids in ARMCI calls */ ARMCI_Put(dsrc, &ddst_put[dst_proc][0], bytes, ARMCI_Absolute_id(group,dst_proc)); } armci_msg_group_barrier(group); /* NOTE: make sure to specify absolute ids in ARMCI calls */ ARMCI_Fence(ARMCI_Absolute_id(group,dst_proc)); sleep(1); /* Verify*/ if(grp_me==dst_proc) { for(j=0; j<ELEMS; j++) { if(ARMCI_ABS(ddst_put[grp_me][j]-j*1.001*(src_proc+1)) > 0.1) { printf("\t%d: ddst_put[%d][%d] = %lf and expected value is %lf\n", me, grp_me, j, ddst_put[grp_me][j], j*1.001*(src_proc+1)); ARMCI_Error("groups: armci put failed...1", 0); } } printf("\n%d(%d): Test O.K. Verified\n", dst_proc, world_me); } armci_msg_group_barrier(group); ARMCI_Free_group(ddst_put[grp_me], group); }
int main(int argc, char * argv[]) { void *baseAddress[MAX_PROCESSORS]; char *local; int thisImage; int iter = 100, size; double startTime, endTime; int i; // initialize ARMCI_Init(); ARMCI_Myid(&thisImage); // allocate data (collective operation) ARMCI_Malloc(baseAddress, MAX_BUF_SIZE*sizeof(char)); local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char)); ARMCI_Barrier(); ARMCI_Migrate(); if (thisImage == 0) { for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){ startTime = CkWallTimer(); for(i = 0; i < iter; i++){ ARMCI_Put(local, baseAddress[1], size, 1); } ARMCI_Fence(1); endTime = CkWallTimer(); printf("%d: %f us\n", size, (endTime-startTime)*1000); } ARMCI_Barrier(); } else if (thisImage == 1) { ARMCI_Barrier(); } ARMCI_Free(baseAddress[thisImage]); ARMCI_Free_local(local); // finalize ARMCI_Finalize(); return 0; }
/* test Put/Get/Acc sequence regardless of communication pattern * tgt -- remote target for put/get/acc (none if -1) * rmt -- list of remote thread that put/acc to here (correctness is cheked here) * rmt_cnt -- # of threads in rmt */ void test_PutGetAcc(int th_idx, int tgt, int *rmt, int rmt_cnt) { /* a - local thread, b - remote thread */ int a, b, b_proc, stride[2], count[2]; int i, j; void *src, *dst; #ifdef DEBUG for (i = 0, cbufl = 0; i < rmt_cnt; i++) cbufl += sprintf(cbuf+cbufl, " %d", rmt[i]); prndbg(th_idx, "test_PutGetAcc: put/acc to %d, get from %d, check put/acc from %s\n", tgt, tgt, rmt_cnt ? cbuf : "none"); #endif a = TH_ME; stride[0] = ASIZE_BYTES; count[0] = ASIZE_BYTES; count[1] = 1; /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* put - put a.ptrs1[b] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ // assert(!ARMCI_Put(src, dst, ASIZE_BYTES, b_proc)); assert(!ARMCI_PutS(src, stride, dst, stride, count, 1, b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "PUT:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "PUT:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk put(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, PUT, &AELEM(ptrs2[a], b, 0, 0)); } //return; // REMOVE WHEN DONE /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* get - get b.ptrs1[a] into a.ptrs2[b] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[b], a, i, 0); /* b.ptrs1[a] */ dst = &AELEM(ptrs2[a], b, i, 0); /* a.ptrs2[b] */ assert(!ARMCI_GetS(src, stride, dst, stride, count, 1, b_proc)); } } print_array(th_idx, "GET:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "GET:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk get from b: a.ptrs2[b] */ if (tgt != -1) { check_PutGetAcc(th_idx, b, GET, &AELEM(ptrs2[a], b, 0, 0)); } #if 1 /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* acc - acc a.ptrs1[b] * scale + b.ptrs2[a] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ assert(!ARMCI_AccS(ARMCI_ACC_DBL,&scale,src,stride,dst,stride,count,1,b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "ACC:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "ACC:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk acc(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, ACC, &AELEM(ptrs2[a], b, 0, 0)); } #endif MT_BARRIER(); }
int main(int argc, char **argv) { int i, j, rank, nranks, peer; size_t xdim, ydim; unsigned long bufsize; double **buffer, *src_buf; double t_start=0.0, t_stop; int count[2], src_stride, trg_stride, stride_level; double scaling; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if (nranks < 2) { printf("%s: Must be run with at least 2 processes\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } ARMCI_Init_args(&argc, &argv); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = MAX_XDIM * MAX_YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) { printf("ARMCI_AccS Latency - local and remote completions - in usec \n"); printf("%30s %22s %22s\n", "Dimensions(array of double)", "Local Completion", "Remote completion"); fflush(stdout); } ARMCI_Access_begin(buffer[rank]); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); scaling = 2.0; src_stride = MAX_YDIM * sizeof(double); trg_stride = MAX_YDIM * sizeof(double); stride_level = 1; ARMCI_Barrier(); for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2) { count[1] = xdim; for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2) { count[0] = ydim * sizeof(double); if (rank == 0) { peer = 1; for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); } t_stop = MPI_Wtime(); ARMCI_Fence(1); char temp[10]; sprintf(temp, "%dX%d", (int) xdim, (int) ydim); printf("%30s %20.2f ", temp, ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); ARMCI_Fence(1); } t_stop = MPI_Wtime(); printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); } else { peer = 0; ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); } } } ARMCI_Barrier(); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int i, j, rank, nranks; int xdim, ydim; long bufsize; double **buffer; double t_start=0.0, t_stop=0.0; int count[2], src_stride, trg_stride, stride_level, peer; double expected, actual; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if (nranks < 2) { printf("%s: Must be run with at least 2 processes\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } ARMCI_Init_args(&argc, &argv); bufsize = MAX_XDIM * MAX_YDIM * sizeof(double); buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } if(rank == 0) { printf("ARMCI_PutS Latency - local and remote completions - in usec \n"); printf("%30s %22s %22s\n", "Dimensions(array of doubles)", "Latency-LocalCompeltion", "Latency-RemoteCompletion"); fflush(stdout); } src_stride = MAX_YDIM*sizeof(double); trg_stride = MAX_YDIM*sizeof(double); stride_level = 1; ARMCI_Barrier(); for(xdim=1; xdim<=MAX_XDIM; xdim*=2) { count[1] = xdim; for(ydim=1; ydim<=MAX_YDIM; ydim*=2) { count[0] = ydim*sizeof(double); if(rank == 0) { peer = 1; for(i=0; i<ITERATIONS+SKIP; i++) { if(i == SKIP) t_start = MPI_Wtime(); ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); } t_stop = MPI_Wtime(); ARMCI_Fence(peer); char temp[10]; sprintf(temp,"%dX%d", xdim, ydim); printf("%30s %20.2f", temp, ((t_stop-t_start)*1000000)/ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); for(i=0; i<ITERATIONS+SKIP; i++) { if(i == SKIP) t_start = MPI_Wtime(); ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); ARMCI_Fence(peer); } t_stop = MPI_Wtime(); printf("%20.2f \n", ((t_stop-t_start)*1000000)/ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); } else { peer = 0; expected = (1.0 + (double) peer); ARMCI_Barrier(); if (rank == 1) { for(i=0; i<xdim; i++) { for(j=0; j<ydim; j++) { actual = *(buffer[rank] + i*MAX_YDIM + j); if(actual != expected) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, expected, actual); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } } for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Barrier(); ARMCI_Barrier(); if (rank == 1) { for(i=0; i<xdim; i++) { for(j=0; j<ydim; j++) { actual = *(buffer[rank] + i*MAX_YDIM + j); if(actual != expected) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, expected, actual); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } } ARMCI_Barrier(); } } } ARMCI_Barrier(); ARMCI_Free((void *) buffer[rank]); free(buffer); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { size_t i, rank, nranks, msgsize, dest; size_t iterations, max_msgsize; int bufsize; double **buffer; double t_start, t_stop, t_total, d_total; double expected, bandwidth; int provided; armci_hdl_t handle; max_msgsize = MAX_MSGSIZE; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); bufsize = max_msgsize * ITERATIONS_LARGE; buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_INIT_HANDLE(&handle); ARMCI_SET_AGGREGATE_HANDLE(&handle); ARMCI_Barrier(); if (rank == 0) { printf("ARMCI_Put Bandwidth in MBPS \n"); printf("%20s %22s \n", "Message Size", "Bandwidth"); fflush(stdout); dest = 1; expected = 1 + dest; for (msgsize = sizeof(double); msgsize <= max_msgsize; msgsize *= 2) { if (msgsize <= 16 * 1024) iterations = ITERATIONS_VERYSMALL; else if (msgsize <= 64 * 1024) iterations = ITERATIONS_SMALL; else if (msgsize <= 512 * 1024) iterations = ITERATIONS_MEDIUM; else iterations = ITERATIONS_LARGE; t_start = MPI_Wtime(); for (i = 0; i < iterations; i++) { ARMCI_NbPut((void *) ((size_t) buffer[dest] + (size_t)(i * msgsize)), (void *) ((size_t) buffer[rank] + (size_t)(i * msgsize)), msgsize, dest, &handle); } ARMCI_Wait(&handle); t_stop = MPI_Wtime(); d_total = (iterations * msgsize) / (1024 * 1024); t_total = t_stop - t_start; bandwidth = d_total / t_total; printf("%20d %20.4lf \n", msgsize, bandwidth); fflush(stdout); ARMCI_Fence(dest); } } ARMCI_Barrier(); ARMCI_UNSET_AGGREGATE_HANDLE(&handle); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }
/*\ Send Request to Execute callback function in a global address space * Arguments: * f - handle to the callback function * p - remote processor * hdr - header data - used to pack extra args for callback (local buffer) * hlen - size of header data < ARMCI_GPC_HLEN * data - bulk data passed to callback (local buffer) * dlen - length of bulk data * rhdr - ptr to reply header (return args from callback) * rhlen - length of buffer to store reply header < ARMCI_GPC_HLEN * rdata - ptr to where reply data from callback should be stored (local buf) * rdlen - size of the buffer to store reply data * nbh - nonblocking handle * \*/ int ARMCI_Gpc_exec(int h, int p, void *hdr, int hlen, void *data, int dlen, void *rhdr, int rhlen, void *rdata, int rdlen, gpc_hdl_t* nbh) { int hnd = -h + GPC_OFFSET; int err = 0; armci_hdl_t *ahdl = (nbh ? &(nbh->ahdl): NULL); if(hnd <0 || hnd>= GPC_SLOTS) err += fprintf(stderr, "ARMCI_Gpc_exec: bad callback handle %d: %d\n",hnd,GPC_SLOTS); if(!_table[hnd]) err += fprintf(stderr, "ARMCI_Gpc_exec: NULL function %d",hnd); if(hlen<0 || hlen>=ARMCI_Gpc_get_hlen()) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send header size %d %d\n", hlen, ARMCI_Gpc_get_hlen()); if(rhlen<0 || rhlen>=ARMCI_Gpc_get_hlen()) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv header size %d %d\n", rhlen, ARMCI_Gpc_get_hlen()); if(dlen<0 || dlen>=ARMCI_Gpc_get_dlen()) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send data size %d %d\n", dlen, ARMCI_Gpc_get_dlen()); if(rdlen<0 || rdlen>=ARMCI_Gpc_get_dlen()) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv data size %d %d\n", rdlen, ARMCI_Gpc_get_dlen()); if(hlen>0 && hdr==NULL) err += fprintf(stderr, "ARMCI_Gpc_exec: Null send header for non-zero header size %d\n", hlen); if(rhlen>0 && rhdr==NULL) err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv header for non-zero header size %d\n", rhlen); if(dlen>0 && data==NULL) err += fprintf(stderr, "ARMCI_Gpc_exec: Null send data for non-zero data size %d\n", dlen); if(rdlen>0 && rdata==NULL) err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv data for non-zero header size %d\n", rdlen); if(p<0 || p >= armci_nproc) err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid target processor id %d\n", p, armci_nproc); if(err) return FAIL; if(rhlen + rdlen == 0) armci_die("Zero reply header + data length not yet supported", 0); if(nbh) nbh->proc = p; #if 1 if(SAMECLUSNODE(p) && armci_nproc==1) { int rhsize, rdsize; int (*func)(); /* fprintf(stderr, "%d:: armci gpc exec. SAMECLUSNODE\n", armci_me); */ func = _table[hnd]; if(func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, rdata, rdlen, &rdsize, GPC_INIT) != GPC_DONE) { func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, rdata, rdlen, &rdsize, GPC_WAIT); } #ifndef VAPI ARMCI_Fence(p); #endif return 0; } #endif /* fprintf(stderr, "%d:: armci gpc exec. invoking armci gpc\n", armci_me); */ return armci_gpc(h, p, hdr, hlen, data, dlen, rhdr, rhlen, rdata, rdlen, ahdl); }
int main(int argc, char *argv[]) { int i, j, rank, nranks, msgsize, dest; int dim, iterations; long bufsize; double **buffer; double t_start, t_stop, t_total, d_total, bw; int count[2], src_stride, trg_stride, stride_level; int provided; armci_hdl_t handle; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); bufsize = MAX_DIM * MAX_DIM * sizeof(double); buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_INIT_HANDLE(&handle); ARMCI_SET_AGGREGATE_HANDLE(&handle); ARMCI_Barrier(); if (rank == 0) { printf("ARMCI_PutS Bandwidth in MBPS \n"); printf("%30s %22s \n", "Dimensions(array of doubles)", "Latency"); fflush(stdout); dest = 1; src_stride = MAX_DIM * sizeof(double); trg_stride = MAX_DIM * sizeof(double); stride_level = 1; for (dim = 1; dim <= MAX_DIM; dim *= 2) { count[0] = dim*sizeof(double); count[1] = dim; iterations = 10*(MAX_DIM * MAX_DIM)/(dim * dim); t_start = MPI_Wtime(); for (i = 0; i < iterations; i++) { ARMCI_NbPutS((void *) buffer[rank], &src_stride, (void *) buffer[dest], &trg_stride, count, stride_level, dest, &handle); } ARMCI_Wait(&handle); t_stop = MPI_Wtime(); ARMCI_Fence(1); char temp[10]; sprintf(temp, "%dX%d", dim, dim); t_total = t_stop - t_start; d_total = (dim*dim*sizeof(double)*iterations)/(1024*1024); bw = d_total/t_total; printf("%30s %20.2f \n", temp, bw); fflush(stdout); } } ARMCI_Barrier(); ARMCI_UNSET_AGGREGATE_HANDLE(&handle); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }