/** One-sided copy of data from the source to the destination. Set a flag on * the remote process when the transfer is complete. * * @param[in] src Source buffer * @param[in] dst Destination buffer on proc * @param[in] size Number of bytes to transfer * @param[in] flag Address of the flag buffer on proc * @param[in] value Value to set the flag to * @param[in] proc Process id of the target * @return 0 on success, non-zero on failure */ int ARMCI_Put_flag(void *src, void* dst, int size, int *flag, int value, int proc) { ARMCI_Put(src, dst, size, proc); ARMCI_Fence(proc); ARMCI_Put(&value, flag, sizeof(int), proc); return 0; }
int main(int argc, char ** argv) { int rank, nproc, val, i; void **base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI mutex read-modify-write test with %d processes\n", nproc); base_ptrs = malloc(nproc*sizeof(void*)); ARMCI_Create_mutexes(rank == 0 ? 1 : 0); ARMCI_Malloc(base_ptrs, (rank == 0) ? sizeof(int) : 0); // Proc 0 has a shared int if (rank == 0) { val = 0; ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0); } ARMCI_Barrier(); for (i = 0; i < NITER; i++) { ARMCI_Lock(0, 0); ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0); val += ADDIN; ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0); ARMCI_Unlock(0, 0); } printf(" + %3d done\n", rank); fflush(NULL); ARMCI_Barrier(); if (rank == 0) { ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0); if (val == ADDIN*nproc*NITER) printf("Test complete: PASS.\n"); else printf("Test complete: FAIL. Got %d, expected %d.\n", val, ADDIN*nproc*NITER); } ARMCI_Free(base_ptrs[rank]); ARMCI_Destroy_mutexes(); free(base_ptrs); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void test_one_group(ARMCI_Group *group, int *pid_list) { int grp_me, grp_size; int i,j,src_proc,dst_proc; double *ddst_put[MAXPROC]; double dsrc[ELEMS]; int elems[2] = {MAXPROC,ELEMS}; int value = -1, bytes, world_me; MP_MYID(&world_me); ARMCI_Group_rank(group, &grp_me); ARMCI_Group_size(group, &grp_size); if(grp_me==0) printf("GROUP SIZE = %d\n", grp_size); printf("%d:group rank = %d\n", me, grp_me); src_proc = 0; dst_proc = grp_size-1; bytes = ELEMS*sizeof(double); ARMCI_Malloc_group((void **)ddst_put, bytes, group); for(i=0; i<ELEMS; i++) dsrc[i]=i*1.001*(grp_me+1); for(i=0; i<ELEMS; i++) ddst_put[grp_me][i]=-1.0; armci_msg_group_barrier(group); if(grp_me==src_proc) { /* NOTE: make sure to specify absolute ids in ARMCI calls */ ARMCI_Put(dsrc, &ddst_put[dst_proc][0], bytes, ARMCI_Absolute_id(group,dst_proc)); } armci_msg_group_barrier(group); /* NOTE: make sure to specify absolute ids in ARMCI calls */ ARMCI_Fence(ARMCI_Absolute_id(group,dst_proc)); sleep(1); /* Verify*/ if(grp_me==dst_proc) { for(j=0; j<ELEMS; j++) { if(ARMCI_ABS(ddst_put[grp_me][j]-j*1.001*(src_proc+1)) > 0.1) { printf("\t%d: ddst_put[%d][%d] = %lf and expected value is %lf\n", me, grp_me, j, ddst_put[grp_me][j], j*1.001*(src_proc+1)); ARMCI_Error("groups: armci put failed...1", 0); } } printf("\n%d(%d): Test O.K. Verified\n", dst_proc, world_me); } armci_msg_group_barrier(group); ARMCI_Free_group(ddst_put[grp_me], group); }
int main(int argc, char * argv[]) { void *baseAddress[MAX_PROCESSORS]; char *local; int thisImage; int iter = 100, size; double startTime, endTime; int i; // initialize ARMCI_Init(); ARMCI_Myid(&thisImage); // allocate data (collective operation) ARMCI_Malloc(baseAddress, MAX_BUF_SIZE*sizeof(char)); local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char)); ARMCI_Barrier(); ARMCI_Migrate(); if (thisImage == 0) { for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){ startTime = CkWallTimer(); for(i = 0; i < iter; i++){ ARMCI_Put(local, baseAddress[1], size, 1); } ARMCI_Fence(1); endTime = CkWallTimer(); printf("%d: %f us\n", size, (endTime-startTime)*1000); } ARMCI_Barrier(); } else if (thisImage == 1) { ARMCI_Barrier(); } ARMCI_Free(baseAddress[thisImage]); ARMCI_Free_local(local); // finalize ARMCI_Finalize(); return 0; }
void TRANSPOSE1D() { int dims[1]; int nelem, i, ierr, min, max, cmin, cmax, lmin, lmax, pmin, pmax; int src_offset, dst_offset, length; int *buf, *map; void *src_ptr, *dst_ptr; void **a_ptr, **b_ptr; int *a, *b; /* Find local processor ID and number of processors */ int me, nprocs; me = armci_msg_me(); nprocs = armci_msg_nproc(); /* Allocate pointers to data on all processors */ a_ptr = (void**)malloc(nprocs*sizeof(int*)); b_ptr = (void**)malloc(nprocs*sizeof(int*)); map = (int*)malloc(nprocs*sizeof(int)); /* Configure array dimensions. Force an unequal data distribution */ dims[0] = nprocs*TOTALELEMS + nprocs/2; if (me == 0) printf("Size of array: %d\n\n",dims[0]); /* Find first (zero-based) index of chunk owned by each processor and store it in map array */ for (i=0; i<nprocs; i++) { map[i] = (int)(((double)i)*(((double)dims[0])/((double)nprocs))); } /* Figure out what size my portion of array is */ if (me<nprocs-1) { nelem = map[me+1]-map[me]; } else { nelem = dims[0]-map[me]; } /* Allocate memory for array A */ ierr = ARMCI_Malloc(a_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(a_ptr[me]); /* Allocate memory for array B */ ierr = ARMCI_Malloc(b_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(b_ptr[me]); /* initialize data in array A and zero data in array B */ a = (int*)a_ptr[me]; b = (int*)b_ptr[me]; for (i=0; i<nelem; i++) { a[i] = i + map[me] + 1; b[i] = 0; } /* Synchronize all processors to guarantee that everyone has data before proceeding to the next step. */ armci_msg_barrier(); /* Create local buffer for performing inversion */ buf = (int*)malloc(nelem*sizeof(int)); /* Copy inverted data into local buffer */ a = (int*)a_ptr[me]; for (i=0; i<nelem; i++) { buf[i] = a[nelem-i-1]; } /* Find out which blocks of array B inverted block should be copied to. Start by finding min and max indices of data in array B*/ min = dims[0] - (map[me] + nelem); max = dims[0] - map[me] - 1; /* Locate processors containing the endpoints */ pmin = 0; for (i=0; i<nprocs; i++) { if (min >= map[i]) { pmin = i; } else { break; } } pmax = nprocs-1; for (i=nprocs-2; i>=0; i--) { if (max < map[i+1]) { pmax = i; } else { break; } } /* Loop over processors that will receive data and copy inverted data to processors */ for (i=pmin; i<=pmax; i++) { /* Find min and max indices owned by processor i */ lmin = map[i]; if (i<nprocs-1) { lmax = map[i+1]-1; } else { lmax = dims[0]-1; } /* Find min and max indices that should be sent to processor i */ if (lmin > min) { cmin = lmin; } else { cmin = min; } if (lmax < max) { cmax = lmax; } else { cmax = max; } /* Find offsets on source and destination processors */ src_offset = cmin - min; src_ptr = (void*)(buf + src_offset); dst_offset = cmin - lmin; dst_ptr = ((char*)b_ptr[i]) + sizeof(int)*dst_offset; /* Find length of data (in bytes) to be sent to processor i */ length = sizeof(int)*(cmax-cmin+1); /* Send data to processor */ ARMCI_Put(src_ptr, dst_ptr, length, i); } ARMCI_AllFence(); armci_msg_barrier(); free(buf); VERIFY(b_ptr, dims, map); free(map); armci_msg_barrier(); ARMCI_Free(a_ptr[me]); ARMCI_Free(b_ptr[me]); free(a_ptr); free(b_ptr); }
double time_put(double *src_buf, double *dst_buf, int chunk, int loop, int proc, int levels) { int i, bal = 0; int stride[2]; int count[2]; int stride_levels = levels; double *tmp_buf; double start_time, stop_time, total_time = 0; stride[0] = SIZE * sizeof(double); count[0] = chunk * sizeof(double); count[1] = chunk; if(CHECK_RESULT) { tmp_buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(tmp_buf != NULL); } start_time = TIMER(); for(i=0; i<loop; i++) { #ifdef FORCE_1D int j; if(levels>0)for(j=0; j< count[1]; j++){ char *s = (char*) src_buf, *d= (char*)dst_buf; s += j*stride[0]; d += j*stride[0]; ARMCI_Put(src_buf, dst_buf, count[0],proc); } else #endif if(levels) ARMCI_PutS(src_buf, stride, dst_buf, stride, count, stride_levels,proc); else ARMCI_Put(src_buf, dst_buf,count[0], proc); if(CHECK_RESULT) { ARMCI_GetS(dst_buf, stride, tmp_buf, stride, count, stride_levels, proc); sprintf(check_type, "ARMCI_PutS:"); check_result(tmp_buf, src_buf, stride, count, stride_levels); } /* prepare next src and dst ptrs: avoid cache locality */ if(bal == 0) { src_buf += 128; dst_buf += 128; bal = 1; } else { src_buf -= 128; dst_buf -= 128; bal = 0; } } stop_time = TIMER(); total_time = (stop_time - start_time); if(CHECK_RESULT) free(tmp_buf); if(total_time == 0.0){ total_time=0.000001; /* workaround for inaccurate timers */ warn_accuracy++; } return(total_time/loop); }
static int sparse_initialize(int *n, int *non_zero, int **row_ind, int **col_ind, double **values, double **vec, double **svec) { int i, j, rc, max, *row_ind_tmp=NULL, *tmp_indices=NULL; double *tmp_values=NULL; unsigned long len; FILE *fp=NULL; /* Broadcast order of matrix */ if(me==0) { if((fp=fopen("Sparse-MPI/av41092.rua.data", "r")) == NULL) ARMCI_Error("Error: Input file not found", me); fortran_indexing = 1; /* This is 1 for Harwell-Boeing format matrices */ fscanf(fp, "%d", n); if(*n%nproc) ARMCI_Error("# of rows is not divisible by # of processors", nproc); if(*n > ROW) ARMCI_Error("order is greater than defined variable ROW", ROW); } len = sizeof(int); armci_msg_brdcst(n, len, 0); /* Broad cast number of non_zeros */ if(me==0) fscanf(fp, "%d", non_zero); armci_msg_brdcst(non_zero, len, 0); /* Broadcast row indices */ len = (*n+1)*sizeof(int); row_ind_tmp = (int *)malloc(len); if(me==0)for(i=0; i<*n+1; i++) { fscanf(fp, "%d", &row_ind_tmp[i]); if(fortran_indexing) --row_ind_tmp[i]; } armci_msg_brdcst(row_ind_tmp, len, 0); load_balance(*n, *non_zero, row_ind_tmp); /* find how much temporary storage is needed at the maximum */ if(me==0) { for(max=-1,j=0; j<nproc; j++) if(max<proc_nz_list[j]) max=proc_nz_list[j]; if(max<0) ARMCI_Error(" max cannot be negative", max); } /* Broadcast the maximum number of elements */ len = sizeof(int); armci_msg_brdcst(&max, len, 0); /* create the Sparse MAtrix Array */ if(me==0) printf(" Creating ValueArray (CompressedSparseMatrix) ...\n\n"); create_array((void**)col_ind, sizeof(int), 1, &max); /* create the column subscript array */ if(me==0) printf(" Creating Column Subscript Array ... \n\n"); create_array((void**)values, sizeof(double), 1, &max); /* create the x-vector and the solution vector */ if(me==0) printf(" Creating Vectors ... \n\n"); create_array((void**)vec, sizeof(double),1, &max); create_array((void**)svec, sizeof(double),1, &max); armci_msg_barrier(); /* Process 0 distributes the column indices and non_zero values to respective processors*/ if(me == 0) { tmp_indices = (int *)malloc(max*sizeof(int)); tmp_values = (double *)malloc(max*sizeof(double)); for(j=0; j<nproc; j++) { for(i=0; i<proc_nz_list[j]; i++) { fscanf(fp, "%d", &tmp_indices[i]); if(fortran_indexing) --tmp_indices[i]; } /* rc = fread(tmp_indices, sizeof(int), proc_nz_list[j], fp); */ if((rc=ARMCI_Put(tmp_indices, col_ind[j], proc_nz_list[j]*sizeof(int), j))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=0; j<nproc; j++) { for(i=0; i<proc_nz_list[j]; i++) fscanf(fp, "%lf", &tmp_values[i]); if((rc=ARMCI_Put(tmp_values, values[j], proc_nz_list[j]*sizeof(double), j))) ARMCI_Error("armci_nbput failed\n",rc); } } ARMCI_AllFence(); armci_msg_barrier(); ARMCI_AllFence(); /* initializing x-vector */ if(me==0) for(i=0; i<proc_nz_list[me]; i++) vec[me][i] = (i+1); else for(i=0; i<proc_nz_list[me]; i++) vec[me][i]=me*proc_nz_list[me-1]+(i+1); #if 0 if(me==0) { printf("max = %d\n", max); for(i=0; i<max; i++) printf("%.1f ", values[me][i]); printf("\n"); } #endif *row_ind = row_ind_tmp; if(me==0) { free(tmp_indices); free(tmp_values); fclose(fp); } return 0; }
/** Non-blocking put operation. Note: the implementation is not non-blocking */ int ARMCI_NbPut(void *src, void *dst, int bytes, int proc, armci_hdl_t *handle) { return ARMCI_Put(src, dst, bytes, proc); }
static void contig_test(size_t buffer_size, int op) { void **dst_ptr; void **put_buf; void **get_buf; double *times; dst_ptr = (void*)malloc(nproc * sizeof(void*)); put_buf = (void*)malloc(nproc * sizeof(void*)); get_buf = (void*)malloc(nproc * sizeof(void*)); times = (double*)malloc(nproc * sizeof(double)); ARMCI_Malloc(dst_ptr, buffer_size); ARMCI_Malloc(put_buf, buffer_size); ARMCI_Malloc(get_buf, buffer_size); /* initialize what we're putting */ fill_array((double*)put_buf[me], buffer_size/sizeof(double), me); size_t msg_size; int dst = 1; double scale = 1.0; for (msg_size = 16; msg_size <= buffer_size; msg_size *= 2) { int j; int iter = msg_size > MEDIUM_MESSAGE_SIZE ? ITER_LARGE : ITER_SMALL; double t_start, t_end; if (0 == me) { for (j= 0; j < iter + WARMUP; ++j) { if (WARMUP == j) { t_start = dclock(); } switch (op) { case PUT: ARMCI_Put(put_buf[me], dst_ptr[dst], msg_size, dst); break; case GET: ARMCI_Get(dst_ptr[dst], get_buf[me], msg_size, dst); break; case ACC: ARMCI_Acc(ARMCI_ACC_DBL, &scale, put_buf[me], dst_ptr[dst], msg_size, dst); break; default: ARMCI_Error("oops", 1); } } } /* calculate total time and average time */ t_end = dclock(); ARMCI_Barrier(); if (0 == me) { printf("%8zu\t\t%6.2f\t\t%10.2f\n", msg_size, ((t_end - t_start))/iter, msg_size*iter/((t_end - t_start))); } } ARMCI_Free(dst_ptr[me]); ARMCI_Free(put_buf[me]); ARMCI_Free(get_buf[me]); free(dst_ptr); free(put_buf); free(get_buf); free(times); }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }
int main(int argc, char ** argv) { int rank, nproc, i, test_iter; int *my_data, *buf; void **base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc); buf = malloc(DATA_SZ); base_ptrs = malloc(sizeof(void*)*nproc); for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) { if (rank == 0) printf(" + iteration %d\n", test_iter); /*** Allocate the shared array ***/ ARMCI_Malloc(base_ptrs, DATA_SZ); my_data = base_ptrs[rank]; /*** Get from our right neighbor and verify correct data ***/ ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter; ARMCI_Access_end(my_data); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc); for (i = 0; i < DATA_NELTS; i++) { if (buf[i] != ((rank+1) % nproc)*test_iter) { printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]); MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Barrier(); // Wait for all gets to complete /*** Put to our left neighbor and verify correct data ***/ for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter; ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) { if (my_data[i] != ((rank+1) % nproc)*test_iter) { printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]); MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Access_end(my_data); ARMCI_Barrier(); // Wait for all gets to complete /*** Accumulate to our left neighbor and verify correct data ***/ for (i = 0; i < DATA_NELTS; i++) buf[i] = rank; ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank; ARMCI_Access_end(my_data); ARMCI_Barrier(); int scale = test_iter; ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) { if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) { printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]); //MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Access_end(my_data); ARMCI_Free(my_data); } free(buf); free(base_ptrs); if (rank == 0) printf("Test complete: PASS.\n"); ARMCI_Finalize(); MPI_Finalize(); return 0; }