int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors; double **buffer, *src_buf; int count[2], src_stride, trg_stride, stride_level; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = XDIM * YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) printf("ARMCI Strided Put Test:\n"); src_stride = XDIM * sizeof(double); trg_stride = XDIM * sizeof(double); stride_level = 1; count[1] = YDIM; count[0] = XDIM * sizeof(double); ARMCI_Barrier(); peer = (rank+1) % nranks; for (i = 0; i < ITERATIONS; i++) { for (j = 0; j < XDIM*YDIM; j++) { *(src_buf + j) = rank + i; } ARMCI_PutS( src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); } ARMCI_Barrier(); ARMCI_Access_begin(buffer[rank]); for (i = errors = 0; i < XDIM; i++) { for (j = 0; j < YDIM; j++) { const double actual = *(buffer[rank] + i + j*XDIM); const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } ARMCI_Access_end(buffer[rank]); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); if (errors == 0) { printf("%d: Success\n", rank); return 0; } else { printf("%d: Fail\n", rank); return 1; } }
void test_notify(int ndim) { int lo[MAXDIMS], hi[MAXDIMS], count[MAXDIMS]; int stride[MAXDIMS]; int dim, elems; int i, Idx = 1, idx = 0; void *b[MAXPROC], *a[MAXPROC]; int left = (me + nproc - 1) % nproc; int right = (me + 1) % nproc; int loopcnt = 1, less = 2, strl; /* less>1 takes a partial plane */ /* create shared and local arrays */ create_array(b, sizeof(double), ndim, dimsB); create_array(a, sizeof(double), ndim, dimsB); elems = get_elems(ndim, stride, dimsB, sizeof(double)); init((double *)a[me], ndim, elems, dimsB); for (i = 0; i < ndim; i++) { lo[i] = 0; hi[i] = (less > dimsB[i]) ? dimsB[i] - 1 : dimsB[i] - less; count[i] = hi[i] - lo[i] + 1; } count[0] *= sizeof(double); for (i = 0; i < ndim - 1; i++) { Idx *= dimsB[i]; } ARMCI_Barrier(); if (me == 0) { printf("--------array[%d", dimsB[0]); for (dim = 1; dim < ndim; dim++) { printf(",%d", dimsB[dim]); } printf("]--------\n"); fflush(stdout); } ARMCI_Barrier(); loopcnt = (ndim > 1) ? dimsB[ndim-1] : 1; strl = (ndim > 1) ? ndim - 2 : 0; /* strides of the subpatch to transfer */ for (i = 0; i < loopcnt; i++) { int wc; if (me == 0) { ARMCI_PutS((double *)a[me] + idx, stride, (double *)b[left] + idx, stride, count, strl, left); #if DEBUG_ printf("%d-%d: ps=%p pd=%p i=%d idx=%d count=%d\n", me, left, (double *) a[me] + idx, (double *)b[left] + idx, i, idx, count[0]); fflush(stdout); #endif (void)armci_notify(left); (void)armci_notify_wait(right, &wc); } else { (void)armci_notify_wait(right, &wc); ARMCI_PutS((double *)b[me] + idx, stride, (double *)b[left] + idx, stride, count, strl, left); #if DEBUG_ printf("%d: ps=%p pd=%p i=%d idx=%d count=%d\n", me, (double *)b[me] + idx, (double *)b[left] + idx, i, idx, count[0]); fflush(stdout); #endif (void)armci_notify(left); } idx += Idx; /* advance to the next slab */ } ARMCI_Barrier(); if (me == 0) { compare_patches(0., ndim, (double *)a[0], lo, hi, dimsB, (double *)b[0], lo, hi, dimsB); printf("OK\n"); } ARMCI_Barrier(); destroy_array(b); destroy_array(a); }
/* test Put/Get/Acc sequence regardless of communication pattern * tgt -- remote target for put/get/acc (none if -1) * rmt -- list of remote thread that put/acc to here (correctness is cheked here) * rmt_cnt -- # of threads in rmt */ void test_PutGetAcc(int th_idx, int tgt, int *rmt, int rmt_cnt) { /* a - local thread, b - remote thread */ int a, b, b_proc, stride[2], count[2]; int i, j; void *src, *dst; #ifdef DEBUG for (i = 0, cbufl = 0; i < rmt_cnt; i++) cbufl += sprintf(cbuf+cbufl, " %d", rmt[i]); prndbg(th_idx, "test_PutGetAcc: put/acc to %d, get from %d, check put/acc from %s\n", tgt, tgt, rmt_cnt ? cbuf : "none"); #endif a = TH_ME; stride[0] = ASIZE_BYTES; count[0] = ASIZE_BYTES; count[1] = 1; /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* put - put a.ptrs1[b] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ // assert(!ARMCI_Put(src, dst, ASIZE_BYTES, b_proc)); assert(!ARMCI_PutS(src, stride, dst, stride, count, 1, b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "PUT:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "PUT:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk put(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, PUT, &AELEM(ptrs2[a], b, 0, 0)); } //return; // REMOVE WHEN DONE /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* get - get b.ptrs1[a] into a.ptrs2[b] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[b], a, i, 0); /* b.ptrs1[a] */ dst = &AELEM(ptrs2[a], b, i, 0); /* a.ptrs2[b] */ assert(!ARMCI_GetS(src, stride, dst, stride, count, 1, b_proc)); } } print_array(th_idx, "GET:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "GET:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk get from b: a.ptrs2[b] */ if (tgt != -1) { check_PutGetAcc(th_idx, b, GET, &AELEM(ptrs2[a], b, 0, 0)); } #if 1 /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* acc - acc a.ptrs1[b] * scale + b.ptrs2[a] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ assert(!ARMCI_AccS(ARMCI_ACC_DBL,&scale,src,stride,dst,stride,count,1,b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "ACC:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "ACC:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk acc(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, ACC, &AELEM(ptrs2[a], b, 0, 0)); } #endif MT_BARRIER(); }
double time_put(double *src_buf, double *dst_buf, int chunk, int loop, int proc, int levels) { int i, bal = 0; int stride[2]; int count[2]; int stride_levels = levels; double *tmp_buf; double start_time, stop_time, total_time = 0; stride[0] = SIZE * sizeof(double); count[0] = chunk * sizeof(double); count[1] = chunk; if(CHECK_RESULT) { tmp_buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(tmp_buf != NULL); } start_time = TIMER(); for(i=0; i<loop; i++) { #ifdef FORCE_1D int j; if(levels>0)for(j=0; j< count[1]; j++){ char *s = (char*) src_buf, *d= (char*)dst_buf; s += j*stride[0]; d += j*stride[0]; ARMCI_Put(src_buf, dst_buf, count[0],proc); } else #endif if(levels) ARMCI_PutS(src_buf, stride, dst_buf, stride, count, stride_levels,proc); else ARMCI_Put(src_buf, dst_buf,count[0], proc); if(CHECK_RESULT) { ARMCI_GetS(dst_buf, stride, tmp_buf, stride, count, stride_levels, proc); sprintf(check_type, "ARMCI_PutS:"); check_result(tmp_buf, src_buf, stride, count, stride_levels); } /* prepare next src and dst ptrs: avoid cache locality */ if(bal == 0) { src_buf += 128; dst_buf += 128; bal = 1; } else { src_buf -= 128; dst_buf -= 128; bal = 0; } } stop_time = TIMER(); total_time = (stop_time - start_time); if(CHECK_RESULT) free(tmp_buf); if(total_time == 0.0){ total_time=0.000001; /* workaround for inaccurate timers */ warn_accuracy++; } return(total_time/loop); }
int main(int argc, char *argv[]) { int i, j, rank, nranks; int xdim, ydim; long bufsize; double **buffer; double t_start=0.0, t_stop=0.0; int count[2], src_stride, trg_stride, stride_level, peer; double expected, actual; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if (nranks < 2) { printf("%s: Must be run with at least 2 processes\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } ARMCI_Init_args(&argc, &argv); bufsize = MAX_XDIM * MAX_YDIM * sizeof(double); buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } if(rank == 0) { printf("ARMCI_PutS Latency - local and remote completions - in usec \n"); printf("%30s %22s %22s\n", "Dimensions(array of doubles)", "Latency-LocalCompeltion", "Latency-RemoteCompletion"); fflush(stdout); } src_stride = MAX_YDIM*sizeof(double); trg_stride = MAX_YDIM*sizeof(double); stride_level = 1; ARMCI_Barrier(); for(xdim=1; xdim<=MAX_XDIM; xdim*=2) { count[1] = xdim; for(ydim=1; ydim<=MAX_YDIM; ydim*=2) { count[0] = ydim*sizeof(double); if(rank == 0) { peer = 1; for(i=0; i<ITERATIONS+SKIP; i++) { if(i == SKIP) t_start = MPI_Wtime(); ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); } t_stop = MPI_Wtime(); ARMCI_Fence(peer); char temp[10]; sprintf(temp,"%dX%d", xdim, ydim); printf("%30s %20.2f", temp, ((t_stop-t_start)*1000000)/ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); for(i=0; i<ITERATIONS+SKIP; i++) { if(i == SKIP) t_start = MPI_Wtime(); ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); ARMCI_Fence(peer); } t_stop = MPI_Wtime(); printf("%20.2f \n", ((t_stop-t_start)*1000000)/ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); } else { peer = 0; expected = (1.0 + (double) peer); ARMCI_Barrier(); if (rank == 1) { for(i=0; i<xdim; i++) { for(j=0; j<ydim; j++) { actual = *(buffer[rank] + i*MAX_YDIM + j); if(actual != expected) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, expected, actual); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } } for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Barrier(); ARMCI_Barrier(); if (rank == 1) { for(i=0; i<xdim; i++) { for(j=0; j<ydim; j++) { actual = *(buffer[rank] + i*MAX_YDIM + j); if(actual != expected) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, expected, actual); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } } ARMCI_Barrier(); } } } ARMCI_Barrier(); ARMCI_Free((void *) buffer[rank]); free(buffer); ARMCI_Finalize(); MPI_Finalize(); return 0; }