double time_get(double *src_buf, double *dst_buf, int chunk, int loop, int proc, int levels) { int i, bal = 0; int stride[2]; int count[2]; int stride_levels = levels; double *tmp_buf, *tmp_buf_ptr; double start_time, stop_time, total_time = 0; stride[0] = SIZE * sizeof(double); count[0] = chunk * sizeof(double); count[1] = chunk; if(CHECK_RESULT) { tmp_buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(tmp_buf != NULL); fill_array(tmp_buf, SIZE*SIZE, proc); tmp_buf_ptr = tmp_buf; } start_time = TIMER(); for(i=0; i<loop; i++) { #ifdef FORCE_1D int j; if(levels>0)for(j=0; j< count[1]; j++){ char *s = (char*) src_buf, *d= (char*)dst_buf; s += j*stride[0]; d += j*stride[0]; ARMCI_Get(src_buf, dst_buf, count[0],proc); } else #endif if(levels) ARMCI_GetS(src_buf, stride, dst_buf, stride, count, stride_levels,proc); else ARMCI_Get(src_buf, dst_buf,count[0], proc); if(CHECK_RESULT) { sprintf(check_type, "ARMCI_GetS:"); check_result(tmp_buf_ptr, dst_buf, stride, count, stride_levels); } /* prepare next src and dst ptrs: avoid cache locality */ if(bal == 0) { src_buf += 128; dst_buf += 128; if(CHECK_RESULT) tmp_buf_ptr += 128; bal = 1; } else { src_buf -= 128; dst_buf -= 128; if(CHECK_RESULT) tmp_buf_ptr -= 128; bal = 0; } } stop_time = TIMER(); total_time = (stop_time - start_time); if(CHECK_RESULT) free(tmp_buf); if(total_time == 0.0){ total_time=0.000001; /* workaround for inaccurate timers */ warn_accuracy++; } return(total_time/loop); }
/* test Put/Get/Acc sequence regardless of communication pattern * tgt -- remote target for put/get/acc (none if -1) * rmt -- list of remote thread that put/acc to here (correctness is cheked here) * rmt_cnt -- # of threads in rmt */ void test_PutGetAcc(int th_idx, int tgt, int *rmt, int rmt_cnt) { /* a - local thread, b - remote thread */ int a, b, b_proc, stride[2], count[2]; int i, j; void *src, *dst; #ifdef DEBUG for (i = 0, cbufl = 0; i < rmt_cnt; i++) cbufl += sprintf(cbuf+cbufl, " %d", rmt[i]); prndbg(th_idx, "test_PutGetAcc: put/acc to %d, get from %d, check put/acc from %s\n", tgt, tgt, rmt_cnt ? cbuf : "none"); #endif a = TH_ME; stride[0] = ASIZE_BYTES; count[0] = ASIZE_BYTES; count[1] = 1; /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* put - put a.ptrs1[b] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ // assert(!ARMCI_Put(src, dst, ASIZE_BYTES, b_proc)); assert(!ARMCI_PutS(src, stride, dst, stride, count, 1, b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "PUT:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "PUT:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk put(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, PUT, &AELEM(ptrs2[a], b, 0, 0)); } //return; // REMOVE WHEN DONE /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* get - get b.ptrs1[a] into a.ptrs2[b] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[b], a, i, 0); /* b.ptrs1[a] */ dst = &AELEM(ptrs2[a], b, i, 0); /* a.ptrs2[b] */ assert(!ARMCI_GetS(src, stride, dst, stride, count, 1, b_proc)); } } print_array(th_idx, "GET:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "GET:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk get from b: a.ptrs2[b] */ if (tgt != -1) { check_PutGetAcc(th_idx, b, GET, &AELEM(ptrs2[a], b, 0, 0)); } #if 1 /* init arrays */ init_array(th_idx, ptrs1[TH_ME]); init_array(th_idx, ptrs2[TH_ME]); MT_BARRIER(); /* acc - acc a.ptrs1[b] * scale + b.ptrs2[a] into b.ptrs2[a] */ if (tgt != -1) { b = tgt; b_proc = TH2PROC(b); for (i = 0; i < iters; i++) { src = &AELEM(ptrs1[a], b, i, 0); /* a.ptrs1[b] */ dst = &AELEM(ptrs2[b], a, i, 0); /* b.ptrs2[a] */ assert(!ARMCI_AccS(ARMCI_ACC_DBL,&scale,src,stride,dst,stride,count,1,b_proc)); } ARMCI_Fence(b_proc); } MT_BARRIER(); print_array(th_idx, "ACC:ptrs1[TH_ME]", ptrs1[TH_ME]); print_array(th_idx, "ACC:ptrs2[TH_ME]", ptrs2[TH_ME]); MT_BARRIER(); /* chk acc(s) from b(s): a.ptrs2[b] */ for (j = 0; j < rmt_cnt; j++) { b = rmt[j]; b_proc = TH2PROC(b); check_PutGetAcc(th_idx, b, ACC, &AELEM(ptrs2[a], b, 0, 0)); } #endif MT_BARRIER(); }
double time_acc(double *src_buf, double *dst_buf, int chunk, int loop, int proc, int levels) { int i, bal = 0; int stride[2]; int count[2]; int stride_levels = levels; double *before_buf, *after_buf; double start_time, stop_time, total_time = 0; stride[0] = SIZE * sizeof(double); count[0] = chunk * sizeof(double); count[1] = chunk; if(CHECK_RESULT) { before_buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(before_buf != NULL); after_buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(after_buf != NULL); } start_time = TIMER(); for(i=0; i<loop; i++) { double scale = (double)i; if(CHECK_RESULT) { ARMCI_GetS(dst_buf, stride, before_buf, stride, count, stride_levels, proc); acc_array(scale, before_buf, src_buf, stride, count,stride_levels); } ARMCI_AccS(ARMCI_ACC_DBL, &scale, src_buf, stride, dst_buf, stride, count, stride_levels, proc); if(CHECK_RESULT) { ARMCI_GetS(dst_buf, stride, after_buf, stride, count, stride_levels, proc); sprintf(check_type, "ARMCI_AccS:"); check_result(after_buf, before_buf, stride, count, stride_levels); } /* prepare next src and dst ptrs: avoid cache locality */ if(bal == 0) { src_buf += 128; dst_buf += 128; bal = 1; } else { src_buf -= 128; dst_buf -= 128; bal = 0; } } stop_time = TIMER(); total_time = (stop_time - start_time); if(CHECK_RESULT) { free(before_buf); free(after_buf); } if(total_time == 0.0){ total_time=0.000001; /* workaround for inaccurate timers */ warn_accuracy++; } return(total_time/loop); }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors; double **buffer, *src_buf, *dst_buf; int count[2], src_stride, trg_stride, stride_level; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = XDIM * YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); dst_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) printf("ARMCI Strided Put Test:\n"); src_stride = XDIM * sizeof(double); trg_stride = XDIM * sizeof(double); stride_level = 1; count[1] = YDIM; count[0] = XDIM * sizeof(double); ARMCI_Barrier(); peer = (rank+1) % nranks; for (i = 0; i < ITERATIONS; i++) { for (j = 0; j < XDIM*YDIM; j++) { *(src_buf + j) = rank + i; } ARMCI_PutS( src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); ARMCI_GetS( (void *) buffer[peer], &trg_stride, dst_buf, &src_stride, count, stride_level, peer); } ARMCI_Barrier(); ARMCI_Access_begin(buffer[rank]); for (i = errors = 0; i < XDIM; i++) { for (j = 0; j < YDIM; j++) { const double actual = *(buffer[rank] + i + j*XDIM); const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } ARMCI_Access_end(buffer[rank]); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); ARMCI_Free_local(dst_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); if (errors == 0) { printf("%d: Success\n", rank); return 0; } else { printf("%d: Fail\n", rank); return 1; } }