void test_2D() { int i; int src, dst; int ierr; double *buf; void *ptr[MAXPROC], *get_ptr[MAXPROC]; /* find who I am and the dst process */ src = me; #ifdef MALLOC_LOC if(me == 0) { buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #else if(me == 0) { buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #endif ierr = ARMCI_Malloc(ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(ptr[me]); ierr = ARMCI_Malloc(get_ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(get_ptr[me]); /* ARMCI - initialize the data window */ fill_array(ptr[me], SIZE*SIZE, me); fill_array(get_ptr[me], SIZE*SIZE, me); MP_BARRIER(); /* only the proc 0 doest the work */ /* print the title */ if(me == 0) { if(!CHECK_RESULT){ printf(" section get put"); printf(" acc\n"); printf("bytes loop sec MB/s sec MB/s"); printf(" sec MB/s\n"); printf("------- ------ -------- -------- -------- --------"); printf(" -------- --------\n"); fflush(stdout); } for(i=0; i<CHUNK_NUM; i++) { int loop; int bytes = chunk[i] * chunk[i] * sizeof(double); double t_get = 0, t_put = 0, t_acc = 0; double latency_get, latency_put, latency_acc; double bandwidth_get, bandwidth_put, bandwidth_acc; loop = SIZE / chunk[i]; if(loop<2)loop=2; for(dst=1; dst<nproc; dst++) { /* strided get */ fill_array(buf, SIZE*SIZE, me*10); t_get += time_get((double *)(get_ptr[dst]), (double *)buf, chunk[i], loop, dst, 1); /* strided put */ fill_array(buf, SIZE*SIZE, me*10); t_put += time_put((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); /* strided acc */ fill_array(buf, SIZE*SIZE, me*10); t_acc += time_acc((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); } latency_get = t_get/(nproc - 1); latency_put = t_put/(nproc - 1); latency_acc = t_acc/(nproc - 1); bandwidth_get = (bytes * (nproc - 1) * 1e-6)/t_get; bandwidth_put = (bytes * (nproc - 1) * 1e-6)/t_put; bandwidth_acc = (bytes * (nproc - 1) * 1e-6)/t_acc; /* print */ if(!CHECK_RESULT)printf("%d\t%d\t%.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, loop, latency_get, bandwidth_get, latency_put, bandwidth_put, latency_acc, bandwidth_acc); } } else sleep(3); ARMCI_AllFence(); MP_BARRIER(); /* cleanup */ ARMCI_Free(get_ptr[me]); ARMCI_Free(ptr[me]); #ifdef MALLOC_LOC if(me == 0) ARMCI_Free_local(buf); #else if(me == 0) free(buf); #endif }
void test_2D() { int i; int src, dst; int ierr; double *buf; void *ptr[MAXPROC], *get_ptr[MAXPROC]; /* find who I am and the dst process */ src = me; if(me == 0) { buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } ierr = A1_Alloc_segment(&ptr[me], (SIZE * SIZE * sizeof(double))); assert(ierr == 0); ierr = A1_Exchange_segments(A1_GROUP_WORLD, ptr); assert(ierr == 0); ierr = A1_Alloc_segment(&get_ptr[me], (SIZE * SIZE * sizeof(double))); assert(ierr == 0); ierr = A1_Exchange_segments(A1_GROUP_WORLD, get_ptr); assert(ierr == 0); /* A1 - initialize the data window */ fill_array(ptr[me], SIZE*SIZE, me); fill_array(get_ptr[me], SIZE*SIZE, me); A1_Barrier_group(A1_GROUP_WORLD); /* only the proc 0 doest the work */ /* print the title */ if(me == 0) { if(!CHECK_RESULT){ printf(" section get put"); printf(" acc\n"); printf("bytes loop usec MB/s usec MB/s"); printf(" usec MB/s\n"); printf("------- ------ -------- -------- -------- --------"); printf(" -------- --------\n"); fflush(stdout); } for(i=0; i<CHUNK_NUM; i++) { int loop; int bytes = chunk[i] * chunk[i] * sizeof(double); double t_get = 0, t_put = 0, t_acc = 0; double latency_get, latency_put, latency_acc; double bandwidth_get, bandwidth_put, bandwidth_acc; loop = SIZE / chunk[i]; if(loop<2)loop=2; for(dst=1; dst<nproc; dst++) { /* strided get */ fill_array(buf, SIZE*SIZE, me*10); t_get += time_get((double *)(get_ptr[dst]), (double *)buf, chunk[i], loop, dst, 1); /* strided put */ fill_array(buf, SIZE*SIZE, me*10); t_put += time_put((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); /* strided acc */ fill_array(buf, SIZE*SIZE, me*10); t_acc += time_acc((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); } latency_get = t_get/(nproc - 1); latency_put = t_put/(nproc - 1); latency_acc = t_acc/(nproc - 1); bandwidth_get = (bytes * (nproc - 1) * 1e-6)/t_get; bandwidth_put = (bytes * (nproc - 1) * 1e-6)/t_put; bandwidth_acc = (bytes * (nproc - 1) * 1e-6)/t_acc; /* print */ if(!CHECK_RESULT) printf("%d\t%d\t %7.2lf %9.2lf %9.2lf %9.2lf %9.2lf %9.2lf\n", bytes, loop, latency_get*1000000, bandwidth_get, latency_put*1000000, bandwidth_put, latency_acc*1000000, bandwidth_acc); } } else sleep(60); A1_Flush_group(A1_GROUP_WORLD); A1_Barrier_group(A1_GROUP_WORLD); /* cleanup */ A1_Release_segments(A1_GROUP_WORLD, get_ptr[me]); A1_Free_segment(get_ptr[me]); A1_Release_segments(A1_GROUP_WORLD, ptr[me]); A1_Free_segment(ptr[me]); if(me == 0) free(buf); }
int time_push(lua_State *L, apr_time_t time) { time_put(L, time); return 1; }