void print_array(int myid) { int i, j, k; double **buf; int ii, jj; int edge; int ibs, jbs, skip; buf = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *)); for(i=0; i<nblocks; i++) for(j=0; j<nblocks; j++) if(block_owner(i, j) == myid) buf[i+j*nblocks] = a[i+j*nblocks]; else { buf[i+j*nblocks] = (double *)ARMCI_Malloc_local(block_size*block_size* sizeof(double)); get_remote(buf[i+j*nblocks], i, j); } /* copied from lu.C */ edge = n%block_size; for (i=0; i<n; i++) { for (j=0; j<n; j++) { if ((n - i) <= edge) { ibs = edge; ibs = n-edge; skip = edge; } else { ibs = block_size; skip = block_size; } if ((n - j) <= edge) { jbs = edge; jbs = n-edge; } else { jbs = block_size; } ii = (i/block_size) + (j/block_size)*nblocks; jj = (i%ibs)+(j%jbs)*skip; printf("%8.1f ", buf[ii][jj]); } printf("\n"); } fflush(stdout); for(i=0; i<nblocks; i++) for(j=0; j<nblocks; j++) if(block_owner(i, j) != myid) ARMCI_Free_local(buf[i+j*nblocks]); ARMCI_Free_local(buf); }
int main(int argc, char * argv[]) { void *baseAddress[MAX_PROCESSORS]; char *local; int thisImage; int iter = 100, size; double startTime, endTime; int i; // initialize ARMCI_Init(); ARMCI_Myid(&thisImage); // allocate data (collective operation) ARMCI_Malloc(baseAddress, MAX_BUF_SIZE*sizeof(char)); local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char)); ARMCI_Barrier(); ARMCI_Migrate(); if (thisImage == 0) { for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){ startTime = CkWallTimer(); for(i = 0; i < iter; i++){ ARMCI_Put(local, baseAddress[1], size, 1); } ARMCI_Fence(1); endTime = CkWallTimer(); printf("%d: %f us\n", size, (endTime-startTime)*1000); } ARMCI_Barrier(); } else if (thisImage == 1) { ARMCI_Barrier(); } ARMCI_Free(baseAddress[thisImage]); ARMCI_Free_local(local); // finalize ARMCI_Finalize(); return 0; }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors; double **buffer, *src_buf; int count[2], src_stride, trg_stride, stride_level; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = XDIM * YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) printf("ARMCI Strided Put Test:\n"); src_stride = XDIM * sizeof(double); trg_stride = XDIM * sizeof(double); stride_level = 1; count[1] = YDIM; count[0] = XDIM * sizeof(double); ARMCI_Barrier(); peer = (rank+1) % nranks; for (i = 0; i < ITERATIONS; i++) { for (j = 0; j < XDIM*YDIM; j++) { *(src_buf + j) = rank + i; } ARMCI_PutS( src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); } ARMCI_Barrier(); ARMCI_Access_begin(buffer[rank]); for (i = errors = 0; i < XDIM; i++) { for (j = 0; j < YDIM; j++) { const double actual = *(buffer[rank] + i + j*XDIM); const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } ARMCI_Access_end(buffer[rank]); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); if (errors == 0) { printf("%d: Success\n", rank); return 0; } else { printf("%d: Fail\n", rank); return 1; } }
int main(int argc, char* argv[]) { int provided; int i, rank, nranks, msgsize, target; long bufsize; int **counter; int *complete; int increment; int counter_fetch; int counters_received; int t_start, t_stop, t_latency; int expected; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); complete = (int *) malloc(sizeof(int) * COUNT); counter = (int**) ARMCI_Malloc_local( nranks * sizeof(int*) ); ARMCI_Malloc((void *) counter[rank], sizeof(int)); if (rank == 0) { printf("ARMCI_RMW Test - in usec \n"); fflush(stdout); } target = 0; for(i=0; i<COUNT; i++) { complete[i] = 0; } if(rank == target) { *(counter[rank]) = 0; } increment = 1; counter_fetch = 0; counters_received = 0; MPI_Barrier(MPI_COMM_WORLD); while(counter_fetch < COUNT) { ARMCI_Rmw(ARMCI_FETCH_AND_ADD, (void *) &counter_fetch, (void *) counter[target], increment, target); /* s/1/rank/ means we will know who got the counter */ if (counter_fetch < COUNT) complete[counter_fetch] = rank; counters_received++; } MPI_Allreduce(MPI_IN_PLACE,complete,COUNT,MPI_INT,MPI_SUM,MPI_COMM_WORLD); for(i=0; i<COUNT; i++) { if (complete[i] == 0) { printf("[%d] The RMW update failed at index: %d \n", rank, i); fflush(stdout); exit(-1); } } printf("[%d] The RMW update completed successfully \n", rank); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); if (0==rank) { printf("Checking for fairness...\n", rank); fflush(stdout); for(i=0; i<COUNT; i++) { printf("counter value %d was received by process %d\n", i, complete[i]); } fflush(stdout); } MPI_Barrier(MPI_COMM_WORLD); printf("process %d received %d counters\n", rank, counters_received); fflush(stdout); ARMCI_Free(counter[rank]); ARMCI_Free_local(counter); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void test_2D() { int i; int src, dst; int ierr; double *buf; void *ptr[MAXPROC], *get_ptr[MAXPROC]; /* find who I am and the dst process */ src = me; #ifdef MALLOC_LOC if(me == 0) { buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #else if(me == 0) { buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #endif ierr = ARMCI_Malloc(ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(ptr[me]); ierr = ARMCI_Malloc(get_ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(get_ptr[me]); /* ARMCI - initialize the data window */ fill_array(ptr[me], SIZE*SIZE, me); fill_array(get_ptr[me], SIZE*SIZE, me); MP_BARRIER(); /* only the proc 0 doest the work */ /* print the title */ if(me == 0) { if(!CHECK_RESULT){ printf(" section get put"); printf(" acc\n"); printf("bytes loop sec MB/s sec MB/s"); printf(" sec MB/s\n"); printf("------- ------ -------- -------- -------- --------"); printf(" -------- --------\n"); fflush(stdout); } for(i=0; i<CHUNK_NUM; i++) { int loop; int bytes = chunk[i] * chunk[i] * sizeof(double); double t_get = 0, t_put = 0, t_acc = 0; double latency_get, latency_put, latency_acc; double bandwidth_get, bandwidth_put, bandwidth_acc; loop = SIZE / chunk[i]; if(loop<2)loop=2; for(dst=1; dst<nproc; dst++) { /* strided get */ fill_array(buf, SIZE*SIZE, me*10); t_get += time_get((double *)(get_ptr[dst]), (double *)buf, chunk[i], loop, dst, 1); /* strided put */ fill_array(buf, SIZE*SIZE, me*10); t_put += time_put((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); /* strided acc */ fill_array(buf, SIZE*SIZE, me*10); t_acc += time_acc((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); } latency_get = t_get/(nproc - 1); latency_put = t_put/(nproc - 1); latency_acc = t_acc/(nproc - 1); bandwidth_get = (bytes * (nproc - 1) * 1e-6)/t_get; bandwidth_put = (bytes * (nproc - 1) * 1e-6)/t_put; bandwidth_acc = (bytes * (nproc - 1) * 1e-6)/t_acc; /* print */ if(!CHECK_RESULT)printf("%d\t%d\t%.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, loop, latency_get, bandwidth_get, latency_put, bandwidth_put, latency_acc, bandwidth_acc); } } else sleep(3); ARMCI_AllFence(); MP_BARRIER(); /* cleanup */ ARMCI_Free(get_ptr[me]); ARMCI_Free(ptr[me]); #ifdef MALLOC_LOC if(me == 0) ARMCI_Free_local(buf); #else if(me == 0) free(buf); #endif }
int main(int argc, char **argv) { int i, j, rank, nranks, peer; size_t xdim, ydim; unsigned long bufsize; double **buffer, *src_buf; double t_start=0.0, t_stop; int count[2], src_stride, trg_stride, stride_level; double scaling; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if (nranks < 2) { printf("%s: Must be run with at least 2 processes\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } ARMCI_Init_args(&argc, &argv); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = MAX_XDIM * MAX_YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) { printf("ARMCI_AccS Latency - local and remote completions - in usec \n"); printf("%30s %22s %22s\n", "Dimensions(array of double)", "Local Completion", "Remote completion"); fflush(stdout); } ARMCI_Access_begin(buffer[rank]); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); scaling = 2.0; src_stride = MAX_YDIM * sizeof(double); trg_stride = MAX_YDIM * sizeof(double); stride_level = 1; ARMCI_Barrier(); for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2) { count[1] = xdim; for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2) { count[0] = ydim * sizeof(double); if (rank == 0) { peer = 1; for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); } t_stop = MPI_Wtime(); ARMCI_Fence(1); char temp[10]; sprintf(temp, "%dX%d", (int) xdim, (int) ydim); printf("%30s %20.2f ", temp, ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); ARMCI_Fence(1); } t_stop = MPI_Wtime(); printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); } else { peer = 0; ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); } } } ARMCI_Barrier(); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int me,nproc; int status; int rank; /* initialization */ MPI_Init(&argc, &argv); ARMCI_Init(); #ifdef HPC_PROFILING HPM_Init(); #endif MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_size(MPI_COMM_WORLD,&nproc); #ifdef DEBUG if(me == 0){ printf("The result of MPI_Comm_size is %d\n",nproc); fflush(stdout); } #endif /* get the matrix parameters */ if (argc > 1){ rank = atoi(argv[1]); } else { rank = 8; } if (me == 0){ printf("Running matmul.x with rank = %d\n",rank); fflush(stdout); } /* register remote pointers */ double** addr_A = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc); if (addr_A == NULL) ARMCI_Error("malloc A failed at line",0); double** addr_B = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc); if (addr_B == NULL) ARMCI_Error("malloc B failed at line",0); double** addr_C = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc); if (addr_C == NULL) ARMCI_Error("malloc C failed at line",0); #ifdef DEBUG if(me == 0) printf("ARMCI_Malloc A requests %lu bytes\n",rank*rank*sizeof(double)); fflush(stdout); #endif status = ARMCI_Malloc((void **) addr_A, rank*rank*sizeof(double)); if (status != 0) ARMCI_Error("ARMCI_Malloc A failed",status); #ifdef DEBUG if(me == 0) printf("ARMCI_Malloc B requests %lu bytes\n",rank*rank*sizeof(double)); fflush(stdout); #endif status = ARMCI_Malloc((void **) addr_B, rank*rank*sizeof(double)); if (status != 0) ARMCI_Error("ARMCI_Malloc B failed",status); #ifdef DEBUG if(me == 0) printf("ARMCI_Malloc C requests %lu bytes\n",rank*rank*sizeof(double)); fflush(stdout); #endif status = ARMCI_Malloc((void **) addr_C, rank*rank*sizeof(double)); if (status != 0) ARMCI_Error("ARMCI_Malloc C failed",status); MPI_Barrier(MPI_COMM_WORLD); /* free ARMCI pointers */ ARMCI_Free_local(addr_C); ARMCI_Free_local(addr_B); ARMCI_Free_local(addr_A); #ifdef HPC_PROFILING HPM_Print(); #endif /* the end */ ARMCI_Finalize(); MPI_Finalize(); return(0); }
void TRANSPOSE1D() { int dims[1]; int nelem, i, ierr, min, max, cmin, cmax, lmin, lmax, pmin, pmax; int src_offset, dst_offset, length; int *buf, *map; void *src_ptr, *dst_ptr; void **a_ptr, **b_ptr; int *a, *b; /* Find local processor ID and number of processors */ int me, nprocs; me = armci_msg_me(); nprocs = armci_msg_nproc(); /* Allocate pointers to data on all processors */ a_ptr = (void**)malloc(nprocs*sizeof(int*)); b_ptr = (void**)malloc(nprocs*sizeof(int*)); map = (int*)malloc(nprocs*sizeof(int)); /* Configure array dimensions. Force an unequal data distribution */ dims[0] = nprocs*TOTALELEMS + nprocs/2; if (me == 0) printf("Size of array: %d\n\n",dims[0]); /* Find first (zero-based) index of chunk owned by each processor and store it in map array */ for (i=0; i<nprocs; i++) { map[i] = (int)(((double)i)*(((double)dims[0])/((double)nprocs))); } /* Figure out what size my portion of array is */ if (me<nprocs-1) { nelem = map[me+1]-map[me]; } else { nelem = dims[0]-map[me]; } /* Allocate memory for array A */ ierr = ARMCI_Malloc(a_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(a_ptr[me]); /* Allocate memory for array B */ ierr = ARMCI_Malloc(b_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(b_ptr[me]); /* initialize data in array A and zero data in array B */ a = (int*)a_ptr[me]; b = (int*)b_ptr[me]; for (i=0; i<nelem; i++) { a[i] = i + map[me] + 1; b[i] = 0; } /* Synchronize all processors to guarantee that everyone has data before proceeding to the next step. */ armci_msg_barrier(); /* Create local buffer for performing inversion */ buf = (int*)ARMCI_Malloc_local(nelem*sizeof(int)); /* Copy inverted data into local buffer */ a = (int*)a_ptr[me]; for (i=0; i<nelem; i++) { buf[i] = a[nelem-i-1]; } /* Find out which blocks of array B inverted block should be copied to. Start by finding min and max indices of data in array B*/ min = dims[0] - (map[me] + nelem); max = dims[0] - map[me] - 1; /* Locate processors containing the endpoints */ pmin = 0; for (i=0; i<nprocs; i++) { if (min >= map[i]) { pmin = i; } else { break; } } pmax = nprocs-1; for (i=nprocs-2; i>=0; i--) { if (max < map[i+1]) { pmax = i; } else { break; } } int loop = 4, k; int warmup = 2; double t_start, t_end; for (k = 0; k < loop +warmup; ++k) { if (warmup == k) { t_start = dclock(); } /* Loop over processors that will receive data and copy inverted data to processors */ for (i=pmin; i<=pmax; i++) { /* Find min and max indices owned by processor i */ lmin = map[i]; if (i<nprocs-1) { lmax = map[i+1]-1; } else { lmax = dims[0]-1; } /* Find min and max indices that should be sent to processor i */ if (lmin > min) { cmin = lmin; } else { cmin = min; } if (lmax < max) { cmax = lmax; } else { cmax = max; } /* Find offsets on source and destination processors */ src_offset = cmin - min; src_ptr = (void*)(buf + src_offset); dst_offset = cmin - lmin; dst_ptr = ((char*)b_ptr[i]) + sizeof(int)*dst_offset; /* Find length of data (in bytes) to be sent to processor i */ length = sizeof(int)*(cmax-cmin+1); /* Send data to processor */ ARMCI_Put(src_ptr, dst_ptr, length, i); } ARMCI_AllFence(); armci_msg_barrier(); } t_end = dclock(); if (0 == me) printf("Procs = [%d], Time[%6.2f]ms\n", nprocs, (t_end - t_start) / (loop * 1.0e3)); ARMCI_Free_local(buf); VERIFY(b_ptr, dims, map); free(map); armci_msg_barrier(); ARMCI_Free(a_ptr[me]); ARMCI_Free(b_ptr[me]); free(a_ptr); free(b_ptr); }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner, destp, hc, m; double *dbuf; armci_hdl_t handle[2*MAXPROC]; int saved[MAXPROC]; dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); /* impl algo on this diag block */ } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = dbuf; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); /* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (I, m); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL); saved[destp] = 1; } } } } /* end of for (i=k1, I=K+1...) */ /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); /* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (m, J); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL); saved[destp] = 1; } } } } ARMCI_WaitAll(); ARMCI_AllFence(); MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = bufc[me*nblocks+I]; } if(block_owner(K,J) == me) B = a[K+J*nblocks]; else B = bufr[me*nblocks + J]; C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } ARMCI_Free_local(dbuf); }
void test_2D() { int i = 0; int dst = 0; int g_a = 0; int shape[2] = {SIZE*nproc, SIZE}; int dist[2] = {SIZE, SIZE}; int lo[2] = {0,0}; int hi[2] = {0,0}; double *buf = NULL; /* allocate the GA */ #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, 2, shape, "2d", dist, &g_a ); ElGlobalArraysDistribution_d( eldga, g_a, me, lo, hi ); #else g_a = NGA_Create(C_DBL, 2, shape, "2d", dist); NGA_Distribution(g_a, me, lo, hi); #endif assert(hi[0]-lo[0]+1 == SIZE); assert(hi[1]-lo[1]+1 == SIZE); /* memory allocation */ if (me == 0) { #if MALLOC_LOC buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double)); assert(buf != NULL); #else buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(buf != NULL); #endif } /* only the proc 0 does the work */ if (me == 0) { if (!CHECK_RESULT) { printf(" section get put acc\n"); printf("bytes loop usec MB/s usec MB/s usec MB/s\n"); printf("------- ------ -------- -------- -------- -------- -------- --------\n"); fflush(stdout); } for (i=0; i<CHUNK_NUM; ++i) { int loop; double intcal; int bytes = chunk[i] * chunk[i] * sizeof(double); double t_get = 0, t_put = 0, t_acc = 0; double latency_get, latency_put, latency_acc; double bandwidth_get, bandwidth_put, bandwidth_acc; intcal = (double)((double)(SIZE * SIZE) / (double)(chunk[i] * chunk[i])); loop = (int)(sqrt((double)intcal)); if (loop < 2) { loop = 2; } for (dst=1; dst<nproc; ++dst) { /* strided get */ fill_array(buf, SIZE * SIZE, me * 10); t_get += time_op(g_a, buf, chunk[i], loop, dst, 2, OP_GET); /* strided put */ fill_array(buf, SIZE * SIZE, me * 10); t_put += time_op(g_a, buf, chunk[i], loop, dst, 2, OP_PUT); /* strided acc */ fill_array(buf, SIZE * SIZE, me * 10); t_acc += time_op(g_a, buf, chunk[i], loop, dst, 2, OP_ACC); } latency_get = t_get / (nproc - 1); latency_put = t_put / (nproc - 1); latency_acc = t_acc / (nproc - 1); bandwidth_get = (bytes * (nproc - 1) * 1e-6) / t_get; bandwidth_put = (bytes * (nproc - 1) * 1e-6) / t_put; bandwidth_acc = (bytes * (nproc - 1) * 1e-6) / t_acc; /* print */ if (!CHECK_RESULT) { printf("%d\t%d\t%.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, loop, latency_get / 1e-6, bandwidth_get, latency_put / 1e-6, bandwidth_put, latency_acc / 1e-6, bandwidth_acc); } } } else { sleep(3); } #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); #else GA_Sync(); #endif #if ENABLE_CLEANUP /* cleanup */ #if defined(USE_ELEMENTAL) ElGlobalArraysDestroy_d( eldga, g_a ); #else GA_Destroy(g_a); #endif #endif if (me == 0) { #if MALLOC_LOC ARMCI_Free_local(buf); #else free(buf); #endif } }