void DDI_ARMCI_Finalize() { int code; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD); #if defined DDI_ARMCI_FREE code = ARMCI_Free((void*)(gv(armci_mem_addr)[comm->me])); if (code > 0) fprintf(stderr,"ARMCI_Free(%p) failed: %i",gv(armci_mem_addr)[comm->me]); code = ARMCI_Free((void*)(gv(armci_cnt_addr)[comm->me])); if (code > 0) fprintf(stderr,"ARMCI_Free(%p) failed: %i",gv(armci_cnt_addr)[comm->me]); #endif code = ARMCI_Destroy_mutexes(); if (code > 0) fprintf(stderr,"ARMCI_Destory_mutexes failed: %i",code); ARMCI_Finalize(); }
int main(int argc, char ** argv) { int rank, nproc, val, i; void **base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI mutex read-modify-write test with %d processes\n", nproc); base_ptrs = malloc(nproc*sizeof(void*)); ARMCI_Create_mutexes(rank == 0 ? 1 : 0); ARMCI_Malloc(base_ptrs, (rank == 0) ? sizeof(int) : 0); // Proc 0 has a shared int if (rank == 0) { val = 0; ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0); } ARMCI_Barrier(); for (i = 0; i < NITER; i++) { ARMCI_Lock(0, 0); ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0); val += ADDIN; ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0); ARMCI_Unlock(0, 0); } printf(" + %3d done\n", rank); fflush(NULL); ARMCI_Barrier(); if (rank == 0) { ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0); if (val == ADDIN*nproc*NITER) printf("Test complete: PASS.\n"); else printf("Test complete: FAIL. Got %d, expected %d.\n", val, ADDIN*nproc*NITER); } ARMCI_Free(base_ptrs[rank]); ARMCI_Destroy_mutexes(); free(base_ptrs); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int k,i; double **myptrs[10]; double t0,t1,tget=0,tnbget=0,tput=0,tnbput=0,tnbwait=0,t2=0; #if PORTALS ARMCI_NetInit(); #endif MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); ARMCI_Init(); ARMCI_Init(); for(k=0;k<10;k++){ myptrs[k] = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs[k],400000*LOOP*sizeof(double)); for(i=0;i<LOOP;i++)myptrs[k][me][i]=me+0.414; MPI_Barrier(MPI_COMM_WORLD); for(i=0;i<LOOP;i++){ ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs); /*if(myptrs[k][me][i]!=0.414+(me+1)%nprocs)ARMCI_Error("errr",myptrs[k][me][i]);*/ } t0=t1=tget=tnbget=tput=tnbput=tnbwait=t2=0; t0 = MPI_Wtime(); for(i=0;i<LOOP;i++){ ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs); } t1 = MPI_Wtime(); printf("\nGet Latency=%lf\n",1e6*(t1-t0)/LOOP);fflush(stdout); t1=t0=0; for(i=0;i<LOOP;i++){ armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MPI_Wtime(); ARMCI_NbGet(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs,&nbh); t1 = MPI_Wtime(); ARMCI_Wait(&nbh); t2 = MPI_Wtime(); tnbget+=(t1-t0); tnbwait+=(t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); } for(k=0;k<10;k++)ARMCI_Free(myptrs[k][me]); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Finalize(); ARMCI_Finalize(); MPI_Finalize(); }
/** @see ddi_armci.h */ void DDI_ARMCI_Memory_finalize() { int code; code = ARMCI_Free(gv(dda_index)); if (code != 0) { fprintf(DDI_STDERR, "%s: ARMCI_Free(%p) returned %i\n", DDI_Id(), gv(dda_index), code); DDI_Error(DDI_ARMCI_MEMORY_FINALIZE_ERROR, DDI_ARMCI_MEMORY_FINALIZE_ERROR_MESSAGE); } code = ARMCI_Destroy_mutexes(); if (code != 0) { fprintf(DDI_STDERR, "%s: ARMCI_Destroy_mutexes() returned %i\n", DDI_Id(), code); DDI_Error(DDI_ARMCI_MEMORY_FINALIZE_ERROR, DDI_ARMCI_MEMORY_FINALIZE_ERROR_MESSAGE); } }
int main(int argc, char **argv) { int i, j, rank, nranks, msgsize, dest; int xdim, ydim; long bufsize; double **buffer; double t_start, t_stop, t_latency; int count[2], src_stride, trg_stride, stride_level; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); ARMCI_Barrier(); int me = armci_msg_me(); int node = armci_domain_my_id(ARMCI_DOMAIN_SMP); printf("MPI_Rank: %d, \ armci_msg_nproc: %d \ armci_msg_me: %d, \ armci_domain_id: %d, \ armci_domain_same_id: %d,\ armci_domain_my_id: %d, \ armci_domain_count: %d, \ armci_domain_nprocs: %d, \ armci_domain_glob_proc_id: %d \n", rank, armci_msg_nproc(), me, armci_domain_id(ARMCI_DOMAIN_SMP, me), armci_domain_same_id(ARMCI_DOMAIN_SMP, me), armci_domain_my_id(ARMCI_DOMAIN_SMP), armci_domain_count(ARMCI_DOMAIN_SMP), armci_domain_nprocs(ARMCI_DOMAIN_SMP, node), armci_domain_glob_proc_id(ARMCI_DOMAIN_SMP, node, 0)); fflush(stdout); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char * argv[]) { void *baseAddress[MAX_PROCESSORS]; char *local; int thisImage; int iter = 100, size; double startTime, endTime; int i; // initialize ARMCI_Init(); ARMCI_Myid(&thisImage); // allocate data (collective operation) ARMCI_Malloc(baseAddress, MAX_BUF_SIZE*sizeof(char)); local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char)); ARMCI_Barrier(); ARMCI_Migrate(); if (thisImage == 0) { for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){ startTime = CkWallTimer(); for(i = 0; i < iter; i++){ ARMCI_Put(local, baseAddress[1], size, 1); } ARMCI_Fence(1); endTime = CkWallTimer(); printf("%d: %f us\n", size, (endTime-startTime)*1000); } ARMCI_Barrier(); } else if (thisImage == 1) { ARMCI_Barrier(); } ARMCI_Free(baseAddress[thisImage]); ARMCI_Free_local(local); // finalize ARMCI_Finalize(); return 0; }
int main(int argc, char ** argv) { int rank, nproc, test_iter; void ***base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI memory allocation test with %d processes\n", nproc); base_ptrs = malloc(sizeof(void**)*NUM_ITERATIONS); // Perform a pile of allocations for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) { if (rank == 0) printf(" + allocation %d\n", test_iter); base_ptrs[test_iter] = malloc(sizeof(void*)*nproc); ARMCI_Malloc((void**)base_ptrs[test_iter], (test_iter % 4 == 0) ? 0 : DATA_SZ); } ARMCI_Barrier(); // Free all allocations for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) { if (rank == 0) printf(" + free %d\n", test_iter); ARMCI_Free(((void**)base_ptrs[test_iter])[rank]); free(base_ptrs[test_iter]); } free(base_ptrs); if (rank == 0) printf("Test complete: PASS.\n"); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void destroy_array(void *ptr[]) { ARMCI_Barrier(); assert(!ARMCI_Free(ptr[me])); }
void destroy_array(void *ptr[]) { armci_msg_barrier(); assert(!ARMCI_Free(ptr[me])); }
int main(int argc, char* argv[]) { int provided; int i, rank, nranks, msgsize, target; long bufsize; int **counter; int *complete; int increment; int counter_fetch; int counters_received; int t_start, t_stop, t_latency; int expected; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); complete = (int *) malloc(sizeof(int) * COUNT); counter = (int**) ARMCI_Malloc_local( nranks * sizeof(int*) ); ARMCI_Malloc((void *) counter[rank], sizeof(int)); if (rank == 0) { printf("ARMCI_RMW Test - in usec \n"); fflush(stdout); } target = 0; for(i=0; i<COUNT; i++) { complete[i] = 0; } if(rank == target) { *(counter[rank]) = 0; } increment = 1; counter_fetch = 0; counters_received = 0; MPI_Barrier(MPI_COMM_WORLD); while(counter_fetch < COUNT) { ARMCI_Rmw(ARMCI_FETCH_AND_ADD, (void *) &counter_fetch, (void *) counter[target], increment, target); /* s/1/rank/ means we will know who got the counter */ if (counter_fetch < COUNT) complete[counter_fetch] = rank; counters_received++; } MPI_Allreduce(MPI_IN_PLACE,complete,COUNT,MPI_INT,MPI_SUM,MPI_COMM_WORLD); for(i=0; i<COUNT; i++) { if (complete[i] == 0) { printf("[%d] The RMW update failed at index: %d \n", rank, i); fflush(stdout); exit(-1); } } printf("[%d] The RMW update completed successfully \n", rank); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); if (0==rank) { printf("Checking for fairness...\n", rank); fflush(stdout); for(i=0; i<COUNT; i++) { printf("counter value %d was received by process %d\n", i, complete[i]); } fflush(stdout); } MPI_Barrier(MPI_COMM_WORLD); printf("process %d received %d counters\n", rank, counters_received); fflush(stdout); ARMCI_Free(counter[rank]); ARMCI_Free_local(counter); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void test_2D() { int i; int src, dst; int ierr; double *buf; void *ptr[MAXPROC], *get_ptr[MAXPROC]; /* find who I am and the dst process */ src = me; #ifdef MALLOC_LOC if(me == 0) { buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #else if(me == 0) { buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #endif ierr = ARMCI_Malloc(ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(ptr[me]); ierr = ARMCI_Malloc(get_ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(get_ptr[me]); /* ARMCI - initialize the data window */ fill_array(ptr[me], SIZE*SIZE, me); fill_array(get_ptr[me], SIZE*SIZE, me); MP_BARRIER(); /* only the proc 0 doest the work */ /* print the title */ if(me == 0) { if(!CHECK_RESULT){ printf(" section get put"); printf(" acc\n"); printf("bytes loop sec MB/s sec MB/s"); printf(" sec MB/s\n"); printf("------- ------ -------- -------- -------- --------"); printf(" -------- --------\n"); fflush(stdout); } for(i=0; i<CHUNK_NUM; i++) { int loop; int bytes = chunk[i] * chunk[i] * sizeof(double); double t_get = 0, t_put = 0, t_acc = 0; double latency_get, latency_put, latency_acc; double bandwidth_get, bandwidth_put, bandwidth_acc; loop = SIZE / chunk[i]; if(loop<2)loop=2; for(dst=1; dst<nproc; dst++) { /* strided get */ fill_array(buf, SIZE*SIZE, me*10); t_get += time_get((double *)(get_ptr[dst]), (double *)buf, chunk[i], loop, dst, 1); /* strided put */ fill_array(buf, SIZE*SIZE, me*10); t_put += time_put((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); /* strided acc */ fill_array(buf, SIZE*SIZE, me*10); t_acc += time_acc((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); } latency_get = t_get/(nproc - 1); latency_put = t_put/(nproc - 1); latency_acc = t_acc/(nproc - 1); bandwidth_get = (bytes * (nproc - 1) * 1e-6)/t_get; bandwidth_put = (bytes * (nproc - 1) * 1e-6)/t_put; bandwidth_acc = (bytes * (nproc - 1) * 1e-6)/t_acc; /* print */ if(!CHECK_RESULT)printf("%d\t%d\t%.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, loop, latency_get, bandwidth_get, latency_put, bandwidth_put, latency_acc, bandwidth_acc); } } else sleep(3); ARMCI_AllFence(); MP_BARRIER(); /* cleanup */ ARMCI_Free(get_ptr[me]); ARMCI_Free(ptr[me]); #ifdef MALLOC_LOC if(me == 0) ARMCI_Free_local(buf); #else if(me == 0) free(buf); #endif }
int main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; MP_INIT(argc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } /* num_rows = (int) sqrt((double) nproc); */ /* for (;;) { */ /* num_cols = nproc/num_rows; */ /* if (num_rows*num_cols == nproc) */ /* break; */ /* num_rows--; */ /* } */ nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } nnodes = nproc / 4; if((nnodes * 4) != nproc) { num_cols = nproc - nnodes * 4; nnodes++; num_rows = 1; } else { num_cols = 2; num_rows = 2; } num = (nblocks * nblocks)/nnodes; if((num * nnodes) != (nblocks * nblocks)) num++; #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif edge = n%block_size; if (edge == 0) { edge = block_size; } for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
int main(int argc, char ** argv) { int rank, nproc, i, test_iter; int *my_data, *buf; void **base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc); buf = malloc(DATA_SZ); base_ptrs = malloc(sizeof(void*)*nproc); for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) { if (rank == 0) printf(" + iteration %d\n", test_iter); /*** Allocate the shared array ***/ ARMCI_Malloc(base_ptrs, DATA_SZ); my_data = base_ptrs[rank]; /*** Get from our right neighbor and verify correct data ***/ ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter; ARMCI_Access_end(my_data); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc); for (i = 0; i < DATA_NELTS; i++) { if (buf[i] != ((rank+1) % nproc)*test_iter) { printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]); MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Barrier(); // Wait for all gets to complete /*** Put to our left neighbor and verify correct data ***/ for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter; ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) { if (my_data[i] != ((rank+1) % nproc)*test_iter) { printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]); MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Access_end(my_data); ARMCI_Barrier(); // Wait for all gets to complete /*** Accumulate to our left neighbor and verify correct data ***/ for (i = 0; i < DATA_NELTS; i++) buf[i] = rank; ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank; ARMCI_Access_end(my_data); ARMCI_Barrier(); int scale = test_iter; ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) { if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) { printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]); //MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Access_end(my_data); ARMCI_Free(my_data); } free(buf); free(base_ptrs); if (rank == 0) printf("Test complete: PASS.\n"); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i, j, rank, nranks, peer; size_t xdim, ydim; unsigned long bufsize; double **buffer, *src_buf; double t_start=0.0, t_stop; int count[2], src_stride, trg_stride, stride_level; double scaling; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if (nranks < 2) { printf("%s: Must be run with at least 2 processes\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } ARMCI_Init_args(&argc, &argv); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = MAX_XDIM * MAX_YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) { printf("ARMCI_AccS Latency - local and remote completions - in usec \n"); printf("%30s %22s %22s\n", "Dimensions(array of double)", "Local Completion", "Remote completion"); fflush(stdout); } ARMCI_Access_begin(buffer[rank]); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); scaling = 2.0; src_stride = MAX_YDIM * sizeof(double); trg_stride = MAX_YDIM * sizeof(double); stride_level = 1; ARMCI_Barrier(); for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2) { count[1] = xdim; for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2) { count[0] = ydim * sizeof(double); if (rank == 0) { peer = 1; for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); } t_stop = MPI_Wtime(); ARMCI_Fence(1); char temp[10]; sprintf(temp, "%dX%d", (int) xdim, (int) ydim); printf("%30s %20.2f ", temp, ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); ARMCI_Fence(1); } t_stop = MPI_Wtime(); printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); } else { peer = 0; ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); } } } ARMCI_Barrier(); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { size_t i, rank, nranks, msgsize, dest; size_t iterations, max_msgsize; int bufsize; double **buffer; double t_start, t_stop, t_total, d_total; double expected, bandwidth; int provided; armci_hdl_t handle; max_msgsize = MAX_MSGSIZE; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); bufsize = max_msgsize * ITERATIONS_LARGE; buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_INIT_HANDLE(&handle); ARMCI_SET_AGGREGATE_HANDLE(&handle); ARMCI_Barrier(); if (rank == 0) { printf("ARMCI_Put Bandwidth in MBPS \n"); printf("%20s %22s \n", "Message Size", "Bandwidth"); fflush(stdout); dest = 1; expected = 1 + dest; for (msgsize = sizeof(double); msgsize <= max_msgsize; msgsize *= 2) { if (msgsize <= 16 * 1024) iterations = ITERATIONS_VERYSMALL; else if (msgsize <= 64 * 1024) iterations = ITERATIONS_SMALL; else if (msgsize <= 512 * 1024) iterations = ITERATIONS_MEDIUM; else iterations = ITERATIONS_LARGE; t_start = MPI_Wtime(); for (i = 0; i < iterations; i++) { ARMCI_NbPut((void *) ((size_t) buffer[dest] + (size_t)(i * msgsize)), (void *) ((size_t) buffer[rank] + (size_t)(i * msgsize)), msgsize, dest, &handle); } ARMCI_Wait(&handle); t_stop = MPI_Wtime(); d_total = (iterations * msgsize) / (1024 * 1024); t_total = t_stop - t_start; bandwidth = d_total / t_total; printf("%20d %20.4lf \n", msgsize, bandwidth); fflush(stdout); ARMCI_Fence(dest); } } ARMCI_Barrier(); ARMCI_UNSET_AGGREGATE_HANDLE(&handle); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors, total_errors; double **buf_bvec, **src_bvec, *src_buf; int count[2], src_stride, trg_stride, stride_level; double scaling, time; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); buf_bvec = (double **) malloc(sizeof(double *) * nranks); src_bvec = (double **) malloc(sizeof(double *) * nranks); bufsize = XDIM * YDIM * sizeof(double); ARMCI_Malloc((void **) buf_bvec, bufsize); ARMCI_Malloc((void **) src_bvec, bufsize); src_buf = src_bvec[rank]; if (rank == 0) printf("ARMCI Strided DLA Accumulate Test:\n"); ARMCI_Access_begin(buf_bvec[rank]); ARMCI_Access_begin(src_buf); for (i = 0; i < XDIM*YDIM; i++) { *(buf_bvec[rank] + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } ARMCI_Access_end(src_buf); ARMCI_Access_end(buf_bvec[rank]); scaling = 2.0; src_stride = XDIM * sizeof(double); trg_stride = XDIM * sizeof(double); stride_level = 1; count[1] = YDIM; count[0] = XDIM * sizeof(double); ARMCI_Barrier(); time = MPI_Wtime(); peer = (rank+1) % nranks; for (i = 0; i < ITERATIONS; i++) { ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, src_buf, &src_stride, (void *) buf_bvec[peer], &trg_stride, count, stride_level, peer); } ARMCI_Barrier(); time = MPI_Wtime() - time; if (rank == 0) printf("Time: %f sec\n", time); ARMCI_Access_begin(buf_bvec[rank]); for (i = errors = 0; i < XDIM; i++) { for (j = 0; j < YDIM; j++) { const double actual = *(buf_bvec[rank] + i + j*XDIM); const double expected = (1.0 + rank) + scaling * (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } ARMCI_Access_end(buf_bvec[rank]); MPI_Allreduce(&errors, &total_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ARMCI_Free((void *) buf_bvec[rank]); ARMCI_Free((void *) src_bvec[rank]); free(buf_bvec); free(src_bvec); ARMCI_Finalize(); MPI_Finalize(); if (total_errors == 0) { if (rank == 0) printf("Success.\n"); return 0; } else { if (rank == 0) printf("Fail.\n"); return 1; } }
void TRANSPOSE1D() { int dims[1]; int nelem, i, ierr, min, max, cmin, cmax, lmin, lmax, pmin, pmax; int src_offset, dst_offset, length; int *buf, *map; void *src_ptr, *dst_ptr; void **a_ptr, **b_ptr; int *a, *b; /* Find local processor ID and number of processors */ int me, nprocs; me = armci_msg_me(); nprocs = armci_msg_nproc(); /* Allocate pointers to data on all processors */ a_ptr = (void**)malloc(nprocs*sizeof(int*)); b_ptr = (void**)malloc(nprocs*sizeof(int*)); map = (int*)malloc(nprocs*sizeof(int)); /* Configure array dimensions. Force an unequal data distribution */ dims[0] = nprocs*TOTALELEMS + nprocs/2; if (me == 0) printf("Size of array: %d\n\n",dims[0]); /* Find first (zero-based) index of chunk owned by each processor and store it in map array */ for (i=0; i<nprocs; i++) { map[i] = (int)(((double)i)*(((double)dims[0])/((double)nprocs))); } /* Figure out what size my portion of array is */ if (me<nprocs-1) { nelem = map[me+1]-map[me]; } else { nelem = dims[0]-map[me]; } /* Allocate memory for array A */ ierr = ARMCI_Malloc(a_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(a_ptr[me]); /* Allocate memory for array B */ ierr = ARMCI_Malloc(b_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(b_ptr[me]); /* initialize data in array A and zero data in array B */ a = (int*)a_ptr[me]; b = (int*)b_ptr[me]; for (i=0; i<nelem; i++) { a[i] = i + map[me] + 1; b[i] = 0; } /* Synchronize all processors to guarantee that everyone has data before proceeding to the next step. */ armci_msg_barrier(); /* Create local buffer for performing inversion */ buf = (int*)malloc(nelem*sizeof(int)); /* Copy inverted data into local buffer */ a = (int*)a_ptr[me]; for (i=0; i<nelem; i++) { buf[i] = a[nelem-i-1]; } /* Find out which blocks of array B inverted block should be copied to. Start by finding min and max indices of data in array B*/ min = dims[0] - (map[me] + nelem); max = dims[0] - map[me] - 1; /* Locate processors containing the endpoints */ pmin = 0; for (i=0; i<nprocs; i++) { if (min >= map[i]) { pmin = i; } else { break; } } pmax = nprocs-1; for (i=nprocs-2; i>=0; i--) { if (max < map[i+1]) { pmax = i; } else { break; } } /* Loop over processors that will receive data and copy inverted data to processors */ for (i=pmin; i<=pmax; i++) { /* Find min and max indices owned by processor i */ lmin = map[i]; if (i<nprocs-1) { lmax = map[i+1]-1; } else { lmax = dims[0]-1; } /* Find min and max indices that should be sent to processor i */ if (lmin > min) { cmin = lmin; } else { cmin = min; } if (lmax < max) { cmax = lmax; } else { cmax = max; } /* Find offsets on source and destination processors */ src_offset = cmin - min; src_ptr = (void*)(buf + src_offset); dst_offset = cmin - lmin; dst_ptr = ((char*)b_ptr[i]) + sizeof(int)*dst_offset; /* Find length of data (in bytes) to be sent to processor i */ length = sizeof(int)*(cmax-cmin+1); /* Send data to processor */ ARMCI_Put(src_ptr, dst_ptr, length, i); } ARMCI_AllFence(); armci_msg_barrier(); free(buf); VERIFY(b_ptr, dims, map); free(map); armci_msg_barrier(); ARMCI_Free(a_ptr[me]); ARMCI_Free(b_ptr[me]); free(a_ptr); free(b_ptr); }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; void **bufr_g, **bufc_g; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\nUsing pre-PUTing\n"); printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); if (bufr == NULL || bufc == NULL) printf("Could not ARMCI_Malloc_local() mem\n"); /* bufr points to all k-th row blocks */ /* save all block address in row-major order */ proc_bytes = nblocks*block_size*block_size * sizeof(double); bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufr_g, proc_bytes); for (i = 0; i < nproc; i++) { bufr[i*nblocks] = (double *) bufr_g[i]; for (j = 1; j < nblocks; j++) { bufr[i*nblocks + j] = bufr[i*nblocks + j-1] + block_size * block_size; } } /* bufc points to all k-th column blocks */ bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufc_g, proc_bytes); for (i = 0; i < nproc; i++) { bufc[i*nblocks] = (double *) bufc_g[i]; for (j = 1; j < nblocks; j++) { bufc[i*nblocks + j] = bufc[i*nblocks + j-1] + block_size * block_size; } } /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Free(bufc_g[me]); ARMCI_Free(bufr_g[me]); ARMCI_Finalize(); MP_FINALIZE(); }
static void contig_test(size_t buffer_size, int op) { void **dst_ptr; void **put_buf; void **get_buf; double *times; dst_ptr = (void*)malloc(nproc * sizeof(void*)); put_buf = (void*)malloc(nproc * sizeof(void*)); get_buf = (void*)malloc(nproc * sizeof(void*)); times = (double*)malloc(nproc * sizeof(double)); ARMCI_Malloc(dst_ptr, buffer_size); ARMCI_Malloc(put_buf, buffer_size); ARMCI_Malloc(get_buf, buffer_size); /* initialize what we're putting */ fill_array((double*)put_buf[me], buffer_size/sizeof(double), me); size_t msg_size; int dst = 1; double scale = 1.0; for (msg_size = 16; msg_size <= buffer_size; msg_size *= 2) { int j; int iter = msg_size > MEDIUM_MESSAGE_SIZE ? ITER_LARGE : ITER_SMALL; double t_start, t_end; if (0 == me) { for (j= 0; j < iter + WARMUP; ++j) { if (WARMUP == j) { t_start = dclock(); } switch (op) { case PUT: ARMCI_Put(put_buf[me], dst_ptr[dst], msg_size, dst); break; case GET: ARMCI_Get(dst_ptr[dst], get_buf[me], msg_size, dst); break; case ACC: ARMCI_Acc(ARMCI_ACC_DBL, &scale, put_buf[me], dst_ptr[dst], msg_size, dst); break; default: ARMCI_Error("oops", 1); } } } /* calculate total time and average time */ t_end = dclock(); ARMCI_Barrier(); if (0 == me) { printf("%8zu\t\t%6.2f\t\t%10.2f\n", msg_size, ((t_end - t_start))/iter, msg_size*iter/((t_end - t_start))); } } ARMCI_Free(dst_ptr[me]); ARMCI_Free(put_buf[me]); ARMCI_Free(get_buf[me]); free(dst_ptr); free(put_buf); free(get_buf); free(times); }
int main(int argc, char **argv) { int i, rank, nranks, msgsize, dest; long bufsize; double **buffer; double t_start, t_stop, t_latency; int provided; ARMCI_Init_args(&argc, &argv); rank = A1_Process_id(A1_GROUP_WORLD); nranks = A1_Process_total(A1_GROUP_WORLD); bufsize = MAX_MSG_SIZE * (ITERATIONS + SKIP); buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } A1_Barrier_group(A1_GROUP_WORLD); if (rank == 0) { printf("ARMCI_Get Latency in usec \n"); printf("%20s %22s \n", "Message Size", "Latency"); fflush(stdout); dest = 1; for (msgsize = sizeof(double); msgsize <= MAX_MSG_SIZE; msgsize *= 2) { for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = A1_Time_seconds(); ARMCI_Get((void *) ((size_t) buffer[dest] + (size_t)(i * msgsize)), (void *) ((size_t) buffer[rank] + (size_t)(i * msgsize)), msgsize, 1); } t_stop = A1_Time_seconds(); printf("%20d %20.2f \n", msgsize, ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); for (i = 0; i < ((ITERATIONS + SKIP) * msgsize) / sizeof(double); i++) { if (*(buffer[rank] + i) != (1.0 + dest)) { printf("Data validation failed At displacement : %d Expected : %f Actual : %f \n", i, (1.0 + dest), *(buffer[rank] + i)); fflush(stdout); return -1; } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } } } A1_Barrier_group(A1_GROUP_WORLD); ARMCI_Free(buffer[rank]); ARMCI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int rank, nranks; size_t i, msgsize, dest; size_t iterations, max_msgsize; int bufsize; double **buffer; double t_start, t_stop, t_total, d_total; double expected, bandwidth; int provided; armci_hdl_t handle; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); max_msgsize = MAX_MSGSIZE; ARMCI_Init_args(&argc, &argv); bufsize = max_msgsize * ITERATIONS; buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_INIT_HANDLE(&handle); ARMCI_SET_AGGREGATE_HANDLE(&handle); ARMCI_Barrier(); if (rank == 0) { printf("ARMCI_Get Bandwidth in MBPS \n"); printf("%20s %22s \n", "Message Size", "Bandwidth"); fflush(stdout); dest = 1; expected = 1 + dest; for (msgsize = sizeof(double); msgsize <= max_msgsize; msgsize *= 2) { iterations = bufsize/msgsize; t_start = MPI_Wtime(); for (i = 0; i < iterations; i++) { ARMCI_NbGet((void *) ((size_t) buffer[dest] + (size_t)(i * msgsize)), (void *) ((size_t) buffer[rank] + (size_t)(i * msgsize)), msgsize, dest, &handle); } ARMCI_Wait(&handle); t_stop = MPI_Wtime(); d_total = (iterations * msgsize) / (1024 * 1024); t_total = t_stop - t_start; bandwidth = d_total / t_total; printf("%20d %20.4lf \n", msgsize, bandwidth); fflush(stdout); #ifdef DATA_VALIDATION { for(j=0; j<((iterations*msgsize)/sizeof(double)); j++) { if(*(buffer[rank] + j) != expected) { printf("Data validation failed At displacement : %d Expected : %lf Actual : %lf \n", j, expected, *(buffer[rank] + j)); fflush(stdout); return -1; } } for(j=0; j<bufsize/sizeof(double); j++) { *(buffer[rank] + j) = 1.0 + rank; } } #endif } } ARMCI_Barrier(); ARMCI_UNSET_AGGREGATE_HANDLE(&handle); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors; double **buffer, *src_buf; int count[2], src_stride, trg_stride, stride_level; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = XDIM * YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) printf("ARMCI Strided Put Test:\n"); src_stride = XDIM * sizeof(double); trg_stride = XDIM * sizeof(double); stride_level = 1; count[1] = YDIM; count[0] = XDIM * sizeof(double); ARMCI_Barrier(); peer = (rank+1) % nranks; for (i = 0; i < ITERATIONS; i++) { for (j = 0; j < XDIM*YDIM; j++) { *(src_buf + j) = rank + i; } ARMCI_PutS( src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); } ARMCI_Barrier(); ARMCI_Access_begin(buffer[rank]); for (i = errors = 0; i < XDIM; i++) { for (j = 0; j < YDIM; j++) { const double actual = *(buffer[rank] + i + j*XDIM); const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } ARMCI_Access_end(buffer[rank]); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); if (errors == 0) { printf("%d: Success\n", rank); return 0; } else { printf("%d: Fail\n", rank); return 1; } }
int main(int argc, char *argv[]) { int i, j, rank, nranks; int xdim, ydim; long bufsize; double **buffer; double t_start=0.0, t_stop=0.0; int count[2], src_stride, trg_stride, stride_level, peer; double expected, actual; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if (nranks < 2) { printf("%s: Must be run with at least 2 processes\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } ARMCI_Init_args(&argc, &argv); bufsize = MAX_XDIM * MAX_YDIM * sizeof(double); buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } if(rank == 0) { printf("ARMCI_PutS Latency - local and remote completions - in usec \n"); printf("%30s %22s %22s\n", "Dimensions(array of doubles)", "Latency-LocalCompeltion", "Latency-RemoteCompletion"); fflush(stdout); } src_stride = MAX_YDIM*sizeof(double); trg_stride = MAX_YDIM*sizeof(double); stride_level = 1; ARMCI_Barrier(); for(xdim=1; xdim<=MAX_XDIM; xdim*=2) { count[1] = xdim; for(ydim=1; ydim<=MAX_YDIM; ydim*=2) { count[0] = ydim*sizeof(double); if(rank == 0) { peer = 1; for(i=0; i<ITERATIONS+SKIP; i++) { if(i == SKIP) t_start = MPI_Wtime(); ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); } t_stop = MPI_Wtime(); ARMCI_Fence(peer); char temp[10]; sprintf(temp,"%dX%d", xdim, ydim); printf("%30s %20.2f", temp, ((t_stop-t_start)*1000000)/ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); for(i=0; i<ITERATIONS+SKIP; i++) { if(i == SKIP) t_start = MPI_Wtime(); ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); ARMCI_Fence(peer); } t_stop = MPI_Wtime(); printf("%20.2f \n", ((t_stop-t_start)*1000000)/ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); } else { peer = 0; expected = (1.0 + (double) peer); ARMCI_Barrier(); if (rank == 1) { for(i=0; i<xdim; i++) { for(j=0; j<ydim; j++) { actual = *(buffer[rank] + i*MAX_YDIM + j); if(actual != expected) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, expected, actual); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } } for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Barrier(); ARMCI_Barrier(); if (rank == 1) { for(i=0; i<xdim; i++) { for(j=0; j<ydim; j++) { actual = *(buffer[rank] + i*MAX_YDIM + j); if(actual != expected) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, expected, actual); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } } ARMCI_Barrier(); } } } ARMCI_Barrier(); ARMCI_Free((void *) buffer[rank]); free(buffer); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void destroy_array(void *ptr[]) { MP_BARRIER(); assert(!ARMCI_Free(ptr[me])); }
int main(int argc, char *argv[]) { int i, j, rank, nranks, msgsize, dest; int dim, iterations; long bufsize; double **buffer; double t_start, t_stop, t_total, d_total, bw; int count[2], src_stride, trg_stride, stride_level; int provided; armci_hdl_t handle; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); bufsize = MAX_DIM * MAX_DIM * sizeof(double); buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_INIT_HANDLE(&handle); ARMCI_SET_AGGREGATE_HANDLE(&handle); ARMCI_Barrier(); if (rank == 0) { printf("ARMCI_PutS Bandwidth in MBPS \n"); printf("%30s %22s \n", "Dimensions(array of doubles)", "Latency"); fflush(stdout); dest = 1; src_stride = MAX_DIM * sizeof(double); trg_stride = MAX_DIM * sizeof(double); stride_level = 1; for (dim = 1; dim <= MAX_DIM; dim *= 2) { count[0] = dim*sizeof(double); count[1] = dim; iterations = 10*(MAX_DIM * MAX_DIM)/(dim * dim); t_start = MPI_Wtime(); for (i = 0; i < iterations; i++) { ARMCI_NbPutS((void *) buffer[rank], &src_stride, (void *) buffer[dest], &trg_stride, count, stride_level, dest, &handle); } ARMCI_Wait(&handle); t_stop = MPI_Wtime(); ARMCI_Fence(1); char temp[10]; sprintf(temp, "%dX%d", dim, dim); t_total = t_stop - t_start; d_total = (dim*dim*sizeof(double)*iterations)/(1024*1024); bw = d_total/t_total; printf("%30s %20.2f \n", temp, bw); fflush(stdout); } } ARMCI_Barrier(); ARMCI_UNSET_AGGREGATE_HANDLE(&handle); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; int nloop=5; double **ptr_loc; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } ptr = (void **)malloc(nproc * sizeof(void *)); #ifdef MPI2_ONESIDED MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]); MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me]; MPI_Barrier(MPI_COMM_WORLD); #else /* initialize ARMCI */ ARMCI_Init(); ARMCI_Malloc(ptr, proc_bytes); #endif a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks; i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } lu(n, block_size, me); /* cold start */ /* Starting the timer */ MP_BARRIER(); if(me == 0) start_timer(); for(i=0; i<nloop; i++) lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()/nloop); printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ #ifdef MPI2_ONESIDED MPI_Win_free(&win); MPI_Free_mem(ptr[me]); #else ARMCI_Free(ptr[me]); ARMCI_Finalize(); #endif MP_FINALIZE(); }
main(int argc, char *argv[]) { int i, j, l; int ch; extern char *optarg; int edge; int size; int lu_arg[MAX_THREADS][3]; /* ARMCI */ void **ptr; double **ptr_loc; THREAD_LOCK_INIT(mutex); armci_msg_init(&argc,&argv); nproc = armci_msg_nproc(); me = armci_msg_me(); while ((ch = getopt(argc, argv, "n:b:p:t:d:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 't': th_per_p = atoi(optarg); break; case 'd': d = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC -tTH_PER_P\n"); armci_msg_barrier(); armci_msg_finalize(); exit(0); } } } if(th_per_p>MAX_THREADS) { th_per_p=MAX_THREADS; if(me==0)printf("Warning: cannot run more than %d threads, adjust MAX_THREADS",MAX_THREADS); } if (d) { fprintf(stderr, "%d: %d\n", me, getpid()); sleep(d); } nthreads = th_per_p * nproc; if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d thread(s) per processor, %d threads total\n", th_per_p, nthreads); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nthreads); for (;;) { num_cols = nthreads/num_rows; if (num_rows*num_cols == nthreads) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } num = (nblocks * nblocks)/nthreads; if((num * nthreads) != (nblocks * nblocks)) num++; edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } armci_msg_barrier(); /* armci_msg_finalize(); */ /* exit(0); */ #endif for (l = 0; l < th_per_p; l++) { me_th[l] = me * th_per_p + l; for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me_th[l]) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } thread_doubles[l] += size; } } } proc_bytes += thread_doubles[l] * sizeof(double); } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nthreads*sizeof(double *)); for (i = 0; i < nproc; i++) { ptr_loc[i * th_per_p] = (double *)ptr[i]; for (j = 1; j < th_per_p; j++) ptr_loc[i * th_per_p + j] = ptr_loc[i * th_per_p + j - 1] + thread_doubles[j - 1]; } for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } #if 0 for(i=0; i<nblocks*nblocks;i ++) printf("%d: a[%d]=%p\n", me, i, a[i]); fflush(stdout); #endif /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ armci_msg_barrier(); /* to remove cold-start misses, all processors touch their own data */ /* for (l = 0; l < th_per_p; l++) touch_array(block_size, me_th[l]); */ armci_msg_barrier(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } armci_msg_barrier(); } #if 1 for (i = 0; i < nblocks; i++) for (j = 0; j < nblocks; j++) print_block_dbg(a[i + j * nblocks], "proc %d, a[%d, %d]:\n", me, i, j); #endif TH_INIT(nproc,th_per_p); /* Starting the timer */ if(me == 0) start_timer(); for (l = 0; l < th_per_p; l++) { lu_arg[l][0] = n; lu_arg[l][1] = block_size; lu_arg[l][2] = l; THREAD_CREATE(threads + l, lu, lu_arg[l]); } for (l = 0; l < th_per_p; l++) THREAD_JOIN(threads[l], NULL); armci_msg_barrier(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } armci_msg_barrier(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); armci_msg_finalize(); THREAD_LOCK_DESTROY(mutex); }