int main(int argc, char ** argv) { int rank, nproc, val, i; void **base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI mutex read-modify-write test with %d processes\n", nproc); base_ptrs = malloc(nproc*sizeof(void*)); ARMCI_Create_mutexes(rank == 0 ? 1 : 0); ARMCI_Malloc(base_ptrs, (rank == 0) ? sizeof(int) : 0); // Proc 0 has a shared int if (rank == 0) { val = 0; ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0); } ARMCI_Barrier(); for (i = 0; i < NITER; i++) { ARMCI_Lock(0, 0); ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0); val += ADDIN; ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0); ARMCI_Unlock(0, 0); } printf(" + %3d done\n", rank); fflush(NULL); ARMCI_Barrier(); if (rank == 0) { ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0); if (val == ADDIN*nproc*NITER) printf("Test complete: PASS.\n"); else printf("Test complete: FAIL. Got %d, expected %d.\n", val, ADDIN*nproc*NITER); } ARMCI_Free(base_ptrs[rank]); ARMCI_Destroy_mutexes(); free(base_ptrs); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int k,i; double **myptrs[10]; double t0,t1,tget=0,tnbget=0,tput=0,tnbput=0,tnbwait=0,t2=0; #if PORTALS ARMCI_NetInit(); #endif MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); ARMCI_Init(); ARMCI_Init(); for(k=0;k<10;k++){ myptrs[k] = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs[k],400000*LOOP*sizeof(double)); for(i=0;i<LOOP;i++)myptrs[k][me][i]=me+0.414; MPI_Barrier(MPI_COMM_WORLD); for(i=0;i<LOOP;i++){ ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs); /*if(myptrs[k][me][i]!=0.414+(me+1)%nprocs)ARMCI_Error("errr",myptrs[k][me][i]);*/ } t0=t1=tget=tnbget=tput=tnbput=tnbwait=t2=0; t0 = MPI_Wtime(); for(i=0;i<LOOP;i++){ ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs); } t1 = MPI_Wtime(); printf("\nGet Latency=%lf\n",1e6*(t1-t0)/LOOP);fflush(stdout); t1=t0=0; for(i=0;i<LOOP;i++){ armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MPI_Wtime(); ARMCI_NbGet(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs,&nbh); t1 = MPI_Wtime(); ARMCI_Wait(&nbh); t2 = MPI_Wtime(); tnbget+=(t1-t0); tnbwait+=(t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); } for(k=0;k<10;k++)ARMCI_Free(myptrs[k][me]); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Finalize(); ARMCI_Finalize(); MPI_Finalize(); }
void test_pairs(int th_idx) { int rem_th, rem_proc; int i, j; void *src, *dst; rem_th = pairs[TH_ME]; rem_proc = TH2PROC(rem_th); prndbg(th_idx, "test_pair: %d<->%d(%d)\n", TH_ME, rem_th, rem_proc); MT_BARRIER(); #if 0 print_array(th_idx, "before", &AELEM(ptrs2[TH_ME],rem_th,0,0), ASIZExITERS); #endif for (i = 0; i < iters; i++) { /* src - addr of my thread block on remote proc/thread */ src = &AELEM(ptrs1[rem_th],TH_ME,i,0); /* src - addr of remote thread block on my proc/thread */ dst = &AELEM(ptrs2[TH_ME],rem_th,i,0); /* get from my pair */ assert(!ARMCI_Get(src, dst, ASIZE_BYTES, rem_proc)); } MT_BARRIER(); #if 0 print_array(th_idx, "rcvd", &AELEM(ptrs2[TH_ME],rem_th,0,0), ASIZExITERS); #endif /* check results */ check_result(&AELEM(ptrs2[TH_ME],rem_th,0,0), rem_th); }
void get_remote(double *buf, int I, int J) { int proc_owner; int edge, size; proc_owner = block_owner(I, J); edge = n%block_size; if (edge == 0) { edge = block_size; } if ((I == nblocks-1) && (J == nblocks-1)) { size = edge*edge; } else if ((I == nblocks-1) || (J == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } size = size * sizeof(double); ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner); }
void get_remote(double *buf, int I, int J) { int proc_owner; int edge, size; #ifdef USE_MUTEX THREAD_LOCK(mutex); #endif proc_owner = block_owner(I, J) / th_per_p; edge = n%block_size; if (edge == 0) { edge = block_size; } if ((I == nblocks-1) && (J == nblocks-1)) { size = edge*edge; } else if ((I == nblocks-1) || (J == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } size = size * sizeof(double); if (proc_owner == me) memcpy(buf, a[I+J*nblocks], size); else ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner); #ifdef USE_MUTEX THREAD_UNLOCK(mutex); #endif }
static void gather_solution_vector(double **svec) { #if 0 double y[COL]; if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start], bytes, i))) ARMCI_Error("armci_nbget failed\n",rc); #endif }
int main(int argc, char ** argv) { MPI_Init(&argc, &argv); ARMCI_Init(); ARMCI_Get(NULL, NULL, 1, 0); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void VERIFY(void **b_ptr, int *dims, int *map) { int i, j, length, icnt, ichk, lmin, lmax; int *buf, *b; void *src_ptr, *dst_ptr; int me, nprocs; /* Find local processor ID and number of processors */ me = armci_msg_me(); nprocs = armci_msg_nproc(); /* Process 0 verifies that inversion is correct. Start by allocating buffer and guarantee that it is big enough */ length = (int)(((double)dims[0])/((double)nprocs)) + 1; buf = (int*)malloc(length*sizeof(int)); if (me == 0) { icnt = 0; ichk = 0; for (i=0; i<nprocs; i++) { /* Find min and max indices owned by processor i */ lmin = map[i]; if (i<nprocs-1) { lmax = map[i+1]-1; } else { lmax = dims[0]-1; } /* evaluate parameters for get call */ length = sizeof(int)*(lmax-lmin+1); src_ptr = b_ptr[i]; dst_ptr = (void*)buf; ARMCI_Get(src_ptr, dst_ptr, length, i); /* check values in buffer */ length = lmax-lmin+1; b = (int*)dst_ptr; for (j=0; j<length; j++) { /* printf("p[%d] b[%d]: %d\n",me,icnt,b[j]); */ if (b[j] != dims[0] - icnt) { printf("Error found for element %d b: %d != a: %d\n", icnt,b[j],dims[0]-icnt); ichk = 1; } icnt++; } } if (ichk == 0) { printf("1D transpose successful. No errors found\n"); } else { printf("1D transpose failed\n"); } } free(buf); }
static void get_data(int n, int start, int end, double *vec_local, double **vec) { int i, j, rc, bytes, offset; int proc_start, proc_end, idx_start, idx_end; proc_start = proc_end = -1; for(i=0; i<nproc; i++) { if(proc_start<0 && proc_row_list[i]>start) proc_start = i; if(proc_end<0 && proc_row_list[i]>end) proc_end = i; } if(proc_start<0 || proc_end<0) ARMCI_Error("Invalid Process Ids", -1); for(i=proc_start; i<=proc_end; i++) { if(i==proc_start) idx_start = start; else { if(i==0) idx_start=0; else idx_start = proc_row_list[i-1]; } if(i==proc_end) idx_end = end; else idx_end = proc_row_list[i]-1; if(i!=prev_proc) { ++count; prev_proc = i; ARMCI_INIT_HANDLE(&gHandle[count]); ARMCI_SET_AGGREGATE_HANDLE(&gHandle[count]); } if(i==0) offset=0; else offset = proc_row_list[i-1]; if(i==me) { /* local */ for(j=idx_start; j<=idx_end; j++) vec_local[j] = vec[me][j-offset]; } else { /* remote */ bytes = (idx_end-idx_start+1)*sizeof(double); vec_local[idx_start] = -1; #if 0 if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start], bytes, i))) #else if((rc=ARMCI_NbGet(&vec[i][idx_start-offset], &vec_local[idx_start], bytes, i, &gHandle[count]))) #endif ARMCI_Error("armci_nbget failed\n",rc); } } }
void get_remote(double *buf, int I, int J) { int proc_owner; int edge, size; double t1; proc_owner = block_owner(I, J); edge = n%block_size; if (edge == 0) { edge = block_size; } if ((I == nblocks-1) && (J == nblocks-1)) { size = edge*edge; } else if ((I == nblocks-1) || (J == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } size = size * sizeof(double); t1 = MPI_Wtime(); #ifdef MPI2_ONESIDED { int target_disp = ( ((char*)(a[I+J*nblocks])) - ((char*)(ptr[proc_owner])) ); if(target_disp<0) { printf("ERROR!: target disp is < 0, target_disp= %d\n", target_disp); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc_owner, 0, win); MPI_Get(buf, size, MPI_CHAR, proc_owner, target_disp, size, MPI_CHAR, win); MPI_Win_unlock(proc_owner, win); } #else ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner); #endif comm_time += MPI_Wtime() - t1; get_cntr++; }
int main(int argc, char ** argv) { int rank, nproc, i, test_iter; int *my_data, *buf; void **base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc); buf = malloc(DATA_SZ); base_ptrs = malloc(sizeof(void*)*nproc); for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) { if (rank == 0) printf(" + iteration %d\n", test_iter); /*** Allocate the shared array ***/ ARMCI_Malloc(base_ptrs, DATA_SZ); my_data = base_ptrs[rank]; /*** Get from our right neighbor and verify correct data ***/ ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter; ARMCI_Access_end(my_data); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc); for (i = 0; i < DATA_NELTS; i++) { if (buf[i] != ((rank+1) % nproc)*test_iter) { printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]); MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Barrier(); // Wait for all gets to complete /*** Put to our left neighbor and verify correct data ***/ for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter; ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) { if (my_data[i] != ((rank+1) % nproc)*test_iter) { printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]); MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Access_end(my_data); ARMCI_Barrier(); // Wait for all gets to complete /*** Accumulate to our left neighbor and verify correct data ***/ for (i = 0; i < DATA_NELTS; i++) buf[i] = rank; ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank; ARMCI_Access_end(my_data); ARMCI_Barrier(); int scale = test_iter; ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) { if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) { printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]); //MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Access_end(my_data); ARMCI_Free(my_data); } free(buf); free(base_ptrs); if (rank == 0) printf("Test complete: PASS.\n"); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i; double **myptrs; double t0, t1, tnbget=0, tnbwait=0, t2=0; MP_INIT(argc,argv); ARMCI_Init(); MP_PROCS(&nprocs); MP_MYID(&me); if (nprocs < 2) ARMCI_Error("This program requires at least to processes", 1); myptrs = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); MP_BARRIER(); if(me == 0) { for(i = 0; i < 10; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1); } t0 = MP_TIMER(); for(i = 0; i < LOOP; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1); } t1 = MP_TIMER(); printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP); fflush(stdout); t1 = t0 = 0; for(i = 0; i < LOOP; i++) { armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MP_TIMER(); //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh); ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh); t1 = MP_TIMER(); ARMCI_Wait(&nbh); t2 = MP_TIMER(); tnbget += (t1-t0); tnbwait += (t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); } else sleep(1); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
double time_get(double *src_buf, double *dst_buf, int chunk, int loop, int proc, int levels) { int i, bal = 0; int stride[2]; int count[2]; int stride_levels = levels; double *tmp_buf, *tmp_buf_ptr; double start_time, stop_time, total_time = 0; stride[0] = SIZE * sizeof(double); count[0] = chunk * sizeof(double); count[1] = chunk; if(CHECK_RESULT) { tmp_buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(tmp_buf != NULL); fill_array(tmp_buf, SIZE*SIZE, proc); tmp_buf_ptr = tmp_buf; } start_time = TIMER(); for(i=0; i<loop; i++) { #ifdef FORCE_1D int j; if(levels>0)for(j=0; j< count[1]; j++){ char *s = (char*) src_buf, *d= (char*)dst_buf; s += j*stride[0]; d += j*stride[0]; ARMCI_Get(src_buf, dst_buf, count[0],proc); } else #endif if(levels) ARMCI_GetS(src_buf, stride, dst_buf, stride, count, stride_levels,proc); else ARMCI_Get(src_buf, dst_buf,count[0], proc); if(CHECK_RESULT) { sprintf(check_type, "ARMCI_GetS:"); check_result(tmp_buf_ptr, dst_buf, stride, count, stride_levels); } /* prepare next src and dst ptrs: avoid cache locality */ if(bal == 0) { src_buf += 128; dst_buf += 128; if(CHECK_RESULT) tmp_buf_ptr += 128; bal = 1; } else { src_buf -= 128; dst_buf -= 128; if(CHECK_RESULT) tmp_buf_ptr -= 128; bal = 0; } } stop_time = TIMER(); total_time = (stop_time - start_time); if(CHECK_RESULT) free(tmp_buf); if(total_time == 0.0){ total_time=0.000001; /* workaround for inaccurate timers */ warn_accuracy++; } return(total_time/loop); }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }
int main(int argc, char **argv) { int i, rank, nranks, msgsize, dest; long bufsize; double **buffer; double t_start, t_stop, t_latency; int provided; ARMCI_Init_args(&argc, &argv); rank = A1_Process_id(A1_GROUP_WORLD); nranks = A1_Process_total(A1_GROUP_WORLD); bufsize = MAX_MSG_SIZE * (ITERATIONS + SKIP); buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } A1_Barrier_group(A1_GROUP_WORLD); if (rank == 0) { printf("ARMCI_Get Latency in usec \n"); printf("%20s %22s \n", "Message Size", "Latency"); fflush(stdout); dest = 1; for (msgsize = sizeof(double); msgsize <= MAX_MSG_SIZE; msgsize *= 2) { for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = A1_Time_seconds(); ARMCI_Get((void *) ((size_t) buffer[dest] + (size_t)(i * msgsize)), (void *) ((size_t) buffer[rank] + (size_t)(i * msgsize)), msgsize, 1); } t_stop = A1_Time_seconds(); printf("%20d %20.2f \n", msgsize, ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); for (i = 0; i < ((ITERATIONS + SKIP) * msgsize) / sizeof(double); i++) { if (*(buffer[rank] + i) != (1.0 + dest)) { printf("Data validation failed At displacement : %d Expected : %f Actual : %f \n", i, (1.0 + dest), *(buffer[rank] + i)); fflush(stdout); return -1; } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } } } A1_Barrier_group(A1_GROUP_WORLD); ARMCI_Free(buffer[rank]); ARMCI_Finalize(); return 0; }
static void contig_test(size_t buffer_size, int op) { void **dst_ptr; void **put_buf; void **get_buf; double *times; dst_ptr = (void*)malloc(nproc * sizeof(void*)); put_buf = (void*)malloc(nproc * sizeof(void*)); get_buf = (void*)malloc(nproc * sizeof(void*)); times = (double*)malloc(nproc * sizeof(double)); ARMCI_Malloc(dst_ptr, buffer_size); ARMCI_Malloc(put_buf, buffer_size); ARMCI_Malloc(get_buf, buffer_size); /* initialize what we're putting */ fill_array((double*)put_buf[me], buffer_size/sizeof(double), me); size_t msg_size; int dst = 1; double scale = 1.0; for (msg_size = 16; msg_size <= buffer_size; msg_size *= 2) { int j; int iter = msg_size > MEDIUM_MESSAGE_SIZE ? ITER_LARGE : ITER_SMALL; double t_start, t_end; if (0 == me) { for (j= 0; j < iter + WARMUP; ++j) { if (WARMUP == j) { t_start = dclock(); } switch (op) { case PUT: ARMCI_Put(put_buf[me], dst_ptr[dst], msg_size, dst); break; case GET: ARMCI_Get(dst_ptr[dst], get_buf[me], msg_size, dst); break; case ACC: ARMCI_Acc(ARMCI_ACC_DBL, &scale, put_buf[me], dst_ptr[dst], msg_size, dst); break; default: ARMCI_Error("oops", 1); } } } /* calculate total time and average time */ t_end = dclock(); ARMCI_Barrier(); if (0 == me) { printf("%8zu\t\t%6.2f\t\t%10.2f\n", msg_size, ((t_end - t_start))/iter, msg_size*iter/((t_end - t_start))); } } ARMCI_Free(dst_ptr[me]); ARMCI_Free(put_buf[me]); ARMCI_Free(get_buf[me]); free(dst_ptr); free(put_buf); free(get_buf); free(times); }
/** Non-blocking get operation. Note: the implementation is not non-blocking */ int ARMCI_NbGet(void *src, void *dst, int bytes, int proc, armci_hdl_t *handle) { return ARMCI_Get(src, dst, bytes, proc); }