void test_aggregate(int dryrun) { int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; double *ddst_put[MAXPROC]; double *ddst_get[MAXPROC]; double *dsrc[MAXPROC]; armci_hdl_t aggr_hdl_put[MAXPROC]; armci_hdl_t aggr_hdl_get[MAXPROC]; armci_hdl_t hdl_put[MAXELEMS]; armci_hdl_t hdl_get[MAXELEMS]; armci_giov_t darr; void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS]; int start = 0, end = 0; double start_time; create_array((void**)ddst_put, sizeof(double),2, elems); create_array((void**)ddst_get, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) { ddst_put[me][i]=0.0; ddst_get[me][i]=0.0; } MP_BARRIER(); /* only proc 0 does the work */ if(me == 0) { if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS); /* initializing non-blocking handles */ for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]); for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]); /* aggregate handles */ for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]); bytes = sizeof(double); /* **************** PUT **************** */ /* register put */ start_time=MP_TIMER(); start = 0; end = elems[1]; for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i, &hdl_put[j]); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Value Put time = %.2es\n", me, MP_TIMER()-start_time); /* vector put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[me][j]; dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i]))) ARMCI_Error("armci_nbputv failed\n",rc); } for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]); if(!dryrun)printf("%d: Vector Put time = %.2es\n", me, MP_TIMER()-start_time); /* regular put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &hdl_put[j]))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Regular Put time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &aggr_hdl_put[i]))) ARMCI_Error("armci_nbput failed\n",rc); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]); if(!dryrun)printf("%d: Aggregate Put time = %.2es\n\n", me, MP_TIMER()-start_time); /* **************** GET **************** */ /* vector get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[i][j]; dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i]))) ARMCI_Error("armci_nbgetv failed\n",rc); ARMCI_Wait(&hdl_get[i]); } if(!dryrun)printf("%d: Vector Get time = %.2es\n", me, MP_TIMER()-start_time); /* regular get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &hdl_get[j]))) ARMCI_Error("armci_nbget failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]); } if(!dryrun)printf("%d: Regular Get time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &aggr_hdl_get[i]); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]); if(!dryrun)printf("%d: Aggregate Get time = %.2es\n", me, MP_TIMER()-start_time); } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); /* Verify */ if(!(me==0)) for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_put[me][j]-j*1.001) > 0.1) { ARMCI_Error("aggregate put failed...1", 0); } } MP_BARRIER(); if(!dryrun)if(me==0) printf("\n aggregate put ..O.K.\n"); fflush(stdout); if(me==0) { for(i=1; i<nproc; i++) { for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) { ARMCI_Error("aggregate get failed...1", 0); } } } } MP_BARRIER(); if(!dryrun)if(me==0) printf(" aggregate get ..O.K.\n"); fflush(stdout); ARMCI_AllFence(); MP_BARRIER(); if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst_put); destroy_array((void **)ddst_get); destroy_array((void **)dsrc); }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }
int main(int argc, char *argv[]) { size_t i, rank, nranks, msgsize, dest; size_t iterations, max_msgsize; int bufsize; double **buffer; double t_start, t_stop, t_total, d_total; double expected, bandwidth; int provided; armci_hdl_t handle; max_msgsize = MAX_MSGSIZE; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); ARMCI_Init_args(&argc, &argv); bufsize = max_msgsize * ITERATIONS_LARGE; buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_INIT_HANDLE(&handle); ARMCI_SET_AGGREGATE_HANDLE(&handle); ARMCI_Barrier(); if (rank == 0) { printf("ARMCI_Put Bandwidth in MBPS \n"); printf("%20s %22s \n", "Message Size", "Bandwidth"); fflush(stdout); dest = 1; expected = 1 + dest; for (msgsize = sizeof(double); msgsize <= max_msgsize; msgsize *= 2) { if (msgsize <= 16 * 1024) iterations = ITERATIONS_VERYSMALL; else if (msgsize <= 64 * 1024) iterations = ITERATIONS_SMALL; else if (msgsize <= 512 * 1024) iterations = ITERATIONS_MEDIUM; else iterations = ITERATIONS_LARGE; t_start = MPI_Wtime(); for (i = 0; i < iterations; i++) { ARMCI_NbPut((void *) ((size_t) buffer[dest] + (size_t)(i * msgsize)), (void *) ((size_t) buffer[rank] + (size_t)(i * msgsize)), msgsize, dest, &handle); } ARMCI_Wait(&handle); t_stop = MPI_Wtime(); d_total = (iterations * msgsize) / (1024 * 1024); t_total = t_stop - t_start; bandwidth = d_total / t_total; printf("%20d %20.4lf \n", msgsize, bandwidth); fflush(stdout); ARMCI_Fence(dest); } } ARMCI_Barrier(); ARMCI_UNSET_AGGREGATE_HANDLE(&handle); ARMCI_Free((void *) buffer[rank]); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner, destp, hc, m; double *dbuf; armci_hdl_t handle[2*MAXPROC]; int saved[MAXPROC]; dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); /* impl algo on this diag block */ } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = dbuf; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); /* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (I, m); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL); saved[destp] = 1; } } } } /* end of for (i=k1, I=K+1...) */ /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); /* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (m, J); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL); saved[destp] = 1; } } } } ARMCI_WaitAll(); ARMCI_AllFence(); MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = bufc[me*nblocks+I]; } if(block_owner(K,J) == me) B = a[K+J*nblocks]; else B = bufr[me*nblocks + J]; C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } ARMCI_Free_local(dbuf); }