void DDI_ARMCI_Memory_init(size_t size) { int code; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD); // malloc ARMCI memory code = ARMCI_Malloc((void*)gv(armci_mem_addr),size); if (code > 0) { ARMCI_Error("ARMCI_Malloc failed",code); Fatal_error(911); } gv(dda_index) = (DDA_Index*)gv(armci_mem_addr)[comm->me]; // malloc ARMCI counter block and set addresses code = ARMCI_Malloc((void*)gv(armci_cnt_addr),sizeof(armci_counter_t)*2); if (code > 0) { ARMCI_Error("ARMCI_Malloc failed",code); Fatal_error(911); } ARMCI_PutValueLong(0, (void*)(gv(armci_cnt_addr)[comm->me]+0), comm->me); ARMCI_PutValueLong(0, (void*)(gv(armci_cnt_addr)[comm->me]+1), comm->me); DDI_ARMCI_DLB_addr(); DDI_ARMCI_GDLB_addr(); // create mutexes code = ARMCI_Create_mutexes(MAX_DD_ARRAYS+1); if (code > 0) { ARMCI_Error("ARMCI_Create_mutexes failed",code); Fatal_error(911); } gv(dlb_access) = MAX_DD_ARRAYS; }
int main(int argc, char ** argv) { int rank, nproc, i; int *buf; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI GOP test with %d processes\n", nproc); buf = malloc(DATA_SZ*sizeof(int)); if (rank == 0) printf(" - Testing ABSMIN\n"); for (i = 0; i < DATA_SZ; i++) buf[i] = (rank+1) * ((i % 2) ? -1 : 1); armci_msg_igop(buf, DATA_SZ, "absmin"); for (i = 0; i < DATA_SZ; i++) if (buf[i] != 1) { printf("Err: buf[%d] = %d expected 1\n", i, buf[i]); ARMCI_Error("Fail", 1); } if (rank == 0) printf(" - Testing ABSMAX\n"); for (i = 0; i < DATA_SZ; i++) buf[i] = (rank+1) * ((i % 2) ? -1 : 1); armci_msg_igop(buf, DATA_SZ, "absmax"); for (i = 0; i < DATA_SZ; i++) if (buf[i] != nproc) { printf("Err: buf[%d] = %d expected %d\n", i, buf[i], nproc); ARMCI_Error("Fail", 1); } free(buf); if (rank == 0) printf("Pass.\n"); ARMCI_Finalize(); MPI_Finalize(); return 0; }
static void get_data(int n, int start, int end, double *vec_local, double **vec) { int i, j, rc, bytes, offset; int proc_start, proc_end, idx_start, idx_end; proc_start = proc_end = -1; for(i=0; i<nproc; i++) { if(proc_start<0 && proc_row_list[i]>start) proc_start = i; if(proc_end<0 && proc_row_list[i]>end) proc_end = i; } if(proc_start<0 || proc_end<0) ARMCI_Error("Invalid Process Ids", -1); for(i=proc_start; i<=proc_end; i++) { if(i==proc_start) idx_start = start; else { if(i==0) idx_start=0; else idx_start = proc_row_list[i-1]; } if(i==proc_end) idx_end = end; else idx_end = proc_row_list[i]-1; if(i!=prev_proc) { ++count; prev_proc = i; ARMCI_INIT_HANDLE(&gHandle[count]); ARMCI_SET_AGGREGATE_HANDLE(&gHandle[count]); } if(i==0) offset=0; else offset = proc_row_list[i-1]; if(i==me) { /* local */ for(j=idx_start; j<=idx_end; j++) vec_local[j] = vec[me][j-offset]; } else { /* remote */ bytes = (idx_end-idx_start+1)*sizeof(double); vec_local[idx_start] = -1; #if 0 if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start], bytes, i))) #else if((rc=ARMCI_NbGet(&vec[i][idx_start-offset], &vec_local[idx_start], bytes, i, &gHandle[count]))) #endif ARMCI_Error("armci_nbget failed\n",rc); } } }
void verify_results(int op, int *elems) { int i, j; switch(op) { case PUT: if(!(me==0)) for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst[me][j]-j*1.001) > 0.1) { ARMCI_Error("put failed...Invalid Value Obtained..1", 0); } } MP_BARRIER(); if(DEBUG) if(me==0) printf(" verifying put ..O.K.\n"); break; case GET: if(me==0) { for(i=1; i<nproc; i++) { for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) ARMCI_Error("get failed...Invalid Value Obtained..1", 0); } } } MP_BARRIER(); if(DEBUG) if(me==0) printf(" verifying get ..O.K.\n\n"); break; case ACC: if(me==0) for(j=0; j<elems[1]; j++) { /*printf("ddst[%d][%d] = %lf\n", me, j, ddst[me][j]); fflush(stdout); */ if( ARMCI_ABS(ddst[me][j]-(double)nproc) > 0.1) { ARMCI_Error("accumulate failed...Invalid Value Obtained..1", 0); } } MP_BARRIER(); if(DEBUG)if(me==0) printf(" verifying accumulate ..O.K.\n"); break; default: ARMCI_Error("Invalid Operation", 0); } fflush(stdout); }
static void gather_solution_vector(double **svec) { #if 0 double y[COL]; if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start], bytes, i))) ARMCI_Error("armci_nbget failed\n",rc); #endif }
void test_one_group(ARMCI_Group *group, int *pid_list) { int grp_me, grp_size; int i,j,src_proc,dst_proc; double *ddst_put[MAXPROC]; double dsrc[ELEMS]; int elems[2] = {MAXPROC,ELEMS}; int value = -1, bytes, world_me; MP_MYID(&world_me); ARMCI_Group_rank(group, &grp_me); ARMCI_Group_size(group, &grp_size); if(grp_me==0) printf("GROUP SIZE = %d\n", grp_size); printf("%d:group rank = %d\n", me, grp_me); src_proc = 0; dst_proc = grp_size-1; bytes = ELEMS*sizeof(double); ARMCI_Malloc_group((void **)ddst_put, bytes, group); for(i=0; i<ELEMS; i++) dsrc[i]=i*1.001*(grp_me+1); for(i=0; i<ELEMS; i++) ddst_put[grp_me][i]=-1.0; armci_msg_group_barrier(group); if(grp_me==src_proc) { /* NOTE: make sure to specify absolute ids in ARMCI calls */ ARMCI_Put(dsrc, &ddst_put[dst_proc][0], bytes, ARMCI_Absolute_id(group,dst_proc)); } armci_msg_group_barrier(group); /* NOTE: make sure to specify absolute ids in ARMCI calls */ ARMCI_Fence(ARMCI_Absolute_id(group,dst_proc)); sleep(1); /* Verify*/ if(grp_me==dst_proc) { for(j=0; j<ELEMS; j++) { if(ARMCI_ABS(ddst_put[grp_me][j]-j*1.001*(src_proc+1)) > 0.1) { printf("\t%d: ddst_put[%d][%d] = %lf and expected value is %lf\n", me, grp_me, j, ddst_put[grp_me][j], j*1.001*(src_proc+1)); ARMCI_Error("groups: armci put failed...1", 0); } } printf("\n%d(%d): Test O.K. Verified\n", dst_proc, world_me); } armci_msg_group_barrier(group); ARMCI_Free_group(ddst_put[grp_me], group); }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if( (nproc<MINPROC || nproc>MAXPROC) && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n Testing ARMCI Groups!\n\n"); fflush(stdout); } test_groups(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);} sleep(2); #ifdef ARMCI_GROUP test_groups_noncollective(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);} sleep(2); #endif MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void compare_patches(double eps, int ndim, double *patch1, int lo1[], int hi1[], int dims1[],double *patch2, int lo2[], int hi2[], int dims2[]) { int i,j, elems=1; int subscr1[MAXDIMS], subscr2[MAXDIMS]; double diff,max; for(i=0;i<ndim;i++){ int diff = hi1[i]-lo1[i]; assert(diff == (hi2[i]-lo2[i])); assert(diff < dims1[i]); assert(diff < dims2[i]); elems *= diff+1; subscr1[i]= lo1[i]; subscr2[i]=lo2[i]; } for(j=0; j< elems; j++){ int idx1=0, idx2=0, offset1=0, offset2=0; idx1 = Index(ndim, subscr1, dims1); idx2 = Index(ndim, subscr2, dims2); if(j==0){ offset1 =idx1; offset2 =idx2; } idx1 -= offset1; idx2 -= offset2; diff = patch1[idx1] - patch2[idx2]; max = ARMCI_MAX(ARMCI_ABS(patch1[idx1]),ARMCI_ABS(patch2[idx2])); if(max == 0. || max <eps) max = 1.; if(eps < ARMCI_ABS(diff)/max){ char msg[48]; sprintf(msg,"(proc=%d):%f",me,patch1[idx1]); print_subscript("ERROR: a",ndim,subscr1,msg); sprintf(msg,"%f\n",patch2[idx2]); print_subscript(" b",ndim,subscr2,msg); fflush(stdout); sleep(1); ARMCI_Error("Bailing out",0); } update_subscript(ndim, subscr1, lo1,hi1, dims1); update_subscript(ndim, subscr2, lo2,hi2, dims2); } }
int main(int argc, char* argv[]) { armci_msg_init(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0) { printf("\n Performing Sparse Matrix-Vector Multiplication ...\n\n"); fflush(stdout); } test_sparse(); ARMCI_AllFence(); armci_msg_barrier(); if(me==0) { printf("\nSuccess!!\n"); fflush(stdout); } sleep(2); armci_msg_barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
void check_result(double *src_buf, double *dst_buf, int *stride, int *count, int stride_levels) { int i, j, size; long idx; int n1dim; /* number of 1 dim block */ int bvalue[ARMCI_MAX_STRIDE_LEVEL], bunit[ARMCI_MAX_STRIDE_LEVEL]; /* number of n-element of the first dimension */ n1dim = 1; for(i=1; i<=stride_levels; i++) n1dim *= count[i]; /* calculate the destination indices */ bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1; for(i=2; i<=stride_levels; i++) { bvalue[i] = 0; bunit[i] = bunit[i-1] * count[i-1]; } for(i=0; i<n1dim; i++) { idx = 0; for(j=1; j<=stride_levels; j++) { idx += bvalue[j] * stride[j-1]; if((i+1) % bunit[j] == 0) bvalue[j]++; if(bvalue[j] > (count[j]-1)) bvalue[j] = 0; } size = count[0] / sizeof(double); for(j=0; j<size; j++) if(ARMCI_ABS(((double *)((char *)src_buf+idx))[j] - ((double *)((char *)dst_buf+idx))[j]) > 0.000001 ){ fprintf(stdout,"Error:%s comparison failed: (%d) (%f :%f) %d\n", check_type, j, ((double *)((char *)src_buf+idx))[j], ((double *)((char *)dst_buf+idx))[j], count[0]); ARMCI_Error("failed",0); } } }
int main(int argc, char *argv[]) { ARMCI_Init_args(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if (nproc > MAXPROC && me == 0) { ARMCI_Error("Test works for up to %d processors\n", MAXPROC); } if (me == 0) { printf("ARMCI test program (%d processes)\n", nproc); fflush(stdout); sleep(1); } if (me == 0) { printf("\nAggregate put/get requests\n\n"); fflush(stdout); } test_aggregate(1); /* cold start */ test_aggregate(0); /* warm start */ ARMCI_AllFence(); ARMCI_Barrier(); if (me == 0) { printf("\nSuccess!!\n"); fflush(stdout); } sleep(2); ARMCI_Barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\nAggregate put/get requests\n\n"); fflush(stdout); } test_aggregate(1); /* cold start */ test_aggregate(0); /* warm start */ ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
static void load_balance(int n, int non_zero, int *row_ind_tmp) { int proc_id, i, local_nz, local_nz_acc, A, B; local_nz = local_nz_acc = non_zero/nproc; /* number of rows owned by each process is stored in proc_row_list. This is supposed to be well load balanced, so that each process has almost same number of non-zero elements */ proc_id = 0; if(me==0) printf("local_nz = %d\n", local_nz); for(i=0; i<n; i++) { /* as # of entries in row_ind_tmp = n+1 */ if(row_ind_tmp[i] < local_nz_acc && row_ind_tmp[i+1] >= local_nz_acc) { proc_row_list[proc_id++] = i+1; local_nz_acc = local_nz*(proc_id+1); if(proc_id == nproc-1) local_nz_acc = non_zero; if(me==0 && proc_id<nproc) printf("local_nz = %d\n", local_nz_acc); } } proc_row_list[nproc-1] = n; for(i=0; i<nproc; i++) { A = (i==0) ? 0: proc_row_list[i-1];/* # of entries in row_ind_tmp is n+1*/ B = proc_row_list[i]; proc_nz_list[i] = row_ind_tmp[B]-row_ind_tmp[A]; } if(proc_id != nproc) ARMCI_Error("Error while preparing Process Row list", proc_id-1); #if 1 if(me==0) verify_list(proc_row_list); #endif }
void test_aggregate(int dryrun) { int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; double *ddst_put[MAXPROC]; double *ddst_get[MAXPROC]; double *dsrc[MAXPROC]; armci_hdl_t aggr_hdl_put[MAXPROC]; armci_hdl_t aggr_hdl_get[MAXPROC]; armci_hdl_t hdl_put[MAXELEMS]; armci_hdl_t hdl_get[MAXELEMS]; armci_giov_t darr; void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS]; int start = 0, end = 0; double start_time; create_array((void**)ddst_put, sizeof(double),2, elems); create_array((void**)ddst_get, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) { ddst_put[me][i]=0.0; ddst_get[me][i]=0.0; } MP_BARRIER(); /* only proc 0 does the work */ if(me == 0) { if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS); /* initializing non-blocking handles */ for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]); for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]); /* aggregate handles */ for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]); bytes = sizeof(double); /* **************** PUT **************** */ /* register put */ start_time=MP_TIMER(); start = 0; end = elems[1]; for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i, &hdl_put[j]); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Value Put time = %.2es\n", me, MP_TIMER()-start_time); /* vector put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[me][j]; dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i]))) ARMCI_Error("armci_nbputv failed\n",rc); } for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]); if(!dryrun)printf("%d: Vector Put time = %.2es\n", me, MP_TIMER()-start_time); /* regular put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &hdl_put[j]))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Regular Put time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &aggr_hdl_put[i]))) ARMCI_Error("armci_nbput failed\n",rc); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]); if(!dryrun)printf("%d: Aggregate Put time = %.2es\n\n", me, MP_TIMER()-start_time); /* **************** GET **************** */ /* vector get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[i][j]; dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i]))) ARMCI_Error("armci_nbgetv failed\n",rc); ARMCI_Wait(&hdl_get[i]); } if(!dryrun)printf("%d: Vector Get time = %.2es\n", me, MP_TIMER()-start_time); /* regular get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &hdl_get[j]))) ARMCI_Error("armci_nbget failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]); } if(!dryrun)printf("%d: Regular Get time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &aggr_hdl_get[i]); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]); if(!dryrun)printf("%d: Aggregate Get time = %.2es\n", me, MP_TIMER()-start_time); } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); /* Verify */ if(!(me==0)) for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_put[me][j]-j*1.001) > 0.1) { ARMCI_Error("aggregate put failed...1", 0); } } MP_BARRIER(); if(!dryrun)if(me==0) printf("\n aggregate put ..O.K.\n"); fflush(stdout); if(me==0) { for(i=1; i<nproc; i++) { for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) { ARMCI_Error("aggregate get failed...1", 0); } } } } MP_BARRIER(); if(!dryrun)if(me==0) printf(" aggregate get ..O.K.\n"); fflush(stdout); ARMCI_AllFence(); MP_BARRIER(); if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst_put); destroy_array((void **)ddst_get); destroy_array((void **)dsrc); }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }
void compare_patches(double eps, int ndim, double *patch1, int lo1[], int hi1[], int dims1[], double *patch2, int lo2[], int hi2[], int dims2[]) { int i, j, elems = 1; int subscr1[MAXDIMS], subscr2[MAXDIMS]; double diff, max; int idx1, idx2, offset1, offset2; for (i = 0; i < ndim; i++) { /* count # of elements & verify consistency of both patches */ int diff = hi1[i] - lo1[i]; assert(diff == (hi2[i] - lo2[i])); assert(diff < dims1[i]); assert(diff < dims2[i]); elems *= diff + 1; subscr1[i] = lo1[i]; subscr2[i] = lo2[i]; } /* compare element values in both patches */ for (j = 0; j < elems; j++) { idx1 = Index(ndim, subscr1, dims1); /* calculate element Index from a subscript */ idx2 = Index(ndim, subscr2, dims2); if (j == 0) { offset1 = idx1; offset2 = idx2; } idx1 -= offset1; idx2 -= offset2; diff = patch1[idx1] - patch2[idx2]; max = ARMCI_MAX(ARMCI_ABS(patch1[idx1]), ARMCI_ABS(patch2[idx2])); if (max == 0. || max < eps) { max = 1.; } if (eps < ARMCI_ABS(diff) / max) { char msg[48]; sprintf(msg, "(proc=%d):%f", me, patch1[idx1]); print_subscript("ERROR: a", ndim, subscr1, msg); sprintf(msg, "%f\n", patch2[idx2]); print_subscript(" b", ndim, subscr2, msg); fflush(stdout); sleep(1); ARMCI_Error("Bailing out", 0); } { /* update subscript for the patches */ update_subscript(ndim, subscr1, lo1, hi1, dims1); update_subscript(ndim, subscr2, lo2, hi2, dims2); } } /* make sure we reached upper limit */ /*for(i=0;i<ndim;i++){ assert(subscr1[i]==hi1[i]); assert(subscr2[i]==hi2[i]); }*/ }
int main(int argc, char **argv) { int i; double **myptrs; double t0, t1, tnbget=0, tnbwait=0, t2=0; MP_INIT(argc,argv); ARMCI_Init(); MP_PROCS(&nprocs); MP_MYID(&me); if (nprocs < 2) ARMCI_Error("This program requires at least to processes", 1); myptrs = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); MP_BARRIER(); if(me == 0) { for(i = 0; i < 10; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1); } t0 = MP_TIMER(); for(i = 0; i < LOOP; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1); } t1 = MP_TIMER(); printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP); fflush(stdout); t1 = t0 = 0; for(i = 0; i < LOOP; i++) { armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MP_TIMER(); //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh); ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh); t1 = MP_TIMER(); ARMCI_Wait(&nbh); t2 = MP_TIMER(); tnbget += (t1-t0); tnbwait += (t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); } else sleep(1); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
void armci_agg_complete(armci_ihdl_t nb_handle, int condition) { int i, index=0, rc; /* get the buffer index for this handle */ for(i=ulist.size-1; i>=0; i--) { index = ulist.index[i]; if(aggr[index]->tag == nb_handle->tag && aggr[index]->proc == nb_handle->proc) break; } if(i<0) return; /* implies this handle has no requests at all */ #if 0 printf("%d: Aggregation Complete to remote process %d (%d:%d requests)\n", armci_me, nb_handle->proc, index, aggr[index]->request_len); #endif /* complete the data transfer. NOTE: in LAPI, Non-blocking calls (followed by wait) performs better than blocking put/get */ if(aggr[index]->request_len) { switch(nb_handle->op) { #ifdef LAPI armci_hdl_t usr_hdl; case PUT: ARMCI_INIT_HANDLE(&usr_hdl); if((rc=ARMCI_NbPutV(aggr[index]->darr, aggr[index]->request_len, nb_handle->proc, (armci_hdl_t*)&usr_hdl))) ARMCI_Error("armci_agg_complete: nbputv failed",rc); ARMCI_Wait((armci_hdl_t*)&usr_hdl); break; case GET: ARMCI_INIT_HANDLE(&usr_hdl); if((rc=ARMCI_NbGetV(aggr[index]->darr, aggr[index]->request_len, nb_handle->proc, (armci_hdl_t*)&usr_hdl))) ARMCI_Error("armci_agg_complete: nbgetv failed",rc); ARMCI_Wait((armci_hdl_t*)&usr_hdl); break; #else case PUT: if((rc=ARMCI_PutV(aggr[index]->darr, aggr[index]->request_len, nb_handle->proc))) ARMCI_Error("armci_agg_complete: putv failed",rc); break; case GET: if((rc=ARMCI_GetV(aggr[index]->darr, aggr[index]->request_len, nb_handle->proc))) ARMCI_Error("armci_agg_complete: getv failed",rc); break; #endif } } /* setting request length to zero, as the requests are completed */ aggr[index]->request_len = 0; aggr[index]->ptr_array_len = 0; aggr[index]->buf_pos_end = _MAX_AGG_BUFSIZE; /* If armci_agg_complete() is called ARMCI_Wait(), then unset nb_handle*/ if(condition==UNSET) { nb_handle->proc = -1; _armci_agg_update_lists(index); } }
void TestGlobals() { #define MAXLENG 256*1024 double *dtest; int *itest; long *ltest; int len; int ifrom=nproc-1,lfrom=1,dfrom=1; if (me == 0) { printf("Global test ... broadcast and reduction for int, long, double\n----------\n"); fflush(stdout); } if (!(dtest = (double *) malloc((unsigned) (MAXLENG*sizeof(double))))) ARMCI_Error("TestGlobals: failed to allocated dtest", MAXLENG); if (!(ltest = (long *) malloc((unsigned) (MAXLENG*sizeof(long))))) ARMCI_Error("TestGlobals: failed to allocated ltest", MAXLENG); if (!(itest = (int *) malloc((unsigned) (MAXLENG*sizeof(int))))) ARMCI_Error("TestGlobals: failed to allocated itest", MAXLENG); for (len=1; len<MAXLENG; len*=2) { int ilen = len*sizeof(int); int dlen = len*sizeof(double); int llen = len*sizeof(long); int i; ifrom = (ifrom+1)%nproc; lfrom = (lfrom+1)%nproc; dfrom = (lfrom+1)%nproc; #if 0 printf("%d:ifrom=%d lfrom=%d dfrom=%d\n",me,ifrom,lfrom,dfrom);fflush(stdout); #endif if (me == 0) { printf("Test length = %d ... ", len); fflush(stdout); } if(me == ifrom)for (i=0; i<len; i++)itest[i]=i; else for (i=0; i<len; i++)itest[i]=0; if(me == lfrom)for (i=0; i<len; i++)ltest[i]=(long)i; else for (i=0; i<len; i++)ltest[i]=0L; if(me == dfrom)for (i=0; i<len; i++)dtest[i]=(double)i; else for (i=0; i<len; i++)dtest[i]=0.0; /* Test broadcast */ armci_msg_brdcst(itest, ilen, ifrom); armci_msg_brdcst(ltest, llen, lfrom); armci_msg_brdcst(dtest, dlen, dfrom); for (i=0; i<len; i++){ if (itest[i] != i) armci_die2("int broadcast failed", i,itest[i]); if (ltest[i] != (long)i) armci_die2("long broadcast failed", i,(int)ltest[i]); if (dtest[i] != (double)i) armci_die2("double broadcast failed", i,(int)dtest[i]); } if (me == 0) { printf("broadcast OK ..."); fflush(stdout); } /* Test global sum */ for (i=0; i<len; i++) { itest[i] = i*me; ltest[i] = (long) itest[i]; dtest[i] = (double) itest[i]; } armci_msg_igop(itest, len, "+"); armci_msg_lgop(ltest, len, "+"); armci_msg_dgop(dtest, len, "+"); for (i=0; i<len; i++) { int iresult = i*nproc*(nproc-1)/2; if (itest[i] != iresult || ltest[i] != (long)iresult || dtest[i] != (double) iresult) ARMCI_Error("TestGlobals: global sum failed", (int) i); } if (me == 0) { printf("global sums OK\n"); fflush(stdout); } } /* now we get timing data */ time_gop(dtest,MAXLENG); time_reduce(dtest,MAXLENG); free((char *) itest); free((char *) ltest); free((char *) dtest); }
void test_acc_type(const int datatype) { int i = 0; int datatype_size = 0; void * scale; void * a; void *b[MAXPROC]; int elems = ELEMS; int dim = 1; int count = 0; int strideA = 0; int strideB = 0; switch(datatype) { case ARMCI_ACC_INT: datatype_size = sizeof(int); scale = malloc(datatype_size); *((int *) scale) = 1; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((int *) a)[i] = i + me; ((int *) b[me])[i] = 0; } break; case ARMCI_ACC_LNG: datatype_size = sizeof(long); scale = malloc(datatype_size); *((long *) scale) = 1; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((long *) a)[i] = i + me; ((long *) b[me])[i] = 0; } break; case ARMCI_ACC_FLT: datatype_size = sizeof(float); scale = malloc(datatype_size); *((float *) scale) = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((float *) a)[i] = (float) i + me; ((float *) b[me])[i] = 0.0; } break; case ARMCI_ACC_DBL: datatype_size = sizeof(double); scale = malloc(datatype_size); *((double *) scale) = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((double *) a)[i] = (double) i + me; ((double *) b[me])[i] = 0.0; } break; case ARMCI_ACC_CPL: datatype_size = sizeof(cmpl_t); scale = malloc(datatype_size); ((cmpl_t *) scale)->real = 2.0; ((cmpl_t *) scale)->imag = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((cmpl_t *) a)[i].real = ((float) i + me); ((cmpl_t *) a)[i].imag = ((float) i + me); ((cmpl_t *) b[me])[i].real = 0.0; ((cmpl_t *) b[me])[i].imag = 0.0; } break; case ARMCI_ACC_DCP: datatype_size = sizeof(dcmpl_t); scale = malloc(datatype_size); ((dcmpl_t *) scale)->real = 2.0; ((dcmpl_t *) scale)->imag = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((dcmpl_t *) a)[i].real = ((double) i + me); ((dcmpl_t *) a)[i].imag = ((double) i + me); ((dcmpl_t *) b[me])[i].real = 0.0; ((dcmpl_t *) b[me])[i].imag = 0.0; } break; default: return; break; } count = elems * datatype_size; strideA = elems * datatype_size; strideB = elems * datatype_size; ARMCI_AllFence(); MP_BARRIER(); for(i = 0; i < nproc; i++) ARMCI_AccS(datatype, scale, a, &strideA, b[(me + i) % nproc], &strideB, &count, 0, (me + i) % nproc); ARMCI_AllFence(); MP_BARRIER(); switch(datatype) { case ARMCI_ACC_INT: for(i = 0; i < elems; i++) { int compare = (i * nproc) + nproc / 2 * (nproc - 1); if(((int *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_INT [%d] = %d != %d\n", i, ((int *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_LNG: for(i = 0; i < elems; i++) { long compare = (i * nproc) + nproc / 2 * (nproc - 1); if(((long *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_LNG [%d] = %d != %ld\n", i, ((int *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_FLT: for(i = 0; i < elems; i++) { float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((float *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_FLT [%d] = %f != %f\n", i, ((float *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_DBL: for(i = 0; i < elems; i++) { double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((double *)b[me])[i] != (double) ((i * nproc) + nproc / 2 * (nproc - 1))) { printf("ERROR accumulate ARMCI_ACC_DBL [%d] = %f != %f \n", i, ((double *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_CPL: for(i = 0; i < elems; i++) { float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((cmpl_t *)b[me])[i].real != compare && ((cmpl_t *)b[me])[i].imag != 3 * compare) { printf("ERROR accumulate ARMCI_ACC_CPL [%d] = %f + %fj != %f + %fj\n", i, ((cmpl_t *)b[me])[i].real, ((cmpl_t *)b[me])[i].imag, compare, 3 * compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_DCP: for(i = 0; i < elems; i++) { double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((dcmpl_t *)b[me])[i].real != compare && ((dcmpl_t *)b[me])[i].imag != 3 * compare) { printf("ERROR accumulate ARMCI_ACC_DCP [%d] = %f + %fj != %f + %fj\n", i, ((dcmpl_t *)b[me])[i].real, ((dcmpl_t *)b[me])[i].imag, compare, 3 * compare); ARMCI_Error("test_acc_type failed\n",0); } } break; default: break; } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("O.K.\n\n"); fflush(stdout);} destroy_array((void**)b); free(a); free(scale); }
void test_brdcst(int datatype) { void *a[6]; int len[6] = {1, 10, 100, 1000, 10000, 100000}; int datatype_size = 0; int i, j; switch(datatype) { case ARMCI_INT: datatype_size = sizeof(int); for(i = 0; i < 6; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < 6; i++) if(me == 0) for(j = 0; j < len[i]; j++) ((int *) a[i])[j] = (int) j; else memset(a[i], 0x0, len[i] * datatype_size); break; case ARMCI_LONG: datatype_size = sizeof(long); for(i = 0; i < 6; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < 6; i++) if(me == 0) for(j = 0; j < len[i]; j++) ((long *) a[i])[j] = (long) j; else memset(a[i], 0x0, len[i] * datatype_size); break; case ARMCI_FLOAT: datatype_size = sizeof(float); for(i = 0; i < 6; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < 6; i++) if(me == 0) for(j = 0; j < len[i]; j++) ((float *) a[i])[j] = (float) j; else memset(a[i], 0x0, len[i] * datatype_size); break; case ARMCI_DOUBLE: datatype_size = sizeof(double); for(i = 0; i < 6; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < 6; i++) if(me == 0) for(j = 0; j < len[i]; j++) ((double *) a[i])[j] = (double) j; else memset(a[i], 0x0, len[i] * datatype_size); break; default: break; } for(i = 0; i < 6; i++) armci_msg_brdcst(a[i], len[i] * datatype_size, 0); switch(datatype) { case ARMCI_INT: for(i = 0; i < 6; i++) for(j = 0; j < len[i]; j++) if(((int *) a[i])[j] != (int) j) { printf("ERROR a[%d][%d] = %d != %d\n", i, j, ((int *) a[i])[j], (int) j); ARMCI_Error("armci_brdcst failed (int)\n",0); } break; case ARMCI_LONG: for(i = 0; i < 6; i++) for(j = 0; j < len[i]; j++) if(((long *) a[i])[j] != (long) j) { printf("ERROR a[%d][%d] = %ld != %ld\n", i, j, ((long *) a[i])[j], (long) j); ARMCI_Error("armci_brdcst failed (long)\n",0); } break; case ARMCI_FLOAT: for(i = 0; i < 6; i++) for(j = 0; j < len[i]; j++) if(((float *) a[i])[j] != (float) j) { printf("ERROR a[%d][%d] = %f != %f\n", i, j, ((float *) a[i])[j], (float) j); ARMCI_Error("armci_brdcst failed (float)\n",0); } break; case ARMCI_DOUBLE: for(i = 0; i < 6; i++) for(j = 0; j < len[i]; j++) if(((double *) a[i])[j] != (double) j) { printf("ERROR a[%d][%d] = %f != %f\n", i, j, ((double *) a[i])[j], (double) j); ARMCI_Error("armci_brdcst failed (double)\n",0); } break; default: break; } for(i = 0; i < 6; i++) free(a[i]); }
void read_and_create(int argc, char **argv) { int ri,i,nread; int tmp1,idealelementsperproc; void **amatptrs,**xvecptrs; na = atoi(argv[1]); nz = atoi(argv[2]); if(strncmp("random",argv[3],6)){ if(me==0){ fd = fopen(argv[3], "r"); if(fd==NULL)ARMCI_Error("unable to open given file",0); } } else{ if(na==0 || nz==0){ printf("\nERROR:exiting-no input file given and na or nz is 0"); fflush(stdout); ARMCI_Finalize(); MP_FINALIZE(); return; } if(me==0){ generate_random_file(na,nz); fd = fopen("randominput.dat", "r"); } } if(me==0){ if(na==0) nread = fread(&na, sizeof(na), 1, fd); if(nz==0) nread = fread(&nz, sizeof(nz), 1, fd); printf("\nReading CG input\n"); printf("Number of rows: %d\n", na); printf("Number of non-zeros: %d\n", nz); } armci_msg_bcast(&nz,sizeof(int),0); armci_msg_bcast(&na,sizeof(int),0); MP_BARRIER(); amatptrs = (void **)malloc(sizeof(void *)*nproc); xvecptrs = (void **)malloc(sizeof(void *)*nproc); if(xvecptrs==NULL || amatptrs==NULL) ARMCI_Error("xvecptrs amatptrs malloc failed",sizeof(void *)*nproc); if(ARMCI_Malloc(amatptrs,((me==0)?(sizeof(double)*nz):0))) ARMCI_Error("amat malloc failed",sizeof(double)*nz); amat = (double *)amatptrs[0]; if(ARMCI_Malloc(amatptrs,((me==0)?(sizeof(int)*(nz+1)):0))) ARMCI_Error("icol malloc failed",sizeof(int)*(nz+1)); cidx = (int *)amatptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(int)*(na+1)):0)); /*+1 for end of last row*/ ridx = (int *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*(na+1)):0)); xvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*(na+1)):0)); bvec = (double *)xvecptrs[0]; if(me==0){ for (i = 0; i < na + 1; i++) xvec[i] = 0.0; nread = fread(amat, sizeof(double), nz, fd); nread = fread(ridx, sizeof(int), (na+1), fd); ridx[na]=nz; nread = fread(cidx, sizeof(int), (nz+1), fd); nread = fread(bvec, sizeof(double), (na+1), fd); /* the c adjustment */ for (i = 0; i < na; i++) ridx[i] -= 1; for (i = 0; i < nz; i++) cidx[i] -= 1; } MP_BARRIER(); /*acg_matvecmul(amat,xvec,bvec,ridx,cidx);*/ if(0){ for(i=0;i<nz+1;i++) printf("\n%d:amat[%d]=%f icol[%d]=%d",me,i,amat[i],i,cidx[i]); for(i=0;i<na+1;i++) printf("\n%d:irow[%d]=%d bvec[%d]=%f",me,i,ridx[i],i,bvec[i]); } allfirstrow = (int *)malloc(sizeof(int)*nproc); alllastrow = (int *)malloc(sizeof(int)*nproc); columnmap = (int *)malloc(sizeof(int)*nproc); if(!allfirstrow || !alllastrow || !columnmap) ARMCI_Error("malloc failed allfirstrow ",0); MP_BARRIER(); /* * next decide who works on which rows, this will decide the * distribution of a,d,r,q,x,and ax */ /*create the mapping for all vectors, row matrix and column matrix*/ if(me==0){ idealelementsperproc = nz/nproc; tmp1=0; for(i=0;i<nproc;i++){ int elementsperproc=0; allfirstrow[i]=tmp1; for(ri=tmp1;ri<na;ri++,tmp1++){ elementsperproc+=(ridx[ri+1]-ridx[ri]); if(elementsperproc>=idealelementsperproc){ if((elementsperproc-idealelementsperproc) > idealelementsperproc-(elementsperproc-(ridx[ri+1]-ridx[ri]))){ alllastrow[i] = ri-1; if((ri-1)<0)ARMCI_Error("run on a smaller processor count",0); /*tmp1--;*/ } else{ alllastrow[i] = ri; if(ri<0)ARMCI_Error("run on a smaller processor count",0); tmp1++; } elementsperproc=0; break; } } } alllastrow[nproc-1]=na-1; for(i=0;i<nproc;i++)columnmap[i]=ridx[allfirstrow[i]]; } armci_msg_bcast(columnmap,nproc*sizeof(int),0); armci_msg_bcast(allfirstrow,nproc*sizeof(int),0); armci_msg_bcast(alllastrow,nproc*sizeof(int),0); myfirstrow = allfirstrow[me]; mylastrow = alllastrow[me]; if(me==0)for(i=0;i<nproc;i++){ printf("\nDISTRIBUTION:first row of process\t%d is %d last row of process\t%d is %d",i,allfirstrow[i],i,alllastrow[i]); } /* for(i=myfirstrow;i<mylastrow;i++){ xvec[i]=0.0; } */ ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); rvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); dvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); svec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); dmvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); qvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); axvec = (double *)xvecptrs[0]; if(me==0)fclose(fd); /*dont forget to free mallocs*/ free(allfirstrow); free(alllastrow); free(columnmap); }
void DDI_ARMCI_Error(char *message, int code) { ARMCI_Error(message,code); }
static int sparse_initialize(int *n, int *non_zero, int **row_ind, int **col_ind, double **values, double **vec, double **svec) { int i, j, rc, max, *row_ind_tmp=NULL, *tmp_indices=NULL; double *tmp_values=NULL; unsigned long len; FILE *fp=NULL; /* Broadcast order of matrix */ if(me==0) { if((fp=fopen("Sparse-MPI/av41092.rua.data", "r")) == NULL) ARMCI_Error("Error: Input file not found", me); fortran_indexing = 1; /* This is 1 for Harwell-Boeing format matrices */ fscanf(fp, "%d", n); if(*n%nproc) ARMCI_Error("# of rows is not divisible by # of processors", nproc); if(*n > ROW) ARMCI_Error("order is greater than defined variable ROW", ROW); } len = sizeof(int); armci_msg_brdcst(n, len, 0); /* Broad cast number of non_zeros */ if(me==0) fscanf(fp, "%d", non_zero); armci_msg_brdcst(non_zero, len, 0); /* Broadcast row indices */ len = (*n+1)*sizeof(int); row_ind_tmp = (int *)malloc(len); if(me==0)for(i=0; i<*n+1; i++) { fscanf(fp, "%d", &row_ind_tmp[i]); if(fortran_indexing) --row_ind_tmp[i]; } armci_msg_brdcst(row_ind_tmp, len, 0); load_balance(*n, *non_zero, row_ind_tmp); /* find how much temporary storage is needed at the maximum */ if(me==0) { for(max=-1,j=0; j<nproc; j++) if(max<proc_nz_list[j]) max=proc_nz_list[j]; if(max<0) ARMCI_Error(" max cannot be negative", max); } /* Broadcast the maximum number of elements */ len = sizeof(int); armci_msg_brdcst(&max, len, 0); /* create the Sparse MAtrix Array */ if(me==0) printf(" Creating ValueArray (CompressedSparseMatrix) ...\n\n"); create_array((void**)col_ind, sizeof(int), 1, &max); /* create the column subscript array */ if(me==0) printf(" Creating Column Subscript Array ... \n\n"); create_array((void**)values, sizeof(double), 1, &max); /* create the x-vector and the solution vector */ if(me==0) printf(" Creating Vectors ... \n\n"); create_array((void**)vec, sizeof(double),1, &max); create_array((void**)svec, sizeof(double),1, &max); armci_msg_barrier(); /* Process 0 distributes the column indices and non_zero values to respective processors*/ if(me == 0) { tmp_indices = (int *)malloc(max*sizeof(int)); tmp_values = (double *)malloc(max*sizeof(double)); for(j=0; j<nproc; j++) { for(i=0; i<proc_nz_list[j]; i++) { fscanf(fp, "%d", &tmp_indices[i]); if(fortran_indexing) --tmp_indices[i]; } /* rc = fread(tmp_indices, sizeof(int), proc_nz_list[j], fp); */ if((rc=ARMCI_Put(tmp_indices, col_ind[j], proc_nz_list[j]*sizeof(int), j))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=0; j<nproc; j++) { for(i=0; i<proc_nz_list[j]; i++) fscanf(fp, "%lf", &tmp_values[i]); if((rc=ARMCI_Put(tmp_values, values[j], proc_nz_list[j]*sizeof(double), j))) ARMCI_Error("armci_nbput failed\n",rc); } } ARMCI_AllFence(); armci_msg_barrier(); ARMCI_AllFence(); /* initializing x-vector */ if(me==0) for(i=0; i<proc_nz_list[me]; i++) vec[me][i] = (i+1); else for(i=0; i<proc_nz_list[me]; i++) vec[me][i]=me*proc_nz_list[me-1]+(i+1); #if 0 if(me==0) { printf("max = %d\n", max); for(i=0; i<max; i++) printf("%.1f ", values[me][i]); printf("\n"); } #endif *row_ind = row_ind_tmp; if(me==0) { free(tmp_indices); free(tmp_values); fclose(fp); } return 0; }
int main(int argc, char **argv) { int me,nproc; int status; int rank; /* initialization */ MPI_Init(&argc, &argv); ARMCI_Init(); #ifdef HPC_PROFILING HPM_Init(); #endif MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_size(MPI_COMM_WORLD,&nproc); #ifdef DEBUG if(me == 0){ printf("The result of MPI_Comm_size is %d\n",nproc); fflush(stdout); } #endif /* get the matrix parameters */ if (argc > 1){ rank = atoi(argv[1]); } else { rank = 8; } if (me == 0){ printf("Running matmul.x with rank = %d\n",rank); fflush(stdout); } /* register remote pointers */ double** addr_A = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc); if (addr_A == NULL) ARMCI_Error("malloc A failed at line",0); double** addr_B = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc); if (addr_B == NULL) ARMCI_Error("malloc B failed at line",0); double** addr_C = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc); if (addr_C == NULL) ARMCI_Error("malloc C failed at line",0); #ifdef DEBUG if(me == 0) printf("ARMCI_Malloc A requests %lu bytes\n",rank*rank*sizeof(double)); fflush(stdout); #endif status = ARMCI_Malloc((void **) addr_A, rank*rank*sizeof(double)); if (status != 0) ARMCI_Error("ARMCI_Malloc A failed",status); #ifdef DEBUG if(me == 0) printf("ARMCI_Malloc B requests %lu bytes\n",rank*rank*sizeof(double)); fflush(stdout); #endif status = ARMCI_Malloc((void **) addr_B, rank*rank*sizeof(double)); if (status != 0) ARMCI_Error("ARMCI_Malloc B failed",status); #ifdef DEBUG if(me == 0) printf("ARMCI_Malloc C requests %lu bytes\n",rank*rank*sizeof(double)); fflush(stdout); #endif status = ARMCI_Malloc((void **) addr_C, rank*rank*sizeof(double)); if (status != 0) ARMCI_Error("ARMCI_Malloc C failed",status); MPI_Barrier(MPI_COMM_WORLD); /* free ARMCI pointers */ ARMCI_Free_local(addr_C); ARMCI_Free_local(addr_B); ARMCI_Free_local(addr_A); #ifdef HPC_PROFILING HPM_Print(); #endif /* the end */ ARMCI_Finalize(); MPI_Finalize(); return(0); }
int main(int argc, char *argv[]) { int i, j, rank, nranks; int xdim, ydim; long bufsize; double **buffer; double t_start=0.0, t_stop=0.0; int count[2], src_stride, trg_stride, stride_level, peer; double expected, actual; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if (nranks < 2) { printf("%s: Must be run with at least 2 processes\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } ARMCI_Init_args(&argc, &argv); bufsize = MAX_XDIM * MAX_YDIM * sizeof(double); buffer = (double **) malloc(sizeof(double *) * nranks); ARMCI_Malloc((void **) buffer, bufsize); for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } if(rank == 0) { printf("ARMCI_PutS Latency - local and remote completions - in usec \n"); printf("%30s %22s %22s\n", "Dimensions(array of doubles)", "Latency-LocalCompeltion", "Latency-RemoteCompletion"); fflush(stdout); } src_stride = MAX_YDIM*sizeof(double); trg_stride = MAX_YDIM*sizeof(double); stride_level = 1; ARMCI_Barrier(); for(xdim=1; xdim<=MAX_XDIM; xdim*=2) { count[1] = xdim; for(ydim=1; ydim<=MAX_YDIM; ydim*=2) { count[0] = ydim*sizeof(double); if(rank == 0) { peer = 1; for(i=0; i<ITERATIONS+SKIP; i++) { if(i == SKIP) t_start = MPI_Wtime(); ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); } t_stop = MPI_Wtime(); ARMCI_Fence(peer); char temp[10]; sprintf(temp,"%dX%d", xdim, ydim); printf("%30s %20.2f", temp, ((t_stop-t_start)*1000000)/ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); for(i=0; i<ITERATIONS+SKIP; i++) { if(i == SKIP) t_start = MPI_Wtime(); ARMCI_PutS((void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); ARMCI_Fence(peer); } t_stop = MPI_Wtime(); printf("%20.2f \n", ((t_stop-t_start)*1000000)/ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); } else { peer = 0; expected = (1.0 + (double) peer); ARMCI_Barrier(); if (rank == 1) { for(i=0; i<xdim; i++) { for(j=0; j<ydim; j++) { actual = *(buffer[rank] + i*MAX_YDIM + j); if(actual != expected) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, expected, actual); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } } for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Barrier(); ARMCI_Barrier(); if (rank == 1) { for(i=0; i<xdim; i++) { for(j=0; j<ydim; j++) { actual = *(buffer[rank] + i*MAX_YDIM + j); if(actual != expected) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, expected, actual); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for(i=0; i< bufsize/sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } } ARMCI_Barrier(); } } } ARMCI_Barrier(); ARMCI_Free((void *) buffer[rank]); free(buffer); ARMCI_Finalize(); MPI_Finalize(); return 0; }
void test_gop2_or_reduce(const int datatype, char * op, const int reduce_test) { void *a[6]; int len[6] = {1, 10, 100, 1000, 10000, 100000}; int len_length = 3; int datatype_size = 0; int i, j; char * test_type; int verbose = 0; if(reduce_test == 0) test_type = "gop2"; else test_type = "reduce"; switch(datatype) { case ARMCI_INT: datatype_size = sizeof(int); for(i = 0; i < len_length; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < len_length; i++) for(j = 0; j < len[i]; j++) ((int *) a[i])[j] = (int) (me + j) * (((me + j) % 2 == 0) ? 1 : -1); for(i = 0; i < len_length; i++) { if(me == 0 && verbose != 0) printf("testing %s %s message size = %d op = %s\n", test_type, "ARMCI_INT", len[i], op); if(reduce_test == 0) armci_msg_igop(a[i], len[i], op); else armci_msg_reduce(a[i], len[i], op, datatype); } if(me == 0 || reduce_test == 0) for(i = 0; i < len_length; i++) { if(me == 0 && verbose != 0) printf("checking %s %s message size = %d op = %s\n", test_type, "ARMCI_INT", len[i], op); for(j = 0; j < len[i]; j++) if(strncmp(op, "+", 1) == 0) { int compare = 0; if(nproc % 2 == 0) { if(j % 2 == 0) compare = -nproc / 2; else compare = nproc / 2; } else { if(j % 2 == 0) compare = j + nproc / 2; else compare = -(j + nproc / 2); } if(((int *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %d != %d\n", test_type, "ARMCI_INT", op, i, j, ((int *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "*", 1) == 0) { int compare = 1; int k = 0; for(k = 0; k < nproc; k++) compare *= (k + j) * (((k + j) % 2 == 0) ? 1 : -1); if(((int *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %d != %d\n", test_type, "ARMCI_INT", op, i, j, ((int *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "min", 3) == 0) { int compare = -(j + nproc - 1); if(compare % 2 == 0 && nproc > 1) compare = -(j + nproc - 2); if(((int *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %d != %d\n", test_type, "ARMCI_INT", op, i, j, ((int *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "max", 3) == 0) { int compare = j + nproc - 1; if(compare % 2 != 0 && nproc > 1) compare = j + nproc - 2; if(((int *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %d != %d\n", test_type, "ARMCI_INT", op, i, j, ((int *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "absmax", 6) == 0) { int compare = j + nproc - 1; if(((int *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %d != %d\n", test_type, "ARMCI_INT", op, i, j, ((int *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "absmin", 6) == 0) { int compare = j; if(((int *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %d != %d\n", test_type, "ARMCI_INT", op, i, j, ((int *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "or", 2) == 0) { } } break; case ARMCI_LONG: datatype_size = sizeof(long); for(i = 0; i < len_length; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < len_length; i++) for(j = 0; j < len[i]; j++) ((long *) a[i])[j] = (long) (me + j) * (((me + j) % 2 == 0) ? 1 : -1); for(i = 0; i < len_length; i++) { if(me == 0 && verbose != 0) printf("testing %s %s message size = %d op = %s\n", test_type, "ARMCI_LONG", len[i], op); if(reduce_test == 0) armci_msg_lgop(a[i], len[i], op); else armci_msg_reduce(a[i], len[i], op, datatype); } if(me == 0 || reduce_test == 0) for(i = 0; i < len_length; i++) { if(me == 0 && verbose != 0) printf("checking %s %s message size = %d op = %s\n", test_type, "ARMCI_LONG", len[i], op); for(j = 0; j < len[i]; j++) if(strncmp(op, "+", 1) == 0) { int compare = 0; if(nproc % 2 == 0) { if(j % 2 == 0) compare = -nproc / 2; else compare = nproc / 2; } else { if(j % 2 == 0) compare = j + nproc / 2; else compare = -(j + nproc / 2); } if(((long *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %ld != %d\n", test_type, "ARMCI_LONG", op, i, j, ((long *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "*", 1) == 0) { int compare = 1; int k = 0; for(k = 0; k < nproc; k++) compare *= (k + j) * (((k + j) % 2 == 0) ? 1 : -1); if(((long *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %ld != %d\n", test_type, "ARMCI_LONG", op, i, j, ((long *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "min", 3) == 0) { int compare = -(j + nproc - 1); if(compare % 2 == 0 && nproc > 1) compare = -(j + nproc - 2); if(((long *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %ld != %d\n", test_type, "ARMCI_LONG", op, i, j, ((long *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "max", 3) == 0) { int compare = j + nproc - 1; if(compare % 2 != 0 && nproc > 1) compare = j + nproc - 2; if(((long *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %ld != %d\n", test_type, "ARMCI_LONG", op, i, j, ((long *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "absmax", 6) == 0) { int compare = j + nproc - 1; if(((long *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %ld != %d\n", test_type, "ARMCI_LONG", op, i, j, ((long *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "absmin", 6) == 0) { int compare = j; if(((long *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %ld != %d\n", test_type, "ARMCI_LONG", op, i, j, ((long *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "or", 2) == 0) { } } break; case ARMCI_FLOAT: datatype_size = sizeof(float); for(i = 0; i < len_length; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < len_length; i++) for(j = 0; j < len[i]; j++) ((float *) a[i])[j] = (float) (me + j) * (((me + j) % 2 == 0) ? 1.0 / nproc : -1.0 / nproc); for(i = 0; i < len_length; i++) { if(me == 0 && verbose != 0) printf("testing %s ARMCI_FLOAT message size = %d op = %s\n", test_type, len[i], op); if(reduce_test == 0) armci_msg_fgop(a[i], len[i], op); else armci_msg_reduce(a[i], len[i], op, datatype); } if(me == 0 || reduce_test == 0) for(i = 0; i < len_length; i++) { if(me == 0 && verbose != 0) printf("checking %s ARMCI_FLOAT message size = %d op = %s\n", test_type, len[i], op); for(j = 0; j < len[i]; j++) if(strncmp(op, "+", 1) == 0) { float compare = 0.0; if(nproc % 2 == 0) { if(j % 2 == 0) compare = -(((int)nproc / 2) / (float) nproc); else compare = ((int)nproc / 2) / (float) nproc; } else { if(j % 2 == 0) compare = ((int) j + nproc / 2) / (float) nproc; else compare = -(((int) j + nproc / 2) / (float) nproc); } if(ARMCI_ABS(((float *) a[i])[j] - compare) > ARMCI_ABS(compare) * FLOAT_EPS) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_FLOAT", op, i, j, ((float *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "*", 1) == 0) { float compare = 1.0; int k = 0; for(k = 0; k < nproc; k++) compare *= ((float) k + j) / (float) nproc; if((nproc / 2) % 2 != 0) if(nproc % 2 != 0) if(j % 2 == 0) compare *= -1.0; if(ARMCI_ABS(((float *) a[i])[j] - compare) > ARMCI_ABS(compare) * FLOAT_EPS) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_FLOAT", op, i, j, ((float *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "min", 3) == 0) { float compare = -((float) j + nproc - 1) / nproc; if((j + nproc - 1)% 2 == 0 && nproc > 1) compare = -((float) j + nproc - 2) / nproc; if(((float *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_FLOAT", op, i, j, ((float *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "max", 3) == 0) { float compare = ((float) j + nproc - 1) / nproc; if((j + nproc - 1) % 2 != 0 && nproc > 1) compare = ((float) j + nproc - 2) / nproc; if(((float *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_FLOAT", op, i, j, ((float *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "absmax", 6) == 0) { float compare = ((float) j + nproc - 1) / nproc; if(((float *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_FLOAT", op, i, j, ((float *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "absmin", 6) == 0) { float compare = (float) j / nproc; if(((float *) a[i])[j] != compare) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_FLOAT", op, i, j, ((float *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } } break; case ARMCI_DOUBLE: datatype_size = sizeof(double); for(i = 0; i < len_length; i++) a[i] = malloc(len[i] * datatype_size); for(i = 0; i < len_length; i++) for(j = 0; j < len[i]; j++) ((double *) a[i])[j] = (double) (me + j) * (((me + j) % 2 == 0) ? 1.0 / nproc : -1.0 / nproc); for(i = 0; i < len_length; i++) { if(me == 0 && verbose != 0) printf("testing %s ARMCI_DOUBLE message size = %d op = %s\n", test_type, len[i], op); if(reduce_test == 0) armci_msg_dgop(a[i], len[i], op); else armci_msg_reduce(a[i], len[i], op, datatype); } if(me == 0 || reduce_test == 0) for(i = 0; i < len_length; i++) { if(me == 0 && verbose != 0) printf("checking %s ARMCI_DOUBLE message size = %d op = %s\n", test_type, len[i], op); for(j = 0; j < len[i]; j++) if(strncmp(op, "+", 1) == 0) { double compare = 0.0; if(nproc % 2 == 0) { if(j % 2 == 0) compare = -(((int)nproc / 2) / (double) nproc); else compare = ((int)nproc / 2) / (double) nproc; } else { if(j % 2 == 0) compare = ((int) j + nproc / 2) / (double) nproc; else compare = -(((int) j + nproc / 2) / (double) nproc); } if(ARMCI_ABS(((double *) a[i])[j] - compare) > ARMCI_ABS(compare) * DOUBLE_EPS) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_DOUBLE", op, i, j, ((double *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "*", 1) == 0) { double compare = 1.0; int k = 0; for(k = 0; k < nproc; k++) compare *= ((float) k + j) / (float) nproc; if((nproc / 2) % 2 != 0) if(nproc % 2 != 0) if(j % 2 == 0) compare *= -1.0; if(ARMCI_ABS(((double *) a[i])[j] - compare) > ARMCI_ABS(compare) * DOUBLE_EPS) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_DOUBLE", op, i, j, ((double *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "min", 3) == 0) { double compare = -((double) j + nproc - 1) / nproc; if((j + nproc - 1)% 2 == 0 && nproc > 1) compare = -((double) j + nproc - 2) / nproc; if(ARMCI_ABS(((double *) a[i])[j] - compare) > ARMCI_ABS(compare) * DOUBLE_EPS) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_DOUBLE", op, i, j, ((double *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "max", 3) == 0) { double compare = ((double) j + nproc - 1) / nproc; if((j + nproc - 1) % 2 != 0 && nproc > 1) compare = ((double) j + nproc - 2) / nproc; if(ARMCI_ABS(((double *) a[i])[j] - compare) > ARMCI_ABS(compare) * DOUBLE_EPS) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_DOUBLE", op, i, j, ((double *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "absmax", 6) == 0) { double compare = ((double) j + nproc - 1) / nproc; if(ARMCI_ABS(((double *) a[i])[j] - compare) > ARMCI_ABS(compare) * DOUBLE_EPS) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_DOUBLE", op, i, j, ((double *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } else if(strncmp(op, "absmin", 6) == 0) { double compare = (double) j / nproc; if(ARMCI_ABS(((double *) a[i])[j] - compare) > ARMCI_ABS(compare) * DOUBLE_EPS) { printf("ERROR %s %s %s a[%d][%d] = %f != %f\n", test_type, "ARMCI_DOUBLE", op, i, j, ((double *) a[i])[j], compare); ARMCI_Error("test_gop2_or_reduce failed\n",0); } } } break; default: break; } for(i = 0; i < len_length; i++) free(a[i]); }
int main(int argc, char **argv) { int i, j, rank, nranks, peer; size_t xdim, ydim; unsigned long bufsize; double **buffer, *src_buf; double t_start=0.0, t_stop; int count[2], src_stride, trg_stride, stride_level; double scaling; int provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); if (nranks < 2) { printf("%s: Must be run with at least 2 processes\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 1); } ARMCI_Init_args(&argc, &argv); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = MAX_XDIM * MAX_YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) { printf("ARMCI_AccS Latency - local and remote completions - in usec \n"); printf("%30s %22s %22s\n", "Dimensions(array of double)", "Local Completion", "Remote completion"); fflush(stdout); } ARMCI_Access_begin(buffer[rank]); for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); scaling = 2.0; src_stride = MAX_YDIM * sizeof(double); trg_stride = MAX_YDIM * sizeof(double); stride_level = 1; ARMCI_Barrier(); for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2) { count[1] = xdim; for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2) { count[0] = ydim * sizeof(double); if (rank == 0) { peer = 1; for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); } t_stop = MPI_Wtime(); ARMCI_Fence(1); char temp[10]; sprintf(temp, "%dX%d", (int) xdim, (int) ydim); printf("%30s %20.2f ", temp, ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); for (i = 0; i < ITERATIONS + SKIP; i++) { if (i == SKIP) t_start = MPI_Wtime(); ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, /* (void *) buffer[rank] */ src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, 1); ARMCI_Fence(1); } t_stop = MPI_Wtime(); printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS); fflush(stdout); ARMCI_Barrier(); ARMCI_Barrier(); } else { peer = 0; ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); ARMCI_Barrier(); if (rank == 1) { ARMCI_Access_begin(buffer[rank]); for (i = 0; i < xdim; i++) { for (j = 0; j < ydim; j++) { if (*(buffer[rank] + i * MAX_XDIM + j) != ((1.0 + rank) + scaling * (1.0 + peer) * (ITERATIONS + SKIP))) { printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n", i, j, ((1.0 + rank) + scaling * (1.0 + peer)), *(buffer[rank] + i * MAX_YDIM + j)); fflush(stdout); ARMCI_Error("Bailing out", 1); } } } for (i = 0; i < bufsize / sizeof(double); i++) { *(buffer[rank] + i) = 1.0 + rank; } ARMCI_Access_end(buffer[rank]); } ARMCI_Barrier(); } } } ARMCI_Barrier(); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char* argv[]) { int i; struct timeval start_time[14]; struct timeval stop_time[14]; /* char * test_name[14] = { "dim", "nbdim", "vec_small", "acc", "vector", "vector_acc", "fetch_add", "swap", "rput", "aggregate", "implicit", "memlock", "acc_type", "collective" }; int test_flags[14] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }; */ char * test_name[2] = { "acc_type", "collective" }; int test_flags[2] = { 1, 1 }; #define TEST_ACC_TYPE 0 #define TEST_COLLECTIVE 1 MP_INIT(argc, argv); ARMCI_Init(); MP_PROCS(&nproc); MP_MYID(&me); if(nproc > MAXPROC && me == 0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me == 0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } gettimeofday(&start_time[TEST_ACC_TYPE],NULL); if(test_flags[TEST_ACC_TYPE] == 1) { if(me == 0) { printf("\nTesting Accumulate Types\n"); fflush(stdout); } MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_INT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_INT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_LNG\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_LNG); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_FLT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_FLT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DBL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DBL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_CPL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_CPL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DCP\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DCP); ARMCI_AllFence(); MP_BARRIER(); } gettimeofday(&stop_time[TEST_ACC_TYPE],NULL); gettimeofday(&start_time[TEST_COLLECTIVE],NULL); if(test_flags[TEST_COLLECTIVE] == 1) { if(me == 0) { printf("\nTesting Collective Types\n"); fflush(stdout); } if(me == 0) { printf("Test Collective ARMCI_INT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_INT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_LONG\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_LONG); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_FLOAT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_FLOAT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_DOUBLE\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_DOUBLE); MP_BARRIER(); } gettimeofday(&stop_time[TEST_COLLECTIVE],NULL); if(me == 0) { printf("Accumulate and Collective tests passed\n"); fflush(stdout); } if(me == 0) { printf("Testcase runtime\n"); printf("Name,Time(seconds)\n"); for(i = 0; i < 2; i++) if(test_flags[i] == 1) { double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6; printf("%s,%.6f\n", test_name[i], time_spent); } } MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
static void contig_test(size_t buffer_size, int op) { void **dst_ptr; void **put_buf; void **get_buf; double *times; dst_ptr = (void*)malloc(nproc * sizeof(void*)); put_buf = (void*)malloc(nproc * sizeof(void*)); get_buf = (void*)malloc(nproc * sizeof(void*)); times = (double*)malloc(nproc * sizeof(double)); ARMCI_Malloc(dst_ptr, buffer_size); ARMCI_Malloc(put_buf, buffer_size); ARMCI_Malloc(get_buf, buffer_size); /* initialize what we're putting */ fill_array((double*)put_buf[me], buffer_size/sizeof(double), me); size_t msg_size; int dst = 1; double scale = 1.0; for (msg_size = 16; msg_size <= buffer_size; msg_size *= 2) { int j; int iter = msg_size > MEDIUM_MESSAGE_SIZE ? ITER_LARGE : ITER_SMALL; double t_start, t_end; if (0 == me) { for (j= 0; j < iter + WARMUP; ++j) { if (WARMUP == j) { t_start = dclock(); } switch (op) { case PUT: ARMCI_Put(put_buf[me], dst_ptr[dst], msg_size, dst); break; case GET: ARMCI_Get(dst_ptr[dst], get_buf[me], msg_size, dst); break; case ACC: ARMCI_Acc(ARMCI_ACC_DBL, &scale, put_buf[me], dst_ptr[dst], msg_size, dst); break; default: ARMCI_Error("oops", 1); } } } /* calculate total time and average time */ t_end = dclock(); ARMCI_Barrier(); if (0 == me) { printf("%8zu\t\t%6.2f\t\t%10.2f\n", msg_size, ((t_end - t_start))/iter, msg_size*iter/((t_end - t_start))); } } ARMCI_Free(dst_ptr[me]); ARMCI_Free(put_buf[me]); ARMCI_Free(get_buf[me]); free(dst_ptr); free(put_buf); free(get_buf); free(times); }