void test_collective(const int datatype) { char * op[7] = {"+", "*", "min", "max", "absmax", "absmin", "or"}; int i = 0; int num_tests = 7; if(datatype == ARMCI_DOUBLE || datatype == ARMCI_FLOAT) num_tests = 6; /* test armci_msg_brdcst */ test_brdcst(datatype); /* test armci_msg_gop2 */ for(i = 0; i < num_tests; i++) test_gop2_or_reduce(datatype, op[i], 0); /* test armci_msg_reduce */ for(i = 0; i < num_tests; i++) test_gop2_or_reduce(datatype, op[i], 1); MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("O.K.\n\n"); fflush(stdout);} }
void test_groups() { int pid_listA[MAXPROC] = {0,1,2}; int pid_listB[MAXPROC] = {1,3}; ARMCI_Group groupA, groupB; MP_BARRIER(); ARMCI_Group_create(GNUM_A, pid_listA, &groupA); /* create group 1 */ ARMCI_Group_create(GNUM_B, pid_listB, &groupB); /* create group 2 */ /* ------------------------ GROUP A ------------------------- */ if(chk_grp_membership(me, &groupA, pid_listA)) { /* group A */ test_one_group(&groupA, pid_listA); } MP_BARRIER(); /* ------------------------ GROUP B ------------------------- */ if(chk_grp_membership(me, &groupB, pid_listB)) { /* group B */ test_one_group(&groupB, pid_listB); } ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("O.K.\n"); fflush(stdout);} }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if( (nproc<MINPROC || nproc>MAXPROC) && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n Testing ARMCI Groups!\n\n"); fflush(stdout); } test_groups(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);} sleep(2); #ifdef ARMCI_GROUP test_groups_noncollective(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);} sleep(2); #endif MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void DDI_ARMCI_Barrier(const DDI_Comm *comm) { if (comm == (const DDI_Comm *)Comm_find(DDI_COMM_WORLD)) { ARMCI_Barrier(); } else { ARMCI_AllFence(); MPI_Barrier(comm->compute_comm); } }
void test_groups_noncollective() { int *pid_lists[MAX_GROUPS]; int pids[MAXPROC]; int i, nprocs, world_me; ARMCI_Group group; int *my_pid_list=NULL, my_grp_size=0; int ngrps; MP_BARRIER(); MP_PROCS(&nprocs); MP_MYID(&world_me); random_permute(pids, nproc); ngrps = nprocs/GROUP_SIZE; for(i=0; i<nprocs/GROUP_SIZE; i++) { pid_lists[i] = pids + (i*GROUP_SIZE); } for(i=0; i<nprocs; i++) { if(pids[i] == world_me) { int grp_id = ARMCI_MIN(i/GROUP_SIZE, ngrps-1); my_pid_list = pid_lists[grp_id]; if(grp_id == ngrps-1) my_grp_size = GROUP_SIZE + (nprocs%GROUP_SIZE); else my_grp_size = GROUP_SIZE; } } qsort(my_pid_list, my_grp_size, sizeof(int), int_compare); MP_BARRIER(); /*now create all these disjoint groups and test them in parallel*/ ARMCI_Group_create(my_grp_size, my_pid_list, &group); test_one_group(&group, my_pid_list); ARMCI_Group_free(&group); ARMCI_AllFence(); MP_BARRIER(); if(world_me==0){printf("O.K.\n"); fflush(stdout);} }
int main(int argc, char* argv[]) { ARMCI_NetInit(); MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); if(nproc < 2 || nproc> MAXPROC) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n put/get/acc requests (Time in secs)\n\n"); fflush(stdout); } test_perf_nb(1); test_perf_nb(0); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { armci_msg_init(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0) { printf("\n Performing Sparse Matrix-Vector Multiplication ...\n\n"); fflush(stdout); } test_sparse(); ARMCI_AllFence(); armci_msg_barrier(); if(me==0) { printf("\nSuccess!!\n"); fflush(stdout); } sleep(2); armci_msg_barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
int main(int argc, char *argv[]) { ARMCI_Init_args(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if (nproc > MAXPROC && me == 0) { ARMCI_Error("Test works for up to %d processors\n", MAXPROC); } if (me == 0) { printf("ARMCI test program (%d processes)\n", nproc); fflush(stdout); sleep(1); } if (me == 0) { printf("\nAggregate put/get requests\n\n"); fflush(stdout); } test_aggregate(1); /* cold start */ test_aggregate(0); /* warm start */ ARMCI_AllFence(); ARMCI_Barrier(); if (me == 0) { printf("\nSuccess!!\n"); fflush(stdout); } sleep(2); ARMCI_Barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\nAggregate put/get requests\n\n"); fflush(stdout); } test_aggregate(1); /* cold start */ test_aggregate(0); /* warm start */ ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void TRANSPOSE1D() { int dims[1]; int nelem, i, ierr, min, max, cmin, cmax, lmin, lmax, pmin, pmax; int src_offset, dst_offset, length; int *buf, *map; void *src_ptr, *dst_ptr; void **a_ptr, **b_ptr; int *a, *b; /* Find local processor ID and number of processors */ int me, nprocs; me = armci_msg_me(); nprocs = armci_msg_nproc(); /* Allocate pointers to data on all processors */ a_ptr = (void**)malloc(nprocs*sizeof(int*)); b_ptr = (void**)malloc(nprocs*sizeof(int*)); map = (int*)malloc(nprocs*sizeof(int)); /* Configure array dimensions. Force an unequal data distribution */ dims[0] = nprocs*TOTALELEMS + nprocs/2; if (me == 0) printf("Size of array: %d\n\n",dims[0]); /* Find first (zero-based) index of chunk owned by each processor and store it in map array */ for (i=0; i<nprocs; i++) { map[i] = (int)(((double)i)*(((double)dims[0])/((double)nprocs))); } /* Figure out what size my portion of array is */ if (me<nprocs-1) { nelem = map[me+1]-map[me]; } else { nelem = dims[0]-map[me]; } /* Allocate memory for array A */ ierr = ARMCI_Malloc(a_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(a_ptr[me]); /* Allocate memory for array B */ ierr = ARMCI_Malloc(b_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(b_ptr[me]); /* initialize data in array A and zero data in array B */ a = (int*)a_ptr[me]; b = (int*)b_ptr[me]; for (i=0; i<nelem; i++) { a[i] = i + map[me] + 1; b[i] = 0; } /* Synchronize all processors to guarantee that everyone has data before proceeding to the next step. */ armci_msg_barrier(); /* Create local buffer for performing inversion */ buf = (int*)malloc(nelem*sizeof(int)); /* Copy inverted data into local buffer */ a = (int*)a_ptr[me]; for (i=0; i<nelem; i++) { buf[i] = a[nelem-i-1]; } /* Find out which blocks of array B inverted block should be copied to. Start by finding min and max indices of data in array B*/ min = dims[0] - (map[me] + nelem); max = dims[0] - map[me] - 1; /* Locate processors containing the endpoints */ pmin = 0; for (i=0; i<nprocs; i++) { if (min >= map[i]) { pmin = i; } else { break; } } pmax = nprocs-1; for (i=nprocs-2; i>=0; i--) { if (max < map[i+1]) { pmax = i; } else { break; } } /* Loop over processors that will receive data and copy inverted data to processors */ for (i=pmin; i<=pmax; i++) { /* Find min and max indices owned by processor i */ lmin = map[i]; if (i<nprocs-1) { lmax = map[i+1]-1; } else { lmax = dims[0]-1; } /* Find min and max indices that should be sent to processor i */ if (lmin > min) { cmin = lmin; } else { cmin = min; } if (lmax < max) { cmax = lmax; } else { cmax = max; } /* Find offsets on source and destination processors */ src_offset = cmin - min; src_ptr = (void*)(buf + src_offset); dst_offset = cmin - lmin; dst_ptr = ((char*)b_ptr[i]) + sizeof(int)*dst_offset; /* Find length of data (in bytes) to be sent to processor i */ length = sizeof(int)*(cmax-cmin+1); /* Send data to processor */ ARMCI_Put(src_ptr, dst_ptr, length, i); } ARMCI_AllFence(); armci_msg_barrier(); free(buf); VERIFY(b_ptr, dims, map); free(map); armci_msg_barrier(); ARMCI_Free(a_ptr[me]); ARMCI_Free(b_ptr[me]); free(a_ptr); free(b_ptr); }
int main(int argc, char* argv[]) { int i; struct timeval start_time[14]; struct timeval stop_time[14]; /* char * test_name[14] = { "dim", "nbdim", "vec_small", "acc", "vector", "vector_acc", "fetch_add", "swap", "rput", "aggregate", "implicit", "memlock", "acc_type", "collective" }; int test_flags[14] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }; */ char * test_name[2] = { "acc_type", "collective" }; int test_flags[2] = { 1, 1 }; #define TEST_ACC_TYPE 0 #define TEST_COLLECTIVE 1 MP_INIT(argc, argv); ARMCI_Init(); MP_PROCS(&nproc); MP_MYID(&me); if(nproc > MAXPROC && me == 0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me == 0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } gettimeofday(&start_time[TEST_ACC_TYPE],NULL); if(test_flags[TEST_ACC_TYPE] == 1) { if(me == 0) { printf("\nTesting Accumulate Types\n"); fflush(stdout); } MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_INT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_INT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_LNG\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_LNG); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_FLT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_FLT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DBL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DBL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_CPL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_CPL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DCP\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DCP); ARMCI_AllFence(); MP_BARRIER(); } gettimeofday(&stop_time[TEST_ACC_TYPE],NULL); gettimeofday(&start_time[TEST_COLLECTIVE],NULL); if(test_flags[TEST_COLLECTIVE] == 1) { if(me == 0) { printf("\nTesting Collective Types\n"); fflush(stdout); } if(me == 0) { printf("Test Collective ARMCI_INT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_INT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_LONG\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_LONG); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_FLOAT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_FLOAT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_DOUBLE\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_DOUBLE); MP_BARRIER(); } gettimeofday(&stop_time[TEST_COLLECTIVE],NULL); if(me == 0) { printf("Accumulate and Collective tests passed\n"); fflush(stdout); } if(me == 0) { printf("Testcase runtime\n"); printf("Name,Time(seconds)\n"); for(i = 0; i < 2; i++) if(test_flags[i] == 1) { double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6; printf("%s,%.6f\n", test_name[i], time_spent); } } MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void test_acc_type(const int datatype) { int i = 0; int datatype_size = 0; void * scale; void * a; void *b[MAXPROC]; int elems = ELEMS; int dim = 1; int count = 0; int strideA = 0; int strideB = 0; switch(datatype) { case ARMCI_ACC_INT: datatype_size = sizeof(int); scale = malloc(datatype_size); *((int *) scale) = 1; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((int *) a)[i] = i + me; ((int *) b[me])[i] = 0; } break; case ARMCI_ACC_LNG: datatype_size = sizeof(long); scale = malloc(datatype_size); *((long *) scale) = 1; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((long *) a)[i] = i + me; ((long *) b[me])[i] = 0; } break; case ARMCI_ACC_FLT: datatype_size = sizeof(float); scale = malloc(datatype_size); *((float *) scale) = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((float *) a)[i] = (float) i + me; ((float *) b[me])[i] = 0.0; } break; case ARMCI_ACC_DBL: datatype_size = sizeof(double); scale = malloc(datatype_size); *((double *) scale) = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((double *) a)[i] = (double) i + me; ((double *) b[me])[i] = 0.0; } break; case ARMCI_ACC_CPL: datatype_size = sizeof(cmpl_t); scale = malloc(datatype_size); ((cmpl_t *) scale)->real = 2.0; ((cmpl_t *) scale)->imag = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((cmpl_t *) a)[i].real = ((float) i + me); ((cmpl_t *) a)[i].imag = ((float) i + me); ((cmpl_t *) b[me])[i].real = 0.0; ((cmpl_t *) b[me])[i].imag = 0.0; } break; case ARMCI_ACC_DCP: datatype_size = sizeof(dcmpl_t); scale = malloc(datatype_size); ((dcmpl_t *) scale)->real = 2.0; ((dcmpl_t *) scale)->imag = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((dcmpl_t *) a)[i].real = ((double) i + me); ((dcmpl_t *) a)[i].imag = ((double) i + me); ((dcmpl_t *) b[me])[i].real = 0.0; ((dcmpl_t *) b[me])[i].imag = 0.0; } break; default: return; break; } count = elems * datatype_size; strideA = elems * datatype_size; strideB = elems * datatype_size; ARMCI_AllFence(); MP_BARRIER(); for(i = 0; i < nproc; i++) ARMCI_AccS(datatype, scale, a, &strideA, b[(me + i) % nproc], &strideB, &count, 0, (me + i) % nproc); ARMCI_AllFence(); MP_BARRIER(); switch(datatype) { case ARMCI_ACC_INT: for(i = 0; i < elems; i++) { int compare = (i * nproc) + nproc / 2 * (nproc - 1); if(((int *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_INT [%d] = %d != %d\n", i, ((int *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_LNG: for(i = 0; i < elems; i++) { long compare = (i * nproc) + nproc / 2 * (nproc - 1); if(((long *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_LNG [%d] = %d != %ld\n", i, ((int *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_FLT: for(i = 0; i < elems; i++) { float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((float *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_FLT [%d] = %f != %f\n", i, ((float *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_DBL: for(i = 0; i < elems; i++) { double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((double *)b[me])[i] != (double) ((i * nproc) + nproc / 2 * (nproc - 1))) { printf("ERROR accumulate ARMCI_ACC_DBL [%d] = %f != %f \n", i, ((double *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_CPL: for(i = 0; i < elems; i++) { float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((cmpl_t *)b[me])[i].real != compare && ((cmpl_t *)b[me])[i].imag != 3 * compare) { printf("ERROR accumulate ARMCI_ACC_CPL [%d] = %f + %fj != %f + %fj\n", i, ((cmpl_t *)b[me])[i].real, ((cmpl_t *)b[me])[i].imag, compare, 3 * compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_DCP: for(i = 0; i < elems; i++) { double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((dcmpl_t *)b[me])[i].real != compare && ((dcmpl_t *)b[me])[i].imag != 3 * compare) { printf("ERROR accumulate ARMCI_ACC_DCP [%d] = %f + %fj != %f + %fj\n", i, ((dcmpl_t *)b[me])[i].real, ((dcmpl_t *)b[me])[i].imag, compare, 3 * compare); ARMCI_Error("test_acc_type failed\n",0); } } break; default: break; } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("O.K.\n\n"); fflush(stdout);} destroy_array((void**)b); free(a); free(scale); }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }
void test_aggregate(int dryrun) { int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; double *ddst_put[MAXPROC]; double *ddst_get[MAXPROC]; double *dsrc[MAXPROC]; armci_hdl_t aggr_hdl_put[MAXPROC]; armci_hdl_t aggr_hdl_get[MAXPROC]; armci_hdl_t hdl_put[MAXELEMS]; armci_hdl_t hdl_get[MAXELEMS]; armci_giov_t darr; void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS]; int start = 0, end = 0; double start_time; create_array((void**)ddst_put, sizeof(double),2, elems); create_array((void**)ddst_get, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) { ddst_put[me][i]=0.0; ddst_get[me][i]=0.0; } MP_BARRIER(); /* only proc 0 does the work */ if(me == 0) { if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS); /* initializing non-blocking handles */ for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]); for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]); /* aggregate handles */ for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]); bytes = sizeof(double); /* **************** PUT **************** */ /* register put */ start_time=MP_TIMER(); start = 0; end = elems[1]; for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i, &hdl_put[j]); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Value Put time = %.2es\n", me, MP_TIMER()-start_time); /* vector put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[me][j]; dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i]))) ARMCI_Error("armci_nbputv failed\n",rc); } for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]); if(!dryrun)printf("%d: Vector Put time = %.2es\n", me, MP_TIMER()-start_time); /* regular put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &hdl_put[j]))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Regular Put time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &aggr_hdl_put[i]))) ARMCI_Error("armci_nbput failed\n",rc); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]); if(!dryrun)printf("%d: Aggregate Put time = %.2es\n\n", me, MP_TIMER()-start_time); /* **************** GET **************** */ /* vector get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[i][j]; dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i]))) ARMCI_Error("armci_nbgetv failed\n",rc); ARMCI_Wait(&hdl_get[i]); } if(!dryrun)printf("%d: Vector Get time = %.2es\n", me, MP_TIMER()-start_time); /* regular get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &hdl_get[j]))) ARMCI_Error("armci_nbget failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]); } if(!dryrun)printf("%d: Regular Get time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &aggr_hdl_get[i]); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]); if(!dryrun)printf("%d: Aggregate Get time = %.2es\n", me, MP_TIMER()-start_time); } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); /* Verify */ if(!(me==0)) for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_put[me][j]-j*1.001) > 0.1) { ARMCI_Error("aggregate put failed...1", 0); } } MP_BARRIER(); if(!dryrun)if(me==0) printf("\n aggregate put ..O.K.\n"); fflush(stdout); if(me==0) { for(i=1; i<nproc; i++) { for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) { ARMCI_Error("aggregate get failed...1", 0); } } } } MP_BARRIER(); if(!dryrun)if(me==0) printf(" aggregate get ..O.K.\n"); fflush(stdout); ARMCI_AllFence(); MP_BARRIER(); if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst_put); destroy_array((void **)ddst_get); destroy_array((void **)dsrc); }
void test_2D() { int i; int src, dst; int ierr; double *buf; void *ptr[MAXPROC], *get_ptr[MAXPROC]; /* find who I am and the dst process */ src = me; #ifdef MALLOC_LOC if(me == 0) { buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #else if(me == 0) { buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #endif ierr = ARMCI_Malloc(ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(ptr[me]); ierr = ARMCI_Malloc(get_ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(get_ptr[me]); /* ARMCI - initialize the data window */ fill_array(ptr[me], SIZE*SIZE, me); fill_array(get_ptr[me], SIZE*SIZE, me); MP_BARRIER(); /* only the proc 0 doest the work */ /* print the title */ if(me == 0) { if(!CHECK_RESULT){ printf(" section get put"); printf(" acc\n"); printf("bytes loop sec MB/s sec MB/s"); printf(" sec MB/s\n"); printf("------- ------ -------- -------- -------- --------"); printf(" -------- --------\n"); fflush(stdout); } for(i=0; i<CHUNK_NUM; i++) { int loop; int bytes = chunk[i] * chunk[i] * sizeof(double); double t_get = 0, t_put = 0, t_acc = 0; double latency_get, latency_put, latency_acc; double bandwidth_get, bandwidth_put, bandwidth_acc; loop = SIZE / chunk[i]; if(loop<2)loop=2; for(dst=1; dst<nproc; dst++) { /* strided get */ fill_array(buf, SIZE*SIZE, me*10); t_get += time_get((double *)(get_ptr[dst]), (double *)buf, chunk[i], loop, dst, 1); /* strided put */ fill_array(buf, SIZE*SIZE, me*10); t_put += time_put((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); /* strided acc */ fill_array(buf, SIZE*SIZE, me*10); t_acc += time_acc((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); } latency_get = t_get/(nproc - 1); latency_put = t_put/(nproc - 1); latency_acc = t_acc/(nproc - 1); bandwidth_get = (bytes * (nproc - 1) * 1e-6)/t_get; bandwidth_put = (bytes * (nproc - 1) * 1e-6)/t_put; bandwidth_acc = (bytes * (nproc - 1) * 1e-6)/t_acc; /* print */ if(!CHECK_RESULT)printf("%d\t%d\t%.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, loop, latency_get, bandwidth_get, latency_put, bandwidth_put, latency_acc, bandwidth_acc); } } else sleep(3); ARMCI_AllFence(); MP_BARRIER(); /* cleanup */ ARMCI_Free(get_ptr[me]); ARMCI_Free(ptr[me]); #ifdef MALLOC_LOC if(me == 0) ARMCI_Free_local(buf); #else if(me == 0) free(buf); #endif }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner, destp, hc, m; double *dbuf; armci_hdl_t handle[2*MAXPROC]; int saved[MAXPROC]; dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); /* impl algo on this diag block */ } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = dbuf; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); /* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (I, m); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL); saved[destp] = 1; } } } } /* end of for (i=k1, I=K+1...) */ /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); /* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (m, J); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL); saved[destp] = 1; } } } } ARMCI_WaitAll(); ARMCI_AllFence(); MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = bufc[me*nblocks+I]; } if(block_owner(K,J) == me) B = a[K+J*nblocks]; else B = bufr[me*nblocks + J]; C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } ARMCI_Free_local(dbuf); }
static int sparse_initialize(int *n, int *non_zero, int **row_ind, int **col_ind, double **values, double **vec, double **svec) { int i, j, rc, max, *row_ind_tmp=NULL, *tmp_indices=NULL; double *tmp_values=NULL; unsigned long len; FILE *fp=NULL; /* Broadcast order of matrix */ if(me==0) { if((fp=fopen("Sparse-MPI/av41092.rua.data", "r")) == NULL) ARMCI_Error("Error: Input file not found", me); fortran_indexing = 1; /* This is 1 for Harwell-Boeing format matrices */ fscanf(fp, "%d", n); if(*n%nproc) ARMCI_Error("# of rows is not divisible by # of processors", nproc); if(*n > ROW) ARMCI_Error("order is greater than defined variable ROW", ROW); } len = sizeof(int); armci_msg_brdcst(n, len, 0); /* Broad cast number of non_zeros */ if(me==0) fscanf(fp, "%d", non_zero); armci_msg_brdcst(non_zero, len, 0); /* Broadcast row indices */ len = (*n+1)*sizeof(int); row_ind_tmp = (int *)malloc(len); if(me==0)for(i=0; i<*n+1; i++) { fscanf(fp, "%d", &row_ind_tmp[i]); if(fortran_indexing) --row_ind_tmp[i]; } armci_msg_brdcst(row_ind_tmp, len, 0); load_balance(*n, *non_zero, row_ind_tmp); /* find how much temporary storage is needed at the maximum */ if(me==0) { for(max=-1,j=0; j<nproc; j++) if(max<proc_nz_list[j]) max=proc_nz_list[j]; if(max<0) ARMCI_Error(" max cannot be negative", max); } /* Broadcast the maximum number of elements */ len = sizeof(int); armci_msg_brdcst(&max, len, 0); /* create the Sparse MAtrix Array */ if(me==0) printf(" Creating ValueArray (CompressedSparseMatrix) ...\n\n"); create_array((void**)col_ind, sizeof(int), 1, &max); /* create the column subscript array */ if(me==0) printf(" Creating Column Subscript Array ... \n\n"); create_array((void**)values, sizeof(double), 1, &max); /* create the x-vector and the solution vector */ if(me==0) printf(" Creating Vectors ... \n\n"); create_array((void**)vec, sizeof(double),1, &max); create_array((void**)svec, sizeof(double),1, &max); armci_msg_barrier(); /* Process 0 distributes the column indices and non_zero values to respective processors*/ if(me == 0) { tmp_indices = (int *)malloc(max*sizeof(int)); tmp_values = (double *)malloc(max*sizeof(double)); for(j=0; j<nproc; j++) { for(i=0; i<proc_nz_list[j]; i++) { fscanf(fp, "%d", &tmp_indices[i]); if(fortran_indexing) --tmp_indices[i]; } /* rc = fread(tmp_indices, sizeof(int), proc_nz_list[j], fp); */ if((rc=ARMCI_Put(tmp_indices, col_ind[j], proc_nz_list[j]*sizeof(int), j))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=0; j<nproc; j++) { for(i=0; i<proc_nz_list[j]; i++) fscanf(fp, "%lf", &tmp_values[i]); if((rc=ARMCI_Put(tmp_values, values[j], proc_nz_list[j]*sizeof(double), j))) ARMCI_Error("armci_nbput failed\n",rc); } } ARMCI_AllFence(); armci_msg_barrier(); ARMCI_AllFence(); /* initializing x-vector */ if(me==0) for(i=0; i<proc_nz_list[me]; i++) vec[me][i] = (i+1); else for(i=0; i<proc_nz_list[me]; i++) vec[me][i]=me*proc_nz_list[me-1]+(i+1); #if 0 if(me==0) { printf("max = %d\n", max); for(i=0; i<max; i++) printf("%.1f ", values[me][i]); printf("\n"); } #endif *row_ind = row_ind_tmp; if(me==0) { free(tmp_indices); free(tmp_values); fclose(fp); } return 0; }
int main(int argc, char **argv) { int i,peer,j; cpu_set_t mycpuid,new_mask; char str[CPU_SETSIZE]; int rrr; char cid[8]; extern char * cpuset_to_cstr(cpu_set_t *mask, char *str); extern int cstr_to_cpuset(cpu_set_t *mask, const char* str); gpc_hdl_t nbh; char rheader[100]; int hlen, rhlen, rhsize; int rdsize; int rem; void *header=&rem; int locval=0; void *loc=&locval; int right; MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); if(nprocs<2){ printf("\ncan run only on >=2 procs\n"); MPI_Finalize(); exit(1); } right = (me+1)%nprocs; hlen=sizeof(header); bzero(rheader,100); rhlen = hlen; ARMCI_Init(); accloop=atoi(argv[1]); rem=accloop; myptrs = (char **)malloc(sizeof(char *)*nprocs); ARMCI_Malloc((void **)myptrs,size); MPI_Barrier(MPI_COMM_WORLD); gpcwork_memcpy = ARMCI_Gpc_register(gpc_work_handler_memcpy); gpcwork_ddot =ARMCI_Gpc_register(gpc_work_handler_ddot); gpcwork_daxpy = ARMCI_Gpc_register(gpc_work_handler_daxpy); gpcwork_dgemm = ARMCI_Gpc_register(gpc_work_handler_dgemm); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Gpc_init_handle(&nbh); if(ARMCI_Gpc_exec(gpcwork_memcpy, right, &header, hlen, loc, sizeof(int), rheader, rhlen,loc, sizeof(int), &nbh)) fprintf(stderr,"ARMCI_Gpc_exec failed\n"); { int m,n,k; char notr='n'; DoubleComplex ZERO; usleep(100); ZERO.real=0.;ZERO.imag=0.; m=n=k=DGS; t0=MPI_Wtime(); #ifdef DGEMM_WORK for(j=0;j<4*15;j++){ c_alpha=c_alpha+j*rand(); dgemm_(¬r,¬r,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1); } #elif IUNIT_WORK for(j=0;j<2*LOOP*100;j++){ for(i=0;i<LOOP*100;i++){ tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2]; } } #elif DAXPY_WORK for(j=0;j<tmp_loop*80;j++){ alpha=alpha+j*rand(); daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE); } #endif t1=MPI_Wtime()-t0; printf("\n%d:Compute_During_memcpy %d %f\n",me,accloop,t1); } ARMCI_Gpc_wait(&nbh); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Gpc_init_handle(&nbh); if(ARMCI_Gpc_exec(gpcwork_ddot, right, &header, hlen, loc, sizeof(int), rheader, rhlen,loc, sizeof(int), &nbh)) fprintf(stderr,"ARMCI_Gpc_exec failed\n"); { int m,n,k; char notr='n'; DoubleComplex ZERO; usleep(100); ZERO.real=0.;ZERO.imag=0.; m=n=k=DGS; t0=MPI_Wtime(); #ifdef DGEMM_WORK for(j=0;j<4*15;j++){ c_alpha=c_alpha+j*rand(); dgemm_(¬r,¬r,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1); } #elif IUNIT_WORK for(j=0;j<2*LOOP*100;j++){ for(i=0;i<LOOP*100;i++){ tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2]; } } #elif DAXPY_WORK for(j=0;j<tmp_loop*80;j++){ alpha=alpha+j*rand(); daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE); } #endif t1=MPI_Wtime()-t0; printf("\n%d:Compute_During_Ddot %d %f\n",me,accloop,t1); } ARMCI_Gpc_wait(&nbh); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Gpc_init_handle(&nbh); if(ARMCI_Gpc_exec(gpcwork_daxpy, right, &header, hlen, loc, sizeof(int), rheader, rhlen,loc, sizeof(int), &nbh)) fprintf(stderr,"ARMCI_Gpc_exec failed\n"); { int m,n,k; char notr='n'; DoubleComplex ZERO; usleep(100); ZERO.real=0.;ZERO.imag=0.; m=n=k=DGS; t0=MPI_Wtime(); #ifdef DGEMM_WORK for(j=0;j<4*15;j++){ c_alpha=c_alpha+j*rand(); dgemm_(¬r,¬r,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1); } #elif IUNIT_WORK for(j=0;j<2*LOOP*100;j++){ for(i=0;i<LOOP*100;i++){ tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2]; } } #elif DAXPY_WORK for(j=0;j<tmp_loop*80;j++){ alpha=alpha+j*rand(); daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE); } #endif t1=MPI_Wtime()-t0; printf("\n%d:Compute_During_Daxpy %d %f\n",me,accloop,t1); } ARMCI_Gpc_wait(&nbh); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Gpc_init_handle(&nbh); if(ARMCI_Gpc_exec(gpcwork_dgemm, right, &header, hlen, loc, sizeof(int), rheader, rhlen,loc, sizeof(int), &nbh)) fprintf(stderr,"ARMCI_Gpc_exec failed\n"); { int m,n,k; char notr='n'; DoubleComplex ZERO; usleep(100); ZERO.real=0.;ZERO.imag=0.; m=n=k=DGS; t0=MPI_Wtime(); #ifdef DGEMM_WORK for(j=0;j<4*15;j++){ c_alpha=c_alpha+j*rand(); dgemm_(¬r,¬r,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1); } #elif IUNIT_WORK for(j=0;j<2*LOOP*100;j++){ for(i=0;i<LOOP*100;i++){ tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2]; } } #elif DAXPY_WORK for(j=0;j<tmp_loop*80;j++){ alpha=alpha+j*rand(); daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE); } #endif t1=MPI_Wtime()-t0; printf("\n%d:Compute_During_Dgemm %d %f\n",me,accloop,t1); } ARMCI_Gpc_wait(&nbh); MPI_Barrier(MPI_COMM_WORLD); ARMCI_AllFence(); ARMCI_Finalize(); MPI_Finalize(); }