void test_groups() { int pid_listA[MAXPROC] = {0,1,2}; int pid_listB[MAXPROC] = {1,3}; ARMCI_Group groupA, groupB; MP_BARRIER(); ARMCI_Group_create(GNUM_A, pid_listA, &groupA); /* create group 1 */ ARMCI_Group_create(GNUM_B, pid_listB, &groupB); /* create group 2 */ /* ------------------------ GROUP A ------------------------- */ if(chk_grp_membership(me, &groupA, pid_listA)) { /* group A */ test_one_group(&groupA, pid_listA); } MP_BARRIER(); /* ------------------------ GROUP B ------------------------- */ if(chk_grp_membership(me, &groupB, pid_listB)) { /* group B */ test_one_group(&groupB, pid_listB); } ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("O.K.\n"); fflush(stdout);} }
int main(int argc, char* argv[]) { int ndim; MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); if(me==0){ printf("ARMCI test program for lock(%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); test_lock(); MP_BARRIER(); if(me==0){printf("test passed\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void test_collective(const int datatype) { char * op[7] = {"+", "*", "min", "max", "absmax", "absmin", "or"}; int i = 0; int num_tests = 7; if(datatype == ARMCI_DOUBLE || datatype == ARMCI_FLOAT) num_tests = 6; /* test armci_msg_brdcst */ test_brdcst(datatype); /* test armci_msg_gop2 */ for(i = 0; i < num_tests; i++) test_gop2_or_reduce(datatype, op[i], 0); /* test armci_msg_reduce */ for(i = 0; i < num_tests; i++) test_gop2_or_reduce(datatype, op[i], 1); MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("O.K.\n\n"); fflush(stdout);} }
int main( int argc, char **argv) { MP_INIT(argc,argv); MP_MYID(&me); MP_PROCS(&nproc); if(nproc < 2) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d\n", nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } if(me == 0){ printf("Test of ARMCI Wrappers to Basic Message Passing Operations\n"); fflush(stdout); } /* initialize ARMCI */ ARMCI_Init(); MP_BARRIER(); TestGlobals(); /* done */ ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void destroy_safe_array() { int rc; MP_BARRIER(); MPI_Win_unlock_all(win); MPI_Win_free(&win); MP_BARRIER(); }
void test_lock() { int i,mut; if(me==0)printf("\n"); for(mut=0;mut<16;mut++) for(i=0;i<nproc;i++){ armcill_lock(mut,i); armcill_unlock(mut,i); MP_BARRIER(); if(me==0){printf(".");fflush(stdout);} MP_BARRIER(); } }
int main(int argc, char* argv[]) { int ndim; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_rank(MPI_COMM_WORLD, &me); if(me==0){ printf("MPI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } if(me==0){ printf("\nTesting strided gets and puts\n"); printf("(Only std output for process 0 is printed)\n\n"); fflush(stdout); sleep(1); } for(ndim=1; ndim<= MAXDIMS; ndim++) test_dim(ndim); MP_BARRIER(); MPI_Finalize(); return(0); }
void test_groups_noncollective() { int *pid_lists[MAX_GROUPS]; int pids[MAXPROC]; int i, nprocs, world_me; ARMCI_Group group; int *my_pid_list=NULL, my_grp_size=0; int ngrps; MP_BARRIER(); MP_PROCS(&nprocs); MP_MYID(&world_me); random_permute(pids, nproc); ngrps = nprocs/GROUP_SIZE; for(i=0; i<nprocs/GROUP_SIZE; i++) { pid_lists[i] = pids + (i*GROUP_SIZE); } for(i=0; i<nprocs; i++) { if(pids[i] == world_me) { int grp_id = ARMCI_MIN(i/GROUP_SIZE, ngrps-1); my_pid_list = pid_lists[grp_id]; if(grp_id == ngrps-1) my_grp_size = GROUP_SIZE + (nprocs%GROUP_SIZE); else my_grp_size = GROUP_SIZE; } } qsort(my_pid_list, my_grp_size, sizeof(int), int_compare); MP_BARRIER(); /*now create all these disjoint groups and test them in parallel*/ ARMCI_Group_create(my_grp_size, my_pid_list, &group); test_one_group(&group, my_pid_list); ARMCI_Group_free(&group); ARMCI_AllFence(); MP_BARRIER(); if(world_me==0){printf("O.K.\n"); fflush(stdout);} }
void verify_results(int op, int *elems) { int i, j; switch(op) { case PUT: if(!(me==0)) for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst[me][j]-j*1.001) > 0.1) { ARMCI_Error("put failed...Invalid Value Obtained..1", 0); } } MP_BARRIER(); if(DEBUG) if(me==0) printf(" verifying put ..O.K.\n"); break; case GET: if(me==0) { for(i=1; i<nproc; i++) { for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) ARMCI_Error("get failed...Invalid Value Obtained..1", 0); } } } MP_BARRIER(); if(DEBUG) if(me==0) printf(" verifying get ..O.K.\n\n"); break; case ACC: if(me==0) for(j=0; j<elems[1]; j++) { /*printf("ddst[%d][%d] = %lf\n", me, j, ddst[me][j]); fflush(stdout); */ if( ARMCI_ABS(ddst[me][j]-(double)nproc) > 0.1) { ARMCI_Error("accumulate failed...Invalid Value Obtained..1", 0); } } MP_BARRIER(); if(DEBUG)if(me==0) printf(" verifying accumulate ..O.K.\n"); break; default: ARMCI_Error("Invalid Operation", 0); } fflush(stdout); }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if( (nproc<MINPROC || nproc>MAXPROC) && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n Testing ARMCI Groups!\n\n"); fflush(stdout); } test_groups(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);} sleep(2); #ifdef ARMCI_GROUP test_groups_noncollective(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);} sleep(2); #endif MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void usage() { if (!rank) { printf("Usage: test_mt, or \n"); printf(" test_mt -tTHREADS_PER_PROC -sARRAY_SIZE -iITERATIONS_COUNT\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); }
int main(int argc, char* argv[]) { ARMCI_NetInit(); MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); if(nproc < 2 || nproc> MAXPROC) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n put/get/acc requests (Time in secs)\n\n"); fflush(stdout); } test_perf_nb(1); test_perf_nb(0); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char **argv) { ARMCI_NetInit(); MP_INIT(argc,argv); MP_MYID(&me); MP_PROCS(&nproc); if(nproc < 2 || nproc> MAXPROC) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } /* initialize ARMCI */ ARMCI_Init(); if(!me)printf("\n Performance of Basic Blocking Communication Operations\n"); MP_BARRIER(); CHECK_RESULT=1; if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); CHECK_RESULT=0; /* test 1 dimension array */ if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); /* test 2 dimension array */ if(!me)printf("\n\t\t\tStrided Data Transfer\n"); test_2D(); MP_BARRIER(); if(me == 0){ if(warn_accuracy) printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n",warn_accuracy); printf("\n\n------------ Testing the same data transfer for correctness ----------\n"); fflush(stdout); } MP_BARRIER(); CHECK_RESULT=1; if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); if(me == 0) printf("OK\n"); MP_BARRIER(); if(!me)printf("\n\t\t\tStrided Data Transfer\n"); test_2D(); if(me == 0) printf("OK\n\n\nTests Completed.\n"); MP_BARRIER(); /* done */ ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void create_safe_array(void **a, int elem_size, int ndim, int dims[]) { int bytes=elem_size, i; void * base; assert(ndim<=MAXDIMS); for(i=0;i<ndim;i++)bytes*=dims[i]; // a[me] = malloc(bytes); MPI_Win_allocate(bytes, 1, MPI_INFO_NULL, MPI_COMM_WORLD, (void *)&base, &win); MPI_Win_lock_all(MPI_MODE_NOCHECK, win); assert(base); *a = base; MP_BARRIER(); }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\nAggregate put/get requests\n\n"); fflush(stdout); } test_aggregate(1); /* cold start */ test_aggregate(0); /* warm start */ ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { int i; struct timeval start_time[14]; struct timeval stop_time[14]; /* char * test_name[14] = { "dim", "nbdim", "vec_small", "acc", "vector", "vector_acc", "fetch_add", "swap", "rput", "aggregate", "implicit", "memlock", "acc_type", "collective" }; int test_flags[14] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }; */ char * test_name[2] = { "acc_type", "collective" }; int test_flags[2] = { 1, 1 }; #define TEST_ACC_TYPE 0 #define TEST_COLLECTIVE 1 MP_INIT(argc, argv); ARMCI_Init(); MP_PROCS(&nproc); MP_MYID(&me); if(nproc > MAXPROC && me == 0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me == 0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } gettimeofday(&start_time[TEST_ACC_TYPE],NULL); if(test_flags[TEST_ACC_TYPE] == 1) { if(me == 0) { printf("\nTesting Accumulate Types\n"); fflush(stdout); } MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_INT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_INT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_LNG\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_LNG); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_FLT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_FLT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DBL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DBL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_CPL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_CPL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DCP\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DCP); ARMCI_AllFence(); MP_BARRIER(); } gettimeofday(&stop_time[TEST_ACC_TYPE],NULL); gettimeofday(&start_time[TEST_COLLECTIVE],NULL); if(test_flags[TEST_COLLECTIVE] == 1) { if(me == 0) { printf("\nTesting Collective Types\n"); fflush(stdout); } if(me == 0) { printf("Test Collective ARMCI_INT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_INT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_LONG\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_LONG); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_FLOAT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_FLOAT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_DOUBLE\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_DOUBLE); MP_BARRIER(); } gettimeofday(&stop_time[TEST_COLLECTIVE],NULL); if(me == 0) { printf("Accumulate and Collective tests passed\n"); fflush(stdout); } if(me == 0) { printf("Testcase runtime\n"); printf("Name,Time(seconds)\n"); for(i = 0; i < 2; i++) if(test_flags[i] == 1) { double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6; printf("%s,%.6f\n", test_name[i], time_spent); } } MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void test_acc_type(const int datatype) { int i = 0; int datatype_size = 0; void * scale; void * a; void *b[MAXPROC]; int elems = ELEMS; int dim = 1; int count = 0; int strideA = 0; int strideB = 0; switch(datatype) { case ARMCI_ACC_INT: datatype_size = sizeof(int); scale = malloc(datatype_size); *((int *) scale) = 1; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((int *) a)[i] = i + me; ((int *) b[me])[i] = 0; } break; case ARMCI_ACC_LNG: datatype_size = sizeof(long); scale = malloc(datatype_size); *((long *) scale) = 1; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((long *) a)[i] = i + me; ((long *) b[me])[i] = 0; } break; case ARMCI_ACC_FLT: datatype_size = sizeof(float); scale = malloc(datatype_size); *((float *) scale) = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((float *) a)[i] = (float) i + me; ((float *) b[me])[i] = 0.0; } break; case ARMCI_ACC_DBL: datatype_size = sizeof(double); scale = malloc(datatype_size); *((double *) scale) = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((double *) a)[i] = (double) i + me; ((double *) b[me])[i] = 0.0; } break; case ARMCI_ACC_CPL: datatype_size = sizeof(cmpl_t); scale = malloc(datatype_size); ((cmpl_t *) scale)->real = 2.0; ((cmpl_t *) scale)->imag = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((cmpl_t *) a)[i].real = ((float) i + me); ((cmpl_t *) a)[i].imag = ((float) i + me); ((cmpl_t *) b[me])[i].real = 0.0; ((cmpl_t *) b[me])[i].imag = 0.0; } break; case ARMCI_ACC_DCP: datatype_size = sizeof(dcmpl_t); scale = malloc(datatype_size); ((dcmpl_t *) scale)->real = 2.0; ((dcmpl_t *) scale)->imag = 1.0; a = malloc(elems * datatype_size); create_array((void**)b, datatype_size, dim, &elems); for(i = 0; i < elems; i++) { ((dcmpl_t *) a)[i].real = ((double) i + me); ((dcmpl_t *) a)[i].imag = ((double) i + me); ((dcmpl_t *) b[me])[i].real = 0.0; ((dcmpl_t *) b[me])[i].imag = 0.0; } break; default: return; break; } count = elems * datatype_size; strideA = elems * datatype_size; strideB = elems * datatype_size; ARMCI_AllFence(); MP_BARRIER(); for(i = 0; i < nproc; i++) ARMCI_AccS(datatype, scale, a, &strideA, b[(me + i) % nproc], &strideB, &count, 0, (me + i) % nproc); ARMCI_AllFence(); MP_BARRIER(); switch(datatype) { case ARMCI_ACC_INT: for(i = 0; i < elems; i++) { int compare = (i * nproc) + nproc / 2 * (nproc - 1); if(((int *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_INT [%d] = %d != %d\n", i, ((int *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_LNG: for(i = 0; i < elems; i++) { long compare = (i * nproc) + nproc / 2 * (nproc - 1); if(((long *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_LNG [%d] = %d != %ld\n", i, ((int *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_FLT: for(i = 0; i < elems; i++) { float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((float *)b[me])[i] != compare) { printf("ERROR accumulate ARMCI_ACC_FLT [%d] = %f != %f\n", i, ((float *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_DBL: for(i = 0; i < elems; i++) { double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((double *)b[me])[i] != (double) ((i * nproc) + nproc / 2 * (nproc - 1))) { printf("ERROR accumulate ARMCI_ACC_DBL [%d] = %f != %f \n", i, ((double *)b[me])[i], compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_CPL: for(i = 0; i < elems; i++) { float compare = (float) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((cmpl_t *)b[me])[i].real != compare && ((cmpl_t *)b[me])[i].imag != 3 * compare) { printf("ERROR accumulate ARMCI_ACC_CPL [%d] = %f + %fj != %f + %fj\n", i, ((cmpl_t *)b[me])[i].real, ((cmpl_t *)b[me])[i].imag, compare, 3 * compare); ARMCI_Error("test_acc_type failed\n",0); } } break; case ARMCI_ACC_DCP: for(i = 0; i < elems; i++) { double compare = (double) ((i * nproc) + nproc / 2 * (nproc - 1)); if(((dcmpl_t *)b[me])[i].real != compare && ((dcmpl_t *)b[me])[i].imag != 3 * compare) { printf("ERROR accumulate ARMCI_ACC_DCP [%d] = %f + %fj != %f + %fj\n", i, ((dcmpl_t *)b[me])[i].real, ((dcmpl_t *)b[me])[i].imag, compare, 3 * compare); ARMCI_Error("test_acc_type failed\n",0); } } break; default: break; } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("O.K.\n\n"); fflush(stdout);} destroy_array((void**)b); free(a); free(scale); }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; void **bufr_g, **bufc_g; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\nUsing pre-PUTing\n"); printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); if (bufr == NULL || bufc == NULL) printf("Could not ARMCI_Malloc_local() mem\n"); /* bufr points to all k-th row blocks */ /* save all block address in row-major order */ proc_bytes = nblocks*block_size*block_size * sizeof(double); bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufr_g, proc_bytes); for (i = 0; i < nproc; i++) { bufr[i*nblocks] = (double *) bufr_g[i]; for (j = 1; j < nblocks; j++) { bufr[i*nblocks + j] = bufr[i*nblocks + j-1] + block_size * block_size; } } /* bufc points to all k-th column blocks */ bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufc_g, proc_bytes); for (i = 0; i < nproc; i++) { bufc[i*nblocks] = (double *) bufc_g[i]; for (j = 1; j < nblocks; j++) { bufc[i*nblocks + j] = bufc[i*nblocks + j-1] + block_size * block_size; } } /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Free(bufc_g[me]); ARMCI_Free(bufr_g[me]); ARMCI_Finalize(); MP_FINALIZE(); }
void destroy_array(void *ptr[]) { MP_BARRIER(); assert(!ARMCI_Free(ptr[me])); }
void test_aggregate(int dryrun) { int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; double *ddst_put[MAXPROC]; double *ddst_get[MAXPROC]; double *dsrc[MAXPROC]; armci_hdl_t aggr_hdl_put[MAXPROC]; armci_hdl_t aggr_hdl_get[MAXPROC]; armci_hdl_t hdl_put[MAXELEMS]; armci_hdl_t hdl_get[MAXELEMS]; armci_giov_t darr; void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS]; int start = 0, end = 0; double start_time; create_array((void**)ddst_put, sizeof(double),2, elems); create_array((void**)ddst_get, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) { ddst_put[me][i]=0.0; ddst_get[me][i]=0.0; } MP_BARRIER(); /* only proc 0 does the work */ if(me == 0) { if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS); /* initializing non-blocking handles */ for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]); for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]); /* aggregate handles */ for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]); for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]); bytes = sizeof(double); /* **************** PUT **************** */ /* register put */ start_time=MP_TIMER(); start = 0; end = elems[1]; for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i, &hdl_put[j]); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Value Put time = %.2es\n", me, MP_TIMER()-start_time); /* vector put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[me][j]; dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i]))) ARMCI_Error("armci_nbputv failed\n",rc); } for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]); if(!dryrun)printf("%d: Vector Put time = %.2es\n", me, MP_TIMER()-start_time); /* regular put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &hdl_put[j]))) ARMCI_Error("armci_nbput failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]); } if(!dryrun)printf("%d: Regular Put time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate put */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes, i, &aggr_hdl_put[i]))) ARMCI_Error("armci_nbput failed\n",rc); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]); if(!dryrun)printf("%d: Aggregate Put time = %.2es\n\n", me, MP_TIMER()-start_time); /* **************** GET **************** */ /* vector get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { src_ptr[j] = (void *)&dsrc[i][j]; dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j]; } darr.src_ptr_array = src_ptr; darr.dst_ptr_array = dst_ptr; darr.bytes = sizeof(double); darr.ptr_array_len = elems[1]; if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i]))) ARMCI_Error("armci_nbgetv failed\n",rc); ARMCI_Wait(&hdl_get[i]); } if(!dryrun)printf("%d: Vector Get time = %.2es\n", me, MP_TIMER()-start_time); /* regular get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &hdl_get[j]))) ARMCI_Error("armci_nbget failed\n",rc); } for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]); } if(!dryrun)printf("%d: Regular Get time = %.2es\n", me, MP_TIMER()-start_time); /* aggregate get */ start_time=MP_TIMER(); for(i=1; i<nproc; i++) { for(j=start; j<end; j++) { ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes, i, &aggr_hdl_get[i]); } } for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]); if(!dryrun)printf("%d: Aggregate Get time = %.2es\n", me, MP_TIMER()-start_time); } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); /* Verify */ if(!(me==0)) for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_put[me][j]-j*1.001) > 0.1) { ARMCI_Error("aggregate put failed...1", 0); } } MP_BARRIER(); if(!dryrun)if(me==0) printf("\n aggregate put ..O.K.\n"); fflush(stdout); if(me==0) { for(i=1; i<nproc; i++) { for(j=0; j<elems[1]; j++) { if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) { ARMCI_Error("aggregate get failed...1", 0); } } } } MP_BARRIER(); if(!dryrun)if(me==0) printf(" aggregate get ..O.K.\n"); fflush(stdout); ARMCI_AllFence(); MP_BARRIER(); if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst_put); destroy_array((void **)ddst_get); destroy_array((void **)dsrc); }
int main(int argc, char **argv) { int i; double **myptrs; double t0, t1, tnbget=0, tnbwait=0, t2=0; MP_INIT(argc,argv); ARMCI_Init(); MP_PROCS(&nprocs); MP_MYID(&me); if (nprocs < 2) ARMCI_Error("This program requires at least to processes", 1); myptrs = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); MP_BARRIER(); if(me == 0) { for(i = 0; i < 10; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1); } t0 = MP_TIMER(); for(i = 0; i < LOOP; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1); } t1 = MP_TIMER(); printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP); fflush(stdout); t1 = t0 = 0; for(i = 0; i < LOOP; i++) { armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MP_TIMER(); //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh); ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh); t1 = MP_TIMER(); ARMCI_Wait(&nbh); t2 = MP_TIMER(); tnbget += (t1-t0); tnbwait += (t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); } else sleep(1); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }
void test_dim(int ndim) { int dim,elems; int i,j, proc; /* double a[DIM4][DIM3][DIM2][DIM1], b[EDIM4][EDIM3][EDIM2][EDIM1];*/ double *b; double *a, *a1, *a2, *c; int ridx; MPI_Datatype typeA, typeB; int rstrideB[MAXDIMS]; int rcount[MAXDIMS]; int pidx1 = -1, pidx2 = -1, pidx3 = -1; elems = 1; strideA[0]=sizeof(double); strideB[0]=sizeof(double); for(i=0;i<ndim;i++){ strideA[i] *= dimsA[i]; strideB[i] *= dimsB[i]; if(i<ndim-1){ strideA[i+1] = strideA[i]; strideB[i+1] = strideB[i]; } elems *= dimsA[i]; } /* create shared and local arrays */ create_safe_array((void**)&b, sizeof(double),ndim,dimsB); a1 = (double *)malloc(sizeof(double)*elems); assert(a1); a2 = (double *)malloc(sizeof(double)*elems); assert(a2); c = (double *)malloc(sizeof(double)*elems); assert(c); init(a1, ndim, elems, dimsA, me!=0, 0); init(a2, ndim, elems, dimsA, me!=0, 1); if(me==0){ printf("--------array[%d",dimsA[0]); for(dim=1;dim<ndim;dim++)printf(",%d",dimsA[dim]); printf("]--------\n"); } sleep(1); MP_BARRIER(); for(i=0;i<LOOP;i++){ int idx1, idx2, idx3, ridx; MPI_Request request; if (i%2) { a = a2; } else { a = a1; } get_range(ndim, dimsA, loA, hiA); new_range(ndim, dimsB, loA, hiA, loB, hiB); new_range(ndim, dimsA, loA, hiA, loC, hiC); proc=nproc-1-me; if(me==0){ print_range("local",ndim,loA, hiA,"-> "); print_range("remote",ndim,loB, hiB,"-> "); print_range("local",ndim,loC, hiC,"\n"); } idx1 = Index(ndim, loA, dimsA); idx2 = Index(ndim, loB, dimsB); idx3 = Index(ndim, loC, dimsA); MPI_Sendrecv(&idx2, 1, MPI_INT, proc, 666, &ridx, 1, MPI_INT, proc, 666, MPI_COMM_WORLD, MPI_STATUS_IGNORE); for(j=0;j<ndim;j++)count[j]=hiA[j]-loA[j]+1; count[0] *= sizeof(double); /* convert range to bytes at stride level zero */ Strided_to_dtype(strideA, count, ndim-1, MPI_BYTE, &typeA); MPI_Type_commit(&typeA); Strided_to_dtype(strideB, count, ndim-1, MPI_BYTE, &typeB); MPI_Type_commit(&typeB); MPI_Accumulate(a + idx1, 1, typeA, proc, (MPI_Aint)(idx2*sizeof(double)), 1, typeB, MPI_REPLACE, win); MP_FLUSH(proc); /* note that we do not need Fence here since * consectutive operations targeting the same process are ordered */ MPI_Get_accumulate(NULL, 0, MPI_BYTE, c + idx3, 1, typeA, proc, (MPI_Aint)(idx2*sizeof(double)), 1, typeB, MPI_NO_OP, win); MP_FLUSH(proc); compare_patches(0., ndim, a+idx1, loA, hiA, dimsA, c+idx3, loC, hiC, dimsA); pidx1 = idx1; pidx2 = idx2; pidx3 = idx3; MPI_Type_free(&typeA); MPI_Type_free(&typeB); } free(c); destroy_safe_array(); free(a); }
void read_and_create(int argc, char **argv) { int ri,i,nread; int tmp1,idealelementsperproc; void **amatptrs,**xvecptrs; na = atoi(argv[1]); nz = atoi(argv[2]); if(strncmp("random",argv[3],6)){ if(me==0){ fd = fopen(argv[3], "r"); if(fd==NULL)ARMCI_Error("unable to open given file",0); } } else{ if(na==0 || nz==0){ printf("\nERROR:exiting-no input file given and na or nz is 0"); fflush(stdout); ARMCI_Finalize(); MP_FINALIZE(); return; } if(me==0){ generate_random_file(na,nz); fd = fopen("randominput.dat", "r"); } } if(me==0){ if(na==0) nread = fread(&na, sizeof(na), 1, fd); if(nz==0) nread = fread(&nz, sizeof(nz), 1, fd); printf("\nReading CG input\n"); printf("Number of rows: %d\n", na); printf("Number of non-zeros: %d\n", nz); } armci_msg_bcast(&nz,sizeof(int),0); armci_msg_bcast(&na,sizeof(int),0); MP_BARRIER(); amatptrs = (void **)malloc(sizeof(void *)*nproc); xvecptrs = (void **)malloc(sizeof(void *)*nproc); if(xvecptrs==NULL || amatptrs==NULL) ARMCI_Error("xvecptrs amatptrs malloc failed",sizeof(void *)*nproc); if(ARMCI_Malloc(amatptrs,((me==0)?(sizeof(double)*nz):0))) ARMCI_Error("amat malloc failed",sizeof(double)*nz); amat = (double *)amatptrs[0]; if(ARMCI_Malloc(amatptrs,((me==0)?(sizeof(int)*(nz+1)):0))) ARMCI_Error("icol malloc failed",sizeof(int)*(nz+1)); cidx = (int *)amatptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(int)*(na+1)):0)); /*+1 for end of last row*/ ridx = (int *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*(na+1)):0)); xvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*(na+1)):0)); bvec = (double *)xvecptrs[0]; if(me==0){ for (i = 0; i < na + 1; i++) xvec[i] = 0.0; nread = fread(amat, sizeof(double), nz, fd); nread = fread(ridx, sizeof(int), (na+1), fd); ridx[na]=nz; nread = fread(cidx, sizeof(int), (nz+1), fd); nread = fread(bvec, sizeof(double), (na+1), fd); /* the c adjustment */ for (i = 0; i < na; i++) ridx[i] -= 1; for (i = 0; i < nz; i++) cidx[i] -= 1; } MP_BARRIER(); /*acg_matvecmul(amat,xvec,bvec,ridx,cidx);*/ if(0){ for(i=0;i<nz+1;i++) printf("\n%d:amat[%d]=%f icol[%d]=%d",me,i,amat[i],i,cidx[i]); for(i=0;i<na+1;i++) printf("\n%d:irow[%d]=%d bvec[%d]=%f",me,i,ridx[i],i,bvec[i]); } allfirstrow = (int *)malloc(sizeof(int)*nproc); alllastrow = (int *)malloc(sizeof(int)*nproc); columnmap = (int *)malloc(sizeof(int)*nproc); if(!allfirstrow || !alllastrow || !columnmap) ARMCI_Error("malloc failed allfirstrow ",0); MP_BARRIER(); /* * next decide who works on which rows, this will decide the * distribution of a,d,r,q,x,and ax */ /*create the mapping for all vectors, row matrix and column matrix*/ if(me==0){ idealelementsperproc = nz/nproc; tmp1=0; for(i=0;i<nproc;i++){ int elementsperproc=0; allfirstrow[i]=tmp1; for(ri=tmp1;ri<na;ri++,tmp1++){ elementsperproc+=(ridx[ri+1]-ridx[ri]); if(elementsperproc>=idealelementsperproc){ if((elementsperproc-idealelementsperproc) > idealelementsperproc-(elementsperproc-(ridx[ri+1]-ridx[ri]))){ alllastrow[i] = ri-1; if((ri-1)<0)ARMCI_Error("run on a smaller processor count",0); /*tmp1--;*/ } else{ alllastrow[i] = ri; if(ri<0)ARMCI_Error("run on a smaller processor count",0); tmp1++; } elementsperproc=0; break; } } } alllastrow[nproc-1]=na-1; for(i=0;i<nproc;i++)columnmap[i]=ridx[allfirstrow[i]]; } armci_msg_bcast(columnmap,nproc*sizeof(int),0); armci_msg_bcast(allfirstrow,nproc*sizeof(int),0); armci_msg_bcast(alllastrow,nproc*sizeof(int),0); myfirstrow = allfirstrow[me]; mylastrow = alllastrow[me]; if(me==0)for(i=0;i<nproc;i++){ printf("\nDISTRIBUTION:first row of process\t%d is %d last row of process\t%d is %d",i,allfirstrow[i],i,alllastrow[i]); } /* for(i=myfirstrow;i<mylastrow;i++){ xvec[i]=0.0; } */ ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); rvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); dvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); svec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); dmvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); qvec = (double *)xvecptrs[0]; ARMCI_Malloc(xvecptrs,((me==0)?(sizeof(double)*na):0)); axvec = (double *)xvecptrs[0]; if(me==0)fclose(fd); /*dont forget to free mallocs*/ free(allfirstrow); free(alllastrow); free(columnmap); }
int main(int argc, char *argv[]) { int ch; extern char *optarg; int i, j, r; thread_t threads[MAX_TPP]; /* init MP */ MP_INIT(argc,argv); MP_PROCS(&size); MP_MYID(&rank); while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) { switch(ch) { case 't': /* # of threads */ tpp = atoi(optarg); if (tpp < 1 || tpp > MAX_TPP) { PRINTF0("\"%s\" is improper value for -t, should be a " "number between 1 and %d(MAX_TPP)\n", optarg, MAX_TPP); usage(); } break; case 'i': /* # of iterations */ iters = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -t, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 's': /* # of elements in the array */ asize = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -s, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 'd': delay = atoi(optarg); break; /* delay before start */ case 'h': usage(); break; /* print usage info */ } } #ifdef NOTHREADS tpp = 1; PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n"); #endif th_size = size * tpp; PRINTF0("\nTest of multi-threaded capabilities:\n" "%d threads per process (%d threads total),\n" "%d array elements of size %d,\n" "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters); if (delay) { printf("%d: %d\n", rank, getpid()); fflush(stdout); sleep(delay); MP_BARRIER(); } TH_INIT(size,tpp); for (i = 0; i < tpp; i++) th_rank[i] = rank * tpp + i; #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) { fname[10] = '0' + th_rank[i] / 100; fname[11] = '0' + th_rank[i] % 100 / 10; fname[12] = '0' + th_rank[i] % 10; dbg[i] = fopen(fname, "w"); } #endif for (i = 0; i < tpp; i++) prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]); /* init ARMCI */ ARMCI_Init(); /* set global seed (to ensure same random sequence across procs) */ time_seed = (unsigned)time(NULL); armci_msg_brdcst(&time_seed, sizeof(time_seed), 0); srand(time_seed); rand(); prndbg(0, "seed = %u\n", time_seed); /* random pairs */ pairs = calloc(th_size, sizeof(int)); for (i = 0; i < th_size; i++) pairs[i] = -1; for (i = 0; i < th_size; i++) { if (pairs[i] != -1) continue; r = RND(0, th_size); while (i == r || pairs[r] != -1 ) r = RND(0, th_size); pairs[i] = r; pairs[r] = i; } for (i = 0, cbufl = 0; i < th_size; i++) cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d", i, pairs[i], pairs[i], pairs[pairs[i]]); prndbg(0, "random pairs:%s\n", cbuf); /* random targets */ rnd_tgts = calloc(th_size, sizeof(int)); for (i = 0, cbufl = 0; i < th_size; i++) { rnd_tgts[i] = RND(0, th_size); if (rnd_tgts[i] == i) { i--; continue; } cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]); } prndbg(0, "random targets:%s\n", cbuf); /* random one */ rnd_one = RND(0, th_size); prndbg(0, "random one = %d\n", rnd_one); assert(ptrs1 = calloc(th_size, sizeof(void *))); assert(ptrs2 = calloc(th_size, sizeof(void *))); #ifdef NOTHREADS thread_main((void *)(long)0); #else for (i = 0; i < tpp; i++) THREAD_CREATE(threads + i, thread_main, (void *)(long)i); for (i = 0; i < tpp; i++) THREAD_JOIN(threads[i], NULL); #endif MP_BARRIER(); PRINTF0("Tests Completed\n"); /* clean up */ #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) fclose(dbg[i]); #endif ARMCI_Finalize(); TH_FINALIZE(); MP_FINALIZE(); return 0; }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner, destp, hc, m; double *dbuf; armci_hdl_t handle[2*MAXPROC]; int saved[MAXPROC]; dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); /* impl algo on this diag block */ } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = dbuf; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); /* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (I, m); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL); saved[destp] = 1; } } } } /* end of for (i=k1, I=K+1...) */ /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); /* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (m, J); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL); saved[destp] = 1; } } } } ARMCI_WaitAll(); ARMCI_AllFence(); MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = bufc[me*nblocks+I]; } if(block_owner(K,J) == me) B = a[K+J*nblocks]; else B = bufr[me*nblocks + J]; C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } ARMCI_Free_local(dbuf); }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner; double *buf1, *buf2; /* temporary memories */ buf1 = (double *)malloc(block_size*block_size*sizeof(double)); buf2 = (double *)malloc(block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = buf1; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); } } /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); } } MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = buf1; get_remote(A, I, K); } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(K,J) == me) B = a[K+J*nblocks]; else { B = buf2; get_remote(B, K, J); } C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } free(buf1); free(buf2); }
void test_2D() { int i; int src, dst; int ierr; double *buf; void *ptr[MAXPROC], *get_ptr[MAXPROC]; /* find who I am and the dst process */ src = me; #ifdef MALLOC_LOC if(me == 0) { buf = (double *)ARMCI_Malloc_local(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #else if(me == 0) { buf = (double *)malloc(SIZE * SIZE * sizeof(double)); assert(buf != NULL); } #endif ierr = ARMCI_Malloc(ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(ptr[me]); ierr = ARMCI_Malloc(get_ptr, (SIZE * SIZE * sizeof(double))); assert(ierr == 0); assert(get_ptr[me]); /* ARMCI - initialize the data window */ fill_array(ptr[me], SIZE*SIZE, me); fill_array(get_ptr[me], SIZE*SIZE, me); MP_BARRIER(); /* only the proc 0 doest the work */ /* print the title */ if(me == 0) { if(!CHECK_RESULT){ printf(" section get put"); printf(" acc\n"); printf("bytes loop sec MB/s sec MB/s"); printf(" sec MB/s\n"); printf("------- ------ -------- -------- -------- --------"); printf(" -------- --------\n"); fflush(stdout); } for(i=0; i<CHUNK_NUM; i++) { int loop; int bytes = chunk[i] * chunk[i] * sizeof(double); double t_get = 0, t_put = 0, t_acc = 0; double latency_get, latency_put, latency_acc; double bandwidth_get, bandwidth_put, bandwidth_acc; loop = SIZE / chunk[i]; if(loop<2)loop=2; for(dst=1; dst<nproc; dst++) { /* strided get */ fill_array(buf, SIZE*SIZE, me*10); t_get += time_get((double *)(get_ptr[dst]), (double *)buf, chunk[i], loop, dst, 1); /* strided put */ fill_array(buf, SIZE*SIZE, me*10); t_put += time_put((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); /* strided acc */ fill_array(buf, SIZE*SIZE, me*10); t_acc += time_acc((double *)buf, (double *)(ptr[dst]), chunk[i], loop, dst, 1); } latency_get = t_get/(nproc - 1); latency_put = t_put/(nproc - 1); latency_acc = t_acc/(nproc - 1); bandwidth_get = (bytes * (nproc - 1) * 1e-6)/t_get; bandwidth_put = (bytes * (nproc - 1) * 1e-6)/t_put; bandwidth_acc = (bytes * (nproc - 1) * 1e-6)/t_acc; /* print */ if(!CHECK_RESULT)printf("%d\t%d\t%.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, loop, latency_get, bandwidth_get, latency_put, bandwidth_put, latency_acc, bandwidth_acc); } } else sleep(3); ARMCI_AllFence(); MP_BARRIER(); /* cleanup */ ARMCI_Free(get_ptr[me]); ARMCI_Free(ptr[me]); #ifdef MALLOC_LOC if(me == 0) ARMCI_Free_local(buf); #else if(me == 0) free(buf); #endif }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; int nloop=5; double **ptr_loc; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } ptr = (void **)malloc(nproc * sizeof(void *)); #ifdef MPI2_ONESIDED MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]); MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me]; MPI_Barrier(MPI_COMM_WORLD); #else /* initialize ARMCI */ ARMCI_Init(); ARMCI_Malloc(ptr, proc_bytes); #endif a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks; i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } lu(n, block_size, me); /* cold start */ /* Starting the timer */ MP_BARRIER(); if(me == 0) start_timer(); for(i=0; i<nloop; i++) lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()/nloop); printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ #ifdef MPI2_ONESIDED MPI_Win_free(&win); MPI_Free_mem(ptr[me]); #else ARMCI_Free(ptr[me]); ARMCI_Finalize(); #endif MP_FINALIZE(); }
int main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; MP_INIT(argc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } /* num_rows = (int) sqrt((double) nproc); */ /* for (;;) { */ /* num_cols = nproc/num_rows; */ /* if (num_rows*num_cols == nproc) */ /* break; */ /* num_rows--; */ /* } */ nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } nnodes = nproc / 4; if((nnodes * 4) != nproc) { num_cols = nproc - nnodes * 4; nnodes++; num_rows = 1; } else { num_cols = 2; num_rows = 2; } num = (nblocks * nblocks)/nnodes; if((num * nnodes) != (nblocks * nblocks)) num++; #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif edge = n%block_size; if (edge == 0) { edge = block_size; } for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); MP_FINALIZE(); return 0; }