int main( int argc, char **argv) { MP_INIT(argc,argv); MP_MYID(&me); MP_PROCS(&nproc); if(nproc < 2) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d\n", nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } if(me == 0){ printf("Test of ARMCI Wrappers to Basic Message Passing Operations\n"); fflush(stdout); } /* initialize ARMCI */ ARMCI_Init(); MP_BARRIER(); TestGlobals(); /* done */ ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char **argv) { ARMCI_NetInit(); MP_INIT(argc,argv); MP_MYID(&me); MP_PROCS(&nproc); if(nproc < 2 || nproc> MAXPROC) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } /* initialize ARMCI */ ARMCI_Init(); if(!me)printf("\n Performance of Basic Blocking Communication Operations\n"); MP_BARRIER(); CHECK_RESULT=1; if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); CHECK_RESULT=0; /* test 1 dimension array */ if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); /* test 2 dimension array */ if(!me)printf("\n\t\t\tStrided Data Transfer\n"); test_2D(); MP_BARRIER(); if(me == 0){ if(warn_accuracy) printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n",warn_accuracy); printf("\n\n------------ Testing the same data transfer for correctness ----------\n"); fflush(stdout); } MP_BARRIER(); CHECK_RESULT=1; if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); if(me == 0) printf("OK\n"); MP_BARRIER(); if(!me)printf("\n\t\t\tStrided Data Transfer\n"); test_2D(); if(me == 0) printf("OK\n\n\nTests Completed.\n"); MP_BARRIER(); /* done */ ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { int ndim; MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); if(me==0){ printf("ARMCI test program for lock(%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); test_lock(); MP_BARRIER(); if(me==0){printf("test passed\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
void test_one_group(ARMCI_Group *group, int *pid_list) { int grp_me, grp_size; int i,j,src_proc,dst_proc; double *ddst_put[MAXPROC]; double dsrc[ELEMS]; int elems[2] = {MAXPROC,ELEMS}; int value = -1, bytes, world_me; MP_MYID(&world_me); ARMCI_Group_rank(group, &grp_me); ARMCI_Group_size(group, &grp_size); if(grp_me==0) printf("GROUP SIZE = %d\n", grp_size); printf("%d:group rank = %d\n", me, grp_me); src_proc = 0; dst_proc = grp_size-1; bytes = ELEMS*sizeof(double); ARMCI_Malloc_group((void **)ddst_put, bytes, group); for(i=0; i<ELEMS; i++) dsrc[i]=i*1.001*(grp_me+1); for(i=0; i<ELEMS; i++) ddst_put[grp_me][i]=-1.0; armci_msg_group_barrier(group); if(grp_me==src_proc) { /* NOTE: make sure to specify absolute ids in ARMCI calls */ ARMCI_Put(dsrc, &ddst_put[dst_proc][0], bytes, ARMCI_Absolute_id(group,dst_proc)); } armci_msg_group_barrier(group); /* NOTE: make sure to specify absolute ids in ARMCI calls */ ARMCI_Fence(ARMCI_Absolute_id(group,dst_proc)); sleep(1); /* Verify*/ if(grp_me==dst_proc) { for(j=0; j<ELEMS; j++) { if(ARMCI_ABS(ddst_put[grp_me][j]-j*1.001*(src_proc+1)) > 0.1) { printf("\t%d: ddst_put[%d][%d] = %lf and expected value is %lf\n", me, grp_me, j, ddst_put[grp_me][j], j*1.001*(src_proc+1)); ARMCI_Error("groups: armci put failed...1", 0); } } printf("\n%d(%d): Test O.K. Verified\n", dst_proc, world_me); } armci_msg_group_barrier(group); ARMCI_Free_group(ddst_put[grp_me], group); }
void test_groups_noncollective() { int *pid_lists[MAX_GROUPS]; int pids[MAXPROC]; int i, nprocs, world_me; ARMCI_Group group; int *my_pid_list=NULL, my_grp_size=0; int ngrps; MP_BARRIER(); MP_PROCS(&nprocs); MP_MYID(&world_me); random_permute(pids, nproc); ngrps = nprocs/GROUP_SIZE; for(i=0; i<nprocs/GROUP_SIZE; i++) { pid_lists[i] = pids + (i*GROUP_SIZE); } for(i=0; i<nprocs; i++) { if(pids[i] == world_me) { int grp_id = ARMCI_MIN(i/GROUP_SIZE, ngrps-1); my_pid_list = pid_lists[grp_id]; if(grp_id == ngrps-1) my_grp_size = GROUP_SIZE + (nprocs%GROUP_SIZE); else my_grp_size = GROUP_SIZE; } } qsort(my_pid_list, my_grp_size, sizeof(int), int_compare); MP_BARRIER(); /*now create all these disjoint groups and test them in parallel*/ ARMCI_Group_create(my_grp_size, my_pid_list, &group); test_one_group(&group, my_pid_list); ARMCI_Group_free(&group); ARMCI_AllFence(); MP_BARRIER(); if(world_me==0){printf("O.K.\n"); fflush(stdout);} }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if( (nproc<MINPROC || nproc>MAXPROC) && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n Testing ARMCI Groups!\n\n"); fflush(stdout); } test_groups(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);} sleep(2); #ifdef ARMCI_GROUP test_groups_noncollective(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);} sleep(2); #endif MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { ARMCI_NetInit(); MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); if(nproc < 2 || nproc> MAXPROC) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n put/get/acc requests (Time in secs)\n\n"); fflush(stdout); } test_perf_nb(1); test_perf_nb(0); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\nAggregate put/get requests\n\n"); fflush(stdout); } test_aggregate(1); /* cold start */ test_aggregate(0); /* warm start */ ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; int nloop=5; double **ptr_loc; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } ptr = (void **)malloc(nproc * sizeof(void *)); #ifdef MPI2_ONESIDED MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]); MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me]; MPI_Barrier(MPI_COMM_WORLD); #else /* initialize ARMCI */ ARMCI_Init(); ARMCI_Malloc(ptr, proc_bytes); #endif a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks; i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } lu(n, block_size, me); /* cold start */ /* Starting the timer */ MP_BARRIER(); if(me == 0) start_timer(); for(i=0; i<nloop; i++) lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()/nloop); printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ #ifdef MPI2_ONESIDED MPI_Win_free(&win); MPI_Free_mem(ptr[me]); #else ARMCI_Free(ptr[me]); ARMCI_Finalize(); #endif MP_FINALIZE(); }
int main(int argc, char* argv[]) { int i; struct timeval start_time[14]; struct timeval stop_time[14]; /* char * test_name[14] = { "dim", "nbdim", "vec_small", "acc", "vector", "vector_acc", "fetch_add", "swap", "rput", "aggregate", "implicit", "memlock", "acc_type", "collective" }; int test_flags[14] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }; */ char * test_name[2] = { "acc_type", "collective" }; int test_flags[2] = { 1, 1 }; #define TEST_ACC_TYPE 0 #define TEST_COLLECTIVE 1 MP_INIT(argc, argv); ARMCI_Init(); MP_PROCS(&nproc); MP_MYID(&me); if(nproc > MAXPROC && me == 0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me == 0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } gettimeofday(&start_time[TEST_ACC_TYPE],NULL); if(test_flags[TEST_ACC_TYPE] == 1) { if(me == 0) { printf("\nTesting Accumulate Types\n"); fflush(stdout); } MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_INT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_INT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_LNG\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_LNG); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_FLT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_FLT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DBL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DBL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_CPL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_CPL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DCP\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DCP); ARMCI_AllFence(); MP_BARRIER(); } gettimeofday(&stop_time[TEST_ACC_TYPE],NULL); gettimeofday(&start_time[TEST_COLLECTIVE],NULL); if(test_flags[TEST_COLLECTIVE] == 1) { if(me == 0) { printf("\nTesting Collective Types\n"); fflush(stdout); } if(me == 0) { printf("Test Collective ARMCI_INT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_INT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_LONG\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_LONG); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_FLOAT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_FLOAT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_DOUBLE\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_DOUBLE); MP_BARRIER(); } gettimeofday(&stop_time[TEST_COLLECTIVE],NULL); if(me == 0) { printf("Accumulate and Collective tests passed\n"); fflush(stdout); } if(me == 0) { printf("Testcase runtime\n"); printf("Name,Time(seconds)\n"); for(i = 0; i < 2; i++) if(test_flags[i] == 1) { double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6; printf("%s,%.6f\n", test_name[i], time_spent); } } MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char **argv) { int i; double **myptrs; double t0, t1, tnbget=0, tnbwait=0, t2=0; MP_INIT(argc,argv); ARMCI_Init(); MP_PROCS(&nprocs); MP_MYID(&me); if (nprocs < 2) ARMCI_Error("This program requires at least to processes", 1); myptrs = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); MP_BARRIER(); if(me == 0) { for(i = 0; i < 10; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1); } t0 = MP_TIMER(); for(i = 0; i < LOOP; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1); } t1 = MP_TIMER(); printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP); fflush(stdout); t1 = t0 = 0; for(i = 0; i < LOOP; i++) { armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MP_TIMER(); //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh); ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh); t1 = MP_TIMER(); ARMCI_Wait(&nbh); t2 = MP_TIMER(); tnbget += (t1-t0); tnbwait += (t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); } else sleep(1); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
int main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; MP_INIT(argc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } /* num_rows = (int) sqrt((double) nproc); */ /* for (;;) { */ /* num_cols = nproc/num_rows; */ /* if (num_rows*num_cols == nproc) */ /* break; */ /* num_rows--; */ /* } */ nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } nnodes = nproc / 4; if((nnodes * 4) != nproc) { num_cols = nproc - nnodes * 4; nnodes++; num_rows = 1; } else { num_cols = 2; num_rows = 2; } num = (nblocks * nblocks)/nnodes; if((num * nnodes) != (nblocks * nblocks)) num++; #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif edge = n%block_size; if (edge == 0) { edge = block_size; } for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
int main(int argc, char *argv[]) { int ch; extern char *optarg; int i, j, r; thread_t threads[MAX_TPP]; /* init MP */ MP_INIT(argc,argv); MP_PROCS(&size); MP_MYID(&rank); while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) { switch(ch) { case 't': /* # of threads */ tpp = atoi(optarg); if (tpp < 1 || tpp > MAX_TPP) { PRINTF0("\"%s\" is improper value for -t, should be a " "number between 1 and %d(MAX_TPP)\n", optarg, MAX_TPP); usage(); } break; case 'i': /* # of iterations */ iters = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -t, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 's': /* # of elements in the array */ asize = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -s, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 'd': delay = atoi(optarg); break; /* delay before start */ case 'h': usage(); break; /* print usage info */ } } #ifdef NOTHREADS tpp = 1; PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n"); #endif th_size = size * tpp; PRINTF0("\nTest of multi-threaded capabilities:\n" "%d threads per process (%d threads total),\n" "%d array elements of size %d,\n" "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters); if (delay) { printf("%d: %d\n", rank, getpid()); fflush(stdout); sleep(delay); MP_BARRIER(); } TH_INIT(size,tpp); for (i = 0; i < tpp; i++) th_rank[i] = rank * tpp + i; #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) { fname[10] = '0' + th_rank[i] / 100; fname[11] = '0' + th_rank[i] % 100 / 10; fname[12] = '0' + th_rank[i] % 10; dbg[i] = fopen(fname, "w"); } #endif for (i = 0; i < tpp; i++) prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]); /* init ARMCI */ ARMCI_Init(); /* set global seed (to ensure same random sequence across procs) */ time_seed = (unsigned)time(NULL); armci_msg_brdcst(&time_seed, sizeof(time_seed), 0); srand(time_seed); rand(); prndbg(0, "seed = %u\n", time_seed); /* random pairs */ pairs = calloc(th_size, sizeof(int)); for (i = 0; i < th_size; i++) pairs[i] = -1; for (i = 0; i < th_size; i++) { if (pairs[i] != -1) continue; r = RND(0, th_size); while (i == r || pairs[r] != -1 ) r = RND(0, th_size); pairs[i] = r; pairs[r] = i; } for (i = 0, cbufl = 0; i < th_size; i++) cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d", i, pairs[i], pairs[i], pairs[pairs[i]]); prndbg(0, "random pairs:%s\n", cbuf); /* random targets */ rnd_tgts = calloc(th_size, sizeof(int)); for (i = 0, cbufl = 0; i < th_size; i++) { rnd_tgts[i] = RND(0, th_size); if (rnd_tgts[i] == i) { i--; continue; } cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]); } prndbg(0, "random targets:%s\n", cbuf); /* random one */ rnd_one = RND(0, th_size); prndbg(0, "random one = %d\n", rnd_one); assert(ptrs1 = calloc(th_size, sizeof(void *))); assert(ptrs2 = calloc(th_size, sizeof(void *))); #ifdef NOTHREADS thread_main((void *)(long)0); #else for (i = 0; i < tpp; i++) THREAD_CREATE(threads + i, thread_main, (void *)(long)i); for (i = 0; i < tpp; i++) THREAD_JOIN(threads[i], NULL); #endif MP_BARRIER(); PRINTF0("Tests Completed\n"); /* clean up */ #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) fclose(dbg[i]); #endif ARMCI_Finalize(); TH_FINALIZE(); MP_FINALIZE(); return 0; }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; void **bufr_g, **bufc_g; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\nUsing pre-PUTing\n"); printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); if (bufr == NULL || bufc == NULL) printf("Could not ARMCI_Malloc_local() mem\n"); /* bufr points to all k-th row blocks */ /* save all block address in row-major order */ proc_bytes = nblocks*block_size*block_size * sizeof(double); bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufr_g, proc_bytes); for (i = 0; i < nproc; i++) { bufr[i*nblocks] = (double *) bufr_g[i]; for (j = 1; j < nblocks; j++) { bufr[i*nblocks + j] = bufr[i*nblocks + j-1] + block_size * block_size; } } /* bufc points to all k-th column blocks */ bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufc_g, proc_bytes); for (i = 0; i < nproc; i++) { bufc[i*nblocks] = (double *) bufc_g[i]; for (j = 1; j < nblocks; j++) { bufc[i*nblocks + j] = bufc[i*nblocks + j-1] + block_size * block_size; } } /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Free(bufc_g[me]); ARMCI_Free(bufr_g[me]); ARMCI_Finalize(); MP_FINALIZE(); }