int main(int argc, char **argv) { ARMCI_NetInit(); MP_INIT(argc,argv); MP_MYID(&me); MP_PROCS(&nproc); if(nproc < 2 || nproc> MAXPROC) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } /* initialize ARMCI */ ARMCI_Init(); if(!me)printf("\n Performance of Basic Blocking Communication Operations\n"); MP_BARRIER(); CHECK_RESULT=1; if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); CHECK_RESULT=0; /* test 1 dimension array */ if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); /* test 2 dimension array */ if(!me)printf("\n\t\t\tStrided Data Transfer\n"); test_2D(); MP_BARRIER(); if(me == 0){ if(warn_accuracy) printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n",warn_accuracy); printf("\n\n------------ Testing the same data transfer for correctness ----------\n"); fflush(stdout); } MP_BARRIER(); CHECK_RESULT=1; if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); if(me == 0) printf("OK\n"); MP_BARRIER(); if(!me)printf("\n\t\t\tStrided Data Transfer\n"); test_2D(); if(me == 0) printf("OK\n\n\nTests Completed.\n"); MP_BARRIER(); /* done */ ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { int ndim; MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); if(me==0){ printf("ARMCI test program for lock(%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); test_lock(); MP_BARRIER(); if(me==0){printf("test passed\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main( int argc, char **argv) { MP_INIT(argc,argv); MP_MYID(&me); MP_PROCS(&nproc); if(nproc < 2) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d\n", nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } if(me == 0){ printf("Test of ARMCI Wrappers to Basic Message Passing Operations\n"); fflush(stdout); } /* initialize ARMCI */ ARMCI_Init(); MP_BARRIER(); TestGlobals(); /* done */ ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char **argv) { int status, me; int max_arrays = 10; double max_sz = 1e8, max_disk = 1e10, max_mem = 1e6; int stack = 120000, heap = 3200000; int numfiles, numioprocs; int total, nproc; MP_INIT(argc,argv); GA_Initialize(); me = GA_Nodeid(); nproc = GA_Nnodes(); total = pow(SIZE,NDIM)*sizeof(double); if (!GA_Uses_ma()) { if (GA_Nodeid() == 0) { printf("GA is not using MA\n"); } stack = 100000; heap = (int)(2.2*(float)(total)); } if (MA_init(MT_F_DBL, stack, heap) ) { if (DRA_Init(max_arrays, max_sz, max_disk, max_mem) != 0) GA_Error("DRA_Init failed: ",0); if (USER_CONFIG == 0) { numfiles = -1; numioprocs = -1; } else if (USER_CONFIG == 1) { numfiles = 1; numioprocs = GA_Cluster_nnodes(); } else if (USER_CONFIG == 2) { numfiles = GA_Cluster_nnodes(); numioprocs = GA_Cluster_nnodes(); } else { numfiles = 1; numioprocs = 1; } if (me==0) { printf("Disk resident arrays configured as:\n"); printf(" Number of files: %d\n",numfiles); printf(" Number of I/O processors: %d\n",numioprocs); } DRA_Set_default_config(numfiles,numioprocs); if (me == 0) printf("\n"); if (me == 0) printf("TESTING PERFORMANCE OF DISK ARRAYS\n"); if (me == 0) printf("\n"); test_io_dbl(); status = DRA_Terminate(); GA_Terminate(); } else { printf("MA_init failed\n"); } if(me == 0) printf("all done ...\n"); MP_FINALIZE(); return 0; }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if( (nproc<MINPROC || nproc>MAXPROC) && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n Testing ARMCI Groups!\n\n"); fflush(stdout); } test_groups(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);} sleep(2); #ifdef ARMCI_GROUP test_groups_noncollective(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);} sleep(2); #endif MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { ARMCI_NetInit(); MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); if(nproc < 2 || nproc> MAXPROC) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n put/get/acc requests (Time in secs)\n\n"); fflush(stdout); } test_perf_nb(1); test_perf_nb(0); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char **argv) { int size_dst = 15; int g_a = 0; int I_NEG_ONE = -1; long L_NEG_ONE = -1; long long LL_NEG_ONE = -1; int FIVE = 5; int TEN = 10; int lo; int hi; int *ptr; int i; MP_INIT(argc,argv); GA_INIT(argc,argv); for (i=0; i<3; ++i) { if (0 == i) { g_a = NGA_Create(C_INT, 1, &size_dst, "dst", NULL); GA_Fill(g_a, &I_NEG_ONE); } else if (1 == i) { g_a = NGA_Create(C_LONG, 1, &size_dst, "dst", NULL); GA_Fill(g_a, &L_NEG_ONE); } else if (2 == i) { g_a = NGA_Create(C_LONGLONG, 1, &size_dst, "dst", NULL); GA_Fill(g_a, &LL_NEG_ONE); } GA_Sync(); GA_Print(g_a); NGA_Print_patch(g_a, &FIVE, &TEN, 0); NGA_Print_patch(g_a, &FIVE, &TEN, 1); NGA_Distribution(g_a, GA_Nodeid(), &lo, &hi); NGA_Access(g_a, &lo, &hi, &ptr, NULL); printf("[%d] (%d)=%d\n", GA_Nodeid(), lo, *ptr); NGA_Release(g_a, &lo, &hi); } GA_Terminate(); MP_FINALIZE(); exit(EXIT_SUCCESS); }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\nAggregate put/get requests\n\n"); fflush(stdout); } test_aggregate(1); /* cold start */ test_aggregate(0); /* warm start */ ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { int i; struct timeval start_time[14]; struct timeval stop_time[14]; /* char * test_name[14] = { "dim", "nbdim", "vec_small", "acc", "vector", "vector_acc", "fetch_add", "swap", "rput", "aggregate", "implicit", "memlock", "acc_type", "collective" }; int test_flags[14] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }; */ char * test_name[2] = { "acc_type", "collective" }; int test_flags[2] = { 1, 1 }; #define TEST_ACC_TYPE 0 #define TEST_COLLECTIVE 1 MP_INIT(argc, argv); ARMCI_Init(); MP_PROCS(&nproc); MP_MYID(&me); if(nproc > MAXPROC && me == 0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me == 0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } gettimeofday(&start_time[TEST_ACC_TYPE],NULL); if(test_flags[TEST_ACC_TYPE] == 1) { if(me == 0) { printf("\nTesting Accumulate Types\n"); fflush(stdout); } MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_INT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_INT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_LNG\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_LNG); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_FLT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_FLT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DBL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DBL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_CPL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_CPL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DCP\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DCP); ARMCI_AllFence(); MP_BARRIER(); } gettimeofday(&stop_time[TEST_ACC_TYPE],NULL); gettimeofday(&start_time[TEST_COLLECTIVE],NULL); if(test_flags[TEST_COLLECTIVE] == 1) { if(me == 0) { printf("\nTesting Collective Types\n"); fflush(stdout); } if(me == 0) { printf("Test Collective ARMCI_INT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_INT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_LONG\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_LONG); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_FLOAT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_FLOAT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_DOUBLE\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_DOUBLE); MP_BARRIER(); } gettimeofday(&stop_time[TEST_COLLECTIVE],NULL); if(me == 0) { printf("Accumulate and Collective tests passed\n"); fflush(stdout); } if(me == 0) { printf("Testcase runtime\n"); printf("Name,Time(seconds)\n"); for(i = 0; i < 2; i++) if(test_flags[i] == 1) { double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6; printf("%s,%.6f\n", test_name[i], time_spent); } } MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char **argv) { int i; double **myptrs; double t0, t1, tnbget=0, tnbwait=0, t2=0; MP_INIT(argc,argv); ARMCI_Init(); MP_PROCS(&nprocs); MP_MYID(&me); if (nprocs < 2) ARMCI_Error("This program requires at least to processes", 1); myptrs = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); MP_BARRIER(); if(me == 0) { for(i = 0; i < 10; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1); } t0 = MP_TIMER(); for(i = 0; i < LOOP; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1); } t1 = MP_TIMER(); printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP); fflush(stdout); t1 = t0 = 0; for(i = 0; i < LOOP; i++) { armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MP_TIMER(); //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh); ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh); t1 = MP_TIMER(); ARMCI_Wait(&nbh); t2 = MP_TIMER(); tnbget += (t1-t0); tnbwait += (t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); } else sleep(1); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; void **bufr_g, **bufc_g; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\nUsing pre-PUTing\n"); printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); if (bufr == NULL || bufc == NULL) printf("Could not ARMCI_Malloc_local() mem\n"); /* bufr points to all k-th row blocks */ /* save all block address in row-major order */ proc_bytes = nblocks*block_size*block_size * sizeof(double); bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufr_g, proc_bytes); for (i = 0; i < nproc; i++) { bufr[i*nblocks] = (double *) bufr_g[i]; for (j = 1; j < nblocks; j++) { bufr[i*nblocks + j] = bufr[i*nblocks + j-1] + block_size * block_size; } } /* bufc points to all k-th column blocks */ bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufc_g, proc_bytes); for (i = 0; i < nproc; i++) { bufc[i*nblocks] = (double *) bufc_g[i]; for (j = 1; j < nblocks; j++) { bufc[i*nblocks + j] = bufc[i*nblocks + j-1] + block_size * block_size; } } /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Free(bufc_g[me]); ARMCI_Free(bufr_g[me]); ARMCI_Finalize(); MP_FINALIZE(); }
int main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; MP_INIT(argc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } /* num_rows = (int) sqrt((double) nproc); */ /* for (;;) { */ /* num_cols = nproc/num_rows; */ /* if (num_rows*num_cols == nproc) */ /* break; */ /* num_rows--; */ /* } */ nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } nnodes = nproc / 4; if((nnodes * 4) != nproc) { num_cols = nproc - nnodes * 4; nnodes++; num_rows = 1; } else { num_cols = 2; num_rows = 2; } num = (nblocks * nblocks)/nnodes; if((num * nnodes) != (nblocks * nblocks)) num++; #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif edge = n%block_size; if (edge == 0) { edge = block_size; } for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
int main(int argc, char *argv[]) { int ch; extern char *optarg; int i, j, r; thread_t threads[MAX_TPP]; /* init MP */ MP_INIT(argc,argv); MP_PROCS(&size); MP_MYID(&rank); while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) { switch(ch) { case 't': /* # of threads */ tpp = atoi(optarg); if (tpp < 1 || tpp > MAX_TPP) { PRINTF0("\"%s\" is improper value for -t, should be a " "number between 1 and %d(MAX_TPP)\n", optarg, MAX_TPP); usage(); } break; case 'i': /* # of iterations */ iters = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -t, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 's': /* # of elements in the array */ asize = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -s, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 'd': delay = atoi(optarg); break; /* delay before start */ case 'h': usage(); break; /* print usage info */ } } #ifdef NOTHREADS tpp = 1; PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n"); #endif th_size = size * tpp; PRINTF0("\nTest of multi-threaded capabilities:\n" "%d threads per process (%d threads total),\n" "%d array elements of size %d,\n" "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters); if (delay) { printf("%d: %d\n", rank, getpid()); fflush(stdout); sleep(delay); MP_BARRIER(); } TH_INIT(size,tpp); for (i = 0; i < tpp; i++) th_rank[i] = rank * tpp + i; #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) { fname[10] = '0' + th_rank[i] / 100; fname[11] = '0' + th_rank[i] % 100 / 10; fname[12] = '0' + th_rank[i] % 10; dbg[i] = fopen(fname, "w"); } #endif for (i = 0; i < tpp; i++) prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]); /* init ARMCI */ ARMCI_Init(); /* set global seed (to ensure same random sequence across procs) */ time_seed = (unsigned)time(NULL); armci_msg_brdcst(&time_seed, sizeof(time_seed), 0); srand(time_seed); rand(); prndbg(0, "seed = %u\n", time_seed); /* random pairs */ pairs = calloc(th_size, sizeof(int)); for (i = 0; i < th_size; i++) pairs[i] = -1; for (i = 0; i < th_size; i++) { if (pairs[i] != -1) continue; r = RND(0, th_size); while (i == r || pairs[r] != -1 ) r = RND(0, th_size); pairs[i] = r; pairs[r] = i; } for (i = 0, cbufl = 0; i < th_size; i++) cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d", i, pairs[i], pairs[i], pairs[pairs[i]]); prndbg(0, "random pairs:%s\n", cbuf); /* random targets */ rnd_tgts = calloc(th_size, sizeof(int)); for (i = 0, cbufl = 0; i < th_size; i++) { rnd_tgts[i] = RND(0, th_size); if (rnd_tgts[i] == i) { i--; continue; } cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]); } prndbg(0, "random targets:%s\n", cbuf); /* random one */ rnd_one = RND(0, th_size); prndbg(0, "random one = %d\n", rnd_one); assert(ptrs1 = calloc(th_size, sizeof(void *))); assert(ptrs2 = calloc(th_size, sizeof(void *))); #ifdef NOTHREADS thread_main((void *)(long)0); #else for (i = 0; i < tpp; i++) THREAD_CREATE(threads + i, thread_main, (void *)(long)i); for (i = 0; i < tpp; i++) THREAD_JOIN(threads[i], NULL); #endif MP_BARRIER(); PRINTF0("Tests Completed\n"); /* clean up */ #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) fclose(dbg[i]); #endif ARMCI_Finalize(); TH_FINALIZE(); MP_FINALIZE(); return 0; }
int main( int argc, char **argv ) { int g_a, g_b, i, j, size, size_me; int icnt, idx, jdx, ld; int n=N, type=MT_C_INT, one; int *values, *ptr; int **indices; int dims[2]={N,N}; int lo[2], hi[2]; int heap=3000000, stack=2000000; int me, nproc; int datatype, elements; double *prealloc_mem; MP_INIT(argc,argv); #if 1 GA_INIT(argc,argv); /* initialize GA */ me=GA_Nodeid(); nproc=GA_Nnodes(); if(me==0) { if(GA_Uses_fapi())GA_Error("Program runs with C array API only",1); printf("\nUsing %ld processes\n",(long)nproc); fflush(stdout); } heap /= nproc; stack /= nproc; if(! MA_init(MT_F_DBL, stack, heap)) GA_Error("MA_init failed",stack+heap); /* initialize memory allocator*/ /* Create a regular matrix. */ if(me==0)printf("\nCreating matrix A of size %d x %d\n",N,N); g_a = NGA_Create(type, 2, dims, "A", NULL); if(!g_a) GA_Error("create failed: A",n); /* Fill matrix using scatter routines */ size = N*N; if (size%nproc == 0) { size_me = size/nproc; } else { i = size - size%nproc; size_me = i/nproc; if (me < size%nproc) size_me++; } /* Check that sizes are all okay */ i = size_me; GA_Igop(&i,1,"+"); if (i != size) { GA_Error("Sizes don't add up correctly: ",i); } else if (me==0) { printf("\nSizes add up correctly\n"); } /* Allocate index and value arrays */ indices = (int**)malloc(size_me*sizeof(int*)); values = (int*)malloc(size_me*sizeof(int)); icnt = me; for (i=0; i<size_me; i++) { values[i] = icnt; idx = icnt%N; jdx = (icnt-idx)/N; if (idx >= N || idx < 0) { printf("p[%d] Bogus index i: %d\n",me,idx); } if (jdx >= N || jdx < 0) { printf("p[%d] Bogus index j: %d\n",me,jdx); } indices[i] = (int*)malloc(2*sizeof(int)); (indices[i])[0] = idx; (indices[i])[1] = jdx; icnt += nproc; } /* Scatter values into g_a */ NGA_Scatter(g_a, values, indices, size_me); GA_Sync(); /* Check to see if contents of g_a are correct */ NGA_Distribution( g_a, me, lo, hi ); NGA_Access(g_a, lo, hi, &ptr, &ld); for (i=lo[0]; i<hi[0]; i++) { idx = i-lo[0]; for (j=lo[1]; j<hi[1]; j++) { jdx = j-lo[1]; if (ptr[idx*ld+jdx] != j*N+i) { printf("p[%d] (Scatter) expected: %d actual: %d\n",me,j*N+i,ptr[idx*ld+jdx]); } } } if (me==0) printf("\nCompleted test of NGA_Scatter\n"); for (i=0; i<size_me; i++) { values[i] = 0; } GA_Sync(); NGA_Gather(g_a, values, indices, size_me); icnt = me; for (i=0; i<size_me; i++) { if (icnt != values[i]) { printf("p[%d] (Gather) expected: %d actual: %d\n",me,icnt,values[i]); } icnt += nproc; } if (me==0) printf("\nCompleted test of NGA_Gather\n"); GA_Sync(); /* Scatter-accumulate values back into GA*/ one = 1; NGA_Scatter_acc(g_a, values, indices, size_me, &one); GA_Sync(); /* Check to see if contents of g_a are correct */ for (i=lo[0]; i<hi[0]; i++) { idx = i-lo[0]; for (j=lo[1]; j<hi[1]; j++) { jdx = j-lo[1]; if (ptr[idx*ld+jdx] != 2*(j*N+i)) { printf("p[%d] (Scatter_acc) expected: %d actual: %d\n",me,2*(j*N+i),ptr[idx*ld+jdx]); } } } if (me==0) printf("\nCompleted test of NGA_Scatter_acc\n"); NGA_Release(g_a, lo, hi); /* Test fixed buffer size */ NGA_Alloc_gatscat_buf(size_me); /* Scatter-accumulate values back into GA*/ GA_Sync(); NGA_Scatter_acc(g_a, values, indices, size_me, &one); GA_Sync(); /* Check to see if contents of g_a are correct */ for (i=lo[0]; i<hi[0]; i++) { idx = i-lo[0]; for (j=lo[1]; j<hi[1]; j++) { jdx = j-lo[1]; if (ptr[idx*ld+jdx] != 3*(j*N+i)) { printf("p[%d] (Scatter_acc) expected: %d actual: %d\n",me,3*(j*N+i),ptr[idx*ld+jdx]); } } } if (me==0) printf("\nCompleted test of NGA_Scatter_acc using fixed buffers\n"); NGA_Release(g_a, lo, hi); NGA_Free_gatscat_buf(); GA_Destroy(g_a); if(me==0)printf("\nSuccess\n"); GA_Terminate(); #endif MP_FINALIZE(); return 0; }
int main(int argc, char **argv) { /* initialize GA */ #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &me ); ElMPICommSize( MPI_COMM_WORLD, &nproc ); // instantiate el::global array ElGlobalArraysConstruct_d( &eldga ); // initialize global arrays ElGlobalArraysInitialize_d( eldga ); #else MP_INIT(argc,argv); GA_Initialize_args(&argc, &argv); me = GA_Nodeid(); nproc = GA_Nnodes(); #endif if (nproc < 2) { if (me == 0) { fprintf(stderr, "USAGE: 2 <= processes - got %d\n", nproc); } #if defined(USE_ELEMENTAL) ElGlobalArraysTerminate_d( eldga ); // call el::global arrays destructor ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Terminate(); MP_FINALIZE(); #endif exit(0); } if (!me) { printf("\n Performance of Basic Blocking Communication Operations\n"); } #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); #else GA_Sync(); #endif /* test 1 dimension array */ /* if (!me) { printf("\n\t\t\tContiguous Data Transfer\n"); } test_1D(); */ /* test 2 dimension array */ if (!me) { printf("\n\t\t\tStrided Data Transfer\n"); } test_2D(); #if 0 if (me == 0) { if (warn_accuracy) { printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n", warn_accuracy); } printf("\n\n------------ Now we test the same data transfer for correctness ----------\n"); fflush(stdout); } #endif #if defined(USE_ELEMENTAL) ElGlobalArraysTerminate_d( eldga ); // call el::global arrays destructor ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Terminate(); MP_FINALIZE(); #endif return(0); }
int main(int argc, char **argv) { Integer heap=9000000, stack=9000000; int me, nproc; DoublePrecision time; MP_INIT(argc,argv); GA_INIT(argc,argv); /* initialize GA */ nproc = GA_Nnodes(); me = GA_Nodeid(); if(me==0) printf("Using %d processes\n\n",nproc); if (me==0) printf ("Matrix size is %d X %d\n",N,N); #ifdef USE_REGULAR if (me == 0) printf("\nUsing regular data distribution\n\n"); #endif #ifdef USE_SIMPLE_CYCLIC if (me == 0) printf("\nUsing simple block-cyclic data distribution\n\n"); #endif #ifdef USE_SCALAPACK if (me == 0) printf("\nUsing ScaLAPACK data distribution\n\n"); #endif #ifdef USE_TILED if (me == 0) printf("\nUsing tiled data distribution\n\n"); #endif if(!MA_init((Integer)MT_F_DBL, stack/nproc, heap/nproc)) GA_Error("MA_init failed bytes= %d",stack+heap); #ifdef PERMUTE { int i, *list = (int*)malloc(nproc*sizeof(int)); if(!list)GA_Error("malloc failed",nproc); for(i=0; i<nproc;i++)list[i]=nproc-1-i; GA_Register_proclist(list, nproc); free(list); } #endif if(GA_Uses_fapi())GA_Error("Program runs with C API only",1); time = MP_TIMER(); do_work(); /* printf("%d: Total Time = %lf\n", me, MP_TIMER()-time); printf("%d: GEMM Total Time = %lf\n", me, gTime); */ if(me==0)printf("\nSuccess\n\n"); GA_Terminate(); MP_FINALIZE(); return 0; }
int main(int argc, char **argv) { int me; int g_a; int status; int i,j; int dims[] = {n,n}; int proc_group[PROC_LIST_SIZE],proclist[PROC_LIST_SIZE],inode; int sbuf[1],rbuf[1]; MPI_Comm comm; MP_INIT(argc,argv); GA_Initialize(); me = GA_Nodeid(); status = MA_init(MT_DBL, 100000, 100000); if (!status) GA_Error("ma_init failed",-1); status = MA_set_auto_verify(1); status = MA_set_hard_fail(1); status = MA_set_error_print(1); inode = GA_Cluster_nodeid(); if (me == 0) { printf("there are %d nodes, node 0 has %d procs\n", GA_Cluster_nnodes(), GA_Cluster_nprocs(0)); fflush(stdout); } GA_Sync(); for (i=0; i<GA_Cluster_nnodes(); ++i) { for (j=0; j<GA_Cluster_nprocs(i); ++j) { proclist[j]=GA_Cluster_procid(i,j); } proc_group[i]=GA_Pgroup_create(proclist,GA_Cluster_nprocs(i)); } GA_Sync(); for (i=0; i<GA_Cluster_nnodes(); ++i) { if (i == inode) { printf("%d joining group %d\n", me, proc_group[inode]); GA_Pgroup_set_default(proc_group[inode]); g_a = NGA_Create(C_DBL, 2, dims, "a", NULL); if (!g_a) GA_Error("NGA_Create failed",-1); printf("%d Created array of group %d as proc no. %d\n", me, proc_group[inode], GA_Nodeid()); GA_Print_distribution(g_a); comm = GA_MPI_Comm_pgroup_default(); if (comm != MPI_COMM_NULL) { sbuf[0] = GA_Nodeid(); status = MPI_Allreduce(sbuf, rbuf, 1, MPI_INT, MPI_MAX, comm); printf("%d max nodeid is %d\n", me, rbuf[0]); if ((rbuf[0]+1) != GA_Cluster_nprocs(i)) { GA_Error("MPI_Allreduce failed",1); } } else { printf("MPI_Comm was null!\n"); } GA_Pgroup_set_default(GA_Pgroup_get_world()); } GA_Sync(); } GA_Terminate(); MP_FINALIZE(); return 0; }
// note: Sayan: brings down memory requirement to about 268 MB int main(int argc, char **argv) { int me, nproc, g_a = -1, i, j; #if defined(USE_ELEMENTAL) int ndim=2, dims[2]= {N1,N2}; #else int ndim=2, type=MT_F_DBL, dims[2]= {N1,N2}; #endif double *buf; int lo[2], hi[2], ld[1]; double alpha = 1.0; #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &me ); ElMPICommSize( MPI_COMM_WORLD, &nproc ); ElGlobalArrays_d eldga; // instantiate el::global array ElGlobalArraysConstruct_d( &eldga ); // initialize global arrays ElGlobalArraysInitialize_d( eldga ); printf ("INITIALIZED elemental global array...\n"); #else MP_INIT(argc,argv); GA_Initialize_ltd(-1); me=GA_Nodeid(); nproc=GA_Nnodes(); #endif if(me==0) printf("Using %ld processes\n",(long)nproc); if(me==0) printf("memory = %ld bytes\n",((long)N1)*((long)N2)*8); #if defined(USE_ELEMENTAL) // create and allocate a global array printf ("ndim = %d\n", ndim); printf ("dim[0] = %d and dim[1] = %d\n", dims[0], dims[1]); ElGlobalArraysCreate_d( eldga, ndim, dims, "A", &g_a); printf ("CREATED elemental global array...\n"); // print distribution ElGlobalArraysPrint_d( eldga, g_a ); #else g_a = NGA_Create(type, ndim, dims, "A", NULL); GA_Zero(g_a); /* zero the matrix */ GA_Print_distribution(g_a); #endif if(me == 0) { // buf = (double*)(malloc(N1*1024*sizeof(double))); buf = (double*)(malloc(N1*128*sizeof(double))); // for(j = 0; j < N1*1024; ++j) buf[j] = 1.0; // for(i = 0; i < N2/1024; ++i) { for(j = 0; j < N1*128; ++j) buf[j] = 1.0; for(i = 0; i < N2/128; ++i) { lo[0] = 0; hi[0] = lo[0] + N1 -1; /* lo[1] = i*1024; hi[1] = lo[1] + 1024 -1; ld[0] = 1024; */ lo[1] = i*128; hi[1] = lo[1] + 128 -1; ld[0] = 128; printf("NGA_Acc.%d: %d:%d %d:%d\n",i,lo[0],hi[0],lo[1],hi[1]); #if defined(USE_ELEMENTAL) ElGlobalArraysAccumulate_d( eldga, g_a, lo, hi, buf, ld, &alpha ); // there is an explicit flush in NGA_Acc/Put, so when it returns, the buffer // can be reused and data has reached the destination #else NGA_Init_fence(); NGA_Acc(g_a, lo, hi, buf, ld, &alpha); NGA_Fence(); #endif } } #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); ElGlobalArraysDestroy_d( eldga, g_a ); ElGlobalArraysTerminate_d( eldga ); // call el::global arrays destructor ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Sync(); GA_Destroy(g_a); GA_Terminate(); MP_FINALIZE(); #endif return 0; }
/* * test ga_dgemm * Note: - change nummax for large arrays * - turn off "dgemm_verify" for large arrays due to memory * limitations, as dgemm_verify=1 for large arrays produces * segfault, dumps core,or any crap. */ int main(int argc, char **argv) { int num_m; int num_n; int num_k; int i; int ii; double *h0; int g_c; int g_b; int g_a; double a; double t1; double mf; double avg_t[ntrans]; double avg_mf[ntrans]; int itime; int ntimes; int nums_m[/*howmany*/] = {512,1024}; int nums_n[/*howmany*/] = {512,1024}; int nums_k[/*howmany*/] = {512,1024}; char transa[/*ntrans*/] = "ntnt"; char transb[/*ntrans*/] = "nntt"; char ta; char tb; double *tmpa; double *tmpb; double *tmpc; int ndim; int dims[2]; #ifdef BLOCK_CYCLIC int block_size[2]; #endif #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &me ); ElMPICommSize( MPI_COMM_WORLD, &nproc ); // instantiate el::global array ElGlobalArraysConstruct_d( &eldga ); // initialize global arrays ElGlobalArraysInitialize_d( eldga ); #else MP_INIT(argc,argv); if (!MA_init(MT_DBL,1,20000000)) { GA_Error("failed: ma_init(MT_DBL,1,20000000)",10); } GA_INIT(argc,argv); me = GA_Nodeid(); #endif h0 = (double*)malloc(sizeof(double) * nummax*nummax); tmpa = (double*)malloc(sizeof(double) * nummax*nummax); tmpb = (double*)malloc(sizeof(double) * nummax*nummax); tmpc = (double*)malloc(sizeof(double) * nummax*nummax); ii = 0; for (i=0; i<nummax*nummax; i++) { ii = ii + 1; if (ii > nummax) { ii = 0; } h0[i] = ii; } /* Compute times assuming 500 mflops and 5 second target time */ /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */ ntimes = 5; for (ii=0; ii<howmany; ii++) { num_m = nums_m[ii]; num_n = nums_n[ii]; num_k = nums_k[ii]; a = 0.5/(num_m*num_n); if (num_m > nummax || num_n > nummax || num_k > nummax) { GA_Error("Insufficient memory: check nummax", 1); } #ifndef BLOCK_CYCLIC ndim = 2; /* dims[0] = num_m; dims[1] = num_n; */ dims[1] = num_m; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c ); #else if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) { GA_Error("failed: create g_c",20); } #endif /* dims[0] = num_k; dims[1] = num_n; */ dims[1] = num_k; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b ); #else if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) { GA_Error("failed: create g_b",30); } #endif /* dims[0] = num_m; dims[1] = num_k; */ dims[1] = num_m; dims[0] = num_k; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a ); #else if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) { GA_Error("failed: create g_a",40); } #endif #else ndim = 2; block_size[0] = 128; block_size[1] = 128; dims[0] = num_m; dims[1] = num_n; g_c = GA_Create_handle(); GA_Set_data(g_c,ndim,dims,MT_DBL); GA_Set_array_name(g_c,"g_c"); GA_Set_block_cyclic(g_c,block_size); if (!GA_Allocate(g_c)) { GA_Error("failed: create g_c",40); } dims[0] = num_k; dims[1] = num_n; g_b = GA_Create_handle(); GA_Set_data(g_b,ndim,dims,MT_DBL); GA_Set_array_name(g_b,"g_b"); GA_Set_block_cyclic(g_b,block_size); if (!ga_allocate(g_b)) { GA_Error("failed: create g_b",40); } dims[0] = num_m; dims[1] = num_k; g_a = GA_Create_handle(); GA_Set_data(g_a,ndim,dims,MT_DBL); GA_Set_array_name(g_a,"g_a"); GA_Set_block_cyclic(g_a,block_size); if (!ga_allocate(g_a)) { GA_Error('failed: create g_a',40); } #endif /* Initialize matrices A and B */ if (me == 0) { load_ga(g_a, h0, num_m, num_k); load_ga(g_b, h0, num_k, num_n); } #if defined(USE_ELEMENTAL) double zero = 0.0; ElGlobalArraysFill_d( eldga, g_c, &zero ); ElGlobalArraysSync_d( eldga ); #else GA_Zero(g_c); GA_Sync(); #endif #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n", (long)num_m, (long)num_k, (long)num_k, (long)num_n); fflush(stdout); } for (i=0; i<ntrans; i++) { avg_t[i] = 0.0; avg_mf[i] = 0.0; } for (itime=0; itime<ntimes; itime++) { for (i=0; i<ntrans; i++) { #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); #else GA_Sync(); #endif ta = transa[i]; tb = transb[i]; t1 = MP_TIMER(); #if defined(USE_ELEMENTAL) ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c ); #else GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c); #endif t1 = MP_TIMER() - t1; #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif #if defined(USE_ELEMENTAL) mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc; #else mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes(); #endif avg_t[i] = avg_t[i]+t1; avg_mf[i] = avg_mf[i] + mf; printf("%15s%2d: %12.4f seconds %12.1f mflops/proc %c %c\n", "Run#", itime, t1, mf, ta, tb); fflush(stdout); if (dgemm_verify && itime == 0) { /* recall the C API swaps the matrix order */ /* we swap it here for the Fortran-based verify */ verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0, g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc); } } } } #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\n"); for (i=0; i<ntrans; i++) { printf("%17s: %12.4f seconds %12.1f mflops/proc %c %c\n", "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes, transa[i], transb[i]); } if(dgemm_verify) { printf("All GA_Dgemms are verified...O.K.\n"); } fflush(stdout); } /* GA_Print(g_a); GA_Print(g_b); GA_Print(g_c); */ #if defined(USE_ELEMENTAL) ElGlobalArraysDestroy_d( eldga, g_a ); ElGlobalArraysDestroy_d( eldga, g_b ); ElGlobalArraysDestroy_d( eldga, g_c ); #else GA_Destroy(g_c); GA_Destroy(g_b); GA_Destroy(g_a); #endif } /* ??? format(a15, i2, ': ', e12.4, ' seconds ',f12.1, . ' mflops/proc ', 3a2) */ #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("All tests successful\n"); } free(h0); free(tmpa); free(tmpb); free(tmpc); #if defined(USE_ELEMENTAL) // call el::global arrays destructor ElGlobalArraysTerminate_d( eldga ); ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Terminate(); MP_FINALIZE(); #endif return 0; } /* * Verify for correctness. Process 0 computes BLAS dgemm * locally. For larger arrays, disbale this test as memory * might not be sufficient */ void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k, double alpha, int g_a, int g_b, double beta, int g_c, double *tmpa, double *tmpb, double *tmpc) { int i,j,type,ndim,dims[2],lo[2],hi[2]; double abs_value; for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { tmpc[j+i*num_m] = -1.0; tmpa[j+i*num_m] = -2.0; } } #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_a, lo, hi, tmpa, &dims[1]); #endif #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] ); #else NGA_Get(g_b, lo, hi, tmpb, &dims[1]); #endif /* compute dgemm sequentially */ #if defined(USE_ELEMENTAL) cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), ( xt2 == 'n'? CblasNoTrans: CblasTrans ), num_m /* M */, num_n /* N */, num_k /* K */, alpha, tmpa, num_m, /* lda */ tmpb, num_k, /* ldb */ beta, tmpc, num_m /* ldc */); #else xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k, &alpha, tmpa, &num_m, tmpb, &num_k, &beta, tmpc, &num_m); #endif /* after computing c locally, verify it with the values in g_c */ #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_c, lo, hi, tmpa, &dims[1]); #endif for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]); if(abs_value > 1.0 || abs_value < -1.0) { printf("Values are = %f %f\n", tmpc[j+i*num_m], tmpa[j+i*num_m]); printf("Values are = %f %f\n", fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value); fflush(stdout); GA_Error("verify ga_dgemm failed", 1); } } } } /** * called by process '0' (or your master process ) */ void load_ga(int handle, double *f, int dim1, int dim2) { int lo[2], hi[2]; if (dim1 < 0 || dim2 < 0) { return; } lo[0] = 0; lo[1] = 0; hi[0] = dim1-1; hi[1] = dim2-1; #if defined(USE_ELEMENTAL) ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 ); #else NGA_Put(handle, lo, hi, f, &dim1); #endif }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; int nloop=5; double **ptr_loc; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } ptr = (void **)malloc(nproc * sizeof(void *)); #ifdef MPI2_ONESIDED MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]); MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me]; MPI_Barrier(MPI_COMM_WORLD); #else /* initialize ARMCI */ ARMCI_Init(); ARMCI_Malloc(ptr, proc_bytes); #endif a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks; i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } lu(n, block_size, me); /* cold start */ /* Starting the timer */ MP_BARRIER(); if(me == 0) start_timer(); for(i=0; i<nloop; i++) lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()/nloop); printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ #ifdef MPI2_ONESIDED MPI_Win_free(&win); MPI_Free_mem(ptr[me]); #else ARMCI_Free(ptr[me]); ARMCI_Finalize(); #endif MP_FINALIZE(); }
int main(int argc, char **argv) { int me; int nproc; int status; int g_a; int dims[NDIM]; int chunk[NDIM]; int pg_world; size_t num = 10; double *p1 = NULL; double *p2 = NULL; size_t i; int num_mutex; int lo[1]; int hi[1]; int ld[1]={1}; MPI_Comm comm; MP_INIT(argc,argv); GA_INIT(argc,argv); me = GA_Nodeid(); nproc = GA_Nnodes(); comm = GA_MPI_Comm_pgroup_default(); printf("%d: Hello world!\n",me); if (me==0) printf("%d: GA_Initialize\n",me); /*if (me==0) printf("%d: ARMCI_Init\n",me);*/ /*ARMCI_Init();*/ /*if (me==0) printf("%d: MA_Init\n",me);*/ /*MA_init(MT_DBL, 8*1024*1024, 2*1024*1024);*/ if (me==0) printf("%d: GA_Create_handle\n",me); g_a = GA_Create_handle(); if (me==0) printf("%d: GA_Set_array_name\n",me); GA_Set_array_name(g_a,"test array A"); dims[0] = 30; if (me==0) printf("%d: GA_Set_data\n",me); GA_Set_data(g_a,NDIM,dims,MT_DBL); chunk[0] = -1; if (me==0) printf("%d: GA_Set_chunk\n",me); GA_Set_chunk(g_a,chunk); if (me==0) printf("%d: GA_Pgroup_get_world\n",me); pg_world = GA_Pgroup_get_world(); if (me==0) printf("%d: GA_Set_pgroup\n",me); GA_Set_pgroup(g_a,pg_world); if (me==0) printf("%d: GA_Allocate\n",me); status = GA_Allocate(g_a); if(0 == status) MPI_Abort(comm,100); if (me==0) printf("%d: GA_Zero\n",me); GA_Zero(g_a); if (me==0) printf("%d: GA_Sync\n",me); GA_Sync(); num = 10; p1 = malloc(num*sizeof(double)); /*double* p1 = ARMCI_Malloc_local(num*sizeof(double));*/ if (p1==NULL) MPI_Abort(comm,1000); p2 = malloc(num*sizeof(double)); /*double* p2 = ARMCI_Malloc_local(num*sizeof(double));*/ if (p2==NULL) MPI_Abort(comm,2000); for ( i=0 ; i<num ; i++ ) p1[i] = 7.0; for ( i=0 ; i<num ; i++ ) p2[i] = 3.0; num_mutex = 17; status = GA_Create_mutexes(num_mutex); if (me==0) printf("%d: GA_Create_mutexes = %d\n",me,status); /***************************************************************/ if (me==0) { printf("%d: before GA_Lock\n",me); GA_Lock(0); lo[0] = 0; hi[0] = num-1; GA_Init_fence(); NGA_Put(g_a,lo,hi,p1,ld); GA_Fence(); GA_Unlock(0); printf("%d: after GA_Unlock\n",me); } GA_Print(g_a); if (me==1) { printf("%d: before GA_Lock\n",me); GA_Lock(0); lo[0] = 0; hi[0] = num-1; GA_Init_fence(); NGA_Get(g_a,lo,hi,p2,ld); GA_Fence(); GA_Unlock(0); printf("%d: after GA_Unlock\n",me); for ( i=0 ; i<num ; i++ ) printf("p2[%2lu] = %20.10f\n", (long unsigned)i,p2[i]); } /***************************************************************/ status = GA_Destroy_mutexes(); if (me==0) printf("%d: GA_Destroy_mutexes = %d\n",me,status); /*ARMCI_Free(p2);*/ /*ARMCI_Free(p1);*/ free(p2); free(p1); if (me==0) printf("%d: GA_Destroy\n",me); GA_Destroy(g_a); /*if (me==0) printf("%d: ARMCI_Finalize\n",me);*/ /*ARMCI_Finalize();*/ if (me==0) printf("%d: GA_Terminate\n",me); GA_Terminate(); if (me==0) printf("%d: MPI_Finalize\n",me); MPI_Finalize(); return(0); }