int main(int argc, char **argv) { int k,i; double **myptrs[10]; double t0,t1,tget=0,tnbget=0,tput=0,tnbput=0,tnbwait=0,t2=0; #if PORTALS ARMCI_NetInit(); #endif MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); ARMCI_Init(); ARMCI_Init(); for(k=0;k<10;k++){ myptrs[k] = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs[k],400000*LOOP*sizeof(double)); for(i=0;i<LOOP;i++)myptrs[k][me][i]=me+0.414; MPI_Barrier(MPI_COMM_WORLD); for(i=0;i<LOOP;i++){ ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs); /*if(myptrs[k][me][i]!=0.414+(me+1)%nprocs)ARMCI_Error("errr",myptrs[k][me][i]);*/ } t0=t1=tget=tnbget=tput=tnbput=tnbwait=t2=0; t0 = MPI_Wtime(); for(i=0;i<LOOP;i++){ ARMCI_Get(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs); } t1 = MPI_Wtime(); printf("\nGet Latency=%lf\n",1e6*(t1-t0)/LOOP);fflush(stdout); t1=t0=0; for(i=0;i<LOOP;i++){ armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MPI_Wtime(); ARMCI_NbGet(myptrs[k][(me+1)%nprocs]+i,myptrs[k][me]+i,sizeof(double),(me+1)%nprocs,&nbh); t1 = MPI_Wtime(); ARMCI_Wait(&nbh); t2 = MPI_Wtime(); tnbget+=(t1-t0); tnbwait+=(t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); } for(k=0;k<10;k++)ARMCI_Free(myptrs[k][me]); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Finalize(); ARMCI_Finalize(); MPI_Finalize(); }
int main(int argc, char **argv) { ARMCI_NetInit(); MP_INIT(argc,argv); MP_MYID(&me); MP_PROCS(&nproc); if(nproc < 2 || nproc> MAXPROC) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } /* initialize ARMCI */ ARMCI_Init(); if(!me)printf("\n Performance of Basic Blocking Communication Operations\n"); MP_BARRIER(); CHECK_RESULT=1; if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); CHECK_RESULT=0; /* test 1 dimension array */ if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); /* test 2 dimension array */ if(!me)printf("\n\t\t\tStrided Data Transfer\n"); test_2D(); MP_BARRIER(); if(me == 0){ if(warn_accuracy) printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n",warn_accuracy); printf("\n\n------------ Testing the same data transfer for correctness ----------\n"); fflush(stdout); } MP_BARRIER(); CHECK_RESULT=1; if(!me)printf("\n\t\t\tContiguous Data Transfer\n"); test_1D(); if(me == 0) printf("OK\n"); MP_BARRIER(); if(!me)printf("\n\t\t\tStrided Data Transfer\n"); test_2D(); if(me == 0) printf("OK\n\n\nTests Completed.\n"); MP_BARRIER(); /* done */ ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main( int argc, char **argv) { MP_INIT(argc,argv); MP_MYID(&me); MP_PROCS(&nproc); if(nproc < 2) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d\n", nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } if(me == 0){ printf("Test of ARMCI Wrappers to Basic Message Passing Operations\n"); fflush(stdout); } /* initialize ARMCI */ ARMCI_Init(); MP_BARRIER(); TestGlobals(); /* done */ ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { int ndim; armci_msg_init(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); ARMCI_Init(); armci_msg_barrier(); if(me==0){ printf("\nTesting armci_notify\n"); fflush(stdout); sleep(1); } armci_msg_barrier(); for(ndim=1; ndim<=MAXDIMS; ndim++) test_notify(ndim); armci_msg_barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
int main(int argc, char* argv[]) { int ndim; MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); if(me==0){ printf("ARMCI test program for lock(%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); test_lock(); MP_BARRIER(); if(me==0){printf("test passed\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char **argv) { /* int heap=300000, stack=300000; */ int me, nprocs; /* Step1: Initialize Message Passing library */ armci_msg_init(&argc, &argv); /* Step2: Initialize ARMCI */ ARMCI_Init(); /* Step3: Initialize Memory Allocator (MA) */ /*bjp if(! MA_init(C_DBL, stack, heap) ) ARMCI_Error("MA_init failed",stack+heap); */ me = armci_msg_me(); nprocs = armci_msg_nproc(); if(me==0) { printf("\nUsing %d processes\n\n", nprocs); fflush(stdout); } TRANSPOSE1D(); if(me==0)printf("\nTerminating ..\n"); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
int main(int argc, char ** argv) { int rank, nproc, val, i; void **base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI mutex read-modify-write test with %d processes\n", nproc); base_ptrs = malloc(nproc*sizeof(void*)); ARMCI_Create_mutexes(rank == 0 ? 1 : 0); ARMCI_Malloc(base_ptrs, (rank == 0) ? sizeof(int) : 0); // Proc 0 has a shared int if (rank == 0) { val = 0; ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0); } ARMCI_Barrier(); for (i = 0; i < NITER; i++) { ARMCI_Lock(0, 0); ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0); val += ADDIN; ARMCI_Put(&val, base_ptrs[0], sizeof(int), 0); ARMCI_Unlock(0, 0); } printf(" + %3d done\n", rank); fflush(NULL); ARMCI_Barrier(); if (rank == 0) { ARMCI_Get(base_ptrs[0], &val, sizeof(int), 0); if (val == ADDIN*nproc*NITER) printf("Test complete: PASS.\n"); else printf("Test complete: FAIL. Got %d, expected %d.\n", val, ADDIN*nproc*NITER); } ARMCI_Free(base_ptrs[rank]); ARMCI_Destroy_mutexes(); free(base_ptrs); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char ** argv) { MPI_Init(&argc, &argv); ARMCI_Init(); ARMCI_Get(NULL, NULL, 1, 0); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char** argv) { MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_rank(MPI_COMM_WORLD, &me); if(me==0)printf("Testing IPCs (%d MPI processes)\n\n",nproc); ARMCI_Init(); test(); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char ** argv) { int rank, nproc, i; int *buf; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI GOP test with %d processes\n", nproc); buf = malloc(DATA_SZ*sizeof(int)); if (rank == 0) printf(" - Testing ABSMIN\n"); for (i = 0; i < DATA_SZ; i++) buf[i] = (rank+1) * ((i % 2) ? -1 : 1); armci_msg_igop(buf, DATA_SZ, "absmin"); for (i = 0; i < DATA_SZ; i++) if (buf[i] != 1) { printf("Err: buf[%d] = %d expected 1\n", i, buf[i]); ARMCI_Error("Fail", 1); } if (rank == 0) printf(" - Testing ABSMAX\n"); for (i = 0; i < DATA_SZ; i++) buf[i] = (rank+1) * ((i % 2) ? -1 : 1); armci_msg_igop(buf, DATA_SZ, "absmax"); for (i = 0; i < DATA_SZ; i++) if (buf[i] != nproc) { printf("Err: buf[%d] = %d expected %d\n", i, buf[i], nproc); ARMCI_Error("Fail", 1); } free(buf); if (rank == 0) printf("Pass.\n"); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if( (nproc<MINPROC || nproc>MAXPROC) && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n Testing ARMCI Groups!\n\n"); fflush(stdout); } test_groups(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Collective groups: Success!!\n"); fflush(stdout);} sleep(2); #ifdef ARMCI_GROUP test_groups_noncollective(); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\n Non-collective groups: Success!!\n"); fflush(stdout);} sleep(2); #endif MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char* argv[]) { ARMCI_NetInit(); MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); if(nproc < 2 || nproc> MAXPROC) { if(me == 0) fprintf(stderr, "USAGE: 2 <= processes < %d - got %d\n", MAXPROC, nproc); MP_BARRIER(); MP_FINALIZE(); exit(0); } if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\n put/get/acc requests (Time in secs)\n\n"); fflush(stdout); } test_perf_nb(1); test_perf_nb(0); ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
int main(int argc, char * argv[]) { void *baseAddress[MAX_PROCESSORS]; char *local; int thisImage; int iter = 100, size; double startTime, endTime; int i; // initialize ARMCI_Init(); ARMCI_Myid(&thisImage); // allocate data (collective operation) ARMCI_Malloc(baseAddress, MAX_BUF_SIZE*sizeof(char)); local = (char *)ARMCI_Malloc_local(MAX_BUF_SIZE*sizeof(char)); ARMCI_Barrier(); ARMCI_Migrate(); if (thisImage == 0) { for(size = 1; size <= MAX_BUF_SIZE; size = size<<1){ startTime = CkWallTimer(); for(i = 0; i < iter; i++){ ARMCI_Put(local, baseAddress[1], size, 1); } ARMCI_Fence(1); endTime = CkWallTimer(); printf("%d: %f us\n", size, (endTime-startTime)*1000); } ARMCI_Barrier(); } else if (thisImage == 1) { ARMCI_Barrier(); } ARMCI_Free(baseAddress[thisImage]); ARMCI_Free_local(local); // finalize ARMCI_Finalize(); return 0; }
int main(int argc, char ** argv) { int rank, nproc, test_iter; void ***base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI memory allocation test with %d processes\n", nproc); base_ptrs = malloc(sizeof(void**)*NUM_ITERATIONS); // Perform a pile of allocations for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) { if (rank == 0) printf(" + allocation %d\n", test_iter); base_ptrs[test_iter] = malloc(sizeof(void*)*nproc); ARMCI_Malloc((void**)base_ptrs[test_iter], (test_iter % 4 == 0) ? 0 : DATA_SZ); } ARMCI_Barrier(); // Free all allocations for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) { if (rank == 0) printf(" + free %d\n", test_iter); ARMCI_Free(((void**)base_ptrs[test_iter])[rank]); free(base_ptrs[test_iter]); } free(base_ptrs); if (rank == 0) printf("Test complete: PASS.\n"); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char* argv[]) { armci_msg_init(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0) { printf("\n Performing Sparse Matrix-Vector Multiplication ...\n\n"); fflush(stdout); } test_sparse(); ARMCI_AllFence(); armci_msg_barrier(); if(me==0) { printf("\nSuccess!!\n"); fflush(stdout); } sleep(2); armci_msg_barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
int main(int argc, char* argv[]) { MP_INIT(argc, argv); MP_PROCS(&nproc); MP_MYID(&me); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0){ printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0){ printf("\nAggregate put/get requests\n\n"); fflush(stdout); } test_aggregate(1); /* cold start */ test_aggregate(0); /* warm start */ ARMCI_AllFence(); MP_BARRIER(); if(me==0){printf("\nSuccess!!\n"); fflush(stdout);} sleep(2); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
main(int argc, char *argv[]) { int i, j, l; int ch; extern char *optarg; int edge; int size; int lu_arg[MAX_THREADS][3]; /* ARMCI */ void **ptr; double **ptr_loc; THREAD_LOCK_INIT(mutex); armci_msg_init(&argc,&argv); nproc = armci_msg_nproc(); me = armci_msg_me(); while ((ch = getopt(argc, argv, "n:b:p:t:d:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 't': th_per_p = atoi(optarg); break; case 'd': d = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC -tTH_PER_P\n"); armci_msg_barrier(); armci_msg_finalize(); exit(0); } } } if(th_per_p>MAX_THREADS) { th_per_p=MAX_THREADS; if(me==0)printf("Warning: cannot run more than %d threads, adjust MAX_THREADS",MAX_THREADS); } if (d) { fprintf(stderr, "%d: %d\n", me, getpid()); sleep(d); } nthreads = th_per_p * nproc; if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d thread(s) per processor, %d threads total\n", th_per_p, nthreads); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nthreads); for (;;) { num_cols = nthreads/num_rows; if (num_rows*num_cols == nthreads) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } num = (nblocks * nblocks)/nthreads; if((num * nthreads) != (nblocks * nblocks)) num++; edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } armci_msg_barrier(); /* armci_msg_finalize(); */ /* exit(0); */ #endif for (l = 0; l < th_per_p; l++) { me_th[l] = me * th_per_p + l; for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me_th[l]) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } thread_doubles[l] += size; } } } proc_bytes += thread_doubles[l] * sizeof(double); } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nthreads*sizeof(double *)); for (i = 0; i < nproc; i++) { ptr_loc[i * th_per_p] = (double *)ptr[i]; for (j = 1; j < th_per_p; j++) ptr_loc[i * th_per_p + j] = ptr_loc[i * th_per_p + j - 1] + thread_doubles[j - 1]; } for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } #if 0 for(i=0; i<nblocks*nblocks;i ++) printf("%d: a[%d]=%p\n", me, i, a[i]); fflush(stdout); #endif /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ armci_msg_barrier(); /* to remove cold-start misses, all processors touch their own data */ /* for (l = 0; l < th_per_p; l++) touch_array(block_size, me_th[l]); */ armci_msg_barrier(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } armci_msg_barrier(); } #if 1 for (i = 0; i < nblocks; i++) for (j = 0; j < nblocks; j++) print_block_dbg(a[i + j * nblocks], "proc %d, a[%d, %d]:\n", me, i, j); #endif TH_INIT(nproc,th_per_p); /* Starting the timer */ if(me == 0) start_timer(); for (l = 0; l < th_per_p; l++) { lu_arg[l][0] = n; lu_arg[l][1] = block_size; lu_arg[l][2] = l; THREAD_CREATE(threads + l, lu, lu_arg[l]); } for (l = 0; l < th_per_p; l++) THREAD_JOIN(threads[l], NULL); armci_msg_barrier(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } armci_msg_barrier(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); armci_msg_finalize(); THREAD_LOCK_DESTROY(mutex); }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; int nloop=5; double **ptr_loc; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } ptr = (void **)malloc(nproc * sizeof(void *)); #ifdef MPI2_ONESIDED MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]); MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me]; MPI_Barrier(MPI_COMM_WORLD); #else /* initialize ARMCI */ ARMCI_Init(); ARMCI_Malloc(ptr, proc_bytes); #endif a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks; i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } lu(n, block_size, me); /* cold start */ /* Starting the timer */ MP_BARRIER(); if(me == 0) start_timer(); for(i=0; i<nloop; i++) lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()/nloop); printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ #ifdef MPI2_ONESIDED MPI_Win_free(&win); MPI_Free_mem(ptr[me]); #else ARMCI_Free(ptr[me]); ARMCI_Finalize(); #endif MP_FINALIZE(); }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors, total_errors; double **buf_bvec, **src_bvec, *src_buf; int count[2], src_stride, trg_stride, stride_level; double scaling, time; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); buf_bvec = (double **) malloc(sizeof(double *) * nranks); src_bvec = (double **) malloc(sizeof(double *) * nranks); bufsize = XDIM * YDIM * sizeof(double); ARMCI_Malloc((void **) buf_bvec, bufsize); ARMCI_Malloc((void **) src_bvec, bufsize); src_buf = src_bvec[rank]; if (rank == 0) printf("ARMCI Strided DLA Accumulate Test:\n"); ARMCI_Access_begin(buf_bvec[rank]); ARMCI_Access_begin(src_buf); for (i = 0; i < XDIM*YDIM; i++) { *(buf_bvec[rank] + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } ARMCI_Access_end(src_buf); ARMCI_Access_end(buf_bvec[rank]); scaling = 2.0; src_stride = XDIM * sizeof(double); trg_stride = XDIM * sizeof(double); stride_level = 1; count[1] = YDIM; count[0] = XDIM * sizeof(double); ARMCI_Barrier(); time = MPI_Wtime(); peer = (rank+1) % nranks; for (i = 0; i < ITERATIONS; i++) { ARMCI_AccS(ARMCI_ACC_DBL, (void *) &scaling, src_buf, &src_stride, (void *) buf_bvec[peer], &trg_stride, count, stride_level, peer); } ARMCI_Barrier(); time = MPI_Wtime() - time; if (rank == 0) printf("Time: %f sec\n", time); ARMCI_Access_begin(buf_bvec[rank]); for (i = errors = 0; i < XDIM; i++) { for (j = 0; j < YDIM; j++) { const double actual = *(buf_bvec[rank] + i + j*XDIM); const double expected = (1.0 + rank) + scaling * (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } ARMCI_Access_end(buf_bvec[rank]); MPI_Allreduce(&errors, &total_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ARMCI_Free((void *) buf_bvec[rank]); ARMCI_Free((void *) src_bvec[rank]); free(buf_bvec); free(src_bvec); ARMCI_Finalize(); MPI_Finalize(); if (total_errors == 0) { if (rank == 0) printf("Success.\n"); return 0; } else { if (rank == 0) printf("Fail.\n"); return 1; } }
int main(int argc, char* argv[]) { int i; struct timeval start_time[14]; struct timeval stop_time[14]; /* char * test_name[14] = { "dim", "nbdim", "vec_small", "acc", "vector", "vector_acc", "fetch_add", "swap", "rput", "aggregate", "implicit", "memlock", "acc_type", "collective" }; int test_flags[14] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }; */ char * test_name[2] = { "acc_type", "collective" }; int test_flags[2] = { 1, 1 }; #define TEST_ACC_TYPE 0 #define TEST_COLLECTIVE 1 MP_INIT(argc, argv); ARMCI_Init(); MP_PROCS(&nproc); MP_MYID(&me); if(nproc > MAXPROC && me == 0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me == 0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } gettimeofday(&start_time[TEST_ACC_TYPE],NULL); if(test_flags[TEST_ACC_TYPE] == 1) { if(me == 0) { printf("\nTesting Accumulate Types\n"); fflush(stdout); } MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_INT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_INT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_LNG\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_LNG); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_FLT\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_FLT); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DBL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DBL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_CPL\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_CPL); ARMCI_AllFence(); MP_BARRIER(); if(me == 0) { printf("Test Accumulate ARMCI_ACC_DCP\n"); fflush(stdout); } test_acc_type(ARMCI_ACC_DCP); ARMCI_AllFence(); MP_BARRIER(); } gettimeofday(&stop_time[TEST_ACC_TYPE],NULL); gettimeofday(&start_time[TEST_COLLECTIVE],NULL); if(test_flags[TEST_COLLECTIVE] == 1) { if(me == 0) { printf("\nTesting Collective Types\n"); fflush(stdout); } if(me == 0) { printf("Test Collective ARMCI_INT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_INT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_LONG\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_LONG); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_FLOAT\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_FLOAT); MP_BARRIER(); if(me == 0) { printf("Test Collective ARMCI_DOUBLE\n"); fflush(stdout); } MP_BARRIER(); test_collective(ARMCI_DOUBLE); MP_BARRIER(); } gettimeofday(&stop_time[TEST_COLLECTIVE],NULL); if(me == 0) { printf("Accumulate and Collective tests passed\n"); fflush(stdout); } if(me == 0) { printf("Testcase runtime\n"); printf("Name,Time(seconds)\n"); for(i = 0; i < 2; i++) if(test_flags[i] == 1) { double time_spent = (stop_time[i].tv_sec - start_time[i].tv_sec) + ((double) stop_time[i].tv_usec - start_time[i].tv_usec) / 1E6; printf("%s,%.6f\n", test_name[i], time_spent); } } MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return(0); }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; void **bufr_g, **bufc_g; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\nUsing pre-PUTing\n"); printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); if (bufr == NULL || bufc == NULL) printf("Could not ARMCI_Malloc_local() mem\n"); /* bufr points to all k-th row blocks */ /* save all block address in row-major order */ proc_bytes = nblocks*block_size*block_size * sizeof(double); bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufr_g, proc_bytes); for (i = 0; i < nproc; i++) { bufr[i*nblocks] = (double *) bufr_g[i]; for (j = 1; j < nblocks; j++) { bufr[i*nblocks + j] = bufr[i*nblocks + j-1] + block_size * block_size; } } /* bufc points to all k-th column blocks */ bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufc_g, proc_bytes); for (i = 0; i < nproc; i++) { bufc[i*nblocks] = (double *) bufc_g[i]; for (j = 1; j < nblocks; j++) { bufc[i*nblocks + j] = bufc[i*nblocks + j-1] + block_size * block_size; } } /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Free(bufc_g[me]); ARMCI_Free(bufr_g[me]); ARMCI_Finalize(); MP_FINALIZE(); }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors; double **buffer, *src_buf; int count[2], src_stride, trg_stride, stride_level; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); buffer = (double **) malloc(sizeof(double *) * nranks); bufsize = XDIM * YDIM * sizeof(double); ARMCI_Malloc((void **) buffer, bufsize); src_buf = ARMCI_Malloc_local(bufsize); if (rank == 0) printf("ARMCI Strided Put Test:\n"); src_stride = XDIM * sizeof(double); trg_stride = XDIM * sizeof(double); stride_level = 1; count[1] = YDIM; count[0] = XDIM * sizeof(double); ARMCI_Barrier(); peer = (rank+1) % nranks; for (i = 0; i < ITERATIONS; i++) { for (j = 0; j < XDIM*YDIM; j++) { *(src_buf + j) = rank + i; } ARMCI_PutS( src_buf, &src_stride, (void *) buffer[peer], &trg_stride, count, stride_level, peer); } ARMCI_Barrier(); ARMCI_Access_begin(buffer[rank]); for (i = errors = 0; i < XDIM; i++) { for (j = 0; j < YDIM; j++) { const double actual = *(buffer[rank] + i + j*XDIM); const double expected = (1.0 + rank) + (1.0 + ((rank+nranks-1)%nranks)) + (ITERATIONS); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } ARMCI_Access_end(buffer[rank]); ARMCI_Free((void *) buffer[rank]); ARMCI_Free_local(src_buf); free(buffer); ARMCI_Finalize(); MPI_Finalize(); if (errors == 0) { printf("%d: Success\n", rank); return 0; } else { printf("%d: Fail\n", rank); return 1; } }
int main(int argc, char ** argv) { int rank, nproc, i, test_iter; int *my_data, *buf; void **base_ptrs; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0) printf("Starting ARMCI test with %d processes\n", nproc); buf = malloc(DATA_SZ); base_ptrs = malloc(sizeof(void*)*nproc); for (test_iter = 0; test_iter < NUM_ITERATIONS; test_iter++) { if (rank == 0) printf(" + iteration %d\n", test_iter); /*** Allocate the shared array ***/ ARMCI_Malloc(base_ptrs, DATA_SZ); my_data = base_ptrs[rank]; /*** Get from our right neighbor and verify correct data ***/ ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank*test_iter; ARMCI_Access_end(my_data); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Get(base_ptrs[(rank+1) % nproc], buf, DATA_SZ, (rank+1) % nproc); for (i = 0; i < DATA_NELTS; i++) { if (buf[i] != ((rank+1) % nproc)*test_iter) { printf("%d: GET expected %d, got %d\n", rank, (rank+1) % nproc, buf[i]); MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Barrier(); // Wait for all gets to complete /*** Put to our left neighbor and verify correct data ***/ for (i = 0; i < DATA_NELTS; i++) buf[i] = rank*test_iter; ARMCI_Put(buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) { if (my_data[i] != ((rank+1) % nproc)*test_iter) { printf("%d: PUT expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]); MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Access_end(my_data); ARMCI_Barrier(); // Wait for all gets to complete /*** Accumulate to our left neighbor and verify correct data ***/ for (i = 0; i < DATA_NELTS; i++) buf[i] = rank; ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) my_data[i] = rank; ARMCI_Access_end(my_data); ARMCI_Barrier(); int scale = test_iter; ARMCI_Acc(ARMCI_ACC_INT, &scale, buf, base_ptrs[(rank+nproc-1) % nproc], DATA_SZ, (rank+nproc-1) % nproc); ARMCI_Barrier(); // Wait for all updates to data to complete ARMCI_Access_begin(my_data); for (i = 0; i < DATA_NELTS; i++) { if (my_data[i] != rank + ((rank+1) % nproc)*test_iter) { printf("%d: ACC expected %d, got %d\n", rank, (rank+1) % nproc, my_data[i]); //MPI_Abort(MPI_COMM_WORLD, 1); } } ARMCI_Access_end(my_data); ARMCI_Free(my_data); } free(buf); free(base_ptrs); if (rank == 0) printf("Test complete: PASS.\n"); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i; double **myptrs; double t0, t1, tnbget=0, tnbwait=0, t2=0; MP_INIT(argc,argv); ARMCI_Init(); MP_PROCS(&nprocs); MP_MYID(&me); if (nprocs < 2) ARMCI_Error("This program requires at least to processes", 1); myptrs = (double **)malloc(sizeof(double *)*nprocs); ARMCI_Malloc((void **)myptrs, LOOP*sizeof(double)); MP_BARRIER(); if(me == 0) { for(i = 0; i < 10; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1); } t0 = MP_TIMER(); for(i = 0; i < LOOP; i++) { // This is a bug: // ARMCI_Get(myptrs[me]+i,myptrs[me+1]+i,sizeof(double),me+1); ARMCI_Get(myptrs[me+1]+1, myptrs[me]+i, sizeof(double), me+1); } t1 = MP_TIMER(); printf("\nGet Latency=%lf\n", 1e6*(t1-t0)/LOOP); fflush(stdout); t1 = t0 = 0; for(i = 0; i < LOOP; i++) { armci_hdl_t nbh; ARMCI_INIT_HANDLE(&nbh); t0 = MP_TIMER(); //ARMCI_NbGet(myptrs[me]+i, myptrs[me+1]+i, sizeof(double), me+1, &nbh); ARMCI_NbGet(myptrs[me+1]+i, myptrs[me]+i, sizeof(double), me+1, &nbh); t1 = MP_TIMER(); ARMCI_Wait(&nbh); t2 = MP_TIMER(); tnbget += (t1-t0); tnbwait += (t2-t1); } printf("\nNb Get Latency=%lf Nb Wait=%lf\n",1e6*tnbget/LOOP,1e6*tnbwait/LOOP);fflush(stdout); } else sleep(1); MP_BARRIER(); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
int main(int argc, char **argv) { int i,peer,j; cpu_set_t mycpuid,new_mask; char str[CPU_SETSIZE]; int rrr; char cid[8]; extern char * cpuset_to_cstr(cpu_set_t *mask, char *str); extern int cstr_to_cpuset(cpu_set_t *mask, const char* str); gpc_hdl_t nbh; char rheader[100]; int hlen, rhlen, rhsize; int rdsize; int rem; void *header=&rem; int locval=0; void *loc=&locval; int right; MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); if(nprocs<2){ printf("\ncan run only on >=2 procs\n"); MPI_Finalize(); exit(1); } right = (me+1)%nprocs; hlen=sizeof(header); bzero(rheader,100); rhlen = hlen; ARMCI_Init(); accloop=atoi(argv[1]); rem=accloop; myptrs = (char **)malloc(sizeof(char *)*nprocs); ARMCI_Malloc((void **)myptrs,size); MPI_Barrier(MPI_COMM_WORLD); gpcwork_memcpy = ARMCI_Gpc_register(gpc_work_handler_memcpy); gpcwork_ddot =ARMCI_Gpc_register(gpc_work_handler_ddot); gpcwork_daxpy = ARMCI_Gpc_register(gpc_work_handler_daxpy); gpcwork_dgemm = ARMCI_Gpc_register(gpc_work_handler_dgemm); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Gpc_init_handle(&nbh); if(ARMCI_Gpc_exec(gpcwork_memcpy, right, &header, hlen, loc, sizeof(int), rheader, rhlen,loc, sizeof(int), &nbh)) fprintf(stderr,"ARMCI_Gpc_exec failed\n"); { int m,n,k; char notr='n'; DoubleComplex ZERO; usleep(100); ZERO.real=0.;ZERO.imag=0.; m=n=k=DGS; t0=MPI_Wtime(); #ifdef DGEMM_WORK for(j=0;j<4*15;j++){ c_alpha=c_alpha+j*rand(); dgemm_(¬r,¬r,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1); } #elif IUNIT_WORK for(j=0;j<2*LOOP*100;j++){ for(i=0;i<LOOP*100;i++){ tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2]; } } #elif DAXPY_WORK for(j=0;j<tmp_loop*80;j++){ alpha=alpha+j*rand(); daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE); } #endif t1=MPI_Wtime()-t0; printf("\n%d:Compute_During_memcpy %d %f\n",me,accloop,t1); } ARMCI_Gpc_wait(&nbh); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Gpc_init_handle(&nbh); if(ARMCI_Gpc_exec(gpcwork_ddot, right, &header, hlen, loc, sizeof(int), rheader, rhlen,loc, sizeof(int), &nbh)) fprintf(stderr,"ARMCI_Gpc_exec failed\n"); { int m,n,k; char notr='n'; DoubleComplex ZERO; usleep(100); ZERO.real=0.;ZERO.imag=0.; m=n=k=DGS; t0=MPI_Wtime(); #ifdef DGEMM_WORK for(j=0;j<4*15;j++){ c_alpha=c_alpha+j*rand(); dgemm_(¬r,¬r,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1); } #elif IUNIT_WORK for(j=0;j<2*LOOP*100;j++){ for(i=0;i<LOOP*100;i++){ tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2]; } } #elif DAXPY_WORK for(j=0;j<tmp_loop*80;j++){ alpha=alpha+j*rand(); daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE); } #endif t1=MPI_Wtime()-t0; printf("\n%d:Compute_During_Ddot %d %f\n",me,accloop,t1); } ARMCI_Gpc_wait(&nbh); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Gpc_init_handle(&nbh); if(ARMCI_Gpc_exec(gpcwork_daxpy, right, &header, hlen, loc, sizeof(int), rheader, rhlen,loc, sizeof(int), &nbh)) fprintf(stderr,"ARMCI_Gpc_exec failed\n"); { int m,n,k; char notr='n'; DoubleComplex ZERO; usleep(100); ZERO.real=0.;ZERO.imag=0.; m=n=k=DGS; t0=MPI_Wtime(); #ifdef DGEMM_WORK for(j=0;j<4*15;j++){ c_alpha=c_alpha+j*rand(); dgemm_(¬r,¬r,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1); } #elif IUNIT_WORK for(j=0;j<2*LOOP*100;j++){ for(i=0;i<LOOP*100;i++){ tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2]; } } #elif DAXPY_WORK for(j=0;j<tmp_loop*80;j++){ alpha=alpha+j*rand(); daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE); } #endif t1=MPI_Wtime()-t0; printf("\n%d:Compute_During_Daxpy %d %f\n",me,accloop,t1); } ARMCI_Gpc_wait(&nbh); MPI_Barrier(MPI_COMM_WORLD); ARMCI_Gpc_init_handle(&nbh); if(ARMCI_Gpc_exec(gpcwork_dgemm, right, &header, hlen, loc, sizeof(int), rheader, rhlen,loc, sizeof(int), &nbh)) fprintf(stderr,"ARMCI_Gpc_exec failed\n"); { int m,n,k; char notr='n'; DoubleComplex ZERO; usleep(100); ZERO.real=0.;ZERO.imag=0.; m=n=k=DGS; t0=MPI_Wtime(); #ifdef DGEMM_WORK for(j=0;j<4*15;j++){ c_alpha=c_alpha+j*rand(); dgemm_(¬r,¬r,&m,&n,&k,&c_alpha,c_dga,&m,c_dgb,&n,&ZERO,c_dgc,&k,1,1); } #elif IUNIT_WORK for(j=0;j<2*LOOP*100;j++){ for(i=0;i<LOOP*100;i++){ tmpbuf1[i]=tmpbuf1[i]*1.1214+i/tmpbuf1[j/2]; } } #elif DAXPY_WORK for(j=0;j<tmp_loop*80;j++){ alpha=alpha+j*rand(); daxpy_(&N,&alpha,tmpbuf1,&ONE,tmpbuf2,&ONE); } #endif t1=MPI_Wtime()-t0; printf("\n%d:Compute_During_Dgemm %d %f\n",me,accloop,t1); } ARMCI_Gpc_wait(&nbh); MPI_Barrier(MPI_COMM_WORLD); ARMCI_AllFence(); ARMCI_Finalize(); MPI_Finalize(); }
int main(int argc, char **argv) { int me, nproc; int i, *procs; ARMCI_Group g_world, g_odd, g_even; MPI_Init(&argc, &argv); ARMCI_Init(); MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Comm_size(MPI_COMM_WORLD, &nproc); procs = malloc(sizeof(int) * ( nproc/2 + (nproc % 2 ? 1 : 0 ))); if (me == 0) printf("ARMCI Group test starting on %d procs\n", nproc); ARMCI_Group_get_world(&g_world); if (me == 0) printf(" + Creating odd group\n"); for (i = 1; i < nproc; i += 2) { procs[i/2] = i; } ARMCI_Group_create_child(i/2, procs, &g_odd, &g_world); if (me == 0) printf(" + Creating even group\n"); for (i = 0; i < nproc; i += 2) { procs[i/2] = i; } ARMCI_Group_create_child(i/2, procs, &g_even, &g_world); /***********************************************************************/ { int grp_me, grp_nproc; double t_abs_to_grp, t_grp_to_abs; const int iter = 1000000; if (me == 0) { ARMCI_Group_rank(&g_even, &grp_me); ARMCI_Group_size(&g_even, &grp_nproc); t_abs_to_grp = MPI_Wtime(); for (i = 0; i < iter; i++) ARMCII_Translate_absolute_to_group(&g_even, (grp_me+1) % grp_nproc); t_abs_to_grp = MPI_Wtime() - t_abs_to_grp; t_grp_to_abs = MPI_Wtime(); for (i = 0; i < iter; i++) ARMCI_Absolute_id(&g_even, (grp_me+1) % grp_nproc); t_grp_to_abs = MPI_Wtime() - t_grp_to_abs; printf("t_abs_to_grp = %f us, t_grp_to_abs = %f us\n", t_abs_to_grp/iter * 1.0e6, t_grp_to_abs/iter * 1.0e6); } ARMCI_Barrier(); } /***********************************************************************/ if (me == 0) printf(" + Freeing groups\n"); if (me % 2 > 0) ARMCI_Group_free(&g_odd); else ARMCI_Group_free(&g_even); free(procs); ARMCI_Finalize(); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; MP_INIT(argc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } /* num_rows = (int) sqrt((double) nproc); */ /* for (;;) { */ /* num_cols = nproc/num_rows; */ /* if (num_rows*num_cols == nproc) */ /* break; */ /* num_rows--; */ /* } */ nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } nnodes = nproc / 4; if((nnodes * 4) != nproc) { num_cols = nproc - nnodes * 4; nnodes++; num_rows = 1; } else { num_cols = 2; num_rows = 2; } num = (nblocks * nblocks)/nnodes; if((num * nnodes) != (nblocks * nblocks)) num++; #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif edge = n%block_size; if (edge == 0) { edge = block_size; } for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
int main(int argc, char **argv) { int me,nproc; int status; int rank; /* initialization */ MPI_Init(&argc, &argv); ARMCI_Init(); #ifdef HPC_PROFILING HPM_Init(); #endif MPI_Comm_rank(MPI_COMM_WORLD,&me); MPI_Comm_size(MPI_COMM_WORLD,&nproc); #ifdef DEBUG if(me == 0){ printf("The result of MPI_Comm_size is %d\n",nproc); fflush(stdout); } #endif /* get the matrix parameters */ if (argc > 1){ rank = atoi(argv[1]); } else { rank = 8; } if (me == 0){ printf("Running matmul.x with rank = %d\n",rank); fflush(stdout); } /* register remote pointers */ double** addr_A = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc); if (addr_A == NULL) ARMCI_Error("malloc A failed at line",0); double** addr_B = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc); if (addr_B == NULL) ARMCI_Error("malloc B failed at line",0); double** addr_C = (double **) ARMCI_Malloc_local(sizeof(double *) * nproc); if (addr_C == NULL) ARMCI_Error("malloc C failed at line",0); #ifdef DEBUG if(me == 0) printf("ARMCI_Malloc A requests %lu bytes\n",rank*rank*sizeof(double)); fflush(stdout); #endif status = ARMCI_Malloc((void **) addr_A, rank*rank*sizeof(double)); if (status != 0) ARMCI_Error("ARMCI_Malloc A failed",status); #ifdef DEBUG if(me == 0) printf("ARMCI_Malloc B requests %lu bytes\n",rank*rank*sizeof(double)); fflush(stdout); #endif status = ARMCI_Malloc((void **) addr_B, rank*rank*sizeof(double)); if (status != 0) ARMCI_Error("ARMCI_Malloc B failed",status); #ifdef DEBUG if(me == 0) printf("ARMCI_Malloc C requests %lu bytes\n",rank*rank*sizeof(double)); fflush(stdout); #endif status = ARMCI_Malloc((void **) addr_C, rank*rank*sizeof(double)); if (status != 0) ARMCI_Error("ARMCI_Malloc C failed",status); MPI_Barrier(MPI_COMM_WORLD); /* free ARMCI pointers */ ARMCI_Free_local(addr_C); ARMCI_Free_local(addr_B); ARMCI_Free_local(addr_A); #ifdef HPC_PROFILING HPM_Print(); #endif /* the end */ ARMCI_Finalize(); MPI_Finalize(); return(0); }
int main(int argc, char *argv[]) { int ch; extern char *optarg; int i, j, r; thread_t threads[MAX_TPP]; /* init MP */ MP_INIT(argc,argv); MP_PROCS(&size); MP_MYID(&rank); while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) { switch(ch) { case 't': /* # of threads */ tpp = atoi(optarg); if (tpp < 1 || tpp > MAX_TPP) { PRINTF0("\"%s\" is improper value for -t, should be a " "number between 1 and %d(MAX_TPP)\n", optarg, MAX_TPP); usage(); } break; case 'i': /* # of iterations */ iters = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -t, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 's': /* # of elements in the array */ asize = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -s, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 'd': delay = atoi(optarg); break; /* delay before start */ case 'h': usage(); break; /* print usage info */ } } #ifdef NOTHREADS tpp = 1; PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n"); #endif th_size = size * tpp; PRINTF0("\nTest of multi-threaded capabilities:\n" "%d threads per process (%d threads total),\n" "%d array elements of size %d,\n" "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters); if (delay) { printf("%d: %d\n", rank, getpid()); fflush(stdout); sleep(delay); MP_BARRIER(); } TH_INIT(size,tpp); for (i = 0; i < tpp; i++) th_rank[i] = rank * tpp + i; #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) { fname[10] = '0' + th_rank[i] / 100; fname[11] = '0' + th_rank[i] % 100 / 10; fname[12] = '0' + th_rank[i] % 10; dbg[i] = fopen(fname, "w"); } #endif for (i = 0; i < tpp; i++) prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]); /* init ARMCI */ ARMCI_Init(); /* set global seed (to ensure same random sequence across procs) */ time_seed = (unsigned)time(NULL); armci_msg_brdcst(&time_seed, sizeof(time_seed), 0); srand(time_seed); rand(); prndbg(0, "seed = %u\n", time_seed); /* random pairs */ pairs = calloc(th_size, sizeof(int)); for (i = 0; i < th_size; i++) pairs[i] = -1; for (i = 0; i < th_size; i++) { if (pairs[i] != -1) continue; r = RND(0, th_size); while (i == r || pairs[r] != -1 ) r = RND(0, th_size); pairs[i] = r; pairs[r] = i; } for (i = 0, cbufl = 0; i < th_size; i++) cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d", i, pairs[i], pairs[i], pairs[pairs[i]]); prndbg(0, "random pairs:%s\n", cbuf); /* random targets */ rnd_tgts = calloc(th_size, sizeof(int)); for (i = 0, cbufl = 0; i < th_size; i++) { rnd_tgts[i] = RND(0, th_size); if (rnd_tgts[i] == i) { i--; continue; } cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]); } prndbg(0, "random targets:%s\n", cbuf); /* random one */ rnd_one = RND(0, th_size); prndbg(0, "random one = %d\n", rnd_one); assert(ptrs1 = calloc(th_size, sizeof(void *))); assert(ptrs2 = calloc(th_size, sizeof(void *))); #ifdef NOTHREADS thread_main((void *)(long)0); #else for (i = 0; i < tpp; i++) THREAD_CREATE(threads + i, thread_main, (void *)(long)i); for (i = 0; i < tpp; i++) THREAD_JOIN(threads[i], NULL); #endif MP_BARRIER(); PRINTF0("Tests Completed\n"); /* clean up */ #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) fclose(dbg[i]); #endif ARMCI_Finalize(); TH_FINALIZE(); MP_FINALIZE(); return 0; }