int main(int argc, char **argv) { /* int heap=300000, stack=300000; */ int me, nprocs; /* Step1: Initialize Message Passing library */ armci_msg_init(&argc, &argv); /* Step2: Initialize ARMCI */ ARMCI_Init(); /* Step3: Initialize Memory Allocator (MA) */ /*bjp if(! MA_init(C_DBL, stack, heap) ) ARMCI_Error("MA_init failed",stack+heap); */ me = armci_msg_me(); nprocs = armci_msg_nproc(); if(me==0) { printf("\nUsing %d processes\n\n", nprocs); fflush(stdout); } TRANSPOSE1D(); if(me==0)printf("\nTerminating ..\n"); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
int main(int argc, char **argv) { armci_msg_init(&argc,&argv); ARMCI_Init_args(&argc, &argv); me = armci_msg_me(); nproc = armci_msg_nproc(); /* This test only works for two processes */ assert(nproc == 2); if (0 == me) { printf("msg size (bytes) avg time (us) avg b/w (MB/sec)\n"); } if (0 == me) { printf("#PNNL comex Put Test\n"); } contig_test(MAX_MESSAGE_SIZE, PUT); if (0 == me) { printf("#PNNL comex Get Test\n"); } contig_test(MAX_MESSAGE_SIZE, GET); if (0 == me) { printf("#PNNL comex Accumulate Test\n"); } contig_test(MAX_MESSAGE_SIZE, ACC); ARMCI_Finalize(); armci_msg_finalize(); return 0; }
int main(int argc, char *argv[]) { int ndim; armci_msg_init(&argc, &argv); ARMCI_Init_args(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); ARMCI_Barrier(); if (me == 0) { printf("\nTesting armci_notify\n"); fflush(stdout); sleep(1); } ARMCI_Barrier(); for (ndim = 1; ndim <= MAXDIMS; ndim++) { test_notify(ndim); } ARMCI_Barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
void VERIFY(void **b_ptr, int *dims, int *map) { int i, j, length, icnt, ichk, lmin, lmax; int *buf, *b; void *src_ptr, *dst_ptr; int me, nprocs; /* Find local processor ID and number of processors */ me = armci_msg_me(); nprocs = armci_msg_nproc(); /* Process 0 verifies that inversion is correct. Start by allocating buffer and guarantee that it is big enough */ length = (int)(((double)dims[0])/((double)nprocs)) + 1; buf = (int*)malloc(length*sizeof(int)); if (me == 0) { icnt = 0; ichk = 0; for (i=0; i<nprocs; i++) { /* Find min and max indices owned by processor i */ lmin = map[i]; if (i<nprocs-1) { lmax = map[i+1]-1; } else { lmax = dims[0]-1; } /* evaluate parameters for get call */ length = sizeof(int)*(lmax-lmin+1); src_ptr = b_ptr[i]; dst_ptr = (void*)buf; ARMCI_Get(src_ptr, dst_ptr, length, i); /* check values in buffer */ length = lmax-lmin+1; b = (int*)dst_ptr; for (j=0; j<length; j++) { /* printf("p[%d] b[%d]: %d\n",me,icnt,b[j]); */ if (b[j] != dims[0] - icnt) { printf("Error found for element %d b: %d != a: %d\n", icnt,b[j],dims[0]-icnt); ichk = 1; } icnt++; } } if (ichk == 0) { printf("1D transpose successful. No errors found\n"); } else { printf("1D transpose failed\n"); } } free(buf); }
void armci_group_init() { int grp_me; #ifdef ARMCI_GROUP int i; #endif ARMCI_iGroup *igroup = (ARMCI_iGroup *)&ARMCI_World_Proc_Group; #ifdef ARMCI_GROUP /*setup the world proc group*/ /* MPI_Comm_size(MPI_COMM_WORLD, &igroup->grp_attr.nproc); MPI_Comm_rank(MPI_COMM_WORLD, &igroup->grp_attr.grp_me); */ igroup->grp_attr.nproc = armci_msg_nproc(); igroup->grp_attr.grp_me = armci_msg_me(); igroup->grp_attr.proc_list = (int *)malloc(igroup->grp_attr.nproc*sizeof(int)); assert(igroup->grp_attr.proc_list != NULL); for(i=0; i<igroup->grp_attr.nproc; i++) { igroup->grp_attr.proc_list[i] = i; } igroup->grp_attr.grp_clus_info = NULL; armci_cache_attr((ARMCI_Group*)&ARMCI_World_Proc_Group); #else /* save MPI world group and communicatior in ARMCI_World_Proc_Group */ igroup->icomm = MPI_COMM_WORLD; MPI_Comm_group(MPI_COMM_WORLD, &(igroup->igroup)); /* processes belong to this group should cache attributes */ MPI_Group_rank((MPI_Group)(igroup->igroup), &grp_me); if(grp_me != MPI_UNDEFINED) { armci_cache_attr((ARMCI_Group*)&ARMCI_World_Proc_Group); } #endif /* Initially, World group is the default group */ ARMCI_Default_Proc_Group = ARMCI_World_Proc_Group; }
int main(int argc, char* argv[]) { armci_msg_init(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if(nproc>MAXPROC && me==0) ARMCI_Error("Test works for up to %d processors\n",MAXPROC); if(me==0) { printf("ARMCI test program (%d processes)\n",nproc); fflush(stdout); sleep(1); } ARMCI_Init(); if(me==0) { printf("\n Performing Sparse Matrix-Vector Multiplication ...\n\n"); fflush(stdout); } test_sparse(); ARMCI_AllFence(); armci_msg_barrier(); if(me==0) { printf("\nSuccess!!\n"); fflush(stdout); } sleep(2); armci_msg_barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
int main(int argc, char *argv[]) { ARMCI_Init_args(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); /* printf("nproc = %d, me = %d\n", nproc, me);*/ if (nproc > MAXPROC && me == 0) { ARMCI_Error("Test works for up to %d processors\n", MAXPROC); } if (me == 0) { printf("ARMCI test program (%d processes)\n", nproc); fflush(stdout); sleep(1); } if (me == 0) { printf("\nAggregate put/get requests\n\n"); fflush(stdout); } test_aggregate(1); /* cold start */ test_aggregate(0); /* warm start */ ARMCI_AllFence(); ARMCI_Barrier(); if (me == 0) { printf("\nSuccess!!\n"); fflush(stdout); } sleep(2); ARMCI_Barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
/** Map process IDs onto a binary tree. * * @param[in] scope Scope of processes involved * @param[out] root Process id of the root * @param[out] up Process id of my parent * @param[out] left Process id of my left child * @param[out] right Process if of my right child */ void armci_msg_bintree(int scope, int *root, int *up, int *left, int *right) { int me, nproc; if (scope == SCOPE_NODE) { *root = 0; *left = -1; *right = -1; return; } me = armci_msg_me(); nproc = armci_msg_nproc(); *root = 0; *up = (me == 0) ? -1 : (me - 1) / 2; *left = 2*me + 1; if (*left >= nproc) *left = -1; *right = 2*me + 2; if (*right >= nproc) *right = -1; }
void TRANSPOSE1D() { int dims[1]; int nelem, i, ierr, min, max, cmin, cmax, lmin, lmax, pmin, pmax; int src_offset, dst_offset, length; int *buf, *map; void *src_ptr, *dst_ptr; void **a_ptr, **b_ptr; int *a, *b; /* Find local processor ID and number of processors */ int me, nprocs; me = armci_msg_me(); nprocs = armci_msg_nproc(); /* Allocate pointers to data on all processors */ a_ptr = (void**)malloc(nprocs*sizeof(int*)); b_ptr = (void**)malloc(nprocs*sizeof(int*)); map = (int*)malloc(nprocs*sizeof(int)); /* Configure array dimensions. Force an unequal data distribution */ dims[0] = nprocs*TOTALELEMS + nprocs/2; if (me == 0) printf("Size of array: %d\n\n",dims[0]); /* Find first (zero-based) index of chunk owned by each processor and store it in map array */ for (i=0; i<nprocs; i++) { map[i] = (int)(((double)i)*(((double)dims[0])/((double)nprocs))); } /* Figure out what size my portion of array is */ if (me<nprocs-1) { nelem = map[me+1]-map[me]; } else { nelem = dims[0]-map[me]; } /* Allocate memory for array A */ ierr = ARMCI_Malloc(a_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(a_ptr[me]); /* Allocate memory for array B */ ierr = ARMCI_Malloc(b_ptr, nelem*sizeof(int)); assert(ierr == 0); assert(b_ptr[me]); /* initialize data in array A and zero data in array B */ a = (int*)a_ptr[me]; b = (int*)b_ptr[me]; for (i=0; i<nelem; i++) { a[i] = i + map[me] + 1; b[i] = 0; } /* Synchronize all processors to guarantee that everyone has data before proceeding to the next step. */ armci_msg_barrier(); /* Create local buffer for performing inversion */ buf = (int*)malloc(nelem*sizeof(int)); /* Copy inverted data into local buffer */ a = (int*)a_ptr[me]; for (i=0; i<nelem; i++) { buf[i] = a[nelem-i-1]; } /* Find out which blocks of array B inverted block should be copied to. Start by finding min and max indices of data in array B*/ min = dims[0] - (map[me] + nelem); max = dims[0] - map[me] - 1; /* Locate processors containing the endpoints */ pmin = 0; for (i=0; i<nprocs; i++) { if (min >= map[i]) { pmin = i; } else { break; } } pmax = nprocs-1; for (i=nprocs-2; i>=0; i--) { if (max < map[i+1]) { pmax = i; } else { break; } } /* Loop over processors that will receive data and copy inverted data to processors */ for (i=pmin; i<=pmax; i++) { /* Find min and max indices owned by processor i */ lmin = map[i]; if (i<nprocs-1) { lmax = map[i+1]-1; } else { lmax = dims[0]-1; } /* Find min and max indices that should be sent to processor i */ if (lmin > min) { cmin = lmin; } else { cmin = min; } if (lmax < max) { cmax = lmax; } else { cmax = max; } /* Find offsets on source and destination processors */ src_offset = cmin - min; src_ptr = (void*)(buf + src_offset); dst_offset = cmin - lmin; dst_ptr = ((char*)b_ptr[i]) + sizeof(int)*dst_offset; /* Find length of data (in bytes) to be sent to processor i */ length = sizeof(int)*(cmax-cmin+1); /* Send data to processor */ ARMCI_Put(src_ptr, dst_ptr, length, i); } ARMCI_AllFence(); armci_msg_barrier(); free(buf); VERIFY(b_ptr, dims, map); free(map); armci_msg_barrier(); ARMCI_Free(a_ptr[me]); ARMCI_Free(b_ptr[me]); free(a_ptr); free(b_ptr); }
int main(int argc, char *argv[]) { int ch; extern char *optarg; int i, j, r; thread_t threads[MAX_TPP]; /* init ARMCI */ ARMCI_Init_args(&argc, &argv); size = armci_msg_nproc(); rank = armci_msg_me(); while ((ch = getopt(argc, argv, "t:s:i:d:h")) != -1) { switch (ch) { case 't': /* # of threads */ tpp = atoi(optarg); if (tpp < 1 || tpp > MAX_TPP) { PRINTF0("\"%s\" is improper value for -t, should be a " "number between 1 and %d(MAX_TPP)\n", optarg, MAX_TPP); usage(); } break; case 'i': /* # of iterations */ iters = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -t, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 's': /* # of elements in the array */ asize = atoi(optarg); if (iters < 1) { PRINTF0("\"%s\" is improper value for -s, should be a " "number equal or larger than 1\n", optarg); usage(); } break; case 'd': delay = atoi(optarg); break; /* delay before start */ case 'h': usage(); break; /* print usage info */ } } #ifdef NOTHREADS tpp = 1; PRINTF0("Warning: NOTHREADS debug symbol is set -- running w/o threads\n"); #endif th_size = size * tpp; PRINTF0("\nTest of multi-threaded capabilities:\n" "%d threads per process (%d threads total),\n" "%d array elements of size %d,\n" "%d iteration(s)\n\n", tpp, th_size, asize, sizeof(atype_t), iters); if (delay) { printf("%d: %d\n", rank, getpid()); fflush(stdout); sleep(delay); ARMCI_Barrier(); } TH_INIT(size, tpp); for (i = 0; i < tpp; i++) { th_rank[i] = rank * tpp + i; } #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) { fname[10] = '0' + th_rank[i] / 100; fname[11] = '0' + th_rank[i] % 100 / 10; fname[12] = '0' + th_rank[i] % 10; dbg[i] = fopen(fname, "w"); } #endif for (i = 0; i < tpp; i++) { prndbg(i, "proc %d, thread %d(%d):\n", rank, i, th_rank[i]); } /* set global seed (to ensure same random sequence across procs) */ time_seed = (unsigned)time(NULL); armci_msg_brdcst(&time_seed, sizeof(time_seed), 0); srand(time_seed); rand(); prndbg(0, "seed = %u\n", time_seed); /* random pairs */ pairs = calloc(th_size, sizeof(int)); for (i = 0; i < th_size; i++) { pairs[i] = -1; } for (i = 0; i < th_size; i++) { if (pairs[i] != -1) { continue; } r = RND(0, th_size); while (i == r || pairs[r] != -1) { r = RND(0, th_size); } pairs[i] = r; pairs[r] = i; } for (i = 0, cbufl = 0; i < th_size; i++) cbufl += sprintf(cbuf + cbufl, " %d->%d|%d->%d", i, pairs[i], pairs[i], pairs[pairs[i]]); prndbg(0, "random pairs:%s\n", cbuf); /* random targets */ rnd_tgts = calloc(th_size, sizeof(int)); for (i = 0, cbufl = 0; i < th_size; i++) { rnd_tgts[i] = RND(0, th_size); if (rnd_tgts[i] == i) { i--; continue; } cbufl += sprintf(cbuf + cbufl, " %d", rnd_tgts[i]); } prndbg(0, "random targets:%s\n", cbuf); /* random one */ rnd_one = RND(0, th_size); prndbg(0, "random one = %d\n", rnd_one); assert(ptrs1 = calloc(th_size, sizeof(void *))); assert(ptrs2 = calloc(th_size, sizeof(void *))); #ifdef NOTHREADS thread_main((void *)(long)0); #else for (i = 0; i < tpp; i++) { THREAD_CREATE(threads + i, thread_main, (void *)(long)i); } for (i = 0; i < tpp; i++) { THREAD_JOIN(threads[i], NULL); } #endif ARMCI_Barrier(); PRINTF0("Tests Completed\n"); /* clean up */ #if defined(DEBUG) && defined(LOG2FILE) for (i = 0; i < tpp; i++) { fclose(dbg[i]); } #endif ARMCI_Finalize(); TH_FINALIZE(); armci_msg_finalize(); return 0; }
int main(int argc, char *argv[]) { int i, j; int ch; int edge; int size; int nloop=5; double **ptr_loc; armci_msg_init(&argc,&argv); nproc = armci_msg_nproc(); me = armci_msg_me(); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); armci_msg_barrier(); armci_msg_finalize(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } armci_msg_barrier(); armci_msg_finalize(); exit(0); #endif for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } ptr = (void **)malloc(nproc * sizeof(void *)); #ifdef MPI2_ONESIDED MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]); MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me]; MPI_Barrier(MPI_COMM_WORLD); #else /* initialize ARMCI */ ARMCI_Init(); ARMCI_Malloc(ptr, proc_bytes); #endif a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ armci_msg_barrier(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); armci_msg_barrier(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } armci_msg_barrier(); } lu(n, block_size, me); /* cold start */ /* Starting the timer */ armci_msg_barrier(); if(me == 0) start_timer(); for(i=0; i<nloop; i++) lu(n, block_size, me); armci_msg_barrier(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %f milliseconds.\n\n", elapsed_time()/nloop); printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } armci_msg_barrier(); } /* done */ #ifdef MPI2_ONESIDED MPI_Win_free(&win); MPI_Free_mem(ptr[me]); #else ARMCI_Free(ptr[me]); ARMCI_Finalize(); #endif armci_msg_finalize(); return 0; }
int main(int argc, char *argv[]) { int rc, i, j = 0, rid, ret; armci_ckpt_ds_t ckptds; ARMCI_Group grp; ARMCI_Init_args(&argc, &argv); nproc = armci_msg_nproc(); me = armci_msg_me(); if (me == 0) { if (nproc > MAXPROCS) { ARMCI_Error("nproc > MAXPROCS", nproc); } else { printf("ARMCI test program (%d processes)\n", nproc); fflush(stdout); sleep(1); } } armci_init_checkpoint2(); ARMCI_Group_get_world(&grp); size = SIZE_; rc = ARMCI_Malloc((void **)ptr_arr, size * 8); printf("ARMCI test program (%d processes)\n", nproc); fflush(stdout); for (size = 1; size <= SIZE_; size *= 2) { t1 = MPI_Wtime(); for (i = 0; i < 5; i++) { for (rc = 0; rc < 15; rc++) { do_work(size); } } time_array[j++] = MPI_Wtime() - t1; ARMCI_Barrier(); printf("%d:done for size %ld\n", me, size); fflush(stdout); } (void)ARMCI_Ckpt_create_ds(&ckptds, 1); ckptds.ptr_arr[0] = ptr_arr[me]; ckptds.sz[0] = SIZE_ * 8; rid = ARMCI_Ckpt_init(NULL, &grp, 1, 0, &ckptds); printf("%d: After ARMCI_Ckpt_init(): \n", me); j = 0; for (size = 128; size <= SIZE_; size *= 2) { int rc; int simulate_restart = 1; t1 = MPI_Wtime(); ret = ARMCI_Ckpt(rid); if (ret == ARMCI_CKPT) { printf("%d: Performed CHECKPOINT @ size=%ld\n", me, size); } else if (ret == ARMCI_RESTART) { simulate_restart = 0; printf("%d: Performed RESTART @ size=%ld\n", me, size); } for (i = 0; i < 5; i++) { for (rc = 0; rc < 15; rc++) if (i == 3 && rc == 10) { } do_work(size); } time_array1[j++] = MPI_Wtime() - t1; sleep(1); if (simulate_restart && size == FAILURE_SIZE_) { printf("%d: Simulating FAILURE @ size = %d\n", me, size); ARMCI_Restart_simulate(rid, 1); } printf("%d: DONE for size=%ld regular=%f withckpt=%f\n\n", me, size, time_array[j-1], time_array1[j-1]); fflush(stdout); } ARMCI_Ckpt_finalize(rid); printf("Before Finalize()\n"); ARMCI_Barrier(); ARMCI_Finalize(); armci_msg_finalize(); return(0); }
int main(int argc, char *argv[]) { int i, j; int ch; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; armci_msg_init(&argc,&argv); nproc = armci_msg_nproc(); me = armci_msg_me(); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); armci_msg_barrier(); armci_msg_finalize(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } /* num_rows = (int) sqrt((double) nproc); */ /* for (;;) { */ /* num_cols = nproc/num_rows; */ /* if (num_rows*num_cols == nproc) */ /* break; */ /* num_rows--; */ /* } */ nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } nnodes = nproc / 4; if((nnodes * 4) != nproc) { num_cols = nproc - nnodes * 4; nnodes++; num_rows = 1; } else { num_cols = 2; num_rows = 2; } num = (nblocks * nblocks)/nnodes; if((num * nnodes) != (nblocks * nblocks)) num++; #ifdef DEBUG if(me == 0) for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) printf("%d ", block_owner(i, j)); printf("\n"); } armci_msg_barrier(); armci_msg_finalize(); exit(0); #endif edge = n%block_size; if (edge == 0) { edge = block_size; } for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init_args(&argc, &argv); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks; i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ armci_msg_barrier(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); armci_msg_barrier(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } armci_msg_barrier(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); armci_msg_barrier(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %f milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } armci_msg_barrier(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); armci_msg_finalize(); return 0; }
main(int argc, char *argv[]) { int i, j, l; int ch; extern char *optarg; int edge; int size; int lu_arg[MAX_THREADS][3]; /* ARMCI */ void **ptr; double **ptr_loc; THREAD_LOCK_INIT(mutex); armci_msg_init(&argc,&argv); nproc = armci_msg_nproc(); me = armci_msg_me(); while ((ch = getopt(argc, argv, "n:b:p:t:d:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 't': th_per_p = atoi(optarg); break; case 'd': d = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC -tTH_PER_P\n"); armci_msg_barrier(); armci_msg_finalize(); exit(0); } } } if(th_per_p>MAX_THREADS) { th_per_p=MAX_THREADS; if(me==0)printf("Warning: cannot run more than %d threads, adjust MAX_THREADS",MAX_THREADS); } if (d) { fprintf(stderr, "%d: %d\n", me, getpid()); sleep(d); } nthreads = th_per_p * nproc; if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d thread(s) per processor, %d threads total\n", th_per_p, nthreads); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nthreads); for (;;) { num_cols = nthreads/num_rows; if (num_rows*num_cols == nthreads) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } num = (nblocks * nblocks)/nthreads; if((num * nthreads) != (nblocks * nblocks)) num++; edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } armci_msg_barrier(); /* armci_msg_finalize(); */ /* exit(0); */ #endif for (l = 0; l < th_per_p; l++) { me_th[l] = me * th_per_p + l; for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me_th[l]) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } thread_doubles[l] += size; } } } proc_bytes += thread_doubles[l] * sizeof(double); } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nthreads*sizeof(double *)); for (i = 0; i < nproc; i++) { ptr_loc[i * th_per_p] = (double *)ptr[i]; for (j = 1; j < th_per_p; j++) ptr_loc[i * th_per_p + j] = ptr_loc[i * th_per_p + j - 1] + thread_doubles[j - 1]; } for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } #if 0 for(i=0; i<nblocks*nblocks;i ++) printf("%d: a[%d]=%p\n", me, i, a[i]); fflush(stdout); #endif /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ armci_msg_barrier(); /* to remove cold-start misses, all processors touch their own data */ /* for (l = 0; l < th_per_p; l++) touch_array(block_size, me_th[l]); */ armci_msg_barrier(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } armci_msg_barrier(); } #if 1 for (i = 0; i < nblocks; i++) for (j = 0; j < nblocks; j++) print_block_dbg(a[i + j * nblocks], "proc %d, a[%d, %d]:\n", me, i, j); #endif TH_INIT(nproc,th_per_p); /* Starting the timer */ if(me == 0) start_timer(); for (l = 0; l < th_per_p; l++) { lu_arg[l][0] = n; lu_arg[l][1] = block_size; lu_arg[l][2] = l; THREAD_CREATE(threads + l, lu, lu_arg[l]); } for (l = 0; l < th_per_p; l++) THREAD_JOIN(threads[l], NULL); armci_msg_barrier(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } armci_msg_barrier(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); armci_msg_finalize(); THREAD_LOCK_DESTROY(mutex); }