void print_array(int myid) { int i, j, k; double **buf; int ii, jj; int edge; int ibs, jbs, skip; buf = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *)); for(i=0; i<nblocks; i++) for(j=0; j<nblocks; j++) if(block_owner(i, j) == myid) buf[i+j*nblocks] = a[i+j*nblocks]; else { buf[i+j*nblocks] = (double *)ARMCI_Malloc_local(block_size*block_size* sizeof(double)); get_remote(buf[i+j*nblocks], i, j); } /* copied from lu.C */ edge = n%block_size; for (i=0; i<n; i++) { for (j=0; j<n; j++) { if ((n - i) <= edge) { ibs = edge; ibs = n-edge; skip = edge; } else { ibs = block_size; skip = block_size; } if ((n - j) <= edge) { jbs = edge; jbs = n-edge; } else { jbs = block_size; } ii = (i/block_size) + (j/block_size)*nblocks; jj = (i%ibs)+(j%jbs)*skip; printf("%8.1f ", buf[ii][jj]); } printf("\n"); } fflush(stdout); for(i=0; i<nblocks; i++) for(j=0; j<nblocks; j++) if(block_owner(i, j) != myid) ARMCI_Free_local(buf[i+j*nblocks]); ARMCI_Free_local(buf); }
void get_remote(double *buf, int I, int J) { int proc_owner; int edge, size; proc_owner = block_owner(I, J); edge = n%block_size; if (edge == 0) { edge = block_size; } if ((I == nblocks-1) && (J == nblocks-1)) { size = edge*edge; } else if ((I == nblocks-1) || (J == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } size = size * sizeof(double); ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner); }
void get_remote(double *buf, int I, int J) { int proc_owner; int edge, size; #ifdef USE_MUTEX THREAD_LOCK(mutex); #endif proc_owner = block_owner(I, J) / th_per_p; edge = n%block_size; if (edge == 0) { edge = block_size; } if ((I == nblocks-1) && (J == nblocks-1)) { size = edge*edge; } else if ((I == nblocks-1) || (J == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } size = size * sizeof(double); if (proc_owner == me) memcpy(buf, a[I+J*nblocks], size); else ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner); #ifdef USE_MUTEX THREAD_UNLOCK(mutex); #endif }
void print_block() { int i, j, k; for(i=0; i<nblocks; i++) for(j=0; j<nblocks; j++) if(block_owner(i,j) == me) { printf("Block %d (%d,%d)\t", i+j*nblocks, i, j); for(k=0; k<block_size*block_size; k++) printf("%8.1f ", a[i+j*nblocks][k]); printf("\t me = %d\n", me); } }
void get_remote(double *buf, int I, int J) { int proc_owner; int edge, size; double t1; proc_owner = block_owner(I, J); edge = n%block_size; if (edge == 0) { edge = block_size; } if ((I == nblocks-1) && (J == nblocks-1)) { size = edge*edge; } else if ((I == nblocks-1) || (J == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } size = size * sizeof(double); t1 = MPI_Wtime(); #ifdef MPI2_ONESIDED { int target_disp = ( ((char*)(a[I+J*nblocks])) - ((char*)(ptr[proc_owner])) ); if(target_disp<0) { printf("ERROR!: target disp is < 0, target_disp= %d\n", target_disp); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc_owner, 0, win); MPI_Get(buf, size, MPI_CHAR, proc_owner, target_disp, size, MPI_CHAR, win); MPI_Win_unlock(proc_owner, win); } #else ARMCI_Get(a[I+J*nblocks], buf, size, proc_owner); #endif comm_time += MPI_Wtime() - t1; get_cntr++; }
double touch_array(int bs, int me) { int i, j, I, J, l; double tot = 0.0; int ibs; int jbs; /* touch my portion of A[] */ for (l = 0; l < th_per_p; l++) { for (J=0; J<nblocks; J++) { for (I=0; I<nblocks; I++) { if (block_owner(I, J) == me_th[l]) { if (J == nblocks-1) { jbs = n%bs; if (jbs == 0) { jbs = bs; } } else { jbs = bs; } if (I == nblocks-1) { ibs = n%bs; if (ibs == 0) { ibs = bs; } } else { ibs = bs; } for (j=0; j<jbs; j++) { for (i=0; i<ibs; i++) { tot += a[I+J*nblocks][i+j*ibs]; } } } } } } return(tot); }
void init_array() { int i, j, l; int ii, jj; int edge; int ibs; int jbs, skip; srand48((long) 1); edge = n%block_size; for (l = 0; l < th_per_p; l++) { for (j=0; j<n; j++) { for (i=0; i<n; i++) { if(block_owner((i/block_size), (j/block_size)) == me_th[l]) { if ((n - i) <= edge) { ibs = edge; ibs = n-edge; skip = edge; } else { ibs = block_size; skip = block_size; } if ((n - j) <= edge) { jbs = edge; jbs = n-edge; } else { jbs = block_size; } ii = (i/block_size) + (j/block_size)*nblocks; jj = (i%ibs)+(j%jbs)*skip; a[ii][jj] = i + j*6 + 1; if (i == j) { a[ii][jj] *= 10; } } } } } }
void init_array() { int i, j; int ii, jj; int edge; int ibs; int jbs, skip; srand48((long) 1); edge = n%block_size; for (j=0; j<n; j++) { for (i=0; i<n; i++) { if(block_owner((i/block_size), (j/block_size)) == me) { if ((n - i) <= edge) { ibs = edge; ibs = n-edge; skip = edge; } else { ibs = block_size; skip = block_size; } if ((n - j) <= edge) { jbs = edge; jbs = n-edge; } else { jbs = block_size; } ii = (i/block_size) + (j/block_size)*nblocks; jj = (i%ibs)+(j%jbs)*skip; /* a[ii][jj] = ((double) lrand48())/MAXRAND; */ a[ii][jj] = i + j*6 + 1; if (i == j) { a[ii][jj] *= 10; } } } } }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; int nloop=5; double **ptr_loc; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0; i<nblocks; i++) { for (j=0; j<nblocks; j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } ptr = (void **)malloc(nproc * sizeof(void *)); #ifdef MPI2_ONESIDED MPI_Alloc_mem(proc_bytes, MPI_INFO_NULL, &ptr[me]); MPI_Win_create((void*)ptr[me], proc_bytes, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); for(i=0; i<nproc; i++) ptr[i] = (double *)ptr[me]; MPI_Barrier(MPI_COMM_WORLD); #else /* initialize ARMCI */ ARMCI_Init(); ARMCI_Malloc(ptr, proc_bytes); #endif a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks; i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } lu(n, block_size, me); /* cold start */ /* Starting the timer */ MP_BARRIER(); if(me == 0) start_timer(); for(i=0; i<nloop; i++) lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()/nloop); printf("%d: (ngets=%d) Communication (get) time = %e milliseconds\n", me, get_cntr, comm_time*1000/nloop); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ #ifdef MPI2_ONESIDED MPI_Win_free(&win); MPI_Free_mem(ptr[me]); #else ARMCI_Free(ptr[me]); ARMCI_Finalize(); #endif MP_FINALIZE(); }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner; double *buf1, *buf2; /* temporary memories */ buf1 = (double *)malloc(block_size*block_size*sizeof(double)); buf2 = (double *)malloc(block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = buf1; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); } } /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); } } MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = buf1; get_remote(A, I, K); } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(K,J) == me) B = a[K+J*nblocks]; else { B = buf2; get_remote(B, K, J); } C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } free(buf1); free(buf2); }
int main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; MP_INIT(argc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } /* num_rows = (int) sqrt((double) nproc); */ /* for (;;) { */ /* num_cols = nproc/num_rows; */ /* if (num_rows*num_cols == nproc) */ /* break; */ /* num_rows--; */ /* } */ nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } nnodes = nproc / 4; if((nnodes * 4) != nproc) { num_cols = nproc - nnodes * 4; nnodes++; num_rows = 1; } else { num_cols = 2; num_rows = 2; } num = (nblocks * nblocks)/nnodes; if((num * nnodes) != (nblocks * nblocks)) num++; #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif edge = n%block_size; if (edge == 0) { edge = block_size; } for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); MP_FINALIZE(); return 0; }
main(int argc, char *argv[]) { int i, j; int ch; extern char *optarg; int edge; int size; /* ARMCI */ void **ptr; double **ptr_loc; void **bufr_g, **bufc_g; MP_INIT(arc,argv); MP_PROCS(&nproc); MP_MYID(&me); while ((ch = getopt(argc, argv, "n:b:p:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC\n"); MP_BARRIER(); MP_FINALIZE(); exit(0); } } } if(me == 0) { printf("\nUsing pre-PUTing\n"); printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nproc); for (;;) { num_cols = nproc/num_rows; if (num_rows*num_cols == nproc) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } MP_BARRIER(); MP_FINALIZE(); exit(0); #endif for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } proc_bytes += size*sizeof(double); } } } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)ARMCI_Malloc_local(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)ARMCI_Malloc_local(nproc*sizeof(double *)); for(i=0; i<nproc; i++) ptr_loc[i] = (double *)ptr[i]; for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } /* initialize the array */ init_array(); bufr = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); bufc = (double **)ARMCI_Malloc_local(nproc*nblocks * sizeof(double *)); if (bufr == NULL || bufc == NULL) printf("Could not ARMCI_Malloc_local() mem\n"); /* bufr points to all k-th row blocks */ /* save all block address in row-major order */ proc_bytes = nblocks*block_size*block_size * sizeof(double); bufr_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufr_g, proc_bytes); for (i = 0; i < nproc; i++) { bufr[i*nblocks] = (double *) bufr_g[i]; for (j = 1; j < nblocks; j++) { bufr[i*nblocks + j] = bufr[i*nblocks + j-1] + block_size * block_size; } } /* bufc points to all k-th column blocks */ bufc_g = (void **)ARMCI_Malloc_local(nproc * sizeof(void *)); ARMCI_Malloc(bufc_g, proc_bytes); for (i = 0; i < nproc; i++) { bufc[i*nblocks] = (double *) bufc_g[i]; for (j = 1; j < nblocks; j++) { bufc[i*nblocks + j] = bufc[i*nblocks + j-1] + block_size * block_size; } } /* barrier to ensure all initialization is done */ MP_BARRIER(); /* to remove cold-start misses, all processors touch their own data */ touch_array(block_size, me); MP_BARRIER(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } MP_BARRIER(); } /* Starting the timer */ if(me == 0) start_timer(); lu(n, block_size, me); MP_BARRIER(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } MP_BARRIER(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Free(bufc_g[me]); ARMCI_Free(bufr_g[me]); ARMCI_Finalize(); MP_FINALIZE(); }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner, destp, hc, m; double *dbuf; armci_hdl_t handle[2*MAXPROC]; int saved[MAXPROC]; dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); /* impl algo on this diag block */ } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = dbuf; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); /* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (I, m); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL); saved[destp] = 1; } } } } /* end of for (i=k1, I=K+1...) */ /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); /* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (m, J); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL); saved[destp] = 1; } } } } ARMCI_WaitAll(); ARMCI_AllFence(); MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = bufc[me*nblocks+I]; } if(block_owner(K,J) == me) B = a[K+J*nblocks]; else B = bufr[me*nblocks + J]; C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } ARMCI_Free_local(dbuf); }
main(int argc, char *argv[]) { int i, j, l; int ch; extern char *optarg; int edge; int size; int lu_arg[MAX_THREADS][3]; /* ARMCI */ void **ptr; double **ptr_loc; THREAD_LOCK_INIT(mutex); armci_msg_init(&argc,&argv); nproc = armci_msg_nproc(); me = armci_msg_me(); while ((ch = getopt(argc, argv, "n:b:p:t:d:h")) != -1) { switch(ch) { case 'n': n = atoi(optarg); break; case 'b': block_size = atoi(optarg); break; case 'p': nproc = atoi(optarg); break; case 't': th_per_p = atoi(optarg); break; case 'd': d = atoi(optarg); break; case 'h': { printf("Usage: LU, or \n"); printf(" LU -nMATRIXSIZE -bBLOCKSIZE -pNPROC -tTH_PER_P\n"); armci_msg_barrier(); armci_msg_finalize(); exit(0); } } } if(th_per_p>MAX_THREADS) { th_per_p=MAX_THREADS; if(me==0)printf("Warning: cannot run more than %d threads, adjust MAX_THREADS",MAX_THREADS); } if (d) { fprintf(stderr, "%d: %d\n", me, getpid()); sleep(d); } nthreads = th_per_p * nproc; if(me == 0) { printf("\n Blocked Dense LU Factorization\n"); printf(" %d by %d Matrix\n", n, n); printf(" %d Processors\n", nproc); printf(" %d thread(s) per processor, %d threads total\n", th_per_p, nthreads); printf(" %d by %d Element Blocks\n", block_size, block_size); printf("\n"); } num_rows = (int) sqrt((double) nthreads); for (;;) { num_cols = nthreads/num_rows; if (num_rows*num_cols == nthreads) break; num_rows--; } nblocks = n/block_size; if (block_size * nblocks != n) { nblocks++; } num = (nblocks * nblocks)/nthreads; if((num * nthreads) != (nblocks * nblocks)) num++; edge = n%block_size; if (edge == 0) { edge = block_size; } #ifdef DEBUG if(me == 0) for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) printf("%d ", block_owner(i, j)); printf("\n"); } armci_msg_barrier(); /* armci_msg_finalize(); */ /* exit(0); */ #endif for (l = 0; l < th_per_p; l++) { me_th[l] = me * th_per_p + l; for (i=0;i<nblocks;i++) { for (j=0;j<nblocks;j++) { if(block_owner(i,j) == me_th[l]) { if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } thread_doubles[l] += size; } } } proc_bytes += thread_doubles[l] * sizeof(double); } /* initialize ARMCI */ ARMCI_Init(); ptr = (void **)malloc(nproc * sizeof(void *)); ARMCI_Malloc(ptr, proc_bytes); a = (double **)malloc(nblocks*nblocks*sizeof(double *)); if (a == NULL) { fprintf(stderr, "Could not malloc memory for a\n"); exit(-1); } ptr_loc = (double **)malloc(nthreads*sizeof(double *)); for (i = 0; i < nproc; i++) { ptr_loc[i * th_per_p] = (double *)ptr[i]; for (j = 1; j < th_per_p; j++) ptr_loc[i * th_per_p + j] = ptr_loc[i * th_per_p + j - 1] + thread_doubles[j - 1]; } for(i=0; i<nblocks;i ++) { for(j=0; j<nblocks; j++) { a[i+j*nblocks] = ptr_loc[block_owner(i, j)]; if ((i == nblocks-1) && (j == nblocks-1)) { size = edge*edge; } else if ((i == nblocks-1) || (j == nblocks-1)) { size = edge*block_size; } else { size = block_size*block_size; } ptr_loc[block_owner(i, j)] += size; } } #if 0 for(i=0; i<nblocks*nblocks;i ++) printf("%d: a[%d]=%p\n", me, i, a[i]); fflush(stdout); #endif /* initialize the array */ init_array(); /* barrier to ensure all initialization is done */ armci_msg_barrier(); /* to remove cold-start misses, all processors touch their own data */ /* for (l = 0; l < th_per_p; l++) touch_array(block_size, me_th[l]); */ armci_msg_barrier(); if(doprint) { if(me == 0) { printf("Matrix before LU decomposition\n"); print_array(me); } armci_msg_barrier(); } #if 1 for (i = 0; i < nblocks; i++) for (j = 0; j < nblocks; j++) print_block_dbg(a[i + j * nblocks], "proc %d, a[%d, %d]:\n", me, i, j); #endif TH_INIT(nproc,th_per_p); /* Starting the timer */ if(me == 0) start_timer(); for (l = 0; l < th_per_p; l++) { lu_arg[l][0] = n; lu_arg[l][1] = block_size; lu_arg[l][2] = l; THREAD_CREATE(threads + l, lu, lu_arg[l]); } for (l = 0; l < th_per_p; l++) THREAD_JOIN(threads[l], NULL); armci_msg_barrier(); /* Timer Stops here */ if(me == 0) printf("\nRunning time = %lf milliseconds.\n\n", elapsed_time()); if(doprint) { if(me == 0) { printf("after LU\n"); print_array(me); } armci_msg_barrier(); } /* done */ ARMCI_Free(ptr[me]); ARMCI_Finalize(); armci_msg_finalize(); THREAD_LOCK_DESTROY(mutex); }
void *lu(void *lu_arg) { int n, bs, th_idx; int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner; double *buf1, *buf2; n = ((int *)lu_arg)[0]; bs = ((int *)lu_arg)[1]; th_idx = ((int *)lu_arg)[2]; #ifdef DEBUG printf("DBG: starting thread %d(idx=%d) on node %d\n", me_th[th_idx], th_idx, me); fflush(stdout); #endif /* temporary memories */ buf1 = (double *)malloc(block_size*block_size*sizeof(double)); buf2 = (double *)malloc(block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me_th[th_idx]) { A = a[K+K*nblocks]; print_block_dbg(A, "th=%d, idx=%d: before lu0 a[%d]:\n", me_th[th_idx], th_idx, K+K*nblocks); lu0(A, strK, strK); } MT_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me_th[th_idx]) D = a[K+K*nblocks]; else { D = buf1; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me_th[th_idx]) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); } } /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me_th[th_idx]) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); } } MT_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } if(block_owner(I,K) == me_th[th_idx]) A = a[I+K*nblocks]; else { A = buf1; get_remote(A, I, K); } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me_th[th_idx]) { /* parcel out blocks */ if(block_owner(K,J) == me_th[th_idx]) B = a[K+J*nblocks]; else { B = buf2; get_remote(B, K, J); } C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } free(buf1); free(buf2); return lu_arg; }