void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner; double *buf1, *buf2; /* temporary memories */ buf1 = (double *)malloc(block_size*block_size*sizeof(double)); buf2 = (double *)malloc(block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = buf1; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); } } /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); } } MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = buf1; get_remote(A, I, K); } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(K,J) == me) B = a[K+J*nblocks]; else { B = buf2; get_remote(B, K, J); } C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } free(buf1); free(buf2); }
void slave() { double *b; double *buffer; double *workbuf; int i,j,k; int myrow,nextrow,rownum; MPI_Status status; int ntasks,pid; /* get the number of the processes in application. * * Can we define ntasks as shared variable?*/ MPI_Comm_size(MPI_COMM_WORLD,&ntasks); rownum=matrix_size/(block_size*ntasks); /* allocate the local portion of matrix */ b=(double*)malloc(rownum*block_size*matrix_size); /* allocate buffer space, it should be big enough * * to contaion a whole row of block. */ buffer=(double *)malloc(matrix_size*block_size*sizeof(double)); /* receive the initial matrix from process 0 */ for (i=0;i<rownum;i++) MPI_Recv(&b[i*block_size*matrix_size],block_size*matrix_size,MPI_DOUBLE, 0,i*block_size*ntasks+myrank*block_size,MPI_COMM_WORLD,&status); MPI_Barrier(MPI_COMM_WORLD); /* do computation work of this process */ for (i=0;i<matrix_size;i+=block_size) { /* compute the id of the process that owns the row i * * to row i+block_size-1 */ pid=(i/block_size)%ntasks; myrow=((i/block_size)/ntasks)*block_size; if (pid==myrank) { /* My process */ /* factor diagonal */ lu0(&b[myrow+myrow*matrix_size],block_size,matrix_size); /* modify "column" by diagonal */ for (j=myrow+block_size;j<matrix_size;j+=block_size) bdiv(&b[j+myrow*matrix_size],&b[myrow+myrow*matrix_size], block_size,matrix_size); /* send this row to other processes, only need to send the column * * after diagonal? */ for (j=0;j<ntasks;j++) { if (j!=myrank) MPI_Send(&b[myrow*matrix_size],block_size*matrix_size,MPI_DOUBLE, j,i,MPI_COMM_WORLD); } workbuf=&b[myrow*matrix_size]; } else { /* other process */ /* receive row i to row i+block_size-1 from process pid */ MPI_Recv(&buffer,block_size*matrix_size,MPI_DOUBLE,pid,i, MPI_COMM_WORLD,&status); workbuf=buffer; } if (myrank>pid) nextrow=myrow; else nextrow=myrow+block_size; /* modify the "row" using diagonal */ for (j=nextrow;j<matrix_size;j+=block_size) bmodd(&b[i+j*matrix_size],&workbuf[i], block_size,matrix_size); /* modify the internal rows and columns */ for (j=nextrow;j<matrix_size;j+=block_size) for (k=i+block_size;k<matrix_size;k+=block_size) bmod(&b[k+j*matrix_size],&workbuf[k],&b[i+j*matrix_size], block_size,matrix_size); } MPI_Barrier(MPI_COMM_WORLD); /* Send b to process 0. */ for (i=0;i<rownum;i++) MPI_Send(&b[i*block_size*matrix_size],block_size*matrix_size,MPI_DOUBLE, 0,i*block_size*ntasks+myrank*block_size,MPI_COMM_WORLD); }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner, destp, hc, m; double *dbuf; armci_hdl_t handle[2*MAXPROC]; int saved[MAXPROC]; dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); /* impl algo on this diag block */ } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = dbuf; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); /* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (I, m); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL); saved[destp] = 1; } } } } /* end of for (i=k1, I=K+1...) */ /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); /* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (m, J); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL); saved[destp] = 1; } } } } ARMCI_WaitAll(); ARMCI_AllFence(); MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = bufc[me*nblocks+I]; } if(block_owner(K,J) == me) B = a[K+J*nblocks]; else B = bufr[me*nblocks + J]; C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } ARMCI_Free_local(dbuf); }
void master() { double *a; double *rhs; int ntasks,pid; double *buffer; double *workbuf; int i,j,k; MPI_Status status; time_t t0,t1; int ct; /* get the number of the processes in application. */ MPI_Comm_size(MPI_COMM_WORLD,&ntasks); /* allocate matrix, rhs vector */ a = (double *) malloc( matrix_size*matrix_size*sizeof(double) ) ; rhs = (double *) malloc( matrix_size*sizeof(double) ) ; /* initialize the matrix */ /* Do we need to allocate a matrix a or ony a row * * and initilize each row and then send to corresponding process? */ initializeMatrix( matrix_size, a, rhs ); /* Send each row to the corresponding process. */ for (i=block_size;i<matrix_size;i+=block_size) { /* send i row to i+block_size-1 row to process * * (i mod block_size)%ntasks. */ pid=(i/block_size)%ntasks; if (pid!=0) MPI_Send(&a[i*matrix_size],matrix_size*block_size,MPI_DOUBLE, pid,i,MPI_COMM_WORLD); } MPI_Barrier(MPI_COMM_WORLD); /* allocate buffer space, it should be big enough * * to contaion a whole row of block. */ buffer=(double *)malloc(matrix_size*block_size*sizeof(double)); time(&t0); /* Do the computation work of process 0 */ for (i=0;i<matrix_size;i+=block_size) { /* compute the id of the processor that own the row i * * to row i+block_size-1 */ pid=(i/block_size)%ntasks; if (pid==0) { /* matser process. Me! */ /* factor diagonal */ lu0(&a[i+i*matrix_size],block_size,matrix_size); /* modify "column" by diagonal */ for (j=i+block_size;j<matrix_size;j+=block_size) bdiv(&a[j+i*matrix_size],&a[i+i*matrix_size], block_size,matrix_size); /* send this row to other processes, only need to send * * the column after diagonal? */ for (j=1;j<ntasks;j++) { MPI_Send(&a[i*matrix_size],block_size*matrix_size,MPI_DOUBLE, j,i,MPI_COMM_WORLD); } workbuf=&a[i*matrix_size]; } else { /* other process */ /* receive row i to row i+block_size-1 from process pid */ MPI_Recv(&buffer,block_size*matrix_size,MPI_DOUBLE,pid,i, MPI_COMM_WORLD,&status); workbuf=buffer; } /* modify the "row" using diagonal */ for (j=i+(ntasks-pid)*block_size;j<matrix_size;j+=block_size*ntasks) bmodd(&a[i+j*matrix_size],&workbuf[i], block_size,matrix_size); /* modify the internal rows and columns */ for (j=i+(ntasks-pid)*block_size;j<matrix_size;j+=block_size*ntasks) for (k=i+block_size;k<matrix_size;k+=block_size) bmod(&a[k+j*matrix_size],&workbuf[k],&a[i+j*matrix_size], block_size,matrix_size); } MPI_Barrier(MPI_COMM_WORLD); time(&t1); ct=t1-t0; printf("LU decomposition took %d millisecs\n", ct); /* Receive the modified matrix from all other processes. */ for (i=0;i<matrix_size;i+=block_size) { /* compute the id of the processor that own the row i * * to row i+block_size-1 */ pid=(i/block_size)%ntasks; if (pid!=0) MPI_Recv(&a[i*matrix_size],block_size*matrix_size,MPI_DOUBLE,pid,i,MPI_COMM_WORLD,&status); } /* test the resulting decoposition */ checkResult(matrix_size,a,rhs); }
void *lu(void *lu_arg) { int n, bs, th_idx; int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner; double *buf1, *buf2; n = ((int *)lu_arg)[0]; bs = ((int *)lu_arg)[1]; th_idx = ((int *)lu_arg)[2]; #ifdef DEBUG printf("DBG: starting thread %d(idx=%d) on node %d\n", me_th[th_idx], th_idx, me); fflush(stdout); #endif /* temporary memories */ buf1 = (double *)malloc(block_size*block_size*sizeof(double)); buf2 = (double *)malloc(block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me_th[th_idx]) { A = a[K+K*nblocks]; print_block_dbg(A, "th=%d, idx=%d: before lu0 a[%d]:\n", me_th[th_idx], th_idx, K+K*nblocks); lu0(A, strK, strK); } MT_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me_th[th_idx]) D = a[K+K*nblocks]; else { D = buf1; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me_th[th_idx]) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); } } /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me_th[th_idx]) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); } } MT_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } if(block_owner(I,K) == me_th[th_idx]) A = a[I+K*nblocks]; else { A = buf1; get_remote(A, I, K); } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me_th[th_idx]) { /* parcel out blocks */ if(block_owner(K,J) == me_th[th_idx]) B = a[K+J*nblocks]; else { B = buf2; get_remote(B, K, J); } C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } free(buf1); free(buf2); return lu_arg; }