void sparselu_seq_call(float **BENCH) { int ii; int jj; int kk; for (kk = 0; kk < bots_arg_size; kk++) { lu0((BENCH[(kk * bots_arg_size) + kk])); for (jj = (kk + 1); jj < bots_arg_size; jj++) if ((BENCH[(kk * bots_arg_size) + jj]) != ((0L))) { fwd((BENCH[(kk * bots_arg_size) + kk]),(BENCH[(kk * bots_arg_size) + jj])); } for (ii = (kk + 1); ii < bots_arg_size; ii++) if ((BENCH[(ii * bots_arg_size) + kk]) != ((0L))) { bdiv((BENCH[(kk * bots_arg_size) + kk]),(BENCH[(ii * bots_arg_size) + kk])); } for (ii = (kk + 1); ii < bots_arg_size; ii++) if ((BENCH[(ii * bots_arg_size) + kk]) != ((0L))) for (jj = (kk + 1); jj < bots_arg_size; jj++) if ((BENCH[(kk * bots_arg_size) + jj]) != ((0L))) { if ((BENCH[(ii * bots_arg_size) + jj]) == ((0L))) BENCH[(ii * bots_arg_size) + jj] = allocate_clean_block(); bmod((BENCH[(ii * bots_arg_size) + kk]),(BENCH[(kk * bots_arg_size) + jj]),(BENCH[(ii * bots_arg_size) + jj])); } } }
void sparselu_seq_call(float **BENCH) { int ii, jj, kk; for (kk=0; kk<bots_arg_size; kk++) { lu0(BENCH[kk*bots_arg_size+kk]); for (jj=kk+1; jj<bots_arg_size; jj++) if (BENCH[kk*bots_arg_size+jj] != NULL) { fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]); } for (ii=kk+1; ii<bots_arg_size; ii++) if (BENCH[ii*bots_arg_size+kk] != NULL) { bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]); } for (ii=kk+1; ii<bots_arg_size; ii++) if (BENCH[ii*bots_arg_size+kk] != NULL) for (jj=kk+1; jj<bots_arg_size; jj++) if (BENCH[kk*bots_arg_size+jj] != NULL) { if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block(); bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]); } } }
void lu_dependencies( double* M[NB][NB] ) { float t_start,t_end; float time; t_start=mysecond(); int ii, jj, kk; for (kk=0; kk<NB; kk++) { { double *diag = M[kk][kk]; #pragma omp task depend(inout: [BSIZE][BSIZE]diag) lu0(diag); } for (jj=kk+1; jj<NB; jj++) if (M[kk][jj] != NULL) { double *diag = M[kk][kk]; double *col = M[kk][jj]; #pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]col) fwd(diag, col); } for (ii=kk+1; ii<NB; ii++) { if (M[ii][kk] != NULL) { { double *row = M[kk][kk]; double *diag = M[ii][kk]; #pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]row) bdiv (diag, row); } for (jj=kk+1; jj<NB; jj++) { if (M[kk][jj] != NULL) { if (M[ii][jj]==NULL) M[ii][jj]=allocate_clean_block(); { double *row = M[ii][kk]; double *col = M[kk][jj]; double *inner = M[ii][jj]; #pragma omp task depend(in: [BSIZE][BSIZE]row, [BSIZE][BSIZE]col) depend(inout: [BSIZE][BSIZE]inner) bmod(row, col, inner); } } } } } } #pragma omp taskwait t_end=mysecond(); time = t_end-t_start; printf("Dependencies time to compute = %f usec\n", time); }
static void OUT__1__1527__(void *__out_argv) { float ***BENCH = (float ***)(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::BENCH_p); int ii = (int )(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::ii); int jj = (int )(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::jj); int kk = (int )(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::kk); int _p_ii = ii; int _p_jj = jj; int _p_kk = kk; if ((( *BENCH)[(_p_ii * bots_arg_size) + _p_jj]) == ((0L))) ( *BENCH)[(_p_ii * bots_arg_size) + _p_jj] = allocate_clean_block(); bmod((( *BENCH)[(_p_ii * bots_arg_size) + _p_kk]),(( *BENCH)[(_p_kk * bots_arg_size) + _p_jj]),(( *BENCH)[(_p_ii * bots_arg_size) + _p_jj])); }
void rec_lobatto( Teuchos::LAPACK<int,Real> &lapack, const double xl1, const double xl2, ROL::Vector<Real> &a, ROL::Vector<Real> &b ) { Teuchos::RCP<std::vector<Real> > ap = Teuchos::rcp_const_cast<std::vector<Real> >((Teuchos::dyn_cast<ROL::StdVector<Real> >(a)).getVector()); Teuchos::RCP<std::vector<Real> > bp = Teuchos::rcp_const_cast<std::vector<Real> >((Teuchos::dyn_cast<ROL::StdVector<Real> >(b)).getVector()); const int N = ap->size()-1; Teuchos::RCP<std::vector<Real> > amodp = Teuchos::rcp(new std::vector<Real> (N,0.0)); Teuchos::RCP<std::vector<Real> > bmodp = Teuchos::rcp(new std::vector<Real> (N-1,0.0)); Teuchos::RCP<std::vector<Real> > enp = Teuchos::rcp(new std::vector<Real> (N,0.0)); Teuchos::RCP<std::vector<Real> > gp = Teuchos::rcp(new std::vector<Real> (N,0.0)); // Nth canonical vector (*enp)[N-1] = 1.0; for(int i=0;i<N-1;++i) { (*bmodp)[i] = sqrt((*bp)[i+1]); } for(int i=0;i<N;++i) { (*amodp)[i] = (*ap)[i]-xl1; } ROL::StdVector<Real> amod(amodp); ROL::StdVector<Real> bmod(bmodp); ROL::StdVector<Real> en(enp); ROL::StdVector<Real> g(gp); trisolve(lapack,bmod,amod,bmod,en,g); Real g1 = (*gp)[N-1]; for(int i=0;i<N;++i) { (*amodp)[i] = (*ap)[i]-xl2; } trisolve(lapack,bmod,amod,bmod,en,g); Real g2 = (*gp)[N-1]; (*ap)[N] = (g1*xl2-g2*xl1)/(g1-g2); (*bp)[N] = (xl2-xl1)/(g1-g2); }
void lu_serial( double* M[NB][NB] ) { float t_start,t_end; float time; t_start= mysecond(); int ii, jj, kk; for (kk=0; kk<NB; kk++) { { double *diag = M[kk][kk]; lu0(diag); } for (jj=kk+1; jj<NB; jj++) if (M[kk][jj] != NULL) { double *diag = M[kk][kk]; double *col = M[kk][jj]; fwd(diag, col); } for (ii=kk+1; ii<NB; ii++) { if (M[ii][kk] != NULL) { { double *row = M[kk][kk]; double *diag = M[ii][kk]; bdiv (diag, row); } for (jj=kk+1; jj<NB; jj++) { if (M[kk][jj] != NULL) { if (M[ii][jj]==NULL) M[ii][jj]=allocate_clean_block(); { double *row = M[ii][kk]; double *col = M[kk][jj]; double *inner = M[ii][jj]; bmod(row, col, inner); } } } } } } t_end=mysecond(); time = t_end-t_start; printf("Serial time to compute = %f usec\n", time); }
void sparselu_par_call(float **BENCH) { int ii, jj, kk; bots_message("Computing SparseLU Factorization (%dx%d matrix with %dx%d blocks) ", bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1); #pragma omp parallel private(kk) { for (kk=0; kk<bots_arg_size; kk++) { #pragma omp single lu0(BENCH[kk*bots_arg_size+kk]); #pragma omp for nowait for (jj=kk+1; jj<bots_arg_size; jj++) if (BENCH[kk*bots_arg_size+jj] != NULL) #pragma omp task untied firstprivate(kk, jj) shared(BENCH) { fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]); } #pragma omp for for (ii=kk+1; ii<bots_arg_size; ii++) if (BENCH[ii*bots_arg_size+kk] != NULL) #pragma omp task untied firstprivate(kk, ii) shared(BENCH) { bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]); } #pragma omp for private(jj) for (ii=kk+1; ii<bots_arg_size; ii++) if (BENCH[ii*bots_arg_size+kk] != NULL) for (jj=kk+1; jj<bots_arg_size; jj++) if (BENCH[kk*bots_arg_size+jj] != NULL) #pragma omp task untied firstprivate(kk, jj, ii) shared(BENCH) { if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block(); bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]); } } } bots_message(" completed!\n"); }
void sparselu_par_call(float **BENCH, int matrix_size, int submatrix_size) { int ii, jj, kk; #pragma omp parallel private(kk,ii,jj) shared(BENCH) #pragma omp single /* nowait */ { /*#pragma omp task untied*/ for (kk=0; kk<matrix_size; kk++) { #pragma omp task firstprivate(kk) shared(BENCH) depend(inout: BENCH[kk*matrix_size+kk:submatrix_size*submatrix_size]) lu0(BENCH[kk*matrix_size+kk], submatrix_size); for (jj=kk+1; jj<matrix_size; jj++) if (BENCH[kk*matrix_size+jj] != NULL) { #pragma omp task firstprivate(kk, jj) shared(BENCH) depend(in: BENCH[kk*matrix_size+kk:submatrix_size*submatrix_size]) depend(inout: BENCH[kk*matrix_size+jj:submatrix_size*submatrix_size]) fwd(BENCH[kk*matrix_size+kk], BENCH[kk*matrix_size+jj], submatrix_size); } for (ii=kk+1; ii<matrix_size; ii++) if (BENCH[ii*matrix_size+kk] != NULL) { #pragma omp task firstprivate(kk, ii) shared(BENCH) depend(in: BENCH[kk*matrix_size+kk:submatrix_size*submatrix_size]) depend(inout: BENCH[ii*matrix_size+kk:submatrix_size*submatrix_size]) bdiv (BENCH[kk*matrix_size+kk], BENCH[ii*matrix_size+kk], submatrix_size); } for (ii=kk+1; ii<matrix_size; ii++) if (BENCH[ii*matrix_size+kk] != NULL) for (jj=kk+1; jj<matrix_size; jj++) if (BENCH[kk*matrix_size+jj] != NULL) { if (BENCH[ii*matrix_size+jj]==NULL) BENCH[ii*matrix_size+jj] = allocate_clean_block(submatrix_size); #pragma omp task firstprivate(kk, jj, ii) shared(BENCH) \ depend(in: BENCH[ii*matrix_size+kk:submatrix_size*submatrix_size], BENCH[kk*matrix_size+jj:submatrix_size*submatrix_size]) \ depend(inout: BENCH[ii*matrix_size+jj:submatrix_size*submatrix_size]) bmod(BENCH[ii*matrix_size+kk], BENCH[kk*matrix_size+jj], BENCH[ii*matrix_size+jj], submatrix_size); } } #pragma omp taskwait } }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner; double *buf1, *buf2; /* temporary memories */ buf1 = (double *)malloc(block_size*block_size*sizeof(double)); buf2 = (double *)malloc(block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = buf1; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); } } /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); } } MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = buf1; get_remote(A, I, K); } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(K,J) == me) B = a[K+J*nblocks]; else { B = buf2; get_remote(B, K, J); } C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } free(buf1); free(buf2); }
void slave() { double *b; double *buffer; double *workbuf; int i,j,k; int myrow,nextrow,rownum; MPI_Status status; int ntasks,pid; /* get the number of the processes in application. * * Can we define ntasks as shared variable?*/ MPI_Comm_size(MPI_COMM_WORLD,&ntasks); rownum=matrix_size/(block_size*ntasks); /* allocate the local portion of matrix */ b=(double*)malloc(rownum*block_size*matrix_size); /* allocate buffer space, it should be big enough * * to contaion a whole row of block. */ buffer=(double *)malloc(matrix_size*block_size*sizeof(double)); /* receive the initial matrix from process 0 */ for (i=0;i<rownum;i++) MPI_Recv(&b[i*block_size*matrix_size],block_size*matrix_size,MPI_DOUBLE, 0,i*block_size*ntasks+myrank*block_size,MPI_COMM_WORLD,&status); MPI_Barrier(MPI_COMM_WORLD); /* do computation work of this process */ for (i=0;i<matrix_size;i+=block_size) { /* compute the id of the process that owns the row i * * to row i+block_size-1 */ pid=(i/block_size)%ntasks; myrow=((i/block_size)/ntasks)*block_size; if (pid==myrank) { /* My process */ /* factor diagonal */ lu0(&b[myrow+myrow*matrix_size],block_size,matrix_size); /* modify "column" by diagonal */ for (j=myrow+block_size;j<matrix_size;j+=block_size) bdiv(&b[j+myrow*matrix_size],&b[myrow+myrow*matrix_size], block_size,matrix_size); /* send this row to other processes, only need to send the column * * after diagonal? */ for (j=0;j<ntasks;j++) { if (j!=myrank) MPI_Send(&b[myrow*matrix_size],block_size*matrix_size,MPI_DOUBLE, j,i,MPI_COMM_WORLD); } workbuf=&b[myrow*matrix_size]; } else { /* other process */ /* receive row i to row i+block_size-1 from process pid */ MPI_Recv(&buffer,block_size*matrix_size,MPI_DOUBLE,pid,i, MPI_COMM_WORLD,&status); workbuf=buffer; } if (myrank>pid) nextrow=myrow; else nextrow=myrow+block_size; /* modify the "row" using diagonal */ for (j=nextrow;j<matrix_size;j+=block_size) bmodd(&b[i+j*matrix_size],&workbuf[i], block_size,matrix_size); /* modify the internal rows and columns */ for (j=nextrow;j<matrix_size;j+=block_size) for (k=i+block_size;k<matrix_size;k+=block_size) bmod(&b[k+j*matrix_size],&workbuf[k],&b[i+j*matrix_size], block_size,matrix_size); } MPI_Barrier(MPI_COMM_WORLD); /* Send b to process 0. */ for (i=0;i<rownum;i++) MPI_Send(&b[i*block_size*matrix_size],block_size*matrix_size,MPI_DOUBLE, 0,i*block_size*ntasks+myrank*block_size,MPI_COMM_WORLD); }
void master() { double *a; double *rhs; int ntasks,pid; double *buffer; double *workbuf; int i,j,k; MPI_Status status; time_t t0,t1; int ct; /* get the number of the processes in application. */ MPI_Comm_size(MPI_COMM_WORLD,&ntasks); /* allocate matrix, rhs vector */ a = (double *) malloc( matrix_size*matrix_size*sizeof(double) ) ; rhs = (double *) malloc( matrix_size*sizeof(double) ) ; /* initialize the matrix */ /* Do we need to allocate a matrix a or ony a row * * and initilize each row and then send to corresponding process? */ initializeMatrix( matrix_size, a, rhs ); /* Send each row to the corresponding process. */ for (i=block_size;i<matrix_size;i+=block_size) { /* send i row to i+block_size-1 row to process * * (i mod block_size)%ntasks. */ pid=(i/block_size)%ntasks; if (pid!=0) MPI_Send(&a[i*matrix_size],matrix_size*block_size,MPI_DOUBLE, pid,i,MPI_COMM_WORLD); } MPI_Barrier(MPI_COMM_WORLD); /* allocate buffer space, it should be big enough * * to contaion a whole row of block. */ buffer=(double *)malloc(matrix_size*block_size*sizeof(double)); time(&t0); /* Do the computation work of process 0 */ for (i=0;i<matrix_size;i+=block_size) { /* compute the id of the processor that own the row i * * to row i+block_size-1 */ pid=(i/block_size)%ntasks; if (pid==0) { /* matser process. Me! */ /* factor diagonal */ lu0(&a[i+i*matrix_size],block_size,matrix_size); /* modify "column" by diagonal */ for (j=i+block_size;j<matrix_size;j+=block_size) bdiv(&a[j+i*matrix_size],&a[i+i*matrix_size], block_size,matrix_size); /* send this row to other processes, only need to send * * the column after diagonal? */ for (j=1;j<ntasks;j++) { MPI_Send(&a[i*matrix_size],block_size*matrix_size,MPI_DOUBLE, j,i,MPI_COMM_WORLD); } workbuf=&a[i*matrix_size]; } else { /* other process */ /* receive row i to row i+block_size-1 from process pid */ MPI_Recv(&buffer,block_size*matrix_size,MPI_DOUBLE,pid,i, MPI_COMM_WORLD,&status); workbuf=buffer; } /* modify the "row" using diagonal */ for (j=i+(ntasks-pid)*block_size;j<matrix_size;j+=block_size*ntasks) bmodd(&a[i+j*matrix_size],&workbuf[i], block_size,matrix_size); /* modify the internal rows and columns */ for (j=i+(ntasks-pid)*block_size;j<matrix_size;j+=block_size*ntasks) for (k=i+block_size;k<matrix_size;k+=block_size) bmod(&a[k+j*matrix_size],&workbuf[k],&a[i+j*matrix_size], block_size,matrix_size); } MPI_Barrier(MPI_COMM_WORLD); time(&t1); ct=t1-t0; printf("LU decomposition took %d millisecs\n", ct); /* Receive the modified matrix from all other processes. */ for (i=0;i<matrix_size;i+=block_size) { /* compute the id of the processor that own the row i * * to row i+block_size-1 */ pid=(i/block_size)%ntasks; if (pid!=0) MPI_Recv(&a[i*matrix_size],block_size*matrix_size,MPI_DOUBLE,pid,i,MPI_COMM_WORLD,&status); } /* test the resulting decoposition */ checkResult(matrix_size,a,rhs); }
void lu(int n, int bs, int me) { int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner, destp, hc, m; double *dbuf; armci_hdl_t handle[2*MAXPROC]; int saved[MAXPROC]; dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me) { A = a[K+K*nblocks]; lu0(A, strK, strK); /* impl algo on this diag block */ } MP_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me) D = a[K+K*nblocks]; else { D = dbuf; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); /* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (I, m); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL); saved[destp] = 1; } } } } /* end of for (i=k1, I=K+1...) */ /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); /* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/ memset (saved, 0, sizeof(saved)); for (m = K+1; m < nblocks; m++) { destp = block_owner (m, J); if (destp != me && !saved[destp]) { ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL); saved[destp] = 1; } } } } ARMCI_WaitAll(); ARMCI_AllFence(); MP_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me) { /* parcel out blocks */ if(block_owner(I,K) == me) A = a[I+K*nblocks]; else { A = bufc[me*nblocks+I]; } if(block_owner(K,J) == me) B = a[K+J*nblocks]; else B = bufr[me*nblocks + J]; C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } ARMCI_Free_local(dbuf); }
void *lu(void *lu_arg) { int n, bs, th_idx; int i, il, j, jl, k, kl; int I, J, K; double *A, *B, *C, *D; int dimI, dimJ, dimK; int strI, strJ, strK; unsigned int t1, t2, t3, t4, t11, t22; int diagowner; double *buf1, *buf2; n = ((int *)lu_arg)[0]; bs = ((int *)lu_arg)[1]; th_idx = ((int *)lu_arg)[2]; #ifdef DEBUG printf("DBG: starting thread %d(idx=%d) on node %d\n", me_th[th_idx], th_idx, me); fflush(stdout); #endif /* temporary memories */ buf1 = (double *)malloc(block_size*block_size*sizeof(double)); buf2 = (double *)malloc(block_size*block_size*sizeof(double)); for (k=0, K=0; k<n; k+=bs, K++) { kl = k + bs; if (kl > n) { kl = n; strK = kl - k; } else { strK = bs; } /* factor diagonal block */ diagowner = block_owner(K, K); if (diagowner == me_th[th_idx]) { A = a[K+K*nblocks]; print_block_dbg(A, "th=%d, idx=%d: before lu0 a[%d]:\n", me_th[th_idx], th_idx, K+K*nblocks); lu0(A, strK, strK); } MT_BARRIER(); /* divide column k by diagonal block */ if(block_owner(K, K) == me_th[th_idx]) D = a[K+K*nblocks]; else { D = buf1; get_remote(D, K, K); } for (i=kl, I=K+1; i<n; i+=bs, I++) { if (block_owner(I, K) == me_th[th_idx]) { /* parcel out blocks */ il = i + bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } A = a[I+K*nblocks]; bdiv(A, D, strI, strK, strI, strK); } } /* modify row k by diagonal block */ for (j=kl, J=K+1; j<n; j+=bs, J++) { if (block_owner(K, J) == me_th[th_idx]) { /* parcel out blocks */ jl = j+bs; if (jl > n) { jl = n; strJ = jl - j; } else { strJ = bs; } A = a[K+J*nblocks]; bmodd(D, A, strK, strJ, strK, strK); } } MT_BARRIER(); /* modify subsequent block columns */ for (i=kl, I=K+1; i<n; i+=bs, I++) { il = i+bs; if (il > n) { il = n; strI = il - i; } else { strI = bs; } if(block_owner(I,K) == me_th[th_idx]) A = a[I+K*nblocks]; else { A = buf1; get_remote(A, I, K); } for (j=kl, J=K+1; j<n; j+=bs, J++) { jl = j + bs; if (jl > n) { jl = n; strJ= jl - j; } else { strJ = bs; } if (block_owner(I, J) == me_th[th_idx]) { /* parcel out blocks */ if(block_owner(K,J) == me_th[th_idx]) B = a[K+J*nblocks]; else { B = buf2; get_remote(B, K, J); } C = a[I+J*nblocks]; bmod(A, B, C, strI, strJ, strK, strI, strK, strI); } } } } free(buf1); free(buf2); return lu_arg; }