void get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len) { int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk; int_t *xsup; i = j = *num_diag_procs = pkk = 0; nprow = grid->nprow; npcol = grid->npcol; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; do { ++(*num_diag_procs); i = (++i) % nprow; j = (++j) % npcol; pkk = PNUM( i, j, grid ); } while ( pkk != 0 ); /* Until wrap back to process 0 */ if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) ) ABORT("Malloc fails for diag_procs[]"); if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) ) ABORT("Calloc fails for diag_len[]"); for (i = j = k = 0; k < *num_diag_procs; ++k) { pkk = PNUM( i, j, grid ); (*diag_procs)[k] = pkk; i = (++i) % nprow; j = (++j) % npcol; } for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); i = k % *num_diag_procs; (*diag_len)[i] += knsupc; } }
/* * r[] is the residual vector distributed the same way as * matrix-vector product. */ static void redist_all_to_diag(int_t n, doublecomplex r[], Glu_persist_t *Glu_persist, LocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[], doublecomplex work[]) { int_t i, ii, k, lk, lr, nsupers; int_t *ilsum, *xsup; int iam, knsupc, psrc, pkk; MPI_Status status; iam = grid->iam; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; ilsum = Llu->ilsum; lr = 0; for (k = 0; k < nsupers; ++k) { pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid ); psrc = mv_sup_to_proc[k]; knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = ilsum[lk] + (lk+1)*XK_H; if ( iam == psrc ) { if ( iam != pkk ) { /* Send X component. */ MPI_Send( &r[lr], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, pkk, Xk, grid->comm ); } else { /* Local copy. */ for (i = 0; i < knsupc; ++i) work[i + ii] = r[i + lr]; } lr += knsupc; } else { if ( iam == pkk ) { /* Recv X component. */ MPI_Recv( &work[ii], knsupc, SuperLU_MPI_DOUBLE_COMPLEX, psrc, Xk, grid->comm, &status ); } } } } /* REDIST_ALL_TO_DIAG */
/*! \brief * * <pre> * Purpose * ======= * Perform local block modifications: lsum[i] -= L_i,k * X[k]. * </pre> */ void dlsum_fmod /************************************************************************/ ( double *lsum, /* Sum of local modifications. */ double *x, /* X array (local) */ double *xk, /* X[k]. */ double *rtemp, /* Result of full matrix-vector multiply. */ int nrhs, /* Number of right-hand sides. */ int knsupc, /* Size of supernode k. */ int_t k, /* The k-th component of X. */ int_t *fmod, /* Modification count for L-solve. */ int_t nlb, /* Number of L blocks. */ int_t lptr, /* Starting position in lsub[*]. */ int_t luptr, /* Starting position in lusup[*]. */ int_t *xsup, gridinfo_t *grid, LocalLU_t *Llu, MPI_Request send_req[], SuperLUStat_t *stat ) { double alpha = 1.0, beta = 0.0; double *lusup, *lusup1; double *dest; int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; int_t i, ii, ik, il, ikcol, irow, j, lb, lk, rel; int_t *lsub, *lsub1, nlb1, lptr1, luptr1; int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ int_t *frecv = Llu->frecv; int_t **fsendx_plist = Llu->fsendx_plist; MPI_Status status; int test_flag; iam = grid->iam; myrow = MYROW( iam, grid ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Llu->Lrowind_bc_ptr[lk]; lusup = Llu->Lnzval_bc_ptr[lk]; nsupr = lsub[1]; for (lb = 0; lb < nlb; ++lb) { ik = lsub[lptr]; /* Global block number, row-wise. */ nbrow = lsub[lptr+1]; #ifdef _CRAY SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, &alpha, &lusup[luptr], &nsupr, xk, &knsupc, &beta, rtemp, &nbrow ); #else dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, &alpha, &lusup[luptr], &nsupr, xk, &knsupc, &beta, rtemp, &nbrow ); #endif stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; lk = LBi( ik, grid ); /* Local block number, row-wise. */ iknsupc = SuperSize( ik ); il = LSUM_BLK( lk ); dest = &lsum[il]; lptr += LB_DESCRIPTOR; rel = xsup[ik]; /* Global row index of block ik. */ for (i = 0; i < nbrow; ++i) { irow = lsub[lptr++] - rel; /* Relative row. */ RHS_ITERATE(j) dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; } luptr += nbrow; if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ ikcol = PCOL( ik, grid ); p = PNUM( myrow, ikcol, grid ); if ( iam != p ) { #ifdef ISEND_IRECV MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, MPI_DOUBLE, p, LSUM, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, MPI_DOUBLE, p, LSUM, grid->comm ); #else MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, MPI_DOUBLE, p, LSUM, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); #endif } else { /* Diagonal process: X[i] += lsum[i]. */ ii = X_BLK( lk ); RHS_ITERATE(j) for (i = 0; i < iknsupc; ++i) x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( ik, grid );/* Local block number, column-wise. */ lsub1 = Llu->Lrowind_bc_ptr[lk]; lusup1 = Llu->Lnzval_bc_ptr[lk]; nsupr1 = lsub1[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, lusup1, &nsupr1, &x[ii], &iknsupc); #else dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, lusup1, &nsupr1, &x[ii], &iknsupc); #endif stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, ik); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < grid->nprow; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, ikcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #else MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nlb1 = lsub1[0] - 1; lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, fmod, nlb1, lptr1, luptr1, xsup, grid, Llu, send_req, stat); } /* if frecv[lk] == 0 */ } /* if iam == p */ } /* if fmod[lk] == 0 */ } /* for lb ... */
static void pdgstrs2 /************************************************************************/ #ifdef _CRAY ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 ) #else ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat ) #endif /* * Purpose * ======= * Perform parallel triangular solves * U(k,:) := A(k,:) \ L(k,k). * Only the process row that owns block row *k* participates * in the work. * * Arguments * ========= * * m (input) int (global) * Number of rows in the matrix. * * k (input) int (global) * The row number of the block row to be factorized. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization; * See SuperLUStat_t structure defined in util.h. * */ { int iam, pkk; int incx = 1; int nsupr; /* number of rows in the block L(:,k) (LDA) */ int segsize; int_t nsupc; /* number of columns in the block */ int_t luptr, iukp, rukp; int_t b, gb, j, klst, knsupc, lk, nb; int_t *xsup = Glu_persist->xsup; int_t *usub; double *lusup, *uval; /* Quick return. */ lk = LBi( k, grid ); /* Local block number */ if ( !Llu->Unzval_br_ptr[lk] ) return; /* Initialization. */ iam = grid->iam; pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); klst = FstBlockC( k+1 ); knsupc = SuperSize( k ); usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ uval = Llu->Unzval_br_ptr[lk]; nb = usub[0]; iukp = BR_HEADER; rukp = 0; if ( iam == pkk ) { lk = LBj( k, grid ); nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ lusup = Llu->Lnzval_bc_ptr[lk]; } else { nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ lusup = Llu->Lval_buf_2[k%2]; } /* Loop through all the row blocks. */ for (b = 0; b < nb; ++b) { gb = usub[iukp]; nsupc = SuperSize( gb ); iukp += UB_DESCRIPTOR; /* Loop through all the segments in the block. */ for (j = 0; j < nsupc; ++j) { segsize = klst - usub[iukp++]; if ( segsize ) { /* Nonzero segment. */ luptr = (knsupc - segsize) * (nsupr + 1); #ifdef _CRAY STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #elif defined (USE_VENDOR_BLAS) dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx, 1, 1, 1); #else dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #endif stat->ops[FACT] += segsize * (segsize + 1); rukp += segsize; } } } /* for b ... */ } /* PDGSTRS2 */
static void pdgstrf2 /************************************************************************/ ( superlu_options_t *options, int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, SuperLUStat_t *stat, int* info ) /* * Purpose * ======= * * Panel factorization -- block column k * Factor diagonal and subdiagonal blocks and test for exact singularity. * Only the column processes that owns block column *k* participate * in the work. * * Arguments * ========= * * k (input) int (global) * The column number of the block column to be factorized. * * thresh (input) double (global) * The threshold value = s_eps * anorm. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * U_diag_blk_send_req (input/output) MPI_Request* * List of send requests to send down the diagonal block of U. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization. * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { int cols_left, iam, l, pkk, pr; int incx = 1, incy = 1; int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t i, krow, j, jfst, jlst, u_diag_cnt; int_t nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; int_t Pr; MPI_Status status; MPI_Comm comm = (grid->cscp).comm; double *lusup, temp; double *ujrow, *ublk_ptr; /* pointer to the U block */ double alpha = -1; *info = 0; /* Quick return. */ /* Initialization. */ iam = grid->iam; Pr = grid->nprow; krow = PROW( k, grid ); pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); j = LBj( k, grid ); /* Local block number */ jfst = FstBlockC( k ); jlst = FstBlockC( k+1 ); lusup = Llu->Lnzval_bc_ptr[j]; nsupc = SuperSize( k ); if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; ublk_ptr = ujrow = Llu->ujrow; luptr = 0; /* point to the diagonal entries. */ cols_left = nsupc; /* supernode size */ u_diag_cnt = 0; if ( iam == pkk ) { /* diagonal process */ if ( U_diag_blk_send_req && U_diag_blk_send_req[krow] ) { /* There are pending sends - wait for all Isend to complete */ for (pr = 0; pr < Pr; ++pr) if (pr != krow) MPI_Wait(U_diag_blk_send_req + pr, &status); } for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ /* Diagonal pivot */ i = luptr; if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { if ( fabs(lusup[i]) < thresh ) { #if ( PRNTlevel>=2 ) printf("(%d) .. col %d, tiny pivot %e ", iam, jfst+j, lusup[i]); #endif /* Keep the new diagonal entry with the same sign. */ if ( lusup[i] < 0 ) lusup[i] = -thresh; else lusup[i] = thresh; #if ( PRNTlevel>=2 ) printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } } for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */ if ( ujrow[0] == 0.0 ) { /* Test for singularity. */ *info = j+jfst+1; } else { /* Scale the j-th column. */ temp = 1.0 / ujrow[0]; for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr-j-1; } /* Rank-1 update of the trailing submatrix. */ if ( --cols_left ) { l = nsupr - j - 1; #ifdef _CRAY SGER(&l, &cols_left, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #else dger_(&l, &cols_left, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #endif stat->ops[FACT] += 2 * l * cols_left; } ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */ luptr += nsupr + 1; /* move to next column */ } /* for column j ... */ if ( U_diag_blk_send_req && iam == pkk ) { /* Send the U block */ /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ for (pr = 0; pr < Pr; ++pr) if (pr != krow) MPI_Isend(ublk_ptr, u_diag_cnt, MPI_DOUBLE, pr, ((k<<2)+2)%NTAGS, comm, U_diag_blk_send_req + pr); U_diag_blk_send_req[krow] = 1; /* flag outstanding Isend */ } } else { /* non-diagonal process */ /* Receive the diagonal block of U */ MPI_Recv(ublk_ptr, (nsupc*(nsupc+1))>>1, MPI_DOUBLE, krow, ((k<<2)+2)%NTAGS, comm, &status); for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ u_diag_cnt += cols_left; if ( !lusup ) { /* empty block column */ --cols_left; if ( ujrow[0] == 0.0 ) *info = j+jfst+1; continue; } /* Test for singularity. */ if ( ujrow[0] == 0.0 ) { *info = j+jfst+1; } else { /* Scale the j-th column. */ temp = 1.0 / ujrow[0]; for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr; } /* Rank-1 update of the trailing submatrix. */ if ( --cols_left ) { #ifdef _CRAY SGER(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #else dger_(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #endif stat->ops[FACT] += 2 * nsupr * cols_left; } ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */ luptr += nsupr; /* move to next column */ } /* for column j ... */ } /* end if pkk ... */ } /* PDGSTRF2 */
static void pdgstrf2 /************************************************************************/ ( superlu_options_t *options, int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, int* info ) /* * Purpose * ======= * Factor diagonal and subdiagonal blocks and test for exact singularity. * Only the process column that owns block column *k* participates * in the work. * * Arguments * ========= * * k (input) int (global) * The column number of the block column to be factorized. * * thresh (input) double (global) * The threshold value = s_eps * anorm. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization. * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { int c, iam, l, pkk; int incx = 1, incy = 1; int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t i, krow, j, jfst, jlst; int_t nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; double *lusup, temp; double *ujrow; double alpha = -1; *info = 0; /* Quick return. */ /* Initialization. */ iam = grid->iam; krow = PROW( k, grid ); pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); j = LBj( k, grid ); /* Local block number */ jfst = FstBlockC( k ); jlst = FstBlockC( k+1 ); lusup = Llu->Lnzval_bc_ptr[j]; nsupc = SuperSize( k ); if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; ujrow = Llu->ujrow; luptr = 0; /* Point to the diagonal entries. */ c = nsupc; for (j = 0; j < jlst - jfst; ++j) { /* Broadcast the j-th row (nsupc - j) elements to the process column. */ if ( iam == pkk ) { /* Diagonal process. */ i = luptr; if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ #if ( PRNTlevel>=2 ) printf("(%d) .. col %d, tiny pivot %e ", iam, jfst+j, lusup[i]); #endif /* Keep the replaced diagonal with the same sign. */ if ( lusup[i] < 0 ) lusup[i] = -thresh; else lusup[i] = thresh; #if ( PRNTlevel>=2 ) printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } } for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i]; } #if 0 dbcast_col(ujrow, c, pkk, UjROW, grid, &c); #else MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm); /*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS, grid, COMM_COLUMN, &c);*/ #endif #if ( DEBUGlevel>=2 ) if ( k == 3329 && j == 2 ) { if ( iam == pkk ) { printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]); } else { printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]); } } #endif if ( !lusup ) { /* Empty block column. */ --c; if ( ujrow[0] == 0.0 ) *info = j+jfst+1; continue; } /* Test for singularity. */ if ( ujrow[0] == 0.0 ) { *info = j+jfst+1; } else { /* Scale the j-th column of the matrix. */ temp = 1.0 / ujrow[0]; if ( iam == pkk ) { for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr-j-1; } else { for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr; } } /* Rank-1 update of the trailing submatrix. */ if ( --c ) { if ( iam == pkk ) { l = nsupr - j - 1; #ifdef _CRAY SGER(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #elif defined (USE_VENDOR_BLAS) dger_(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #else hypre_F90_NAME_BLAS(dger,DGER)(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #endif stat->ops[FACT] += 2 * l * c; } else { #ifdef _CRAY SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #elif defined (USE_VENDOR_BLAS) dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #else hypre_F90_NAME_BLAS(dger,DGER)(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #endif stat->ops[FACT] += 2 * nsupr * c; } } /* Move to the next column. */ if ( iam == pkk ) luptr += nsupr + 1; else luptr += nsupr; } /* for j ... */ } /* PDGSTRF2 */
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PZGSTRS solves a system of distributed linear equations * A*X = B with a general N-by-N matrix A using the LU factorization * computed by PZGSTRF. * If the equilibration, and row and column permutations were performed, * the LU factorization was performed for A1 where * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * and the linear system solved is * A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and * the permutation to B1 by Pc*Pr is applied internally in this routine. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures storing L and U factors. * The L and U factors are obtained from PZGSTRF for * the possibly scaled and permuted matrix A. * See superlu_zdefs.h for the definition of 'LUstruct_t'. * A may be scaled and permuted into A1, so that * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_defs.h for the definition of 'gridinfo_t'. * * B (input/output) doublecomplex* * On entry, the distributed right-hand side matrix of the possibly * equilibrated system. That is, B may be overwritten by diag(R)*B. * On exit, the distributed solution matrix Y of the possibly * equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X, * and X is the solution of the original system. * * m_loc (input) int (local) * The local row dimension of matrix B. * * fst_row (input) int (global) * The row number of B's first row in the global matrix. * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * Number of right-hand sides. * * SOLVEstruct (output) SOLVEstruct_t* (global) * Contains the information for the communication during the * solution phase. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
int_t pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, int_t fst_row, int_t *ilsum, doublecomplex *x, ScalePermstruct_t *ScalePermstruct, Glu_persist_t *Glu_persist, gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { /* * Purpose * ======= * Re-distribute B on the diagonal processes of the 2D process mesh. * * Note * ==== * This routine can only be called after the routine pxgstrs_init(), * in which the structures of the send and receive buffers are set up. * * Arguments * ========= * * B (input) doublecomplex* * The distributed right-hand side matrix of the possibly * equilibrated system. * * m_loc (input) int (local) * The local row dimension of matrix B. * * nrhs (input) int (global) * Number of right-hand sides. * * ldb (input) int (local) * Leading dimension of matrix B. * * fst_row (input) int (global) * The row number of B's first row in the global matrix. * * ilsum (input) int* (global) * Starting position of each supernode in a full array. * * x (output) doublecomplex* * The solution vector. It is valid only on the diagonal processes. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * grid (input) gridinfo_t* * The 2D process mesh. * * SOLVEstruct (input) SOLVEstruct_t* * Contains the information for the communication during the * solution phase. * * Return value * ============ * */ int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *ptr_to_ibuf, *ptr_to_dbuf; int_t *perm_r, *perm_c; /* row and column permutation vectors */ int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; int_t *xsup, *supno; int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; int p, procs; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()"); #endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; xsup = Glu_persist->xsup; supno = Glu_persist->supno; SendCnt = gstrs_comm->B_to_X_SendCnt; SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; /* ------------------------------------------------------------ NOW COMMUNICATE THE ACTUAL DATA. ------------------------------------------------------------*/ k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; for (p = 0; p < procs; ++p) { ptr_to_ibuf[p] = sdispls[p]; ptr_to_dbuf[p] = sdispls[p] * nrhs; } /* Copy the row indices and values to the send buffer. */ for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ gbi = BlockNum( irow ); p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ k = ptr_to_ibuf[p]; send_ibuf[k] = irow; k = ptr_to_dbuf[p]; RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ send_dbuf[k++] = B[i + j*ldb]; } ++ptr_to_ibuf[p]; ptr_to_dbuf[p] += nrhs; } /* Communicate the (permuted) row indices. */ MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); /* Communicate the numerical values. */ MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); /* ------------------------------------------------------------ Copy buffer into X on the diagonal processes. ------------------------------------------------------------*/ ii = 0; for (p = 0; p < procs; ++p) { jj = rdispls_nrhs[p]; for (i = 0; i < RecvCnt[p]; ++i) { /* Only the diagonal processes do this; the off-diagonal processes have 0 RecvCnt. */ irow = recv_ibuf[ii]; /* The permuted row index. */ k = BlockNum( irow ); knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number. */ l = X_BLK( lk ); x[l - XK_H].r = k; /* Block number prepended in the header. */ x[l - XK_H].i = 0; irow = irow - FstBlockC(k); /* Relative row number in X-block */ RHS_ITERATE(j) { x[l + irow + j*knsupc] = recv_dbuf[jj++]; } ++ii; } } SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()"); #endif return 0; } /* pzReDistribute_B_to_X */
int_t pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, int_t fst_row, int_t *ilsum, doublecomplex *x, ScalePermstruct_t *ScalePermstruct, Glu_persist_t *Glu_persist, gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *ptr_to_ibuf, *ptr_to_dbuf; int_t *perm_r, *perm_c; /* row and column permutation vectors */ int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; int_t *xsup, *supno; int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; int p, procs; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()"); #endif /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; xsup = Glu_persist->xsup; supno = Glu_persist->supno; SendCnt = gstrs_comm->B_to_X_SendCnt; SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; /* ------------------------------------------------------------ NOW COMMUNICATE THE ACTUAL DATA. ------------------------------------------------------------*/ k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; for (p = 0; p < procs; ++p) { ptr_to_ibuf[p] = sdispls[p]; ptr_to_dbuf[p] = sdispls[p] * nrhs; } /* Copy the row indices and values to the send buffer. */ for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ gbi = BlockNum( irow ); p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ k = ptr_to_ibuf[p]; send_ibuf[k] = irow; k = ptr_to_dbuf[p]; RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ send_dbuf[k++] = B[i + j*ldb]; } ++ptr_to_ibuf[p]; ptr_to_dbuf[p] += nrhs; } /* Communicate the (permuted) row indices. */ MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); /* Communicate the numerical values. */ MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); /* ------------------------------------------------------------ Copy buffer into X on the diagonal processes. ------------------------------------------------------------*/ ii = 0; for (p = 0; p < procs; ++p) { jj = rdispls_nrhs[p]; for (i = 0; i < RecvCnt[p]; ++i) { /* Only the diagonal processes do this; the off-diagonal processes have 0 RecvCnt. */ irow = recv_ibuf[ii]; /* The permuted row index. */ k = BlockNum( irow ); knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number. */ l = X_BLK( lk ); x[l - XK_H].r = k; /* Block number prepended in the header. */ x[l - XK_H].i = 0; irow = irow - FstBlockC(k); /* Relative row number in X-block */ RHS_ITERATE(j) { x[l + irow + j*knsupc] = recv_dbuf[jj++]; } ++ii; } } SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()"); #endif return 0; } /* pzReDistribute_B_to_X */
/*! \brief * * <pre> * Purpose * ======= * Perform parallel triangular solves * U(k,:) := A(k,:) \ L(k,k). * Only the process column that owns block column *k* participates * in the work. * * Arguments * ========= * * m (input) int (global) * Number of rows in the matrix. * * k (input) int (global) * The row number of the block row to be factorized. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization; * See SuperLUStat_t structure defined in util.h. * </pre> */ static void pdgstrs2 /************************************************************************/ #ifdef _CRAY ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 ) #else ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat ) #endif { int iam, pkk; int incx = 1; int nsupr; /* number of rows in the block L(:,k) (LDA) */ int segsize; int_t nsupc; /* number of columns in the block */ int_t luptr, iukp, rukp; int_t b, gb, j, klst, knsupc, lk, nb; int_t *xsup = Glu_persist->xsup; int_t *usub; double *lusup, *uval; /* Quick return. */ lk = LBi( k, grid ); /* Local block number */ if ( !Llu->Unzval_br_ptr[lk] ) return; /* Initialization. */ iam = grid->iam; pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); klst = FstBlockC( k+1 ); knsupc = SuperSize( k ); usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ uval = Llu->Unzval_br_ptr[lk]; nb = usub[0]; iukp = BR_HEADER; rukp = 0; if ( iam == pkk ) { lk = LBj( k, grid ); nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ lusup = Llu->Lnzval_bc_ptr[lk]; } else { nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ lusup = Llu->Lval_buf_2[k%2]; } /* Loop through all the row blocks. */ for (b = 0; b < nb; ++b) { gb = usub[iukp]; nsupc = SuperSize( gb ); iukp += UB_DESCRIPTOR; /* Loop through all the segments in the block. */ for (j = 0; j < nsupc; ++j) { segsize = klst - usub[iukp++]; if ( segsize ) { /* Nonzero segment. */ luptr = (knsupc - segsize) * (nsupr + 1); #ifdef _CRAY STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #else dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #endif stat->ops[FACT] += segsize * (segsize + 1); rukp += segsize; } } } /* for b ... */ } /* PDGSTRS2 */
/*! \brief * * <pre> * Purpose * ======= * Factor diagonal and subdiagonal blocks and test for exact singularity. * Only the process column that owns block column *k* participates * in the work. * * Arguments * ========= * * k (input) int (global) * The column number of the block column to be factorized. * * thresh (input) double (global) * The threshold value = s_eps * anorm. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization. * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * </pre> */ static void pdgstrf2 /************************************************************************/ ( superlu_options_t *options, int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, int* info ) { int c, iam, l, pkk; int incx = 1, incy = 1; int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t i, krow, j, jfst, jlst; int_t nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; double *lusup, temp; double *ujrow; double alpha = -1; *info = 0; /* Quick return. */ /* Initialization. */ iam = grid->iam; krow = PROW( k, grid ); pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); j = LBj( k, grid ); /* Local block number */ jfst = FstBlockC( k ); jlst = FstBlockC( k+1 ); lusup = Llu->Lnzval_bc_ptr[j]; nsupc = SuperSize( k ); if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; ujrow = Llu->ujrow; luptr = 0; /* Point to the diagonal entries. */ c = nsupc; for (j = 0; j < jlst - jfst; ++j) { /* Broadcast the j-th row (nsupc - j) elements to the process column. */ if ( iam == pkk ) { /* Diagonal process. */ i = luptr; if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ #if ( PRNTlevel>=2 ) printf("(%d) .. col %d, tiny pivot %e ", iam, jfst+j, lusup[i]); #endif /* Keep the replaced diagonal with the same sign. */ if ( lusup[i] < 0 ) lusup[i] = -thresh; else lusup[i] = thresh; #if ( PRNTlevel>=2 ) printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } } for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i]; } #if 0 dbcast_col(ujrow, c, pkk, UjROW, grid, &c); #else MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm); /*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS, grid, COMM_COLUMN, &c);*/ #endif #if ( DEBUGlevel>=2 ) if ( k == 3329 && j == 2 ) { if ( iam == pkk ) { printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]); } else { printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]); } } #endif if ( !lusup ) { /* Empty block column. */ --c; if ( ujrow[0] == 0.0 ) *info = j+jfst+1; continue; } /* Test for singularity. */ if ( ujrow[0] == 0.0 ) { *info = j+jfst+1; } else { /* Scale the j-th column of the matrix. */ temp = 1.0 / ujrow[0]; if ( iam == pkk ) { for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr-j-1; } else { for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr; } } /* Rank-1 update of the trailing submatrix. */ if ( --c ) { if ( iam == pkk ) { l = nsupr - j - 1; #ifdef _CRAY SGER(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #else dger_(&l, &c, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #endif stat->ops[FACT] += 2 * l * c; } else { #ifdef _CRAY SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #else dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #endif stat->ops[FACT] += 2 * nsupr * c; } } /* Move to the next column. */ if ( iam == pkk ) luptr += nsupr + 1; else luptr += nsupr; } /* for j ... */ } /* PDGSTRF2 */
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; MPI_Request *send_req, recv_req; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; #if 1 for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol != kcol && fmod[lk] ) mod_bit[lk] = 1; /* contribution from off-diagonal */ } } /*PrintInt10("mod_bit", nlb, mod_bit);*/ #if ( PROFlevel>=2 ) t_reduce_tmp = SuperLU_timer_(); #endif /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); #if ( PROFlevel>=2 ) t_reduce += SuperLU_timer_() - t_reduce_tmp; #endif for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol == kcol ) { /* diagonal process */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; } } } #else /* old */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } #endif } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #if 0 MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #if 0 MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/ for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status); Llu->SolveMsgSent = 0; MPI_Barrier( grid->comm ); /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; #if 1 for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); /* root process in this row scope */ if ( mycol != kcol && bmod[lk] ) mod_bit[lk] = 1; /* Contribution from off-diagonal */ } } /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); /* root process in this row scope. */ if ( mycol == kcol ) { /* diagonal process */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } #else /* old */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } #endif } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
int_t dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, gridinfo_t *grid, int_t *colptr[], int_t *rowind[], double *a[]) { /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * Purpose * ======= * Re-distribute A on the 2D process mesh. * * Arguments * ========= * * A (input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (input) *Glu_freeable_t * The global structure describing the graph of L and U. * * grid (input) gridinfo_t* * The 2D process mesh. * * colptr (output) int* * * rowind (output) int* * * a (output) double* * * Return value * ============ * */ NRformat_loc *Astore; int_t *perm_r; /* row permutation vector */ int_t *perm_c; /* column permutation vector */ int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; int_t nnz_loc; /* number of local nonzeros */ int_t nnz_remote; /* number of remote nonzeros to be sent */ int_t SendCnt; /* number of remote nonzeros to be sent */ int_t RecvCnt; /* number of remote nonzeros to be sent */ int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; int_t *ia, *ja, **ia_send, *index, *itemp; int_t *ptr_to_send; double *aij, **aij_send, *nzval, *dtemp; double *nzval_a; int iam, it, p, procs; MPI_Request *send_req; MPI_Status status; /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter dReDistribute_A()"); #endif perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A->Store; n = A->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; nnzToRecv = intCalloc_dist(2*procs); nnzToSend = nnzToRecv + procs; /* ------------------------------------------------------------ COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE SPACE. THIS ACCOUNTS FOR THE FIRST PASS OF A. ------------------------------------------------------------*/ for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); ++nnzToSend[p]; } } /* All-to-all communication */ MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, grid->comm); maxnnzToRecv = 0; nnz_loc = SendCnt = RecvCnt = 0; for (p = 0; p < procs; ++p) { if ( p != iam ) { SendCnt += nnzToSend[p]; RecvCnt += nnzToRecv[p]; maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); } else { nnz_loc += nnzToRecv[p]; /*assert(nnzToSend[p] == nnzToRecv[p]);*/ } } k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ /* Allocate space for storing the triplets after redistribution. */ if ( !(ia = intMalloc_dist(2*k)) ) ABORT("Malloc fails for ia[]."); ja = ia + k; if ( !(aij = doubleMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) ABORT("Malloc fails for ia_send[]."); if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) ABORT("Malloc fails for aij_send[]."); if ( !(index = intMalloc_dist(2*SendCnt)) ) ABORT("Malloc fails for index[]."); if ( !(nzval = doubleMalloc_dist(SendCnt)) ) ABORT("Malloc fails for nzval[]."); if ( !(ptr_to_send = intCalloc_dist(procs)) ) ABORT("Malloc fails for ptr_to_send[]."); if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) ABORT("Malloc fails for itemp[]."); if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) ABORT("Malloc fails for dtemp[]."); for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } } /* if procs > 1 */ if ( !(*colptr = intCalloc_dist(n+1)) ) ABORT("Malloc fails for *colptr[]."); /* ------------------------------------------------------------ LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. THIS ACCOUNTS FOR THE SECOND PASS OF A. ------------------------------------------------------------*/ nnz_loc = 0; /* Reset the local nonzero count. */ nzval_a = Astore->nzval; for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); if ( p != iam ) { /* remote */ k = ptr_to_send[p]; ia_send[p][k] = irow; ia_send[p][k + nnzToSend[p]] = jcol; aij_send[p][k] = nzval_a[j]; ++ptr_to_send[p]; } else { /* local */ ia[nnz_loc] = irow; ja[nnz_loc] = jcol; aij[nnz_loc] = nzval_a[j]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } /* ------------------------------------------------------------ PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); it = nnzToSend[p]; MPI_Isend( aij_send[p], it, MPI_DOUBLE, p, iam+procs, grid->comm, &send_req[procs+p] ); } } for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, grid->comm, &status ); for (i = 0; i < nnzToRecv[p]; ++i) { ia[nnz_loc] = itemp[i]; jcol = itemp[i + nnzToRecv[p]]; /*assert(jcol<n);*/ ja[nnz_loc] = jcol; aij[nnz_loc] = dtemp[i]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } for (p = 0; p < procs; ++p) { if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } } /* ------------------------------------------------------------ DEALLOCATE TEMPORARY STORAGE ------------------------------------------------------------*/ SUPERLU_FREE(nnzToRecv); if ( procs > 1 ) { SUPERLU_FREE(send_req); SUPERLU_FREE(ia_send); SUPERLU_FREE(aij_send); SUPERLU_FREE(index); SUPERLU_FREE(nzval); SUPERLU_FREE(ptr_to_send); SUPERLU_FREE(itemp); SUPERLU_FREE(dtemp); } /* ------------------------------------------------------------ CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. ------------------------------------------------------------*/ if ( !(*rowind = intMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *rowind[]."); if ( !(*a = doubleMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *a[]."); /* Initialize the array of column pointers */ k = 0; jsize = (*colptr)[0]; (*colptr)[0] = 0; for (j = 1; j < n; ++j) { k += jsize; jsize = (*colptr)[j]; (*colptr)[j] = k; } /* Copy the triplets into the column oriented storage */ for (i = 0; i < nnz_loc; ++i) { j = ja[i]; k = (*colptr)[j]; (*rowind)[k] = ia[i]; (*a)[k] = aij[i]; ++(*colptr)[j]; } /* Reset the column pointers to the beginning of each column */ for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; (*colptr)[0] = 0; SUPERLU_FREE(ia); SUPERLU_FREE(aij); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit dReDistribute_A()"); #endif } /* dReDistribute_A */
/*! \brief * * <pre> * Purpose * ======= * Set up the communication pattern for the triangular solution. * * Arguments * ========= * * n (input) int (global) * The dimension of the linear system. * * m_loc (input) int (local) * The local row dimension of the distributed input matrix. * * nrhs (input) int (global) * Number of right-hand sides. * * fst_row (input) int (global) * The row number of matrix B's first row in the global matrix. * * perm_r (input) int* (global) * The row permutation vector. * * perm_c (input) int* (global) * The column permutation vector. * * grid (input) gridinfo_t* * The 2D process mesh. * </pre> */ int_t pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, int_t perm_r[], int_t perm_c[], gridinfo_t *grid, Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct) { int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; int *itemp, *ptr_to_ibuf, *ptr_to_dbuf; int_t *row_to_proc; int_t i, gbi, k, l, num_diag_procs, *diag_procs; int_t irow, lk, q, knsupc, nsupers, *xsup, *supno; int iam, p, pkk, procs; pxgstrs_comm_t *gstrs_comm; procs = grid->nprow * grid->npcol; iam = grid->iam; gstrs_comm = SOLVEstruct->gstrs_comm; xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = Glu_persist->supno[n-1] + 1; row_to_proc = SOLVEstruct->row_to_proc; /* ------------------------------------------------------------ SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. ------------------------------------------------------------*/ if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) ABORT("Malloc fails for B_to_X_itemp[]."); SendCnt = itemp; SendCnt_nrhs = itemp + procs; RecvCnt = itemp + 2*procs; RecvCnt_nrhs = itemp + 3*procs; sdispls = itemp + 4*procs; sdispls_nrhs = itemp + 5*procs; rdispls = itemp + 6*procs; rdispls_nrhs = itemp + 7*procs; /* Count the number of elements to be sent to each diagonal process.*/ for (p = 0; p < procs; ++p) SendCnt[p] = 0; for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ gbi = BlockNum( irow ); p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ ++SendCnt[p]; } /* Set up the displacements for alltoall. */ MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); sdispls[0] = rdispls[0] = 0; for (p = 1; p < procs; ++p) { sdispls[p] = sdispls[p-1] + SendCnt[p-1]; rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; } for (p = 0; p < procs; ++p) { SendCnt_nrhs[p] = SendCnt[p] * nrhs; sdispls_nrhs[p] = sdispls[p] * nrhs; RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; rdispls_nrhs[p] = rdispls[p] * nrhs; } /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ gstrs_comm->B_to_X_SendCnt = SendCnt; /* ------------------------------------------------------------ SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. ------------------------------------------------------------*/ /* This is freed in pxgstrs_finalize(). */ if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) ABORT("Malloc fails for X_to_B_itemp[]."); SendCnt = itemp; SendCnt_nrhs = itemp + procs; RecvCnt = itemp + 2*procs; RecvCnt_nrhs = itemp + 3*procs; sdispls = itemp + 4*procs; sdispls_nrhs = itemp + 5*procs; rdispls = itemp + 6*procs; rdispls_nrhs = itemp + 7*procs; /* Count the number of X entries to be sent to each process.*/ for (p = 0; p < procs; ++p) SendCnt[p] = 0; num_diag_procs = SOLVEstruct->num_diag_procs; diag_procs = SOLVEstruct->diag_procs; for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */ pkk = diag_procs[p]; if ( iam == pkk ) { for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); lk = LBi( k, grid ); /* Local block number */ irow = FstBlockC( k ); for (i = 0; i < knsupc; ++i) { #if 0 q = row_to_proc[inv_perm_c[irow]]; #else q = row_to_proc[irow]; #endif ++SendCnt[q]; ++irow; } } } } MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); sdispls[0] = rdispls[0] = 0; sdispls_nrhs[0] = rdispls_nrhs[0] = 0; SendCnt_nrhs[0] = SendCnt[0] * nrhs; RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; for (p = 1; p < procs; ++p) { sdispls[p] = sdispls[p-1] + SendCnt[p-1]; rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; sdispls_nrhs[p] = sdispls[p] * nrhs; rdispls_nrhs[p] = rdispls[p] * nrhs; SendCnt_nrhs[p] = SendCnt[p] * nrhs; RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; } /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ gstrs_comm->X_to_B_SendCnt = SendCnt; if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) ) ABORT("Malloc fails for ptr_to_ibuf[]."); gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; } /* PXGSTRS_INIT */
/*! \brief * * <pre> * Purpose * ======= * Panel factorization -- block column k * * Factor diagonal and subdiagonal blocks and test for exact singularity. * Only the column processes that own block column *k* participate * in the work. * * Arguments * ========= * * k (input) int (global) * The column number of the block column to be factorized. * * thresh (input) double (global) * The threshold value = s_eps * anorm. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * U_diag_blk_send_req (input/output) MPI_Request* * List of send requests to send down the diagonal block of U. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization. * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * </pre> */ static void pdgstrf2 ( superlu_options_t *options, int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, SuperLUStat_t *stat, int* info ) { int cols_left, iam, l, pkk, pr; int incx = 1, incy = 1; int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t i, krow, j, jfst, jlst, u_diag_cnt; int_t nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; double *lusup, temp; double *ujrow, *ublk_ptr; /* pointer to the U block */ double alpha = -1, zero = 0.0; int_t Pr; MPI_Status status; MPI_Comm comm = (grid->cscp).comm; /* Quick return. */ *info = 0; /* Initialization. */ iam = grid->iam; Pr = grid->nprow; krow = PROW( k, grid ); pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); j = LBj( k, grid ); /* Local block number */ jfst = FstBlockC( k ); jlst = FstBlockC( k+1 ); lusup = Llu->Lnzval_bc_ptr[j]; nsupc = SuperSize( k ); if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; ublk_ptr = ujrow = Llu->ujrow; luptr = 0; /* Point to the diagonal entries. */ cols_left = nsupc; /* supernode size */ u_diag_cnt = 0; if ( iam == pkk ) { /* diagonal process */ if ( U_diag_blk_send_req && U_diag_blk_send_req[krow] ) { /* There are pending sends - wait for all Isend to complete */ for (pr = 0; pr < Pr; ++pr) if (pr != krow) MPI_Wait(U_diag_blk_send_req + pr, &status); } for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ /* Diagonal pivot */ i = luptr; if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ #if ( PRNTlevel>=2 ) printf("(%d) .. col %d, tiny pivot %e ", iam, jfst+j, lusup[i]); #endif /* Keep the new diagonal entry with the same sign. */ if ( lusup[i] < 0 ) lusup[i] = -thresh; else lusup[i] = thresh; #if ( PRNTlevel>=2 ) printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } } for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */ if ( ujrow[0] == zero ) { /* Test for singularity. */ *info = j+jfst+1; } else { /* Scale the j-th column. */ temp = 1.0 / ujrow[0]; for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr-j-1; } /* Rank-1 update of the trailing submatrix. */ if ( --cols_left ) { l = nsupr - j - 1; #ifdef _CRAY SGER(&l, &cols_left, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #else dger_(&l, &cols_left, &alpha, &lusup[luptr+1], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); #endif stat->ops[FACT] += 2 * l * cols_left; } ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */ luptr += nsupr + 1; /* move to next column */ } /* for column j ... */ if ( U_diag_blk_send_req && iam == pkk ) { /* Send the U block */ /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ for (pr = 0; pr < Pr; ++pr) if (pr != krow) MPI_Isend(ublk_ptr, u_diag_cnt, MPI_DOUBLE, pr, ((k<<2)+2)%NTAGS, comm, U_diag_blk_send_req + pr); U_diag_blk_send_req[krow] = 1; /* flag outstanding Isend */ } } else { /* non-diagonal process */ /* Receive the diagonal block of U */ MPI_Recv(ublk_ptr, (nsupc*(nsupc+1))>>1, MPI_DOUBLE, krow, ((k<<2)+2)%NTAGS, comm, &status); for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ u_diag_cnt += cols_left; if ( !lusup ) { /* empty block column */ --cols_left; if ( ujrow[0] == zero ) *info = j+jfst+1; continue; } /* Test for singularity. */ if ( ujrow[0] == zero ) { *info = j+jfst+1; } else { /* Scale the j-th column. */ temp = 1.0 / ujrow[0]; for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupr; } /* Rank-1 update of the trailing submatrix. */ if ( --cols_left ) { #ifdef _CRAY SGER(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #else dger_(&nsupr, &cols_left, &alpha, &lusup[luptr], &incx, &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); #endif stat->ops[FACT] += 2 * nsupr * cols_left; } ujrow = ublk_ptr + u_diag_cnt; /* move to next row of U */ luptr += nsupr; /* move to next column */ } /* for column j ... */ } /* end if pkk ... */ } /* PDGSTRF2 */
void pdgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *x, int nrhs, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PDGSTRS1 solves a system of distributed linear equations * * op( sub(A) ) * X = sub( B ) * * with a general N-by-N distributed matrix sub( A ) using the LU * factorization computed by PDGSTRF. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures to store L and U factors, * and the permutation vectors. * See superlu_ddefs.h for the definition of 'LUstruct_t' structure. * * grid (input) gridinfo_t* * The 2D process mesh. * * x (input/output) double* * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * NOTE: the right-hand side matrix is already distributed on * the diagonal processes. * * nrhs (input) int (global) * Number of right-hand sides. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves; * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; double alpha = 1.0; double *lsum; /* Local running sum of the updates to B-components */ double *lusup, *dest; double *recvbuf, *tempv; double *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; double **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for L-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -8; if ( *info ) { pxerbla("PDGSTRS1", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ Llu->SolveMsgSent = 0; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrs1()"); #endif /* Save the count to be altered so it can be used by subsequent call to PDGSTRS1. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Compute ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)) ) ABORT("Calloc fails for lsum[]."); maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H); if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* * Prepended the block number in the header for lsum[]. */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H] = k; } } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( !frecv[lk] && !fmod[lk] ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* * Compute the internal nodes asynchronously by all processes. */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = *recvbuf; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) x[i + ii + j*knsupc] += tempv[i + j*knsupc]; if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel>=2 ) if ( !iam ) printf("\n.. After L-solve: y =\n"); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); } MPI_Barrier( grid->comm ); } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PDGSTRS1. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; } }