/*! \brief * * <pre> * Purpose * ======= * Perform local block modifications: lsum[i] -= L_i,k * X[k]. * </pre> */ void dlsum_fmod /************************************************************************/ ( double *lsum, /* Sum of local modifications. */ double *x, /* X array (local) */ double *xk, /* X[k]. */ double *rtemp, /* Result of full matrix-vector multiply. */ int nrhs, /* Number of right-hand sides. */ int knsupc, /* Size of supernode k. */ int_t k, /* The k-th component of X. */ int_t *fmod, /* Modification count for L-solve. */ int_t nlb, /* Number of L blocks. */ int_t lptr, /* Starting position in lsub[*]. */ int_t luptr, /* Starting position in lusup[*]. */ int_t *xsup, gridinfo_t *grid, LocalLU_t *Llu, MPI_Request send_req[], SuperLUStat_t *stat ) { double alpha = 1.0, beta = 0.0; double *lusup, *lusup1; double *dest; int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; int_t i, ii, ik, il, ikcol, irow, j, lb, lk, rel; int_t *lsub, *lsub1, nlb1, lptr1, luptr1; int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ int_t *frecv = Llu->frecv; int_t **fsendx_plist = Llu->fsendx_plist; MPI_Status status; int test_flag; iam = grid->iam; myrow = MYROW( iam, grid ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Llu->Lrowind_bc_ptr[lk]; lusup = Llu->Lnzval_bc_ptr[lk]; nsupr = lsub[1]; for (lb = 0; lb < nlb; ++lb) { ik = lsub[lptr]; /* Global block number, row-wise. */ nbrow = lsub[lptr+1]; #ifdef _CRAY SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, &alpha, &lusup[luptr], &nsupr, xk, &knsupc, &beta, rtemp, &nbrow ); #else dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, &alpha, &lusup[luptr], &nsupr, xk, &knsupc, &beta, rtemp, &nbrow ); #endif stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; lk = LBi( ik, grid ); /* Local block number, row-wise. */ iknsupc = SuperSize( ik ); il = LSUM_BLK( lk ); dest = &lsum[il]; lptr += LB_DESCRIPTOR; rel = xsup[ik]; /* Global row index of block ik. */ for (i = 0; i < nbrow; ++i) { irow = lsub[lptr++] - rel; /* Relative row. */ RHS_ITERATE(j) dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; } luptr += nbrow; if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ ikcol = PCOL( ik, grid ); p = PNUM( myrow, ikcol, grid ); if ( iam != p ) { #ifdef ISEND_IRECV MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, MPI_DOUBLE, p, LSUM, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, MPI_DOUBLE, p, LSUM, grid->comm ); #else MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, MPI_DOUBLE, p, LSUM, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); #endif } else { /* Diagonal process: X[i] += lsum[i]. */ ii = X_BLK( lk ); RHS_ITERATE(j) for (i = 0; i < iknsupc; ++i) x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( ik, grid );/* Local block number, column-wise. */ lsub1 = Llu->Lrowind_bc_ptr[lk]; lusup1 = Llu->Lnzval_bc_ptr[lk]; nsupr1 = lsub1[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, lusup1, &nsupr1, &x[ii], &iknsupc); #else dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, lusup1, &nsupr1, &x[ii], &iknsupc); #endif stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, ik); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < grid->nprow; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, ikcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #else MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nlb1 = lsub1[0] - 1; lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, fmod, nlb1, lptr1, luptr1, xsup, grid, Llu, send_req, stat); } /* if frecv[lk] == 0 */ } /* if iam == p */ } /* if fmod[lk] == 0 */ } /* for lb ... */
void dgstrs(trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_r, int *perm_c, SuperMatrix *B, Gstat_t *Gstat, int *info) { /* * -- SuperLU MT routine (version 1.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * August 15, 1997 * * Purpose * ======= * * dgstrs() solves a system of linear equations A*X=B or A'*X=B * with A sparse and B dense, using the LU factorization computed by * pdgstrf(). * * Arguments * ========= * * trans (input) Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * pdgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SCP, Dtype = _D, Mtype = TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * pdgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = NCP, Dtype = _D, Mtype = TRU. * * perm_r (input) int* * Row permutation vector of size L->nrow, which defines the * permutation matrix Pr; perm_r[i] = j means row i of A is in * position j in Pr*A. * * perm_c (int*) dimension A->ncol * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * B (input/output) SuperMatrix* * B has types: Stype = DN, Dtype = _D, Mtype = GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * Gstat (output) Gstat_t* * Record all the statistics about the triangular solves; * See Gstat_t structure defined in util.h. * * info (output) Diagnostics * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; #endif register int j, k, jcol, iptr, luptr, ksupno, istart, irow, bptr; register int fsupc, nsuper; int i, n, nsupc, nsupr, nrow, nrhs, ldb; int *supno; DNformat *Bstore; SCPformat *Lstore; NCPformat *Ustore; double *Lval, *Uval, *Bmat; double *work, *work_col, *rhs_work, *soln; flops_t solve_ops; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -4; else if ( ldb < MAX(0, L->nrow) ) *info = -6; if ( *info ) { i = -(*info); xerbla_("dgstrs", &i); return; } n = L->nrow; work = doubleCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = doubleMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; supno = Lstore->col_to_sup; nsuper = Lstore->nsuper; solve_ops = 0; if ( trans == NOTRANS ) { /* Permute right hand sides to form Pr*B */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ /*>> for (k = 0; k < n; k += nsupc) { ksupno = supno[k]; */ for (ksupno = 0; ksupno <= nsuper; ++ksupno) { fsupc = L_FST_SUPC(ksupno); istart = L_SUB_START(fsupc); nsupr = L_SUB_END(fsupc) - istart; nsupc = L_LAST_SUPC(ksupno) - fsupc; nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1) * nrhs; solve_ops += 2 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) { rhs_work = &Bmat[bptr]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_END(fsupc); iptr++){ irow = L_SUB(iptr); ++luptr; rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); SGEMM(ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) { rhs_work = &Bmat[bptr]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work_col[i]; /* Scatter */ work_col[i] = 0.0; iptr++; } } #else for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) { rhs_work = &Bmat[bptr]; dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work[i]; work[i] = 0.0; iptr++; } } #endif } /* if-else: nsupc == 1 ... */ } /* for L-solve */ #if ( DEBUGlevel>=2 ) printf("After L-solve: y=\n"); dprint_soln(n, nrhs, Bmat); #endif /* * Back solve Ux=y. */ /*>> for (k = n-1; k >= 0; k -= nsupc) { ksupno = supno[k]; */ for (ksupno = nsuper; ksupno >= 0; --ksupno) { fsupc = L_FST_SUPC(ksupno); istart = L_SUB_START(fsupc); nsupr = L_SUB_END(fsupc) - istart; nsupc = L_LAST_SUPC(ksupno) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1) * nrhs; /* dense triangular matrix */ if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { rhs_work[fsupc] /= Lval[luptr]; rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); STRSM(ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else dtrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0, bptr = fsupc; j < nrhs; j++, bptr += ldb) { dusolve (nsupr, nsupc, &Lval[luptr], &Bmat[bptr]); } #endif } /* matrix-vector update */ for (j = 0, bptr = 0; j < nrhs; ++j, bptr += ldb) { rhs_work = &Bmat[bptr]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 2*(U_NZ_END(jcol) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_END(jcol); i++ ){ irow = U_SUB(i); rhs_work[irow] -= rhs_work[jcol] * Uval[i]; } } } } /* for U-solve */ #if ( DEBUGlevel>=2 ) printf("After U-solve: x=\n"); dprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X <= Pc*X. */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } else { /* Solve A'*X=B */ /* Permute right hand sides to form Pc'*B. */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], info); /* Multiply by inv(L'). */ sp_dtrsv("L", "T", "U", L, U, &Bmat[k*ldb], info); } /* Compute the final solution X <= Pr'*X (=inv(Pr)*X) */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } /* if-else trans */ Gstat->ops[TRISOLVE] = solve_ops; SUPERLU_FREE(work); SUPERLU_FREE(soln); }
void sgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, SuperMatrix *B, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * SGSTRS solves a system of linear equations A*X=B or A'*X=B * with A sparse and B dense, using the LU factorization computed by * SGSTRF. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * sgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * sgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU. * * perm_c (input) int*, dimension (L->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int*, dimension (L->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif int incx = 1, incy = 1; #ifdef USE_VENDOR_BLAS float alpha = 1.0, beta = 1.0; float *work_col; #endif DNformat *Bstore; float *Bmat; SCformat *Lstore; NCformat *Ustore; float *Lval, *Uval; int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; float *work, *rhs_work, *soln; flops_t solve_ops; void sprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU ) *info = -2; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU ) *info = -3; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE ) *info = -6; if ( *info ) { i = -(*info); xerbla_("sgstrs", &i); return; } n = L->nrow; work = floatCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = floatMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( trans == NOTRANS ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1) * nrhs; solve_ops += 2 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else strsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); sgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work_col[i]; /* Scatter */ work_col[i] = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; slsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); smatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work[i]; work[i] = 0.0; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); sprint_soln(n, nrhs, Bmat); #endif /* * Back solve Ux=y. */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1) * nrhs; if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { rhs_work[fsupc] /= Lval[luptr]; rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else strsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0; j < nrhs; j++) susolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); #endif } for (j = 0; j < nrhs; ++j) { rhs_work = &Bmat[j*ldb]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ irow = U_SUB(i); rhs_work[irow] -= rhs_work[jcol] * Uval[i]; } } } } /* for U-solve */ #ifdef DEBUG printf("After U-solve: x=\n"); sprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X := Pc*X. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = solve_ops; } else { /* Solve A'*X=B or CONJ(A)*X=B */ /* Permute right hand sides to form Pc'*B. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = 0; for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_strsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by inv(L'). */ sp_strsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); } /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } SUPERLU_FREE(work); SUPERLU_FREE(soln); }
/*! \brief * * <pre> * Purpose * ======= * * dgstrsU only performs the U-solve using the LU factorization computed * by DGSTRF. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. * * perm_c (input) int*, dimension (L->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int*, dimension (L->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * </pre> */ void dgstrsU(trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, SuperMatrix *B, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif #ifdef USE_VENDOR_BLAS double alpha = 1.0, beta = 1.0; double *work_col; #endif DNformat *Bstore; double *Bmat; SCformat *Lstore; NCformat *Ustore; double *Lval, *Uval; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, j, k, jcol, n, ldb, nrhs; double *rhs_work, *soln; flops_t solve_ops; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) *info = -2; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) *info = -3; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -6; if ( *info ) { i = -(*info); xerbla_("dgstrs", &i); return; } n = L->nrow; soln = doubleMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( trans == NOTRANS ) { /* * Back solve Ux=y. */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1) * nrhs; if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { rhs_work[fsupc] /= Lval[luptr]; rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else dtrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0; j < nrhs; j++) dusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); #endif } for (j = 0; j < nrhs; ++j) { rhs_work = &Bmat[j*ldb]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ irow = U_SUB(i); rhs_work[irow] -= rhs_work[jcol] * Uval[i]; } } } } /* for U-solve */ #ifdef DEBUG printf("After U-solve: x=\n"); dprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X := Pc*X. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = solve_ops; } else { /* Solve U'x = b */ /* Permute right hand sides to form Pc'*B. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); } } SUPERLU_FREE(soln); }
void pdgstrsL(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, double *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; double alpha = 1.0; double zero = 0.0; double *lsum; /* Local running sum of the updates to B-components */ double *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ double *lusup, *dest; double *recvbuf, *tempv; double *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; double **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PDGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrsL()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol != kcol && fmod[lk] ) mod_bit[lk] = 1; /* contribution from off-diagonal */ } } /*PrintInt10("mod_bit", nlb, mod_bit);*/ #if ( PROFlevel>=2 ) t_reduce_tmp = SuperLU_timer_(); #endif /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); #if ( PROFlevel>=2 ) t_reduce += SuperLU_timer_() - t_reduce_tmp; #endif for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol == kcol ) { /* diagonal process */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = *recvbuf; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) x[i + ii + j*knsupc] += tempv[i + j*knsupc]; } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /* Re-distribute X on the diagonal processes to B distributed on all the processes. */ pdReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Deallocate storage. */ SUPERLU_FREE(lsum); SUPERLU_FREE(x); SUPERLU_FREE(recvbuf); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); SUPERLU_FREE(send_req); #endif stat->utime[SOLVE] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrsL()"); #endif } /* PDGSTRS */
void dgstrsL(char *trans, SuperMatrix *L, int *perm_r, SuperMatrix *B, int *info) { /* * Purpose * ======= * * DGSTRSL only performs the L-solve using the LU factorization computed * by DGSTRF. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) char* * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A'* X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. * * perm_r (input) int*, dimension (L->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; DNformat *Bstore; double *Bmat; SCformat *Lstore; double *Lval, *Uval; int nrow, notran; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; double *work, *work_col, *rhs_work, *soln; flops_t solve_ops; extern SuperLUStat_t SuperLUStat; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; notran = lsame_(trans, "N"); if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C") ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) *info = -2; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -4; if ( *info ) { i = -(*info); xerbla_("dgstrsL", &i); return; } n = L->nrow; work = doubleCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = doubleMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; solve_ops = 0; if ( notran ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1) * nrhs; solve_ops += 2 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work_col[i]; /* Scatter */ work_col[i] = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work[i]; work[i] = 0.0; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); dprint_soln(n, nrhs, Bmat); #endif SuperLUStat.ops[SOLVE] = solve_ops; } else { printf("Transposed solve not implemented.\n"); exit(0); } SUPERLU_FREE(work); SUPERLU_FREE(soln); }
void pdgstrs1(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *x, int nrhs, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PDGSTRS1 solves a system of distributed linear equations * * op( sub(A) ) * X = sub( B ) * * with a general N-by-N distributed matrix sub( A ) using the LU * factorization computed by PDGSTRF. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures to store L and U factors, * and the permutation vectors. * See superlu_ddefs.h for the definition of 'LUstruct_t' structure. * * grid (input) gridinfo_t* * The 2D process mesh. * * x (input/output) double* * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * NOTE: the right-hand side matrix is already distributed on * the diagonal processes. * * nrhs (input) int (global) * Number of right-hand sides. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves; * See SuperLUStat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; double alpha = 1.0; double *lsum; /* Local running sum of the updates to B-components */ double *lusup, *dest; double *recvbuf, *tempv; double *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; double **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for L-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -8; if ( *info ) { pxerbla("PDGSTRS1", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ Llu->SolveMsgSent = 0; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrs1()"); #endif /* Save the count to be altered so it can be used by subsequent call to PDGSTRS1. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Compute ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)) ) ABORT("Calloc fails for lsum[]."); maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H); if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* * Prepended the block number in the header for lsum[]. */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H] = k; } } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( !frecv[lk] && !fmod[lk] ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* * Compute the internal nodes asynchronously by all processes. */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = *recvbuf; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) x[i + ii + j*knsupc] += tempv[i + j*knsupc]; if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++] ); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, MPI_DOUBLE, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel>=2 ) if ( !iam ) printf("\n.. After L-solve: y =\n"); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); } MPI_Barrier( grid->comm ); } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PDGSTRS1. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; } }
/*! \brief * * <pre> * Purpose * ======= * * dgstrsL only performs the L-solve using the LU factorization computed * by DGSTRF. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) char* * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A'* X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. * * perm_r (input) int*, dimension (L->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * </pre> */ void dgstrsL(char *trans, SuperMatrix *L, int *perm_r, SuperMatrix *B, int *info) { #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; DNformat *Bstore; double *Bmat; SCformat *Lstore; double *Lval, *Uval; int nrow, notran; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; double *work, *work_col, *rhs_work, *soln; flops_t solve_ops; extern SuperLUStat_t SuperLUStat; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; notran = lsame_(trans, "N"); if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C") ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) *info = -2; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -4; if ( *info ) { i = -(*info); xerbla_("dgstrsL", &i); return; } n = L->nrow; work = doubleCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = doubleMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; solve_ops = 0; if ( notran ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1) * nrhs; solve_ops += 2 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work_col[i]; /* Scatter */ work_col[i] = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work[i]; work[i] = 0.0; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); dprint_soln(n, nrhs, Bmat); #endif SuperLUStat.ops[SOLVE] = solve_ops; } else { printf("Transposed solve not implemented.\n"); exit(0); } SUPERLU_FREE(work); SUPERLU_FREE(soln); }