void pdgssvx(superlu_options_t *options, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, double B[], int ldb, int nrhs, gridinfo_t *grid, LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr, SuperLUStat_t *stat, int *info) { /* * -- Distributed SuperLU routine (version 2.2) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * November 1, 2007 * Feburary 20, 2008 * * * Purpose * ======= * * PDGSSVX solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * * Static pivoting is a technique that combines the numerical stability * of partial pivoting with the scalability of Cholesky (no pivoting), * to run accurately and efficiently on large numbers of processors. * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed * description of the parallel algorithms. * * The input matrices A and B are distributed by block rows. * Here is a graphical illustration (0-based indexing): * * A B * 0 --------------- ------ * | | | | * | | P0 | | * | | | | * --------------- ------ * - fst_row->| | | | * | | | | | * m_loc | | P1 | | * | | | | | * - | | | | * --------------- ------ * | . | |. | * | . | |. | * | . | |. | * --------------- ------ * * where, fst_row is the row number of the first row, * m_loc is the number of rows local to this processor * These are defined in the 'SuperMatrix' structure, see supermatrix.h. * * * Here are the options for using this code: * * 1. Independent of all the other options specified below, the * user must supply * * - B, the matrix of right-hand sides, distributed by block rows, * and its dimensions ldb (local) and nrhs (global) * - grid, a structure describing the 2D processor mesh * - options->IterRefine, which determines whether or not to * improve the accuracy of the computed solution using * iterative refinement * * On output, B is overwritten with the solution X. * * 2. Depending on options->Fact, the user has four options * for solving A*X=B. The standard option is for factoring * A "from scratch". (The other options, described below, * are used when A is sufficiently similar to a previously * solved problem to save time by reusing part or all of * the previous factorization.) * * - options->Fact = DOFACT: A is factored "from scratch" * * In this case the user must also supply * * o A, the input matrix * * as well as the following options to determine what matrix to * factorize. * * o options->Equil, to specify how to scale the rows and columns * of A to "equilibrate" it (to try to reduce its * condition number and so improve the * accuracy of the computed solution) * * o options->RowPerm, to specify how to permute the rows of A * (typically to control numerical stability) * * o options->ColPerm, to specify how to permute the columns of A * (typically to control fill-in and enhance * parallelism during factorization) * * o options->ReplaceTinyPivot, to specify how to deal with tiny * pivots encountered during factorization * (to control numerical stability) * * The outputs returned include * * o ScalePermstruct, modified to describe how the input matrix A * was equilibrated and permuted: * . ScalePermstruct->DiagScale, indicates whether the rows and/or * columns of A were scaled * . ScalePermstruct->R, array of row scale factors * . ScalePermstruct->C, array of column scale factors * . ScalePermstruct->perm_r, row permutation vector * . ScalePermstruct->perm_c, column permutation vector * * (part of ScalePermstruct may also need to be supplied on input, * depending on options->RowPerm and options->ColPerm as described * later). * * o A, the input matrix A overwritten by the scaled and permuted * matrix diag(R)*A*diag(C)*Pc^T, where * Pc is the row permutation matrix determined by * ScalePermstruct->perm_c * diag(R) and diag(C) are diagonal scaling matrices determined * by ScalePermstruct->DiagScale, ScalePermstruct->R and * ScalePermstruct->C * * o LUstruct, which contains the L and U factorization of A1 where * * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored * in A on output.) * * 3. The second value of options->Fact assumes that a matrix with the same * sparsity pattern as A has already been factored: * * - options->Fact = SamePattern: A is factored, assuming that it has * the same nonzero pattern as a previously factored matrix. In * this case the algorithm saves time by reusing the previously * computed column permutation vector stored in * ScalePermstruct->perm_c and the "elimination tree" of A * stored in LUstruct->etree * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->RowPerm * o options->ReplaceTinyPivot * * but not options->ColPerm, whose value is ignored. This is because the * previous column permutation from ScalePermstruct->perm_c is used as * input. The user must also supply * * o A, the input matrix * o ScalePermstruct->perm_c, the column permutation * o LUstruct->etree, the elimination tree * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated and row permuted * o LUstruct, modified to contain the new L and U factors * * 4. The third value of options->Fact assumes that a matrix B with the same * sparsity pattern as A has already been factored, and where the * row permutation of B can be reused for A. This is useful when A and B * have similar numerical values, so that the same row permutation * will make both factorizations numerically stable. This lets us reuse * all of the previously computed structure of L and U. * * - options->Fact = SamePattern_SameRowPerm: A is factored, * assuming not only the same nonzero pattern as the previously * factored matrix B, but reusing B's row permutation. * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->ReplaceTinyPivot * * but not options->RowPerm or options->ColPerm, whose values are * ignored. This is because the permutations from ScalePermstruct->perm_r * and ScalePermstruct->perm_c are used as input. * * The user must also supply * * o A, the input matrix * o ScalePermstruct->DiagScale, how the previous matrix was row * and/or column scaled * o ScalePermstruct->R, the row scalings of the previous matrix, * if any * o ScalePermstruct->C, the columns scalings of the previous matrix, * if any * o ScalePermstruct->perm_r, the row permutation of the previous * matrix * o ScalePermstruct->perm_c, the column permutation of the previous * matrix * o all of LUstruct, the previously computed information about * L and U (the actual numerical values of L and U * stored in LUstruct->Llu are ignored) * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated (thus ScalePermstruct->DiagScale, * R and C may be modified) * o LUstruct, modified to contain the new L and U factors * * 5. The fourth and last value of options->Fact assumes that A is * identical to a matrix that has already been factored on a previous * call, and reuses its entire LU factorization * * - options->Fact = Factored: A is identical to a previously * factorized matrix, so the entire previous factorization * can be reused. * * In this case all the other options mentioned above are ignored * (options->Equil, options->RowPerm, options->ColPerm, * options->ReplaceTinyPivot) * * The user must also supply * * o A, the unfactored matrix, only in the case that iterative * refinment is to be done (specifically A must be the output * A from the previous call, so that it has been scaled and permuted) * o all of ScalePermstruct * o all of LUstruct, including the actual numerical values of * L and U * * all of which are unmodified on output. * * Arguments * ========= * * options (input) superlu_options_t* (global) * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following fields should be defined for this structure: * * o Fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, how the matrix A should * be factorized based on the previous history. * * = DOFACT: The matrix A will be factorized from scratch. * Inputs: A * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * all of ScalePermstruct * all of LUstruct * * = SamePattern: the matrix A will be factorized assuming * that a factorization of a matrix with the same sparsity * pattern was performed prior to this one. Therefore, this * factorization will reuse column permutation vector * ScalePermstruct->perm_c and the elimination tree * LUstruct->etree * Inputs: A * options->Equil, RowPerm, ReplaceTinyPivot * ScalePermstruct->perm_c * LUstruct->etree * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * rest of ScalePermstruct (DiagScale, R, C, perm_r) * rest of LUstruct (GLU_persist, Llu) * * = SamePattern_SameRowPerm: the matrix A will be factorized * assuming that a factorization of a matrix with the same * sparsity pattern and similar numerical values was performed * prior to this one. Therefore, this factorization will reuse * both row and column scaling factors R and C, and the * both row and column permutation vectors perm_r and perm_c, * distributed data structure set up from the previous symbolic * factorization. * Inputs: A * options->Equil, ReplaceTinyPivot * all of ScalePermstruct * all of LUstruct * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * modified LUstruct->Llu * = FACTORED: the matrix A is already factored. * Inputs: all of ScalePermstruct * all of LUstruct * * o Equil (yes_no_t) * Specifies whether to equilibrate the system. * = NO: no equilibration. * = YES: scaling factors are computed to equilibrate the system: * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. * Whether or not the system will be equilibrated depends * on the scaling of the matrix A, but if equilibration is * used, A is overwritten by diag(R)*A*diag(C) and B by * diag(R)*B. * * o RowPerm (rowperm_t) * Specifies how to permute rows of the matrix A. * = NATURAL: use the natural ordering. * = LargeDiag: use the Duff/Koster algorithm to permute rows of * the original matrix to make the diagonal large * relative to the off-diagonal. * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r * input by the user. * * o ColPerm (colperm_t) * Specifies what type of column permutation to use to reduce fill. * = NATURAL: natural ordering. * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. * = MMD_ATA: minimum degree ordering on structure of A'*A. * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. * * o ReplaceTinyPivot (yes_no_t) * = NO: do not modify pivots * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during * LU factorization. * * o IterRefine (IterRefine_t) * Specifies how to perform iterative refinement. * = NO: no iterative refinement. * = DOUBLE: accumulate residual in double precision. * = EXTRA: accumulate residual in extra precision. * * NOTE: all options must be indentical on all processes when * calling this routine. * * A (input/output) SuperMatrix* (local) * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. * That is, A is stored in distributed compressed row format. * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization * routine PDGSTRF can factorize rectangular matrices. * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, * depending on ScalePermstruct->DiagScale and options->ColPerm: * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by * diag(R)*A*diag(C). * if options->ColPerm != NATURAL, A is further overwritten by * diag(R)*A*diag(C)*Pc^T. * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. * * ScalePermstruct (input/output) ScalePermstruct_t* (global) * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * It contains the following fields: * * o DiagScale (DiagScale_t) * Specifies the form of equilibration that was done. * = NOEQUIL: no equilibration. * = ROW: row equilibration, i.e., A was premultiplied by * diag(R). * = COL: Column equilibration, i.e., A was postmultiplied * by diag(C). * = BOTH: both row and column equilibration, i.e., A was * replaced by diag(R)*A*diag(C). * If options->Fact = FACTORED or SamePattern_SameRowPerm, * DiagScale is an input argument; otherwise it is an output * argument. * * o perm_r (int*) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * If options->RowPerm = MY_PERMR, or * options->Fact = SamePattern_SameRowPerm, perm_r is an * input argument; otherwise it is an output argument. * * o perm_c (int*) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * If options->ColPerm = MY_PERMC or options->Fact = SamePattern * or options->Fact = SamePattern_SameRowPerm, perm_c is an * input argument; otherwise, it is an output argument. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * * o R (double*) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). * If DiagScale = NOEQUIL or COL, R is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * * o C (double*) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). * If DiagScale = NOEQUIL or ROW, C is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is * an input argument; otherwise, C is an output argument. * * B (input/output) double* (local) * On entry, the right-hand side matrix of dimension (m_loc, nrhs), * where, m_loc is the number of rows stored locally on my * process and is defined in the data structure of matrix A. * On exit, the solution matrix if info = 0; * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * The number of right-hand sides. * If nrhs = 0, only LU decomposition is performed, the forward * and back substitutions are skipped. * * grid (input) gridinfo_t* (global) * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * It contains the following fields: * * o etree (int*) dimension (A->ncol) (global) * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'. * It is computed in sp_colorder() during the first factorization, * and is reused in the subsequent factorizations of the matrices * with the same nonzero pattern. * On exit of sp_colorder(), the columns of A are permuted so that * the etree is in a certain postorder. This postorder is reflected * in ScalePermstruct->perm_c. * NOTE: * Etree is a vector of parent pointers for a forest whose vertices * are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * o Glu_persist (Glu_persist_t*) (global) * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (LocalLU_t*) (local) * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * SOLVEstruct (input/output) SOLVEstruct_t* * The data structure to hold the communication pattern used * in the phases of triangular solution and iterative refinement. * This pattern should be intialized only once for repeated solutions. * If options->SolveInitialized = YES, it is an input argument. * If options->SolveInitialized = NO and nrhs != 0, it is an output * argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'. * * berr (output) double*, dimension (nrhs) (global) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * so the solution could not be computed. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * * See superlu_ddefs.h for the definitions of varioous data types. * */ NRformat_loc *Astore; SuperMatrix GA; /* Global A in NC format */ NCformat *GAstore; double *a_GA; SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ NCPformat *GACstore; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t *Glu_freeable; /* The nonzero structures of L and U factors, which are replicated on all processrs. (lsub, xlsub) contains the compressed subscript of supernodes in L. (usub, xusub) contains the compressed subscript of nonzero segments in U. If options->Fact != SamePattern_SameRowPerm, they are computed by SYMBFACT routine, and then used by PDDISTRIBUTE routine. They will be freed after PDDISTRIBUTE routine. If options->Fact == SamePattern_SameRowPerm, these structures are not used. */ fact_t Fact; double *a; int_t *colptr, *rowind; int_t *perm_r; /* row permutations from partial pivoting */ int_t *perm_c; /* column permutation vector */ int_t *etree; /* elimination tree */ int_t *rowptr, *colind; /* Local A in NR*/ int_t *rowind_loc, *colptr_loc; int_t colequ, Equil, factored, job, notran, rowequ, need_value; int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; int_t nnz_loc, m_loc, fst_row, icol; int iam; int ldx; /* LDA for matrix X (local). */ char equed[1], norm[1]; double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; double *X, *b_col, *b_work, *x_col; double t; static mem_usage_t num_mem_usage, symb_mem_usage; #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif int_t procs; /* Structures needed for parallel symbolic factorization */ int_t *sizes, *fstVtxSep, parSymbFact; int noDomains, nprocs_num; MPI_Comm symb_comm; /* communicator for symbolic factorization */ int col, key; /* parameters for creating a new communicator */ Pslu_freeable_t Pslu_freeable; float flinfo; /* Initialization. */ m = A->nrow; n = A->ncol; Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; a = (double *) Astore->nzval; rowptr = Astore->rowptr; colind = Astore->colind; sizes = NULL; fstVtxSep = NULL; symb_comm = MPI_COMM_NULL; /* Test the input parameters. */ *info = 0; Fact = options->Fact; if ( Fact < 0 || Fact > FACTORED ) *info = -1; else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) *info = -1; else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) *info = -1; else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) *info = -1; else if ( options->IterRefine == EXTRA ) { *info = -1; fprintf(stderr, "Extra precise iterative refinement yet to support."); } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < m_loc ) *info = -5; else if ( nrhs < 0 ) *info = -6; if ( *info ) { i = -(*info); pxerbla("pdgssvx", grid, -*info); return; } factored = (Fact == FACTORED); Equil = (!factored && options->Equil == YES); notran = (options->Trans == NOTRANS); iam = grid->iam; job = 5; if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { rowequ = (ScalePermstruct->DiagScale == ROW) || (ScalePermstruct->DiagScale == BOTH); colequ = (ScalePermstruct->DiagScale == COL) || (ScalePermstruct->DiagScale == BOTH); } else rowequ = colequ = FALSE; /* The following arrays are replicated on all processes. */ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; etree = LUstruct->etree; R = ScalePermstruct->R; C = ScalePermstruct->C; /********/ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgssvx()"); #endif /* Not factored & ask for equilibration */ if ( Equil && Fact != SamePattern_SameRowPerm ) { /* Allocate storage if not done so before. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; } } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( Equil ) { #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter equil"); #endif t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm ) { /* Reuse R and C. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: break; case ROW: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { a[i] *= R[irow]; /* Scale rows. */ } ++irow; } break; case COL: for (j = 0; j < m_loc; ++j) for (i = rowptr[j]; i < rowptr[j+1]; ++i){ icol = colind[i]; a[i] *= C[icol]; /* Scale columns. */ } break; case BOTH: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ } ++irow; } break; } } else { /* Compute R & C from scratch */ /* Compute the row and column scalings. */ pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); /* Equilibrate matrix A if it is badly-scaled. */ pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); if ( lsame_(equed, "R") ) { ScalePermstruct->DiagScale = rowequ = ROW; } else if ( lsame_(equed, "C") ) { ScalePermstruct->DiagScale = colequ = COL; } else if ( lsame_(equed, "B") ) { ScalePermstruct->DiagScale = BOTH; rowequ = ROW; colequ = COL; } else ScalePermstruct->DiagScale = NOEQUIL; #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. equilibrated? *equed = %c\n", *equed); /*fflush(stdout);*/ } #endif } /* if Fact ... */ stat->utime[EQUIL] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit equil"); #endif } /* if Equil ... */ if ( !factored ) { /* Skip this if already factored. */ /* * Gather A from the distributed compressed row format to * global A in compressed column format. * Numerical values are gathered only when a row permutation * for large diagonal is sought after. */ if ( Fact != SamePattern_SameRowPerm ) { need_value = (options->RowPerm == LargeDiag); pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); GAstore = (NCformat *) GA.Store; colptr = GAstore->colptr; rowind = GAstore->rowind; nnz = GAstore->nnz; if ( need_value ) a_GA = (double *) GAstore->nzval; else assert(GAstore->nzval == NULL); } /* ------------------------------------------------------------ Find the row permutation for A. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { t = SuperLU_timer_(); if ( Fact != SamePattern_SameRowPerm ) { if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ /* Permute the global matrix GA for symbfact() */ for (i = 0; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } else { /* options->RowPerm == LargeDiag */ /* Get a new perm_r[] */ if ( job == 5 ) { /* Allocate storage for scaling factors. */ if ( !(R1 = doubleMalloc_dist(m)) ) ABORT("SUPERLU_MALLOC fails for R1[]"); if ( !(C1 = doubleMalloc_dist(n)) ) ABORT("SUPERLU_MALLOC fails for C1[]"); } if ( !iam ) { /* Process 0 finds a row permutation */ dldperm(job, m, nnz, colptr, rowind, a_GA, perm_r, R1, C1); MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } else { MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } #if ( PRNTlevel>=2 ) dmin = dlamch_("Overflow"); dsum = 0.0; dprod = 1.0; #endif if ( job == 5 ) { if ( Equil ) { for (i = 0; i < n; ++i) { R1[i] = exp(R1[i]); C1[i] = exp(C1[i]); } /* Scale the distributed matrix */ irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R1[irow] * C1[icol]; #if ( PRNTlevel>=2 ) if ( perm_r[irow] == icol ) { /* New diagonal */ if ( job == 2 || job == 3 ) dmin = SUPERLU_MIN(dmin, fabs(a[i])); else if ( job == 4 ) dsum += fabs(a[i]); else if ( job == 5 ) dprod *= fabs(a[i]); } #endif } ++irow; } /* Multiply together the scaling factors. */ if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; else for (i = 0; i < m; ++i) R[i] = R1[i]; if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; else for (i = 0; i < n; ++i) C[i] = C1[i]; ScalePermstruct->DiagScale = BOTH; rowequ = colequ = 1; } /* end Equil */ /* Now permute global A to prepare for symbfact() */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } SUPERLU_FREE (R1); SUPERLU_FREE (C1); } else { /* job = 2,3,4 */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } /* end for i ... */ } /* end for j ... */ } /* end else job ... */ #if ( PRNTlevel>=2 ) if ( job == 2 || job == 3 ) { if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); } else if ( job == 4 ) { if ( !iam ) printf("\tsum of diagonal %e\n", dsum); } else if ( job == 5 ) { if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif } /* end if options->RowPerm ... */ t = SuperLU_timer_() - t; stat->utime[ROWPERM] = t; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); #endif } /* end if Fact ... */ } else { /* options->RowPerm == NOROWPERM */ for (i = 0; i < m; ++i) perm_r[i] = i; } #if ( DEBUGlevel>=2 ) if ( !iam ) PrintInt10("perm_r", m, perm_r); #endif } /* end if (!factored) */ if ( !factored || options->IterRefine ) { /* Compute norm(A), which will be used to adjust small diagonal. */ if ( notran ) *(unsigned char *)norm = '1'; else *(unsigned char *)norm = 'I'; anorm = pdlangs(norm, A, grid); #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. anorm %e\n", anorm); #endif } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( !factored ) { t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A * permc_spec = PARMETIS: parallel METIS on structure of A'+A * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; parSymbFact = options->ParSymbFact; #if ( PRNTlevel>=1 ) if ( parSymbFact && permc_spec != PARMETIS ) if ( !iam ) printf(".. Parallel symbolic factorization" " only works wth ParMetis!\n"); #endif if ( parSymbFact == YES || permc_spec == PARMETIS ) { nprocs_num = grid->nprow * grid->npcol; noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num )))); /* create a new communicator for the first noDomains processors in grid->comm */ key = iam; if (iam < noDomains) col = 0; else col = MPI_UNDEFINED; MPI_Comm_split (grid->comm, col, key, &symb_comm ); permc_spec = PARMETIS; /* only works with PARMETIS */ } if ( permc_spec != MY_PERMC && Fact == DOFACT ) { if ( permc_spec == PARMETIS ) { /* Get column permutation vector in perm_c. * * This routine takes as input the distributed input matrix A * * and does not modify it. It also allocates memory for * * sizes[] and fstVtxSep[] arrays, that contain information * * on the separator tree computed by ParMETIS. */ flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, noDomains, &sizes, &fstVtxSep, grid, &symb_comm); if (flinfo > 0) ABORT("ERROR in get perm_c parmetis."); } else { get_perm_c_dist(iam, permc_spec, &GA, perm_c); } } stat->utime[COLPERM] = SuperLU_timer_() - t; /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' (a.k.a. column etree), depending on the choice of ColPerm. Adjust perm_c[] to be consistent with a postorder of etree. Permute columns of A to form A*Pc'. */ if ( Fact != SamePattern_SameRowPerm ) { if ( parSymbFact == NO ) { int_t *GACcolbeg, *GACcolend, *GACrowind; sp_colorder(options, &GA, perm_c, etree, &GAC); /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ GACstore = (NCPformat *) GAC.Store; GACcolbeg = GACstore->colbeg; GACcolend = GACstore->colend; GACrowind = GACstore->rowind; for (j = 0; j < n; ++j) { for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { irow = GACrowind[i]; GACrowind[i] = perm_c[irow]; } } /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); if ( !(Glu_freeable = (Glu_freeable_t *) SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) ABORT("Malloc fails for Glu_freeable."); /* Every process does this. */ iinfo = symbfact(options, iam, &GAC, perm_c, etree, Glu_persist, Glu_freeable); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if ( iinfo < 0 ) { /* Successful return */ QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); #if ( PRNTlevel>=1 ) if ( !iam ) { printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); printf("\tint %d, short %d, float %d, double %d\n", sizeof(int_t), sizeof(short), sizeof(float), sizeof(double)); printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); } #endif } else { if ( !iam ) { fprintf(stderr,"symbfact() error returns %d\n",iinfo); exit(-1); } } } /* end if serial symbolic factorization */ else { /* parallel symbolic factorization */ t = SuperLU_timer_(); flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, sizes, fstVtxSep, &Pslu_freeable, &(grid->comm), &symb_comm, &symb_mem_usage); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if (flinfo > 0) ABORT("Insufficient memory for parallel symbolic factorization."); } } /* end if Fact ... */ #if ( PRNTlevel>=1 ) if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]); #endif if (sizes) SUPERLU_FREE (sizes); if (fstVtxSep) SUPERLU_FREE (fstVtxSep); if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { /* Apply column permutation to the original distributed A */ for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ t = SuperLU_timer_(); dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct, Glu_freeable, LUstruct, grid); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factorization. */ if ( Fact != SamePattern_SameRowPerm ) { iinfo = symbfact_SubFree(Glu_freeable); SUPERLU_FREE(Glu_freeable); } } else { /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ /* Apply column permutation to the original distributed A */ for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; t = SuperLU_timer_(); dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct, &Pslu_freeable, LUstruct, grid); if (dist_mem_use > 0) ABORT ("Not enough memory available for dist_psymbtonum\n"); stat->utime[DIST] = SuperLU_timer_() - t; } #if ( PRNTlevel>=1 ) if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); #endif /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); stat->utime[FACT] = SuperLU_timer_() - t; #if ( PRNTlevel>=1 ) { int_t TinyPivots; float for_lu, total, max, avg, temp; dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); MPI_Reduce( &num_mem_usage.for_lu, &for_lu, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &num_mem_usage.total, &total, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); temp = SUPERLU_MAX(symb_mem_usage.total, symb_mem_usage.for_lu + (float)dist_mem_use + num_mem_usage.for_lu); if (parSymbFact == TRUE) /* The memory used in the redistribution routine includes the memory used for storing the symbolic structure and the memory allocated for numerical factorization */ temp = SUPERLU_MAX(symb_mem_usage.total, (float)dist_mem_use); temp = SUPERLU_MAX(temp, num_mem_usage.total); MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, MPI_SUM, grid->comm ); stat->TinyPivots = TinyPivots; if ( !iam ) { printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", for_lu*1e-6, total*1e-6); printf("\tAll space (MB):" "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); } } #endif /* Destroy GA */ if ( Fact != SamePattern_SameRowPerm ) Destroy_CompCol_Matrix_dist(&GA); } /* end if (!factored) */ /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ if ( nrhs ) { if ( !(b_work = doubleMalloc_dist(n)) ) ABORT("Malloc fails for b_work[]"); /* ------------------------------------------------------------ Scale the right-hand side if equilibration was performed. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } } else if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } /* Save a copy of the right-hand side. */ ldx = ldb; if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) ABORT("Malloc fails for X[]"); x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; x_col += ldx; b_col += ldb; } /* ------------------------------------------------------------ Solve the linear system. ------------------------------------------------------------*/ if ( options->SolveInitialized == NO ) { dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, SOLVEstruct); } pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, fst_row, ldb, nrhs, SOLVEstruct, stat, info); /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ if ( options->IterRefine ) { /* Improve the solution by iterative refinement. */ int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ t = SuperLU_timer_(); if ( options->RefineInitialized == NO || Fact == DOFACT ) { /* All these cases need to re-initialize gsmv structure */ if ( options->RefineInitialized ) pdgsmv_finalize(SOLVEstruct->gsmv_comm); pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, SOLVEstruct->gsmv_comm); /* Save a copy of the transformed local col indices in colind_gsmv[]. */ if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); if ( !(it = intMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for colind_gsmv[]"); colind_gsmv = SOLVEstruct->A_colind_gsmv = it; for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; options->RefineInitialized = YES; } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { double at; int_t k, jcol, p; /* Swap to beginning the part of A corresponding to the local part of X, as was done in pdgsmv_init() */ for (i = 0; i < m_loc; ++i) { /* Loop through each row */ k = rowptr[i]; for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; p = SOLVEstruct->row_to_proc[jcol]; if ( p == iam ) { /* Local */ at = a[k]; a[k] = a[j]; a[j] = at; ++k; } } } /* Re-use the local col indices of A obtained from the previous call to pdgsmv_init() */ for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; } if ( nrhs == 1 ) { /* Use the existing solve structure */ SOLVEstruct1 = SOLVEstruct; } else { /* For nrhs > 1, since refinement is performed for RHS one at a time, the communication structure for pdgstrs is different than the solve with nrhs RHS. So we use SOLVEstruct1 for the refinement step. */ if ( !(SOLVEstruct1 = (SOLVEstruct_t *) SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) ) ABORT("Malloc fails for SOLVEstruct1"); /* Copy the same stuff */ SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; SOLVEstruct1->diag_len = SOLVEstruct->diag_len; SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; /* Initialize the *gstrs_comm for 1 RHS. */ if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, Glu_persist, SOLVEstruct1); } pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); /* Deallocate the storage associated with SOLVEstruct1 */ if ( nrhs > 1 ) { pxgstrs_finalize(SOLVEstruct1->gstrs_comm); SUPERLU_FREE(SOLVEstruct1); } stat->utime[REFINE] = SuperLU_timer_() - t; } /* Permute the solution matrix B <= Pc'*X. */ pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, SOLVEstruct->inv_perm_c, X, ldx, B, ldb, nrhs, grid); #if ( DEBUGlevel>=2 ) printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); for (i = 0; i < m_loc; ++i) printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); #endif /* Transform the solution matrix X to a solution of the original system before the equilibration. */ if ( notran ) { if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } } else if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } SUPERLU_FREE(b_work); SUPERLU_FREE(X); } /* end if nrhs != 0 */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif /* Deallocate R and/or C if it was not used. */ if ( Equil && Fact != SamePattern_SameRowPerm ) { switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: SUPERLU_FREE(R); SUPERLU_FREE(C); break; case ROW: SUPERLU_FREE(C); break; case COL: SUPERLU_FREE(R); break; } } if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) Destroy_CompCol_Permuted_dist(&GAC); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgssvx()"); #endif }
void pdgssvx_ABglobal(superlu_options_t_Distributed *options, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, double B[], int ldb, int nrhs, gridinfo_t *grid, LUstruct_t *LUstruct, double *berr, SuperLUStat_t *stat, int *info) { /* * -- Distributed SuperLU routine (version 1.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * September 1, 1999 * * * Purpose * ======= * * pdgssvx_ABglobal solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * * Static pivoting is a technique that combines the numerical stability * of partial pivoting with the scalability of Cholesky (no pivoting), * to run accurately and efficiently on large numbers of processors. * * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed * description of the parallel algorithms. * * Here are the options for using this code: * * 1. Independent of all the other options specified below, the * user must supply * * - B, the matrix of right hand sides, and its dimensions ldb and nrhs * - grid, a structure describing the 2D processor mesh * - options->IterRefine, which determines whether or not to * improve the accuracy of the computed solution using * iterative refinement * * On output, B is overwritten with the solution X. * * 2. Depending on options->Fact, the user has several options * for solving A*X=B. The standard option is for factoring * A "from scratch". (The other options, described below, * are used when A is sufficiently similar to a previously * solved problem to save time by reusing part or all of * the previous factorization.) * * - options->Fact = DOFACT: A is factored "from scratch" * * In this case the user must also supply * * - A, the input matrix * * as well as the following options, which are described in more * detail below: * * - options->Equil, to specify how to scale the rows and columns * of A to "equilibrate" it (to try to reduce its * condition number and so improve the * accuracy of the computed solution) * * - options->RowPerm, to specify how to permute the rows of A * (typically to control numerical stability) * * - options->ColPerm, to specify how to permute the columns of A * (typically to control fill-in and enhance * parallelism during factorization) * * - options->ReplaceTinyPivot, to specify how to deal with tiny * pivots encountered during factorization * (to control numerical stability) * * The outputs returned include * * - ScalePermstruct, modified to describe how the input matrix A * was equilibrated and permuted: * - ScalePermstruct->DiagScale, indicates whether the rows and/or * columns of A were scaled * - ScalePermstruct->R, array of row scale factors * - ScalePermstruct->C, array of column scale factors * - ScalePermstruct->perm_r, row permutation vector * - ScalePermstruct->perm_c, column permutation vector * * (part of ScalePermstruct may also need to be supplied on input, * depending on options->RowPerm and options->ColPerm as described * later). * * - A, the input matrix A overwritten by the scaled and permuted matrix * Pc*Pr*diag(R)*A*diag(C) * where * Pr and Pc are row and columns permutation matrices determined * by ScalePermstruct->perm_r and ScalePermstruct->perm_c, * respectively, and * diag(R) and diag(C) are diagonal scaling matrices determined * by ScalePermstruct->DiagScale, ScalePermstruct->R and * ScalePermstruct->C * * - LUstruct, which contains the L and U factorization of A1 where * * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * (Note that A1 = Aout * Pc^T, where Aout is the matrix stored * in A on output.) * * 3. The second value of options->Fact assumes that a matrix with the same * sparsity pattern as A has already been factored: * * - options->Fact = SamePattern: A is factored, assuming that it has * the same nonzero pattern as a previously factored matrix. In this * case the algorithm saves time by reusing the previously computed * column permutation vector stored in ScalePermstruct->perm_c * and the "elimination tree" of A stored in LUstruct->etree. * * In this case the user must still specify the following options * as before: * * - options->Equil * - options->RowPerm * - options->ReplaceTinyPivot * * but not options->ColPerm, whose value is ignored. This is because the * previous column permutation from ScalePermstruct->perm_c is used as * input. The user must also supply * * - A, the input matrix * - ScalePermstruct->perm_c, the column permutation * - LUstruct->etree, the elimination tree * * The outputs returned include * * - A, the input matrix A overwritten by the scaled and permuted matrix * as described above * - ScalePermstruct, modified to describe how the input matrix A was * equilibrated and row permuted * - LUstruct, modified to contain the new L and U factors * * 4. The third value of options->Fact assumes that a matrix B with the same * sparsity pattern as A has already been factored, and where the * row permutation of B can be reused for A. This is useful when A and B * have similar numerical values, so that the same row permutation * will make both factorizations numerically stable. This lets us reuse * all of the previously computed structure of L and U. * * - options->Fact = SamePattern_SameRowPerm: A is factored, * assuming not only the same nonzero pattern as the previously * factored matrix B, but reusing B's row permutation. * * In this case the user must still specify the following options * as before: * * - options->Equil * - options->ReplaceTinyPivot * * but not options->RowPerm or options->ColPerm, whose values are ignored. * This is because the permutations from ScalePermstruct->perm_r and * ScalePermstruct->perm_c are used as input. * * The user must also supply * * - A, the input matrix * - ScalePermstruct->DiagScale, how the previous matrix was row and/or * column scaled * - ScalePermstruct->R, the row scalings of the previous matrix, if any * - ScalePermstruct->C, the columns scalings of the previous matrix, * if any * - ScalePermstruct->perm_r, the row permutation of the previous matrix * - ScalePermstruct->perm_c, the column permutation of the previous * matrix * - all of LUstruct, the previously computed information about L and U * (the actual numerical values of L and U stored in * LUstruct->Llu are ignored) * * The outputs returned include * * - A, the input matrix A overwritten by the scaled and permuted matrix * as described above * - ScalePermstruct, modified to describe how the input matrix A was * equilibrated * (thus ScalePermstruct->DiagScale, R and C may be modified) * - LUstruct, modified to contain the new L and U factors * * 5. The fourth and last value of options->Fact assumes that A is * identical to a matrix that has already been factored on a previous * call, and reuses its entire LU factorization * * - options->Fact = Factored: A is identical to a previously * factorized matrix, so the entire previous factorization * can be reused. * * In this case all the other options mentioned above are ignored * (options->Equil, options->RowPerm, options->ColPerm, * options->ReplaceTinyPivot) * * The user must also supply * * - A, the unfactored matrix, only in the case that iterative refinment * is to be done (specifically A must be the output A from * the previous call, so that it has been scaled and permuted) * - all of ScalePermstruct * - all of LUstruct, including the actual numerical values of L and U * * all of which are unmodified on output. * * Arguments * ========= * * options (input) superlu_options_t_Distributed* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following fields should be defined for this structure: * * o Fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, how the matrix A should * be factorized based on the previous history. * * = DOFACT: The matrix A will be factorized from scratch. * Inputs: A * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * all of ScalePermstruct * all of LUstruct * * = SamePattern: the matrix A will be factorized assuming * that a factorization of a matrix with the same sparsity * pattern was performed prior to this one. Therefore, this * factorization will reuse column permutation vector * ScalePermstruct->perm_c and the elimination tree * LUstruct->etree * Inputs: A * options->Equil, RowPerm, ReplaceTinyPivot * ScalePermstruct->perm_c * LUstruct->etree * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * rest of ScalePermstruct (DiagScale, R, C, perm_r) * rest of LUstruct (GLU_persist, Llu) * * = SamePattern_SameRowPerm: the matrix A will be factorized * assuming that a factorization of a matrix with the same * sparsity pattern and similar numerical values was performed * prior to this one. Therefore, this factorization will reuse * both row and column scaling factors R and C, and the * both row and column permutation vectors perm_r and perm_c, * distributed data structure set up from the previous symbolic * factorization. * Inputs: A * options->Equil, ReplaceTinyPivot * all of ScalePermstruct * all of LUstruct * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * modified LUstruct->Llu * = FACTORED: the matrix A is already factored. * Inputs: all of ScalePermstruct * all of LUstruct * * o Equil (yes_no_t) * Specifies whether to equilibrate the system. * = NO: no equilibration. * = YES: scaling factors are computed to equilibrate the system: * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. * Whether or not the system will be equilibrated depends * on the scaling of the matrix A, but if equilibration is * used, A is overwritten by diag(R)*A*diag(C) and B by * diag(R)*B. * * o RowPerm (rowperm_t) * Specifies how to permute rows of the matrix A. * = NATURAL: use the natural ordering. * = LargeDiag: use the Duff/Koster algorithm to permute rows of * the original matrix to make the diagonal large * relative to the off-diagonal. * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r * input by the user. * * o ColPerm (colperm_t) * Specifies what type of column permutation to use to reduce fill. * = NATURAL: natural ordering. * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. * = MMD_ATA: minimum degree ordering on structure of A'*A. * = COLAMD: approximate minimum degree column ordering. * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. * * o ReplaceTinyPivot (yes_no_t) * = NO: do not modify pivots * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during * LU factorization. * * o IterRefine (IterRefine_t) * Specifies how to perform iterative refinement. * = NO: no iterative refinement. * = DOUBLE: accumulate residual in double precision. * = EXTRA: accumulate residual in extra precision. * * NOTE: all options must be indentical on all processes when * calling this routine. * * A (input/output) SuperMatrix* * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = NC; Dtype = D; Mtype = GE. That is, A is stored in * compressed column format (also known as Harwell-Boeing format). * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization * routine pdgstrf can factorize rectangular matrices. * On exit, A may be overwtirren by Pc*Pr*diag(R)*A*diag(C), * depending on ScalePermstruct->DiagScale, options->RowPerm and * options->colpem: * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by * diag(R)*A*diag(C). * if options->RowPerm != NATURAL, A is further overwritten by * Pr*diag(R)*A*diag(C). * if options->ColPerm != NATURAL, A is further overwritten by * Pc*Pr*diag(R)*A*diag(C). * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. * * NOTE: Currently, A must reside in all processes when calling * this routine. * * ScalePermstruct (input/output) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * It contains the following fields: * * o DiagScale (DiagScale_t) * Specifies the form of equilibration that was done. * = NOEQUIL: no equilibration. * = ROW: row equilibration, i.e., A was premultiplied by * diag(R). * = COL: Column equilibration, i.e., A was postmultiplied * by diag(C). * = BOTH: both row and column equilibration, i.e., A was * replaced by diag(R)*A*diag(C). * If options->Fact = FACTORED or SamePattern_SameRowPerm, * DiagScale is an input argument; otherwise it is an output * argument. * * o perm_r (int*) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * If options->RowPerm = MY_PERMR, or * options->Fact = SamePattern_SameRowPerm, perm_r is an * input argument; otherwise it is an output argument. * * o perm_c (int*) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * If options->ColPerm = MY_PERMC or options->Fact = SamePattern * or options->Fact = SamePattern_SameRowPerm, perm_c is an * input argument; otherwise, it is an output argument. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * * o R (double*) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). * If DiagScale = NOEQUIL or COL, R is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * * o C (double*) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). * If DiagScale = NOEQUIL or ROW, C is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is * an input argument; otherwise, C is an output argument. * * B (input/output) double* * On entry, the right-hand side matrix of dimension (A->nrow, nrhs). * On exit, the solution matrix if info = 0; * * NOTE: Currently, B must reside in all processes when calling * this routine. * * ldb (input) int (global) * The leading dimension of matrix B. * * nrhs (input) int (global) * The number of right-hand sides. * If nrhs = 0, only LU decomposition is performed, the forward * and back substitutions are skipped. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * It contains the following fields: * * o etree (int*) dimension (A->ncol) * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol. * It is computed in sp_colorder() during the first factorization, * and is reused in the subsequent factorizations of the matrices * with the same nonzero pattern. * On exit of sp_colorder(), the columns of A are permuted so that * the etree is in a certain postorder. This postorder is reflected * in ScalePermstruct->perm_c. * NOTE: * Etree is a vector of parent pointers for a forest whose vertices * are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * o Glu_persist (Glu_persist_t*) * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (LocalLU_t*) * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * berr (output) double*, dimension (nrhs) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * so the solution could not be computed. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * * * See superlu_ddefs.h for the definitions of various data types. * */ SuperMatrix AC; NCformat *Astore; NCPformat *ACstore; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t *Glu_freeable; /* The nonzero structures of L and U factors, which are replicated on all processrs. (lsub, xlsub) contains the compressed subscript of supernodes in L. (usub, xusub) contains the compressed subscript of nonzero segments in U. If options->Fact != SamePattern_SameRowPerm, they are computed by SYMBFACT routine, and then used by DDISTRIBUTE routine. They will be freed after DDISTRIBUTE routine. If options->Fact == SamePattern_SameRowPerm, these structures are not used. */ fact_t Fact; double *a; int_t *perm_r; /* row permutations from partial pivoting */ int_t *perm_c; /* column permutation vector */ int_t *etree; /* elimination tree */ int_t *colptr, *rowind; int_t colequ, Equil, factored, job, notran, rowequ; int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; int iam; int ldx; /* LDA for matrix X (global). */ char equed[1], norm[1]; double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; double *X, *b_col, *b_work, *x_col; double t; static mem_usage_t_Distributed num_mem_usage, symb_mem_usage; #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif /* Test input parameters. */ *info = 0; Fact = options->Fact; if ( Fact < 0 || Fact > FACTORED ) *info = -1; else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) *info = -1; else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) *info = -1; else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) *info = -1; else if ( options->IterRefine == EXTRA ) { *info = -1; fprintf(stderr, "Extra precise iterative refinement yet to support."); } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < A->nrow ) *info = -5; else if ( nrhs < 0 ) *info = -6; if ( *info ) { i = -(*info); pxerbla("pdgssvx_ABglobal", grid, -*info); return; } /* Initialization */ factored = (Fact == FACTORED); Equil = (!factored && options->Equil == YES); notran = (options->Trans == NOTRANS); iam = grid->iam; job = 5; m = A->nrow; n = A->ncol; Astore = A->Store; nnz = Astore->nnz; a = Astore->nzval; colptr = Astore->colptr; rowind = Astore->rowind; if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { rowequ = (ScalePermstruct->DiagScale == ROW) || (ScalePermstruct->DiagScale == BOTH); colequ = (ScalePermstruct->DiagScale == COL) || (ScalePermstruct->DiagScale == BOTH); } else rowequ = colequ = FALSE; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgssvx_ABglobal()"); #endif perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; etree = LUstruct->etree; R = ScalePermstruct->R; C = ScalePermstruct->C; if ( Equil ) { /* Allocate storage if not done so before. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; } } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( Equil ) { #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter equil"); #endif t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm ) { /* Reuse R and C. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: break; case ROW: for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; a[i] *= R[irow]; /* Scale rows. */ } } break; case COL: for (j = 0; j < n; ++j) for (i = colptr[j]; i < colptr[j+1]; ++i) a[i] *= C[j]; /* Scale columns. */ break; case BOTH: for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; a[i] *= R[irow] * C[j]; /* Scale rows and columns. */ } } break; } } else { if ( !iam ) { /* Compute row and column scalings to equilibrate matrix A. */ dgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo); MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); if ( iinfo == 0 ) { MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); } else { if ( iinfo > 0 ) { if ( iinfo <= m ) fprintf(stderr, "The %d-th row of A is exactly zero\n", iinfo); else fprintf(stderr, "The %d-th column of A is exactly zero\n", iinfo-n); exit(-1); } } } else { MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); if ( iinfo == 0 ) { MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); } else { ABORT("DGSEQU failed\n"); } } /* Equilibrate matrix A. */ dlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed); if ( lsame_(equed, "R") ) { ScalePermstruct->DiagScale = rowequ = ROW; } else if ( lsame_(equed, "C") ) { ScalePermstruct->DiagScale = colequ = COL; } else if ( lsame_(equed, "B") ) { ScalePermstruct->DiagScale = BOTH; rowequ = ROW; colequ = COL; } else ScalePermstruct->DiagScale = NOEQUIL; #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. equilibrated? *equed = %c\n", *equed); /*fflush(stdout);*/ } #endif } /* if Fact ... */ stat->utime[EQUIL] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit equil"); #endif } /* if Equil ... */ /* ------------------------------------------------------------ Permute rows of A. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */ || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */ /* for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) {*/ for (i = 0; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; /* }*/ } } else if ( !factored ) { if ( job == 5 ) { /* Allocate storage for scaling factors. */ if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for R1[]"); if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for C1[]"); } if ( !iam ) { /* Process 0 finds a row permutation for large diagonal. */ dldperm(job, m, nnz, colptr, rowind, a, perm_r, R1, C1); MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } else { MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } #if ( PRNTlevel>=2 ) dmin = dlamch_("Overflow"); dsum = 0.0; dprod = 1.0; #endif if ( job == 5 ) { if ( Equil ) { for (i = 0; i < n; ++i) { R1[i] = exp(R1[i]); C1[i] = exp(C1[i]); } for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; a[i] *= R1[irow] * C1[j]; /* Scale the matrix. */ rowind[i] = perm_r[irow]; #if ( PRNTlevel>=2 ) if ( rowind[i] == j ) /* New diagonal */ dprod *= fabs(a[i]); #endif } } /* Multiply together the scaling factors. */ if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; else for (i = 0; i < m; ++i) R[i] = R1[i]; if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; else for (i = 0; i < n; ++i) C[i] = C1[i]; ScalePermstruct->DiagScale = BOTH; rowequ = colequ = 1; } else { /* No equilibration. */ /* for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) {*/ for (i = colptr[0]; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } /* }*/ } SUPERLU_FREE (R1); SUPERLU_FREE (C1); } else { /* job = 2,3,4 */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; #if ( PRNTlevel>=2 ) if ( rowind[i] == j ) { /* New diagonal */ if ( job == 2 || job == 3 ) dmin = SUPERLU_MIN(dmin, fabs(a[i])); else if ( job == 4 ) dsum += fabs(a[i]); else if ( job == 5 ) dprod *= fabs(a[i]); } #endif } } } #if ( PRNTlevel>=2 ) if ( job == 2 || job == 3 ) { if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); } else if ( job == 4 ) { if ( !iam ) printf("\tsum of diagonal %e\n", dsum); } else if ( job == 5 ) { if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif } /* else !factored */ t = SuperLU_timer_() - t; stat->utime[ROWPERM] = t; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); #endif } /* if options->RowPerm ... */ if ( !factored || options->IterRefine ) { /* Compute norm(A), which will be used to adjust small diagonal. */ if ( notran ) *(unsigned char *)norm = '1'; else *(unsigned char *)norm = 'I'; anorm = dlangs_dist(norm, A); #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. anorm %e\n", anorm); #endif } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( !factored ) { t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = COLAMD: approximate minimum degree column ordering * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && Fact == DOFACT ) /* Use an ordering provided by SuperLU */ get_perm_c_dist(iam, permc_spec, A, perm_c); /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' (a.k.a. column etree), depending on the choice of ColPerm. Adjust perm_c[] to be consistent with a postorder of etree. Permute columns of A to form A*Pc'. */ sp_colorder(options, A, perm_c, etree, &AC); /* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */ ACstore = AC.Store; for (j = 0; j < n; ++j) for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) { irow = ACstore->rowind[i]; ACstore->rowind[i] = perm_c[irow]; } stat->utime[COLPERM] = SuperLU_timer_() - t; /* Perform a symbolic factorization on matrix A and set up the nonzero data structures which are suitable for supernodal GENP. */ if ( Fact != SamePattern_SameRowPerm ) { #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); if ( !(Glu_freeable = (Glu_freeable_t *) SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) ABORT("Malloc fails for Glu_freeable."); iinfo = symbfact(iam, &AC, perm_c, etree, Glu_persist, Glu_freeable); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if ( iinfo < 0 ) { QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); #if ( PRNTlevel>=1 ) if ( !iam ) { printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); printf("\tint %d, short %d, float %d, double %d\n", sizeof(int_t), sizeof(short), sizeof(float), sizeof(double)); printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); } #endif } else { if ( !iam ) { fprintf(stderr, "symbfact() error returns %d\n", iinfo); exit(-1); } } } /* Distribute the L and U factors onto the process grid. */ t = SuperLU_timer_(); dist_mem_use = ddistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factor. */ if ( Fact != SamePattern_SameRowPerm ) { iinfo = symbfact_SubFree(Glu_freeable); SUPERLU_FREE(Glu_freeable); } /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); stat->utime[FACT] = SuperLU_timer_() - t; #if ( PRNTlevel>=1 ) { int_t TinyPivots; float for_lu, total, max, avg, temp; dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); MPI_Reduce( &num_mem_usage.for_lu, &for_lu, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &num_mem_usage.total, &total, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); temp = SUPERLU_MAX(symb_mem_usage.total, symb_mem_usage.for_lu + (float)dist_mem_use + num_mem_usage.for_lu); temp = SUPERLU_MAX(temp, num_mem_usage.total); MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, MPI_SUM, grid->comm ); stat->TinyPivots = TinyPivots; if ( !iam ) { printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", for_lu*1e-6, total*1e-6); printf("\tAll space (MB):" "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); } } #endif #if ( PRNTlevel>=2 ) if ( !iam ) printf(".. pdgstrf INFO = %d\n", *info); #endif } else if ( options->IterRefine ) { /* options->Fact==FACTORED */ /* Permute columns of A to form A*Pc' using the existing perm_c. * NOTE: rows of A were previously permuted to Pc*A. */ sp_colorder(options, A, perm_c, NULL, &AC); } /* if !factored ... */ /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ if ( nrhs ) { if ( !(b_work = doubleMalloc_dist(n)) ) ABORT("Malloc fails for b_work[]"); /* ------------------------------------------------------------ Scale the right-hand side if equilibration was performed. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_col[i] *= R[i]; b_col += ldb; } } } else if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_col[i] *= C[i]; b_col += ldb; } } /* ------------------------------------------------------------ Permute the right-hand side to form Pr*B. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { if ( notran ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i]; for (i = 0; i < m; ++i) b_col[i] = b_work[i]; b_col += ldb; } } } /* ------------------------------------------------------------ Permute the right-hand side to form Pc*B. ------------------------------------------------------------*/ if ( notran ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i]; for (i = 0; i < m; ++i) b_col[i] = b_work[i]; b_col += ldb; } } /* Save a copy of the right-hand side. */ ldx = ldb; if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) ABORT("Malloc fails for X[]"); x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < ldb; ++i) x_col[i] = b_col[i]; x_col += ldx; b_col += ldb; } /* ------------------------------------------------------------ Solve the linear system. ------------------------------------------------------------*/ pdgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info); /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ if ( options->IterRefine ) { /* Improve the solution by iterative refinement. */ t = SuperLU_timer_(); pdgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb, X, ldx, nrhs, berr, stat, info); stat->utime[REFINE] = SuperLU_timer_() - t; } /* Permute the solution matrix X <= Pc'*X. */ for (j = 0; j < nrhs; j++) { b_col = &B[j*ldb]; x_col = &X[j*ldx]; for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]]; } /* Transform the solution matrix X to a solution of the original system before the equilibration. */ if ( notran ) { if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < n; ++i) b_col[i] *= C[i]; b_col += ldb; } } } else if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < n; ++i) b_col[i] *= R[i]; b_col += ldb; } } SUPERLU_FREE(b_work); SUPERLU_FREE(X); } /* end if nrhs != 0 */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif /* Deallocate storage. */ if ( Equil && Fact != SamePattern_SameRowPerm ) { switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: SUPERLU_FREE(R); SUPERLU_FREE(C); break; case ROW: SUPERLU_FREE(C); break; case COL: SUPERLU_FREE(R); break; } } if ( !factored || (factored && options->IterRefine) ) Destroy_CompCol_Permuted_dist(&AC); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgssvx_ABglobal()"); #endif }
void dgsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree, char *equed, double *R, double *C, SuperMatrix *L, SuperMatrix *U, void *work, int lwork, SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, double *rcond, GlobalLU_t *Glu, mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info) { DNformat *Bstore, *Xstore; double *Bmat, *Xmat; int ldb, ldx, nrhs, n; SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int colequ, equil, nofact, notran, rowequ, permc_spec, mc64; trans_t trant; char norm[1]; int i, j, info1; double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; int relax, panel_size; double diag_pivot_thresh; double t0; /* temporary time */ double *utime; int *perm = NULL; /* permutation returned from MC64 */ /* External functions */ extern double dlangs(char *, SuperMatrix *); Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; n = B->nrow; *info = 0; nofact = (options->Fact != FACTORED); equil = (options->Equil == YES); notran = (options->Trans == NOTRANS); mc64 = (options->RowPerm == LargeDiag); if ( nofact ) { *(unsigned char *)equed = 'N'; rowequ = FALSE; colequ = FALSE; } else { rowequ = strncmp(equed, "R", 1)==0 || strncmp(equed, "B", 1)==0; colequ = strncmp(equed, "C", 1)==0 || strncmp(equed, "B", 1)==0; smlnum = dmach("Safe minimum"); /* lamch_("Safe minimum"); */ bignum = 1. / smlnum; } /* Test the input parameters */ if (options->Fact != DOFACT && options->Fact != SamePattern && options->Fact != SamePattern_SameRowPerm && options->Fact != FACTORED && options->Trans != NOTRANS && options->Trans != TRANS && options->Trans != CONJ && options->Equil != NO && options->Equil != YES) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( options->Fact == FACTORED && !(rowequ || colequ || strncmp(equed, "N", 1)==0) ) *info = -6; else { if (rowequ) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, R[j]); rcmax = SUPERLU_MAX(rcmax, R[j]); } if (rcmin <= 0.) *info = -7; else if ( A->nrow > 0) rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else rowcnd = 1.; } if (colequ && *info == 0) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, C[j]); rcmax = SUPERLU_MAX(rcmax, C[j]); } if (rcmin <= 0.) *info = -8; else if (A->nrow > 0) colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else colcnd = 1.; } if (*info == 0) { if ( lwork < -1 ) *info = -12; else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -13; else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || (B->ncol != 0 && B->ncol != X->ncol) || X->Stype != SLU_DN || X->Dtype != SLU_D || X->Mtype != SLU_GE ) *info = -14; } } if (*info != 0) { i = -(*info); input_error("dgsisx", &i); return; } /* Initialization for factor parameters */ panel_size = sp_ienv(1); relax = sp_ienv(2); diag_pivot_thresh = options->DiagPivotThresh; utime = stat->utime; /* Convert A to SLU_NC format when necessary. */ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); if ( notran ) { /* Reverse the transpose argument. */ trant = TRANS; notran = 0; } else { trant = NOTRANS; notran = 1; } } else { /* A->Stype == SLU_NC */ trant = options->Trans; AA = A; } if ( nofact ) { register int i, j; NCformat *Astore = AA->Store; int nnz = Astore->nnz; int *colptr = Astore->colptr; int *rowind = Astore->rowind; double *nzval = (double *)Astore->nzval; if ( mc64 ) { t0 = SuperLU_timer_(); if ((perm = intMalloc(n)) == NULL) ABORT("SUPERLU_MALLOC fails for perm[]"); info1 = dldperm(5, n, nnz, colptr, rowind, nzval, perm, R, C); if (info1 != 0) { /* MC64 fails, call dgsequ() later */ mc64 = 0; SUPERLU_FREE(perm); perm = NULL; } else { if ( equil ) { rowequ = colequ = 1; for (i = 0; i < n; i++) { R[i] = exp(R[i]); C[i] = exp(C[i]); } /* scale the matrix */ for (j = 0; j < n; j++) { for (i = colptr[j]; i < colptr[j + 1]; i++) { nzval[i] *= R[rowind[i]] * C[j]; } } *equed = 'B'; } /* permute the matrix */ for (j = 0; j < n; j++) { for (i = colptr[j]; i < colptr[j + 1]; i++) { /*nzval[i] *= R[rowind[i]] * C[j];*/ rowind[i] = perm[rowind[i]]; } } } utime[EQUIL] = SuperLU_timer_() - t0; } if ( !mc64 & equil ) { /* Only perform equilibration, no row perm */ t0 = SuperLU_timer_(); /* Compute row and column scalings to equilibrate the matrix A. */ dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); if ( info1 == 0 ) { /* Equilibrate matrix A. */ dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); rowequ = strncmp(equed, "R", 1)==0 || strncmp(equed, "B", 1)==0; colequ = strncmp(equed, "C", 1)==0 || strncmp(equed, "B", 1)==0; } utime[EQUIL] = SuperLU_timer_() - t0; } } if ( nofact ) { t0 = SuperLU_timer_(); /* * Gnet column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = COLAMD: approximate minimum degree column ordering * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) get_perm_c(permc_spec, AA, perm_c); utime[COLPERM] = SuperLU_timer_() - t0; t0 = SuperLU_timer_(); sp_preorder(options, AA, perm_c, etree, &AC); utime[ETREE] = SuperLU_timer_() - t0; /* Compute the LU factorization of A*Pc. */ t0 = SuperLU_timer_(); dgsitrf(options, &AC, relax, panel_size, etree, work, lwork, perm_c, perm_r, L, U, Glu, stat, info); utime[FACT] = SuperLU_timer_() - t0; if ( lwork == -1 ) { mem_usage->total_needed = *info - A->ncol; return; } if ( mc64 ) { /* Fold MC64's perm[] into perm_r[]. */ NCformat *Astore = AA->Store; int nnz = Astore->nnz, *rowind = Astore->rowind; int *perm_tmp, *iperm; if ((perm_tmp = intMalloc(2*n)) == NULL) ABORT("SUPERLU_MALLOC fails for perm_tmp[]"); iperm = perm_tmp + n; for (i = 0; i < n; ++i) perm_tmp[i] = perm_r[perm[i]]; for (i = 0; i < n; ++i) { perm_r[i] = perm_tmp[i]; iperm[perm[i]] = i; } /* Restore A's original row indices. */ for (i = 0; i < nnz; ++i) rowind[i] = iperm[rowind[i]]; SUPERLU_FREE(perm); /* MC64 permutation */ SUPERLU_FREE(perm_tmp); } } if ( options->PivotGrowth ) { if ( *info > 0 ) return; /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U); } if ( options->ConditionNumber ) { /* Estimate the reciprocal of the condition number of A. */ t0 = SuperLU_timer_(); if ( notran ) { *(unsigned char *)norm = '1'; } else { *(unsigned char *)norm = 'I'; } anorm = dlangs(norm, AA); dgscon(norm, L, U, anorm, rcond, stat, &info1); utime[RCOND] = SuperLU_timer_() - t0; } if ( nrhs > 0 ) { /* Solve the system */ double *rhs_work; /* Scale and permute the right-hand side if equilibration and permutation from MC64 were performed. */ if ( notran ) { if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) Bmat[i + j*ldb] *= R[i]; } } else if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) { Bmat[i + j*ldb] *= C[i]; } } /* Compute the solution matrix X. */ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ for (i = 0; i < B->nrow; i++) Xmat[i + j*ldx] = Bmat[i + j*ldb]; t0 = SuperLU_timer_(); dgstrs (trant, L, U, perm_c, perm_r, X, stat, &info1); utime[SOLVE] = SuperLU_timer_() - t0; /* Transform the solution matrix X to a solution of the original system. */ if ( notran ) { if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) { Xmat[i + j*ldx] *= C[i]; } } } else { /* transposed system */ if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { Xmat[i + j*ldx] *= R[i]; } } } } /* end if nrhs > 0 */ if ( options->ConditionNumber ) { /* The matrix is singular to working precision. */ /* if ( *rcond < dlamch_("E") && *info == 0) *info = A->ncol + 1; */ if ( *rcond < dmach("E") && *info == 0) *info = A->ncol + 1; } if ( nofact ) { ilu_dQuerySpace(L, U, mem_usage); Destroy_CompCol_Permuted(&AC); } if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } }
/*! \brief * * <pre> * Purpose * ======= * * pdgssvx_ABglobal solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * * Static pivoting is a technique that combines the numerical stability * of partial pivoting with the scalability of Cholesky (no pivoting), * to run accurately and efficiently on large numbers of processors. * * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed * description of the parallel algorithms. * * Here are the options for using this code: * * 1. Independent of all the other options specified below, the * user must supply * * - B, the matrix of right hand sides, and its dimensions ldb and nrhs * - grid, a structure describing the 2D processor mesh * - options->IterRefine, which determines whether or not to * improve the accuracy of the computed solution using * iterative refinement * * On output, B is overwritten with the solution X. * * 2. Depending on options->Fact, the user has several options * for solving A*X=B. The standard option is for factoring * A "from scratch". (The other options, described below, * are used when A is sufficiently similar to a previously * solved problem to save time by reusing part or all of * the previous factorization.) * * - options->Fact = DOFACT: A is factored "from scratch" * * In this case the user must also supply * * - A, the input matrix * * as well as the following options, which are described in more * detail below: * * - options->Equil, to specify how to scale the rows and columns * of A to "equilibrate" it (to try to reduce its * condition number and so improve the * accuracy of the computed solution) * * - options->RowPerm, to specify how to permute the rows of A * (typically to control numerical stability) * * - options->ColPerm, to specify how to permute the columns of A * (typically to control fill-in and enhance * parallelism during factorization) * * - options->ReplaceTinyPivot, to specify how to deal with tiny * pivots encountered during factorization * (to control numerical stability) * * The outputs returned include * * - ScalePermstruct, modified to describe how the input matrix A * was equilibrated and permuted: * - ScalePermstruct->DiagScale, indicates whether the rows and/or * columns of A were scaled * - ScalePermstruct->R, array of row scale factors * - ScalePermstruct->C, array of column scale factors * - ScalePermstruct->perm_r, row permutation vector * - ScalePermstruct->perm_c, column permutation vector * * (part of ScalePermstruct may also need to be supplied on input, * depending on options->RowPerm and options->ColPerm as described * later). * * - A, the input matrix A overwritten by the scaled and permuted matrix * Pc*Pr*diag(R)*A*diag(C) * where * Pr and Pc are row and columns permutation matrices determined * by ScalePermstruct->perm_r and ScalePermstruct->perm_c, * respectively, and * diag(R) and diag(C) are diagonal scaling matrices determined * by ScalePermstruct->DiagScale, ScalePermstruct->R and * ScalePermstruct->C * * - LUstruct, which contains the L and U factorization of A1 where * * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * (Note that A1 = Aout * Pc^T, where Aout is the matrix stored * in A on output.) * * 3. The second value of options->Fact assumes that a matrix with the same * sparsity pattern as A has already been factored: * * - options->Fact = SamePattern: A is factored, assuming that it has * the same nonzero pattern as a previously factored matrix. In this * case the algorithm saves time by reusing the previously computed * column permutation vector stored in ScalePermstruct->perm_c * and the "elimination tree" of A stored in LUstruct->etree. * * In this case the user must still specify the following options * as before: * * - options->Equil * - options->RowPerm * - options->ReplaceTinyPivot * * but not options->ColPerm, whose value is ignored. This is because the * previous column permutation from ScalePermstruct->perm_c is used as * input. The user must also supply * * - A, the input matrix * - ScalePermstruct->perm_c, the column permutation * - LUstruct->etree, the elimination tree * * The outputs returned include * * - A, the input matrix A overwritten by the scaled and permuted matrix * as described above * - ScalePermstruct, modified to describe how the input matrix A was * equilibrated and row permuted * - LUstruct, modified to contain the new L and U factors * * 4. The third value of options->Fact assumes that a matrix B with the same * sparsity pattern as A has already been factored, and where the * row permutation of B can be reused for A. This is useful when A and B * have similar numerical values, so that the same row permutation * will make both factorizations numerically stable. This lets us reuse * all of the previously computed structure of L and U. * * - options->Fact = SamePattern_SameRowPerm: A is factored, * assuming not only the same nonzero pattern as the previously * factored matrix B, but reusing B's row permutation. * * In this case the user must still specify the following options * as before: * * - options->Equil * - options->ReplaceTinyPivot * * but not options->RowPerm or options->ColPerm, whose values are ignored. * This is because the permutations from ScalePermstruct->perm_r and * ScalePermstruct->perm_c are used as input. * * The user must also supply * * - A, the input matrix * - ScalePermstruct->DiagScale, how the previous matrix was row and/or * column scaled * - ScalePermstruct->R, the row scalings of the previous matrix, if any * - ScalePermstruct->C, the columns scalings of the previous matrix, * if any * - ScalePermstruct->perm_r, the row permutation of the previous matrix * - ScalePermstruct->perm_c, the column permutation of the previous * matrix * - all of LUstruct, the previously computed information about L and U * (the actual numerical values of L and U stored in * LUstruct->Llu are ignored) * * The outputs returned include * * - A, the input matrix A overwritten by the scaled and permuted matrix * as described above * - ScalePermstruct, modified to describe how the input matrix A was * equilibrated * (thus ScalePermstruct->DiagScale, R and C may be modified) * - LUstruct, modified to contain the new L and U factors * * 5. The fourth and last value of options->Fact assumes that A is * identical to a matrix that has already been factored on a previous * call, and reuses its entire LU factorization * * - options->Fact = Factored: A is identical to a previously * factorized matrix, so the entire previous factorization * can be reused. * * In this case all the other options mentioned above are ignored * (options->Equil, options->RowPerm, options->ColPerm, * options->ReplaceTinyPivot) * * The user must also supply * * - A, the unfactored matrix, only in the case that iterative refinment * is to be done (specifically A must be the output A from * the previous call, so that it has been scaled and permuted) * - all of ScalePermstruct * - all of LUstruct, including the actual numerical values of L and U * * all of which are unmodified on output. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following fields should be defined for this structure: * * o Fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, how the matrix A should * be factorized based on the previous history. * * = DOFACT: The matrix A will be factorized from scratch. * Inputs: A * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * all of ScalePermstruct * all of LUstruct * * = SamePattern: the matrix A will be factorized assuming * that a factorization of a matrix with the same sparsity * pattern was performed prior to this one. Therefore, this * factorization will reuse column permutation vector * ScalePermstruct->perm_c and the elimination tree * LUstruct->etree * Inputs: A * options->Equil, RowPerm, ReplaceTinyPivot * ScalePermstruct->perm_c * LUstruct->etree * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * rest of ScalePermstruct (DiagScale, R, C, perm_r) * rest of LUstruct (GLU_persist, Llu) * * = SamePattern_SameRowPerm: the matrix A will be factorized * assuming that a factorization of a matrix with the same * sparsity pattern and similar numerical values was performed * prior to this one. Therefore, this factorization will reuse * both row and column scaling factors R and C, and the * both row and column permutation vectors perm_r and perm_c, * distributed data structure set up from the previous symbolic * factorization. * Inputs: A * options->Equil, ReplaceTinyPivot * all of ScalePermstruct * all of LUstruct * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * modified LUstruct->Llu * = FACTORED: the matrix A is already factored. * Inputs: all of ScalePermstruct * all of LUstruct * * o Equil (yes_no_t) * Specifies whether to equilibrate the system. * = NO: no equilibration. * = YES: scaling factors are computed to equilibrate the system: * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. * Whether or not the system will be equilibrated depends * on the scaling of the matrix A, but if equilibration is * used, A is overwritten by diag(R)*A*diag(C) and B by * diag(R)*B. * * o RowPerm (rowperm_t) * Specifies how to permute rows of the matrix A. * = NATURAL: use the natural ordering. * = LargeDiag: use the Duff/Koster algorithm to permute rows of * the original matrix to make the diagonal large * relative to the off-diagonal. * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r * input by the user. * * o ColPerm (colperm_t) * Specifies what type of column permutation to use to reduce fill. * = NATURAL: natural ordering. * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. * = MMD_ATA: minimum degree ordering on structure of A'*A. * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. * * o ReplaceTinyPivot (yes_no_t) * = NO: do not modify pivots * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during * LU factorization. * * o IterRefine (IterRefine_t) * Specifies how to perform iterative refinement. * = NO: no iterative refinement. * = DOUBLE: accumulate residual in double precision. * = EXTRA: accumulate residual in extra precision. * * NOTE: all options must be indentical on all processes when * calling this routine. * * A (input/output) SuperMatrix* * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. That is, A is stored in * compressed column format (also known as Harwell-Boeing format). * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization * routine pdgstrf can factorize rectangular matrices. * On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C), * depending on ScalePermstruct->DiagScale, options->RowPerm and * options->colpem: * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by * diag(R)*A*diag(C). * if options->RowPerm != NATURAL, A is further overwritten by * Pr*diag(R)*A*diag(C). * if options->ColPerm != NATURAL, A is further overwritten by * Pc*Pr*diag(R)*A*diag(C). * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. * * NOTE: Currently, A must reside in all processes when calling * this routine. * * ScalePermstruct (input/output) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * It contains the following fields: * * o DiagScale (DiagScale_t) * Specifies the form of equilibration that was done. * = NOEQUIL: no equilibration. * = ROW: row equilibration, i.e., A was premultiplied by * diag(R). * = COL: Column equilibration, i.e., A was postmultiplied * by diag(C). * = BOTH: both row and column equilibration, i.e., A was * replaced by diag(R)*A*diag(C). * If options->Fact = FACTORED or SamePattern_SameRowPerm, * DiagScale is an input argument; otherwise it is an output * argument. * * o perm_r (int*) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * If options->RowPerm = MY_PERMR, or * options->Fact = SamePattern_SameRowPerm, perm_r is an * input argument; otherwise it is an output argument. * * o perm_c (int*) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * If options->ColPerm = MY_PERMC or options->Fact = SamePattern * or options->Fact = SamePattern_SameRowPerm, perm_c is an * input argument; otherwise, it is an output argument. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * * o R (double*) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). * If DiagScale = NOEQUIL or COL, R is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * * o C (double*) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). * If DiagScale = NOEQUIL or ROW, C is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is * an input argument; otherwise, C is an output argument. * * B (input/output) double* * On entry, the right-hand side matrix of dimension (A->nrow, nrhs). * On exit, the solution matrix if info = 0; * * NOTE: Currently, B must reside in all processes when calling * this routine. * * ldb (input) int (global) * The leading dimension of matrix B. * * nrhs (input) int (global) * The number of right-hand sides. * If nrhs = 0, only LU decomposition is performed, the forward * and back substitutions are skipped. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * It contains the following fields: * * o etree (int*) dimension (A->ncol) * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol. * It is computed in sp_colorder() during the first factorization, * and is reused in the subsequent factorizations of the matrices * with the same nonzero pattern. * On exit of sp_colorder(), the columns of A are permuted so that * the etree is in a certain postorder. This postorder is reflected * in ScalePermstruct->perm_c. * NOTE: * Etree is a vector of parent pointers for a forest whose vertices * are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * o Glu_persist (Glu_persist_t*) * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (LocalLU_t*) * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * berr (output) double*, dimension (nrhs) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * so the solution could not be computed. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * * * See superlu_ddefs.h for the definitions of various data types. * </pre> */ void pdgssvx_ABglobal(superlu_options_t *options, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, double B[], int ldb, int nrhs, gridinfo_t *grid, LUstruct_t *LUstruct, double *berr, SuperLUStat_t *stat, int *info) { SuperMatrix AC; NCformat *Astore; NCPformat *ACstore; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t *Glu_freeable; /* The nonzero structures of L and U factors, which are replicated on all processrs. (lsub, xlsub) contains the compressed subscript of supernodes in L. (usub, xusub) contains the compressed subscript of nonzero segments in U. If options->Fact != SamePattern_SameRowPerm, they are computed by SYMBFACT routine, and then used by DDISTRIBUTE routine. They will be freed after DDISTRIBUTE routine. If options->Fact == SamePattern_SameRowPerm, these structures are not used. */ fact_t Fact; double *a; int_t *perm_r; /* row permutations from partial pivoting */ int_t *perm_c; /* column permutation vector */ int_t *etree; /* elimination tree */ int_t *colptr, *rowind; int_t colequ, Equil, factored, job, notran, rowequ; int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; int iam; int ldx; /* LDA for matrix X (global). */ char equed[1], norm[1]; double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; double *X, *b_col, *b_work, *x_col; double t; static mem_usage_t num_mem_usage, symb_mem_usage; #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif /* Test input parameters. */ *info = 0; Fact = options->Fact; if ( Fact < 0 || Fact > FACTORED ) *info = -1; else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) *info = -1; else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) *info = -1; else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) *info = -1; else if ( options->IterRefine == EXTRA ) { *info = -1; fprintf(stderr, "Extra precise iterative refinement yet to support."); } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < A->nrow ) *info = -5; else if ( nrhs < 0 ) *info = -6; if ( *info ) { i = -(*info); pxerbla("pdgssvx_ABglobal", grid, -*info); return; } /* Initialization */ factored = (Fact == FACTORED); Equil = (!factored && options->Equil == YES); notran = (options->Trans == NOTRANS); iam = grid->iam; job = 5; m = A->nrow; n = A->ncol; Astore = A->Store; nnz = Astore->nnz; a = Astore->nzval; colptr = Astore->colptr; rowind = Astore->rowind; if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { rowequ = (ScalePermstruct->DiagScale == ROW) || (ScalePermstruct->DiagScale == BOTH); colequ = (ScalePermstruct->DiagScale == COL) || (ScalePermstruct->DiagScale == BOTH); } else rowequ = colequ = FALSE; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgssvx_ABglobal()"); #endif perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; etree = LUstruct->etree; R = ScalePermstruct->R; C = ScalePermstruct->C; if ( Equil && Fact != SamePattern_SameRowPerm ) { /* Allocate storage if not done so before. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; } } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( Equil ) { #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter equil"); #endif t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm ) { /* Reuse R and C. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: break; case ROW: for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; a[i] *= R[irow]; /* Scale rows. */ } } break; case COL: for (j = 0; j < n; ++j) for (i = colptr[j]; i < colptr[j+1]; ++i) a[i] *= C[j]; /* Scale columns. */ break; case BOTH: for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; a[i] *= R[irow] * C[j]; /* Scale rows and columns. */ } } break; } } else { if ( !iam ) { /* Compute row and column scalings to equilibrate matrix A. */ dgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo); MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); if ( iinfo == 0 ) { MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); } else { if ( iinfo > 0 ) { if ( iinfo <= m ) fprintf(stderr, "The %d-th row of A is exactly zero\n", iinfo); else fprintf(stderr, "The %d-th column of A is exactly zero\n", iinfo-n); exit(-1); } } } else { MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); if ( iinfo == 0 ) { MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); } else { ABORT("DGSEQU failed\n"); } } /* Equilibrate matrix A. */ dlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed); if ( lsame_(equed, "R") ) { ScalePermstruct->DiagScale = rowequ = ROW; } else if ( lsame_(equed, "C") ) { ScalePermstruct->DiagScale = colequ = COL; } else if ( lsame_(equed, "B") ) { ScalePermstruct->DiagScale = BOTH; rowequ = ROW; colequ = COL; } else ScalePermstruct->DiagScale = NOEQUIL; #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. equilibrated? *equed = %c\n", *equed); /*fflush(stdout);*/ } #endif } /* if Fact ... */ stat->utime[EQUIL] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit equil"); #endif } /* end if Equil ... */ /* ------------------------------------------------------------ Permute rows of A. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */ || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */ for (i = 0; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } else if ( !factored ) { if ( job == 5 ) { /* Allocate storage for scaling factors. */ if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for R1[]"); if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for C1[]"); } if ( !iam ) { /* Process 0 finds a row permutation for large diagonal. */ dldperm(job, m, nnz, colptr, rowind, a, perm_r, R1, C1); MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } else { MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } #if ( PRNTlevel>=2 ) dmin = dlamch_("Overflow"); dsum = 0.0; dprod = 1.0; #endif if ( job == 5 ) { if ( Equil ) { for (i = 0; i < n; ++i) { R1[i] = exp(R1[i]); C1[i] = exp(C1[i]); } for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; a[i] *= R1[irow] * C1[j]; /* Scale the matrix. */ rowind[i] = perm_r[irow]; #if ( PRNTlevel>=2 ) if ( rowind[i] == j ) /* New diagonal */ dprod *= fabs(a[i]); #endif } } /* Multiply together the scaling factors. */ if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; else for (i = 0; i < m; ++i) R[i] = R1[i]; if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; else for (i = 0; i < n; ++i) C[i] = C1[i]; ScalePermstruct->DiagScale = BOTH; rowequ = colequ = 1; } else { /* No equilibration. */ for (i = colptr[0]; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } SUPERLU_FREE (R1); SUPERLU_FREE (C1); } else { /* job = 2,3,4 */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; #if ( PRNTlevel>=2 ) if ( rowind[i] == j ) { /* New diagonal */ if ( job == 2 || job == 3 ) dmin = SUPERLU_MIN(dmin, fabs(a[i])); else if ( job == 4 ) dsum += fabs(a[i]); else if ( job == 5 ) dprod *= fabs(a[i]); } #endif } } } #if ( PRNTlevel>=2 ) if ( job == 2 || job == 3 ) { if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); } else if ( job == 4 ) { if ( !iam ) printf("\tsum of diagonal %e\n", dsum); } else if ( job == 5 ) { if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif } /* else !factored */ t = SuperLU_timer_() - t; stat->utime[ROWPERM] = t; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); #endif } else { /* options->RowPerm == NOROWPERM */ for (i = 0; i < m; ++i) perm_r[i] = i; } if ( !factored || options->IterRefine ) { /* Compute norm(A), which will be used to adjust small diagonal. */ if ( notran ) *(unsigned char *)norm = '1'; else *(unsigned char *)norm = 'I'; anorm = dlangs_dist(norm, A); #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. anorm %e\n", anorm); #endif } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( !factored ) { t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && Fact == DOFACT ) /* Use an ordering provided by SuperLU */ get_perm_c_dist(iam, permc_spec, A, perm_c); /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' (a.k.a. column etree), depending on the choice of ColPerm. Adjust perm_c[] to be consistent with a postorder of etree. Permute columns of A to form A*Pc'. */ sp_colorder(options, A, perm_c, etree, &AC); /* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */ ACstore = AC.Store; for (j = 0; j < n; ++j) for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) { irow = ACstore->rowind[i]; ACstore->rowind[i] = perm_c[irow]; } stat->utime[COLPERM] = SuperLU_timer_() - t; /* Perform a symbolic factorization on matrix A and set up the nonzero data structures which are suitable for supernodal GENP. */ if ( Fact != SamePattern_SameRowPerm ) { #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); if ( !(Glu_freeable = (Glu_freeable_t *) SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) ABORT("Malloc fails for Glu_freeable."); iinfo = symbfact(options, iam, &AC, perm_c, etree, Glu_persist, Glu_freeable); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if ( iinfo < 0 ) { QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); #if ( PRNTlevel>=1 ) if ( !iam ) { printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); printf("\tint %d, short %d, float %d, double %d\n", sizeof(int_t), sizeof(short), sizeof(float), sizeof(double)); printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); } #endif } else { if ( !iam ) { fprintf(stderr, "symbfact() error returns %d\n", iinfo); exit(-1); } } } /* Distribute the L and U factors onto the process grid. */ t = SuperLU_timer_(); dist_mem_use = ddistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factor. */ if ( Fact != SamePattern_SameRowPerm ) { iinfo = symbfact_SubFree(Glu_freeable); SUPERLU_FREE(Glu_freeable); } /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); stat->utime[FACT] = SuperLU_timer_() - t; #if ( PRNTlevel>=1 ) { int_t TinyPivots; float for_lu, total, max, avg, temp; dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); MPI_Reduce( &num_mem_usage.for_lu, &for_lu, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &num_mem_usage.total, &total, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); temp = SUPERLU_MAX(symb_mem_usage.total, symb_mem_usage.for_lu + (float)dist_mem_use + num_mem_usage.for_lu); temp = SUPERLU_MAX(temp, num_mem_usage.total); MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, MPI_SUM, grid->comm ); stat->TinyPivots = TinyPivots; if ( !iam ) { printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", for_lu*1e-6, total*1e-6); printf("\tAll space (MB):" "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); } } #endif #if ( PRNTlevel>=2 ) if ( !iam ) printf(".. pdgstrf INFO = %d\n", *info); #endif } else if ( options->IterRefine ) { /* options->Fact==FACTORED */ /* Permute columns of A to form A*Pc' using the existing perm_c. * NOTE: rows of A were previously permuted to Pc*A. */ sp_colorder(options, A, perm_c, NULL, &AC); } /* if !factored ... */ /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ if ( nrhs ) { if ( !(b_work = doubleMalloc_dist(n)) ) ABORT("Malloc fails for b_work[]"); /* ------------------------------------------------------------ Scale the right-hand side if equilibration was performed. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_col[i] *= R[i]; b_col += ldb; } } } else if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_col[i] *= C[i]; b_col += ldb; } } /* ------------------------------------------------------------ Permute the right-hand side to form Pr*B. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { if ( notran ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i]; for (i = 0; i < m; ++i) b_col[i] = b_work[i]; b_col += ldb; } } } /* ------------------------------------------------------------ Permute the right-hand side to form Pc*B. ------------------------------------------------------------*/ if ( notran ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i]; for (i = 0; i < m; ++i) b_col[i] = b_work[i]; b_col += ldb; } } /* Save a copy of the right-hand side. */ ldx = ldb; if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) ABORT("Malloc fails for X[]"); x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < ldb; ++i) x_col[i] = b_col[i]; x_col += ldx; b_col += ldb; } /* ------------------------------------------------------------ Solve the linear system. ------------------------------------------------------------*/ pdgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info); /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ if ( options->IterRefine ) { /* Improve the solution by iterative refinement. */ t = SuperLU_timer_(); pdgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb, X, ldx, nrhs, berr, stat, info); stat->utime[REFINE] = SuperLU_timer_() - t; } /* Permute the solution matrix X <= Pc'*X. */ for (j = 0; j < nrhs; j++) { b_col = &B[j*ldb]; x_col = &X[j*ldx]; for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]]; } /* Transform the solution matrix X to a solution of the original system before the equilibration. */ if ( notran ) { if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < n; ++i) b_col[i] *= C[i]; b_col += ldb; } } } else if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < n; ++i) b_col[i] *= R[i]; b_col += ldb; } } SUPERLU_FREE(b_work); SUPERLU_FREE(X); } /* end if nrhs != 0 */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif /* Deallocate R and/or C if it is not used. */ if ( Equil && Fact != SamePattern_SameRowPerm ) { switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: SUPERLU_FREE(R); SUPERLU_FREE(C); break; case ROW: SUPERLU_FREE(C); break; case COL: SUPERLU_FREE(R); break; } } if ( !factored || (factored && options->IterRefine) ) Destroy_CompCol_Permuted_dist(&AC); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgssvx_ABglobal()"); #endif }