/*! \brief Initialize the data structure for the solution phase. */ int zSolveInit(superlu_options_t *options, SuperMatrix *A, int_t perm_r[], int_t perm_c[], int_t nrhs, LUstruct_t *LUstruct, gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { int_t *row_to_proc, *inv_perm_c, *itemp; NRformat_loc *Astore; int_t i, fst_row, m_loc, p; int procs; /* prototypes */ extern int_t pxgstrs_init(int_t, int_t, int_t, int_t, int_t [], int_t [], gridinfo_t *grid, Glu_persist_t *, SOLVEstruct_t *); Astore = (NRformat_loc *) A->Store; fst_row = Astore->fst_row; m_loc = Astore->m_loc; procs = grid->nprow * grid->npcol; if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) ABORT("Malloc fails for row_to_proc[]"); SOLVEstruct->row_to_proc = row_to_proc; if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) ABORT("Malloc fails for inv_perm_c[]."); for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i; SOLVEstruct->inv_perm_c = inv_perm_c; /* ------------------------------------------------------------ EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION. SET UP THE MAPPING BETWEEN ROWS AND PROCESSES. NOTE: For those processes that do not own any row, it must must be set so that fst_row == A->nrow. ------------------------------------------------------------*/ if ( !(itemp = intMalloc_dist(procs+1)) ) ABORT("Malloc fails for itemp[]"); MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); itemp[procs] = A->nrow; for (p = 0; p < procs; ++p) { for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; } #if ( DEBUGlevel>=2 ) if ( !grid->iam ) { printf("fst_row = %d\n", fst_row); PrintInt10("row_to_proc", A->nrow, row_to_proc); PrintInt10("inv_perm_c", A->ncol, inv_perm_c); } #endif SUPERLU_FREE(itemp); #if 0 /* Compute the mapping between rows and processes. */ /* XSL NOTE: What happens if # of mapped processes is smaller than total Procs? For the processes without any row, let fst_row be EMPTY (-1). Make sure this case works! */ MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); itemp[procs] = n; for (p = 0; p < procs; ++p) { j = itemp[p]; if ( j != EMPTY ) { k = itemp[p+1]; if ( k == EMPTY ) k = n; for (i = j ; i < k; ++i) row_to_proc[i] = p; } } #endif get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, &SOLVEstruct->num_diag_procs, &SOLVEstruct->diag_procs, &SOLVEstruct->diag_len); if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, LUstruct->Glu_persist, SOLVEstruct); if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *) SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) ) ABORT("Malloc fails for gsmv_comm[]"); SOLVEstruct->A_colind_gsmv = NULL; options->SolveInitialized = YES; return 0; } /* zSolveInit */
void pdgssvx(superlu_options_t *options, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, double B[], int ldb, int nrhs, gridinfo_t *grid, LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr, SuperLUStat_t *stat, int *info) { /* * -- Distributed SuperLU routine (version 2.2) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * November 1, 2007 * Feburary 20, 2008 * * * Purpose * ======= * * PDGSSVX solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * * Static pivoting is a technique that combines the numerical stability * of partial pivoting with the scalability of Cholesky (no pivoting), * to run accurately and efficiently on large numbers of processors. * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed * description of the parallel algorithms. * * The input matrices A and B are distributed by block rows. * Here is a graphical illustration (0-based indexing): * * A B * 0 --------------- ------ * | | | | * | | P0 | | * | | | | * --------------- ------ * - fst_row->| | | | * | | | | | * m_loc | | P1 | | * | | | | | * - | | | | * --------------- ------ * | . | |. | * | . | |. | * | . | |. | * --------------- ------ * * where, fst_row is the row number of the first row, * m_loc is the number of rows local to this processor * These are defined in the 'SuperMatrix' structure, see supermatrix.h. * * * Here are the options for using this code: * * 1. Independent of all the other options specified below, the * user must supply * * - B, the matrix of right-hand sides, distributed by block rows, * and its dimensions ldb (local) and nrhs (global) * - grid, a structure describing the 2D processor mesh * - options->IterRefine, which determines whether or not to * improve the accuracy of the computed solution using * iterative refinement * * On output, B is overwritten with the solution X. * * 2. Depending on options->Fact, the user has four options * for solving A*X=B. The standard option is for factoring * A "from scratch". (The other options, described below, * are used when A is sufficiently similar to a previously * solved problem to save time by reusing part or all of * the previous factorization.) * * - options->Fact = DOFACT: A is factored "from scratch" * * In this case the user must also supply * * o A, the input matrix * * as well as the following options to determine what matrix to * factorize. * * o options->Equil, to specify how to scale the rows and columns * of A to "equilibrate" it (to try to reduce its * condition number and so improve the * accuracy of the computed solution) * * o options->RowPerm, to specify how to permute the rows of A * (typically to control numerical stability) * * o options->ColPerm, to specify how to permute the columns of A * (typically to control fill-in and enhance * parallelism during factorization) * * o options->ReplaceTinyPivot, to specify how to deal with tiny * pivots encountered during factorization * (to control numerical stability) * * The outputs returned include * * o ScalePermstruct, modified to describe how the input matrix A * was equilibrated and permuted: * . ScalePermstruct->DiagScale, indicates whether the rows and/or * columns of A were scaled * . ScalePermstruct->R, array of row scale factors * . ScalePermstruct->C, array of column scale factors * . ScalePermstruct->perm_r, row permutation vector * . ScalePermstruct->perm_c, column permutation vector * * (part of ScalePermstruct may also need to be supplied on input, * depending on options->RowPerm and options->ColPerm as described * later). * * o A, the input matrix A overwritten by the scaled and permuted * matrix diag(R)*A*diag(C)*Pc^T, where * Pc is the row permutation matrix determined by * ScalePermstruct->perm_c * diag(R) and diag(C) are diagonal scaling matrices determined * by ScalePermstruct->DiagScale, ScalePermstruct->R and * ScalePermstruct->C * * o LUstruct, which contains the L and U factorization of A1 where * * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored * in A on output.) * * 3. The second value of options->Fact assumes that a matrix with the same * sparsity pattern as A has already been factored: * * - options->Fact = SamePattern: A is factored, assuming that it has * the same nonzero pattern as a previously factored matrix. In * this case the algorithm saves time by reusing the previously * computed column permutation vector stored in * ScalePermstruct->perm_c and the "elimination tree" of A * stored in LUstruct->etree * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->RowPerm * o options->ReplaceTinyPivot * * but not options->ColPerm, whose value is ignored. This is because the * previous column permutation from ScalePermstruct->perm_c is used as * input. The user must also supply * * o A, the input matrix * o ScalePermstruct->perm_c, the column permutation * o LUstruct->etree, the elimination tree * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated and row permuted * o LUstruct, modified to contain the new L and U factors * * 4. The third value of options->Fact assumes that a matrix B with the same * sparsity pattern as A has already been factored, and where the * row permutation of B can be reused for A. This is useful when A and B * have similar numerical values, so that the same row permutation * will make both factorizations numerically stable. This lets us reuse * all of the previously computed structure of L and U. * * - options->Fact = SamePattern_SameRowPerm: A is factored, * assuming not only the same nonzero pattern as the previously * factored matrix B, but reusing B's row permutation. * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->ReplaceTinyPivot * * but not options->RowPerm or options->ColPerm, whose values are * ignored. This is because the permutations from ScalePermstruct->perm_r * and ScalePermstruct->perm_c are used as input. * * The user must also supply * * o A, the input matrix * o ScalePermstruct->DiagScale, how the previous matrix was row * and/or column scaled * o ScalePermstruct->R, the row scalings of the previous matrix, * if any * o ScalePermstruct->C, the columns scalings of the previous matrix, * if any * o ScalePermstruct->perm_r, the row permutation of the previous * matrix * o ScalePermstruct->perm_c, the column permutation of the previous * matrix * o all of LUstruct, the previously computed information about * L and U (the actual numerical values of L and U * stored in LUstruct->Llu are ignored) * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated (thus ScalePermstruct->DiagScale, * R and C may be modified) * o LUstruct, modified to contain the new L and U factors * * 5. The fourth and last value of options->Fact assumes that A is * identical to a matrix that has already been factored on a previous * call, and reuses its entire LU factorization * * - options->Fact = Factored: A is identical to a previously * factorized matrix, so the entire previous factorization * can be reused. * * In this case all the other options mentioned above are ignored * (options->Equil, options->RowPerm, options->ColPerm, * options->ReplaceTinyPivot) * * The user must also supply * * o A, the unfactored matrix, only in the case that iterative * refinment is to be done (specifically A must be the output * A from the previous call, so that it has been scaled and permuted) * o all of ScalePermstruct * o all of LUstruct, including the actual numerical values of * L and U * * all of which are unmodified on output. * * Arguments * ========= * * options (input) superlu_options_t* (global) * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following fields should be defined for this structure: * * o Fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, how the matrix A should * be factorized based on the previous history. * * = DOFACT: The matrix A will be factorized from scratch. * Inputs: A * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * all of ScalePermstruct * all of LUstruct * * = SamePattern: the matrix A will be factorized assuming * that a factorization of a matrix with the same sparsity * pattern was performed prior to this one. Therefore, this * factorization will reuse column permutation vector * ScalePermstruct->perm_c and the elimination tree * LUstruct->etree * Inputs: A * options->Equil, RowPerm, ReplaceTinyPivot * ScalePermstruct->perm_c * LUstruct->etree * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * rest of ScalePermstruct (DiagScale, R, C, perm_r) * rest of LUstruct (GLU_persist, Llu) * * = SamePattern_SameRowPerm: the matrix A will be factorized * assuming that a factorization of a matrix with the same * sparsity pattern and similar numerical values was performed * prior to this one. Therefore, this factorization will reuse * both row and column scaling factors R and C, and the * both row and column permutation vectors perm_r and perm_c, * distributed data structure set up from the previous symbolic * factorization. * Inputs: A * options->Equil, ReplaceTinyPivot * all of ScalePermstruct * all of LUstruct * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * modified LUstruct->Llu * = FACTORED: the matrix A is already factored. * Inputs: all of ScalePermstruct * all of LUstruct * * o Equil (yes_no_t) * Specifies whether to equilibrate the system. * = NO: no equilibration. * = YES: scaling factors are computed to equilibrate the system: * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. * Whether or not the system will be equilibrated depends * on the scaling of the matrix A, but if equilibration is * used, A is overwritten by diag(R)*A*diag(C) and B by * diag(R)*B. * * o RowPerm (rowperm_t) * Specifies how to permute rows of the matrix A. * = NATURAL: use the natural ordering. * = LargeDiag: use the Duff/Koster algorithm to permute rows of * the original matrix to make the diagonal large * relative to the off-diagonal. * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r * input by the user. * * o ColPerm (colperm_t) * Specifies what type of column permutation to use to reduce fill. * = NATURAL: natural ordering. * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. * = MMD_ATA: minimum degree ordering on structure of A'*A. * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. * * o ReplaceTinyPivot (yes_no_t) * = NO: do not modify pivots * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during * LU factorization. * * o IterRefine (IterRefine_t) * Specifies how to perform iterative refinement. * = NO: no iterative refinement. * = DOUBLE: accumulate residual in double precision. * = EXTRA: accumulate residual in extra precision. * * NOTE: all options must be indentical on all processes when * calling this routine. * * A (input/output) SuperMatrix* (local) * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. * That is, A is stored in distributed compressed row format. * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization * routine PDGSTRF can factorize rectangular matrices. * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, * depending on ScalePermstruct->DiagScale and options->ColPerm: * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by * diag(R)*A*diag(C). * if options->ColPerm != NATURAL, A is further overwritten by * diag(R)*A*diag(C)*Pc^T. * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. * * ScalePermstruct (input/output) ScalePermstruct_t* (global) * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * It contains the following fields: * * o DiagScale (DiagScale_t) * Specifies the form of equilibration that was done. * = NOEQUIL: no equilibration. * = ROW: row equilibration, i.e., A was premultiplied by * diag(R). * = COL: Column equilibration, i.e., A was postmultiplied * by diag(C). * = BOTH: both row and column equilibration, i.e., A was * replaced by diag(R)*A*diag(C). * If options->Fact = FACTORED or SamePattern_SameRowPerm, * DiagScale is an input argument; otherwise it is an output * argument. * * o perm_r (int*) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * If options->RowPerm = MY_PERMR, or * options->Fact = SamePattern_SameRowPerm, perm_r is an * input argument; otherwise it is an output argument. * * o perm_c (int*) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * If options->ColPerm = MY_PERMC or options->Fact = SamePattern * or options->Fact = SamePattern_SameRowPerm, perm_c is an * input argument; otherwise, it is an output argument. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * * o R (double*) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). * If DiagScale = NOEQUIL or COL, R is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * * o C (double*) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). * If DiagScale = NOEQUIL or ROW, C is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is * an input argument; otherwise, C is an output argument. * * B (input/output) double* (local) * On entry, the right-hand side matrix of dimension (m_loc, nrhs), * where, m_loc is the number of rows stored locally on my * process and is defined in the data structure of matrix A. * On exit, the solution matrix if info = 0; * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * The number of right-hand sides. * If nrhs = 0, only LU decomposition is performed, the forward * and back substitutions are skipped. * * grid (input) gridinfo_t* (global) * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * It contains the following fields: * * o etree (int*) dimension (A->ncol) (global) * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'. * It is computed in sp_colorder() during the first factorization, * and is reused in the subsequent factorizations of the matrices * with the same nonzero pattern. * On exit of sp_colorder(), the columns of A are permuted so that * the etree is in a certain postorder. This postorder is reflected * in ScalePermstruct->perm_c. * NOTE: * Etree is a vector of parent pointers for a forest whose vertices * are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * o Glu_persist (Glu_persist_t*) (global) * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (LocalLU_t*) (local) * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * SOLVEstruct (input/output) SOLVEstruct_t* * The data structure to hold the communication pattern used * in the phases of triangular solution and iterative refinement. * This pattern should be intialized only once for repeated solutions. * If options->SolveInitialized = YES, it is an input argument. * If options->SolveInitialized = NO and nrhs != 0, it is an output * argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'. * * berr (output) double*, dimension (nrhs) (global) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * so the solution could not be computed. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * * See superlu_ddefs.h for the definitions of varioous data types. * */ NRformat_loc *Astore; SuperMatrix GA; /* Global A in NC format */ NCformat *GAstore; double *a_GA; SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ NCPformat *GACstore; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t *Glu_freeable; /* The nonzero structures of L and U factors, which are replicated on all processrs. (lsub, xlsub) contains the compressed subscript of supernodes in L. (usub, xusub) contains the compressed subscript of nonzero segments in U. If options->Fact != SamePattern_SameRowPerm, they are computed by SYMBFACT routine, and then used by PDDISTRIBUTE routine. They will be freed after PDDISTRIBUTE routine. If options->Fact == SamePattern_SameRowPerm, these structures are not used. */ fact_t Fact; double *a; int_t *colptr, *rowind; int_t *perm_r; /* row permutations from partial pivoting */ int_t *perm_c; /* column permutation vector */ int_t *etree; /* elimination tree */ int_t *rowptr, *colind; /* Local A in NR*/ int_t *rowind_loc, *colptr_loc; int_t colequ, Equil, factored, job, notran, rowequ, need_value; int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; int_t nnz_loc, m_loc, fst_row, icol; int iam; int ldx; /* LDA for matrix X (local). */ char equed[1], norm[1]; double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; double *X, *b_col, *b_work, *x_col; double t; static mem_usage_t num_mem_usage, symb_mem_usage; #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif int_t procs; /* Structures needed for parallel symbolic factorization */ int_t *sizes, *fstVtxSep, parSymbFact; int noDomains, nprocs_num; MPI_Comm symb_comm; /* communicator for symbolic factorization */ int col, key; /* parameters for creating a new communicator */ Pslu_freeable_t Pslu_freeable; float flinfo; /* Initialization. */ m = A->nrow; n = A->ncol; Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; a = (double *) Astore->nzval; rowptr = Astore->rowptr; colind = Astore->colind; sizes = NULL; fstVtxSep = NULL; symb_comm = MPI_COMM_NULL; /* Test the input parameters. */ *info = 0; Fact = options->Fact; if ( Fact < 0 || Fact > FACTORED ) *info = -1; else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) *info = -1; else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) *info = -1; else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) *info = -1; else if ( options->IterRefine == EXTRA ) { *info = -1; fprintf(stderr, "Extra precise iterative refinement yet to support."); } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < m_loc ) *info = -5; else if ( nrhs < 0 ) *info = -6; if ( *info ) { i = -(*info); pxerbla("pdgssvx", grid, -*info); return; } factored = (Fact == FACTORED); Equil = (!factored && options->Equil == YES); notran = (options->Trans == NOTRANS); iam = grid->iam; job = 5; if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { rowequ = (ScalePermstruct->DiagScale == ROW) || (ScalePermstruct->DiagScale == BOTH); colequ = (ScalePermstruct->DiagScale == COL) || (ScalePermstruct->DiagScale == BOTH); } else rowequ = colequ = FALSE; /* The following arrays are replicated on all processes. */ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; etree = LUstruct->etree; R = ScalePermstruct->R; C = ScalePermstruct->C; /********/ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgssvx()"); #endif /* Not factored & ask for equilibration */ if ( Equil && Fact != SamePattern_SameRowPerm ) { /* Allocate storage if not done so before. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; } } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( Equil ) { #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter equil"); #endif t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm ) { /* Reuse R and C. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: break; case ROW: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { a[i] *= R[irow]; /* Scale rows. */ } ++irow; } break; case COL: for (j = 0; j < m_loc; ++j) for (i = rowptr[j]; i < rowptr[j+1]; ++i){ icol = colind[i]; a[i] *= C[icol]; /* Scale columns. */ } break; case BOTH: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ } ++irow; } break; } } else { /* Compute R & C from scratch */ /* Compute the row and column scalings. */ pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); /* Equilibrate matrix A if it is badly-scaled. */ pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); if ( lsame_(equed, "R") ) { ScalePermstruct->DiagScale = rowequ = ROW; } else if ( lsame_(equed, "C") ) { ScalePermstruct->DiagScale = colequ = COL; } else if ( lsame_(equed, "B") ) { ScalePermstruct->DiagScale = BOTH; rowequ = ROW; colequ = COL; } else ScalePermstruct->DiagScale = NOEQUIL; #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. equilibrated? *equed = %c\n", *equed); /*fflush(stdout);*/ } #endif } /* if Fact ... */ stat->utime[EQUIL] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit equil"); #endif } /* if Equil ... */ if ( !factored ) { /* Skip this if already factored. */ /* * Gather A from the distributed compressed row format to * global A in compressed column format. * Numerical values are gathered only when a row permutation * for large diagonal is sought after. */ if ( Fact != SamePattern_SameRowPerm ) { need_value = (options->RowPerm == LargeDiag); pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); GAstore = (NCformat *) GA.Store; colptr = GAstore->colptr; rowind = GAstore->rowind; nnz = GAstore->nnz; if ( need_value ) a_GA = (double *) GAstore->nzval; else assert(GAstore->nzval == NULL); } /* ------------------------------------------------------------ Find the row permutation for A. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { t = SuperLU_timer_(); if ( Fact != SamePattern_SameRowPerm ) { if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ /* Permute the global matrix GA for symbfact() */ for (i = 0; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } else { /* options->RowPerm == LargeDiag */ /* Get a new perm_r[] */ if ( job == 5 ) { /* Allocate storage for scaling factors. */ if ( !(R1 = doubleMalloc_dist(m)) ) ABORT("SUPERLU_MALLOC fails for R1[]"); if ( !(C1 = doubleMalloc_dist(n)) ) ABORT("SUPERLU_MALLOC fails for C1[]"); } if ( !iam ) { /* Process 0 finds a row permutation */ dldperm(job, m, nnz, colptr, rowind, a_GA, perm_r, R1, C1); MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } else { MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } #if ( PRNTlevel>=2 ) dmin = dlamch_("Overflow"); dsum = 0.0; dprod = 1.0; #endif if ( job == 5 ) { if ( Equil ) { for (i = 0; i < n; ++i) { R1[i] = exp(R1[i]); C1[i] = exp(C1[i]); } /* Scale the distributed matrix */ irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R1[irow] * C1[icol]; #if ( PRNTlevel>=2 ) if ( perm_r[irow] == icol ) { /* New diagonal */ if ( job == 2 || job == 3 ) dmin = SUPERLU_MIN(dmin, fabs(a[i])); else if ( job == 4 ) dsum += fabs(a[i]); else if ( job == 5 ) dprod *= fabs(a[i]); } #endif } ++irow; } /* Multiply together the scaling factors. */ if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; else for (i = 0; i < m; ++i) R[i] = R1[i]; if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; else for (i = 0; i < n; ++i) C[i] = C1[i]; ScalePermstruct->DiagScale = BOTH; rowequ = colequ = 1; } /* end Equil */ /* Now permute global A to prepare for symbfact() */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } SUPERLU_FREE (R1); SUPERLU_FREE (C1); } else { /* job = 2,3,4 */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } /* end for i ... */ } /* end for j ... */ } /* end else job ... */ #if ( PRNTlevel>=2 ) if ( job == 2 || job == 3 ) { if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); } else if ( job == 4 ) { if ( !iam ) printf("\tsum of diagonal %e\n", dsum); } else if ( job == 5 ) { if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif } /* end if options->RowPerm ... */ t = SuperLU_timer_() - t; stat->utime[ROWPERM] = t; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); #endif } /* end if Fact ... */ } else { /* options->RowPerm == NOROWPERM */ for (i = 0; i < m; ++i) perm_r[i] = i; } #if ( DEBUGlevel>=2 ) if ( !iam ) PrintInt10("perm_r", m, perm_r); #endif } /* end if (!factored) */ if ( !factored || options->IterRefine ) { /* Compute norm(A), which will be used to adjust small diagonal. */ if ( notran ) *(unsigned char *)norm = '1'; else *(unsigned char *)norm = 'I'; anorm = pdlangs(norm, A, grid); #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. anorm %e\n", anorm); #endif } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( !factored ) { t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A * permc_spec = PARMETIS: parallel METIS on structure of A'+A * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; parSymbFact = options->ParSymbFact; #if ( PRNTlevel>=1 ) if ( parSymbFact && permc_spec != PARMETIS ) if ( !iam ) printf(".. Parallel symbolic factorization" " only works wth ParMetis!\n"); #endif if ( parSymbFact == YES || permc_spec == PARMETIS ) { nprocs_num = grid->nprow * grid->npcol; noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num )))); /* create a new communicator for the first noDomains processors in grid->comm */ key = iam; if (iam < noDomains) col = 0; else col = MPI_UNDEFINED; MPI_Comm_split (grid->comm, col, key, &symb_comm ); permc_spec = PARMETIS; /* only works with PARMETIS */ } if ( permc_spec != MY_PERMC && Fact == DOFACT ) { if ( permc_spec == PARMETIS ) { /* Get column permutation vector in perm_c. * * This routine takes as input the distributed input matrix A * * and does not modify it. It also allocates memory for * * sizes[] and fstVtxSep[] arrays, that contain information * * on the separator tree computed by ParMETIS. */ flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, noDomains, &sizes, &fstVtxSep, grid, &symb_comm); if (flinfo > 0) ABORT("ERROR in get perm_c parmetis."); } else { get_perm_c_dist(iam, permc_spec, &GA, perm_c); } } stat->utime[COLPERM] = SuperLU_timer_() - t; /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' (a.k.a. column etree), depending on the choice of ColPerm. Adjust perm_c[] to be consistent with a postorder of etree. Permute columns of A to form A*Pc'. */ if ( Fact != SamePattern_SameRowPerm ) { if ( parSymbFact == NO ) { int_t *GACcolbeg, *GACcolend, *GACrowind; sp_colorder(options, &GA, perm_c, etree, &GAC); /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ GACstore = (NCPformat *) GAC.Store; GACcolbeg = GACstore->colbeg; GACcolend = GACstore->colend; GACrowind = GACstore->rowind; for (j = 0; j < n; ++j) { for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { irow = GACrowind[i]; GACrowind[i] = perm_c[irow]; } } /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); if ( !(Glu_freeable = (Glu_freeable_t *) SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) ABORT("Malloc fails for Glu_freeable."); /* Every process does this. */ iinfo = symbfact(options, iam, &GAC, perm_c, etree, Glu_persist, Glu_freeable); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if ( iinfo < 0 ) { /* Successful return */ QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); #if ( PRNTlevel>=1 ) if ( !iam ) { printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); printf("\tint %d, short %d, float %d, double %d\n", sizeof(int_t), sizeof(short), sizeof(float), sizeof(double)); printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); } #endif } else { if ( !iam ) { fprintf(stderr,"symbfact() error returns %d\n",iinfo); exit(-1); } } } /* end if serial symbolic factorization */ else { /* parallel symbolic factorization */ t = SuperLU_timer_(); flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, sizes, fstVtxSep, &Pslu_freeable, &(grid->comm), &symb_comm, &symb_mem_usage); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if (flinfo > 0) ABORT("Insufficient memory for parallel symbolic factorization."); } } /* end if Fact ... */ #if ( PRNTlevel>=1 ) if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]); #endif if (sizes) SUPERLU_FREE (sizes); if (fstVtxSep) SUPERLU_FREE (fstVtxSep); if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { /* Apply column permutation to the original distributed A */ for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ t = SuperLU_timer_(); dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct, Glu_freeable, LUstruct, grid); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factorization. */ if ( Fact != SamePattern_SameRowPerm ) { iinfo = symbfact_SubFree(Glu_freeable); SUPERLU_FREE(Glu_freeable); } } else { /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ /* Apply column permutation to the original distributed A */ for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; t = SuperLU_timer_(); dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct, &Pslu_freeable, LUstruct, grid); if (dist_mem_use > 0) ABORT ("Not enough memory available for dist_psymbtonum\n"); stat->utime[DIST] = SuperLU_timer_() - t; } #if ( PRNTlevel>=1 ) if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); #endif /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); stat->utime[FACT] = SuperLU_timer_() - t; #if ( PRNTlevel>=1 ) { int_t TinyPivots; float for_lu, total, max, avg, temp; dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); MPI_Reduce( &num_mem_usage.for_lu, &for_lu, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &num_mem_usage.total, &total, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); temp = SUPERLU_MAX(symb_mem_usage.total, symb_mem_usage.for_lu + (float)dist_mem_use + num_mem_usage.for_lu); if (parSymbFact == TRUE) /* The memory used in the redistribution routine includes the memory used for storing the symbolic structure and the memory allocated for numerical factorization */ temp = SUPERLU_MAX(symb_mem_usage.total, (float)dist_mem_use); temp = SUPERLU_MAX(temp, num_mem_usage.total); MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, MPI_SUM, grid->comm ); stat->TinyPivots = TinyPivots; if ( !iam ) { printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", for_lu*1e-6, total*1e-6); printf("\tAll space (MB):" "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); } } #endif /* Destroy GA */ if ( Fact != SamePattern_SameRowPerm ) Destroy_CompCol_Matrix_dist(&GA); } /* end if (!factored) */ /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ if ( nrhs ) { if ( !(b_work = doubleMalloc_dist(n)) ) ABORT("Malloc fails for b_work[]"); /* ------------------------------------------------------------ Scale the right-hand side if equilibration was performed. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } } else if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } /* Save a copy of the right-hand side. */ ldx = ldb; if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) ABORT("Malloc fails for X[]"); x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; x_col += ldx; b_col += ldb; } /* ------------------------------------------------------------ Solve the linear system. ------------------------------------------------------------*/ if ( options->SolveInitialized == NO ) { dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, SOLVEstruct); } pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, fst_row, ldb, nrhs, SOLVEstruct, stat, info); /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ if ( options->IterRefine ) { /* Improve the solution by iterative refinement. */ int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ t = SuperLU_timer_(); if ( options->RefineInitialized == NO || Fact == DOFACT ) { /* All these cases need to re-initialize gsmv structure */ if ( options->RefineInitialized ) pdgsmv_finalize(SOLVEstruct->gsmv_comm); pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, SOLVEstruct->gsmv_comm); /* Save a copy of the transformed local col indices in colind_gsmv[]. */ if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); if ( !(it = intMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for colind_gsmv[]"); colind_gsmv = SOLVEstruct->A_colind_gsmv = it; for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; options->RefineInitialized = YES; } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { double at; int_t k, jcol, p; /* Swap to beginning the part of A corresponding to the local part of X, as was done in pdgsmv_init() */ for (i = 0; i < m_loc; ++i) { /* Loop through each row */ k = rowptr[i]; for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; p = SOLVEstruct->row_to_proc[jcol]; if ( p == iam ) { /* Local */ at = a[k]; a[k] = a[j]; a[j] = at; ++k; } } } /* Re-use the local col indices of A obtained from the previous call to pdgsmv_init() */ for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; } if ( nrhs == 1 ) { /* Use the existing solve structure */ SOLVEstruct1 = SOLVEstruct; } else { /* For nrhs > 1, since refinement is performed for RHS one at a time, the communication structure for pdgstrs is different than the solve with nrhs RHS. So we use SOLVEstruct1 for the refinement step. */ if ( !(SOLVEstruct1 = (SOLVEstruct_t *) SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) ) ABORT("Malloc fails for SOLVEstruct1"); /* Copy the same stuff */ SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; SOLVEstruct1->diag_len = SOLVEstruct->diag_len; SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; /* Initialize the *gstrs_comm for 1 RHS. */ if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, Glu_persist, SOLVEstruct1); } pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); /* Deallocate the storage associated with SOLVEstruct1 */ if ( nrhs > 1 ) { pxgstrs_finalize(SOLVEstruct1->gstrs_comm); SUPERLU_FREE(SOLVEstruct1); } stat->utime[REFINE] = SuperLU_timer_() - t; } /* Permute the solution matrix B <= Pc'*X. */ pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, SOLVEstruct->inv_perm_c, X, ldx, B, ldb, nrhs, grid); #if ( DEBUGlevel>=2 ) printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); for (i = 0; i < m_loc; ++i) printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); #endif /* Transform the solution matrix X to a solution of the original system before the equilibration. */ if ( notran ) { if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } } else if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } SUPERLU_FREE(b_work); SUPERLU_FREE(X); } /* end if nrhs != 0 */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif /* Deallocate R and/or C if it was not used. */ if ( Equil && Fact != SamePattern_SameRowPerm ) { switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: SUPERLU_FREE(R); SUPERLU_FREE(C); break; case ROW: SUPERLU_FREE(C); break; case COL: SUPERLU_FREE(R); break; } } if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) Destroy_CompCol_Permuted_dist(&GAC); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgssvx()"); #endif }