Ejemplo n.º 1
0
/*
 * Generate a banded square matrix A, with dimension n and semi-bandwidth b.
 */
void
zband(int n, int b, int nonz, doublecomplex **nzval, int **rowind, int **colptr)
{
    int iseed[] = {1992,1993,1994,1995};    
    register int i, j, ub, lb, ilow, ihigh, lasta = 0;
    doublecomplex *a;
    int    *asub, *xa;
    doublecomplex *val;
    int    *row;
    extern double dlaran_();
    
    printf("A banded matrix.");
    zallocateA(n, nonz, nzval, rowind, colptr); /* Allocate storage */
    a    = *nzval;
    asub = *rowind;
    xa   = *colptr;
    ub = lb = b;
    
    for (i = 0; i < 4; ++i) iseed[i] = abs( iseed[i] ) % 4096;
    if ( iseed[3] % 2 != 1 ) ++iseed[3];

    for (j = 0; j < n; ++j) {
	xa[j] = lasta;
	val = &a[lasta];
	row = &asub[lasta];
	ilow = SUPERLU_MAX(0, j - ub);
	ihigh = SUPERLU_MIN(n-1, j + lb);
	for (i = ilow; i <= ihigh; ++i) {
	    val[i-ilow].r = dlaran_(iseed);
	    row[i-ilow] = i;
	}
	lasta += ihigh - ilow + 1;
    } /* for j ... */
    xa[n] = lasta;
}
/*
 * Convert a full matrix into a sparse matrix format. 
 */
int
sp_dconvert(int m, int n, double *A, int lda, int kl, int ku,
	   double *a, int *asub, int *xa, int *nnz)
{
    int     lasta = 0;
    int     i, j, ilow, ihigh;
    int     *row;
    double  *val;

    for (j = 0; j < n; ++j) {
	xa[j] = lasta;
	val = &a[xa[j]];
	row = &asub[xa[j]];

	ilow = SUPERLU_MAX(0, j - ku);
	ihigh = SUPERLU_MIN(n-1, j + kl);
	for (i = ilow; i <= ihigh; ++i) {
	    val[i-ilow] = A[i + j*lda];
	    row[i-ilow] = i;
	}
	lasta += ihigh - ilow + 1;
    }

    xa[n] = *nnz = lasta;
    return 0;
}
Ejemplo n.º 3
0
float sqselect(int n, float A[], int k)
{
    register int i, j, p;
    register float val;

    k = SUPERLU_MAX(k, 0);
    k = SUPERLU_MIN(k, n - 1);
    while (n > 1)
    {
	i = 0; j = n-1;
	p = j; val = A[p];
	while (i < j)
	{
	    for (; A[i] >= val && i < p; i++);
	    if (A[i] < val) { A[p] = A[i]; p = i; }
	    for (; A[j] <= val && j > p; j--);
	    if (A[j] > val) { A[p] = A[j]; p = j; }
	}
	A[p] = val;
	if (p == k) return val;
	else if (p > k) n = p;
	else
	{
	    p++;
	    n -= p; A += p; k -= p;
	}
    }

    return A[0];
}
Ejemplo n.º 4
0
/*! \brief

<pre> 
    Purpose   
    =======   

    DLANGS_dist returns the value of the one norm, or the Frobenius norm, or 
    the infinity norm, or the element of largest absolute value of a 
    real matrix A.   

    Description   
    ===========   

    DLANGE returns the value   

       DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
                (   
                ( norm1(A),         NORM = '1', 'O' or 'o'   
                (   
                ( normI(A),         NORM = 'I' or 'i'   
                (   
                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   

    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   

    Arguments   
    =========   

    NORM    (input) CHARACTER*1   
            Specifies the value to be returned in DLANGE as described above.   
    A       (input) SuperMatrix*
            The M by N sparse matrix A. 

   ===================================================================== 
</pre>
*/
double dlangs_dist(char *norm, SuperMatrix *A)
{

    
    /* Local variables */
    NCformat *Astore;
    double   *Aval;
    int_t    i, j, irow;
    double   value=0., sum;
    double   *rwork;

    Astore = (NCformat *) A->Store;
    Aval   = (double *) Astore->nzval;
    
    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
	value = 0.;
	
    } else if ( strncmp(norm, "M", 1)==0 ) {
	/* Find max(abs(A(i,j))). */
	value = 0.;
	for (j = 0; j < A->ncol; ++j)
	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
		value = SUPERLU_MAX( value, fabs( Aval[i]) );
	
    } else if ( strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') {
	/* Find norm1(A). */
	value = 0.;
	for (j = 0; j < A->ncol; ++j) {
	    sum = 0.;
	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
		sum += fabs(Aval[i]);
	    value = SUPERLU_MAX(value, sum);
	}
	
    } else if ( strncmp(norm, "I", 1)==0 ) {
	/* Find normI(A). */
	if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) )
	    ABORT("SUPERLU_MALLOC fails for rwork.");
	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
	for (j = 0; j < A->ncol; ++j)
	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
		irow = Astore->rowind[i];
		rwork[irow] += fabs(Aval[i]);
	    }
	value = 0.;
	for (i = 0; i < A->nrow; ++i)
	    value = SUPERLU_MAX(value, rwork[i]);
	
	SUPERLU_FREE (rwork);
	
    } else if ( strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0 ) {
	/* Find normF(A). */
	ABORT("Not implemented.");
    } else
	ABORT("Illegal norm specified.");

    return (value);

} /* dlangs_dist */
Ejemplo n.º 5
0
/*
 * Nonsymmetric elimination tree
 */
int
sp_coletree(
	    int *acolst, int *acolend, /* column start and end past 1 */
	    int *arow,                 /* row indices of A */
	    int nr, int nc,            /* dimension of A */
	    int *parent	               /* parent in elim tree */
	    )
{
	int	*root;			/* root of subtee of etree 	*/
	int     *firstcol;		/* first nonzero col in each row*/
	int	rset, cset;             
	int	row, col;
	int	rroot;
	int	p;
	int     *pp;

	root = mxCallocInt (nc);
	initialize_disjoint_sets (nc, &pp);

	/* Compute firstcol[row] = first nonzero column in row */

	firstcol = mxCallocInt (nr);
	for (row = 0; row < nr; firstcol[row++] = nc);
	for (col = 0; col < nc; col++) 
		for (p = acolst[col]; p < acolend[col]; p++) {
			row = arow[p];
			firstcol[row] = SUPERLU_MIN(firstcol[row], col);
		}

	/* Compute etree by Liu's algorithm for symmetric matrices,
           except use (firstcol[r],c) in place of an edge (r,c) of A.
	   Thus each row clique in A'*A is replaced by a star
	   centered at its first vertex, which has the same fill. */

	for (col = 0; col < nc; col++) {
		cset = make_set (col, pp);
		root[cset] = col;
		parent[col] = nc; /* Matlab */
		for (p = acolst[col]; p < acolend[col]; p++) {
			row = firstcol[arow[p]];
			if (row >= col) continue;
			rset = find (row, pp);
			rroot = root[rset];
			if (rroot != col) {
				parent[rroot] = col;
				cset = link (cset, rset, pp);
				root[cset] = col;
			}
		}
	}

	SUPERLU_FREE (root);
	SUPERLU_FREE (firstcol);
	finalize_disjoint_sets (pp);
	return 0;
}
Ejemplo n.º 6
0
float
sPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, 
             SuperMatrix *L, SuperMatrix *U)
{

    NCformat *Astore;
    SCformat *Lstore;
    NCformat *Ustore;
    float  *Aval, *Lval, *Uval;
    int      fsupc, nsupr, luptr, nz_in_U;
    int      i, j, k, oldcol;
    int      *inv_perm_c;
    float   rpg, maxaj, maxuj;
    extern   double slamch_(char *);
    float   smlnum;
    float   *luval;
   
    /* Get machine constants. */
    smlnum = slamch_("S");
    rpg = 1. / smlnum;

    Astore = A->Store;
    Lstore = L->Store;
    Ustore = U->Store;
    Aval = Astore->nzval;
    Lval = Lstore->nzval;
    Uval = Ustore->nzval;
    
    inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int));
    for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j;

    for (k = 0; k <= Lstore->nsuper; ++k) {
	fsupc = L_FST_SUPC(k);
	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
	luptr = L_NZ_START(fsupc);
	luval = &Lval[luptr];
	nz_in_U = 1;
	
	for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) {
	    maxaj = 0.;
            oldcol = inv_perm_c[j];
	    for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i)
		maxaj = SUPERLU_MAX( maxaj, fabs(Aval[i]) );
	
	    maxuj = 0.;
	    for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++)
		maxuj = SUPERLU_MAX( maxuj, fabs(Uval[i]) );
	    
	    /* Supernode */
	    for (i = 0; i < nz_in_U; ++i)
		maxuj = SUPERLU_MAX( maxuj, fabs(luval[i]) );

	    ++nz_in_U;
	    luval += nsupr;

	    if ( maxuj == 0. )
		rpg = SUPERLU_MIN( rpg, 1.);
	    else
		rpg = SUPERLU_MIN( rpg, maxaj / maxuj );
	}
	
	if ( j >= ncols ) break;
    }

    SUPERLU_FREE(inv_perm_c);
    return (rpg);
}
Ejemplo n.º 7
0
void
pdgssvx(superlu_options_t *options, SuperMatrix *A, 
	ScalePermstruct_t *ScalePermstruct,
	double B[], int ldb, int nrhs, gridinfo_t *grid,
	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
	SuperLUStat_t *stat, int *info)
{
/* 
 * -- Distributed SuperLU routine (version 2.2) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley.
 * November 1, 2007
 * Feburary 20, 2008
 *
 *
 * Purpose
 * =======
 *
 * PDGSSVX solves a system of linear equations A*X=B,
 * by using Gaussian elimination with "static pivoting" to
 * compute the LU factorization of A.
 *
 * Static pivoting is a technique that combines the numerical stability
 * of partial pivoting with the scalability of Cholesky (no pivoting),
 * to run accurately and efficiently on large numbers of processors.
 * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
 * description of the parallel algorithms.
 *
 * The input matrices A and B are distributed by block rows.
 * Here is a graphical illustration (0-based indexing):
 *
 *                        A                B
 *               0 ---------------       ------
 *                   |           |        |  |
 *                   |           |   P0   |  |
 *                   |           |        |  |
 *                 ---------------       ------
 *        - fst_row->|           |        |  |
 *        |          |           |        |  |
 *       m_loc       |           |   P1   |  |
 *        |          |           |        |  |
 *        -          |           |        |  |
 *                 ---------------       ------
 *                   |    .      |        |. |
 *                   |    .      |        |. |
 *                   |    .      |        |. |
 *                 ---------------       ------
 * 
 * where, fst_row is the row number of the first row,
 *        m_loc is the number of rows local to this processor
 * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
 *
 *
 * Here are the options for using this code:
 *
 *   1. Independent of all the other options specified below, the
 *      user must supply
 *
 *      -  B, the matrix of right-hand sides, distributed by block rows,
 *            and its dimensions ldb (local) and nrhs (global)
 *      -  grid, a structure describing the 2D processor mesh
 *      -  options->IterRefine, which determines whether or not to
 *            improve the accuracy of the computed solution using 
 *            iterative refinement
 *
 *      On output, B is overwritten with the solution X.
 *
 *   2. Depending on options->Fact, the user has four options
 *      for solving A*X=B. The standard option is for factoring
 *      A "from scratch". (The other options, described below,
 *      are used when A is sufficiently similar to a previously 
 *      solved problem to save time by reusing part or all of 
 *      the previous factorization.)
 *
 *      -  options->Fact = DOFACT: A is factored "from scratch"
 *
 *      In this case the user must also supply
 *
 *        o  A, the input matrix
 *
 *        as well as the following options to determine what matrix to
 *        factorize.
 *
 *        o  options->Equil,   to specify how to scale the rows and columns
 *                             of A to "equilibrate" it (to try to reduce its
 *                             condition number and so improve the
 *                             accuracy of the computed solution)
 *
 *        o  options->RowPerm, to specify how to permute the rows of A
 *                             (typically to control numerical stability)
 *
 *        o  options->ColPerm, to specify how to permute the columns of A
 *                             (typically to control fill-in and enhance
 *                             parallelism during factorization)
 *
 *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
 *                             pivots encountered during factorization
 *                             (to control numerical stability)
 *
 *      The outputs returned include
 *         
 *        o  ScalePermstruct,  modified to describe how the input matrix A
 *                             was equilibrated and permuted:
 *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
 *                                         columns of A were scaled
 *          .  ScalePermstruct->R, array of row scale factors
 *          .  ScalePermstruct->C, array of column scale factors
 *          .  ScalePermstruct->perm_r, row permutation vector
 *          .  ScalePermstruct->perm_c, column permutation vector
 *
 *          (part of ScalePermstruct may also need to be supplied on input,
 *           depending on options->RowPerm and options->ColPerm as described 
 *           later).
 *
 *        o  A, the input matrix A overwritten by the scaled and permuted
 *              matrix diag(R)*A*diag(C)*Pc^T, where 
 *              Pc is the row permutation matrix determined by
 *                  ScalePermstruct->perm_c
 *              diag(R) and diag(C) are diagonal scaling matrices determined
 *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
 *                  ScalePermstruct->C
 *
 *        o  LUstruct, which contains the L and U factorization of A1 where
 *
 *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
 *
 *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
 *                in A on output.)
 *
 *   3. The second value of options->Fact assumes that a matrix with the same
 *      sparsity pattern as A has already been factored:
 *     
 *      -  options->Fact = SamePattern: A is factored, assuming that it has
 *            the same nonzero pattern as a previously factored matrix. In
 *            this case the algorithm saves time by reusing the previously
 *            computed column permutation vector stored in
 *            ScalePermstruct->perm_c and the "elimination tree" of A
 *            stored in LUstruct->etree
 *
 *      In this case the user must still specify the following options
 *      as before:
 *
 *        o  options->Equil
 *        o  options->RowPerm
 *        o  options->ReplaceTinyPivot
 *
 *      but not options->ColPerm, whose value is ignored. This is because the
 *      previous column permutation from ScalePermstruct->perm_c is used as
 *      input. The user must also supply 
 *
 *        o  A, the input matrix
 *        o  ScalePermstruct->perm_c, the column permutation
 *        o  LUstruct->etree, the elimination tree
 *
 *      The outputs returned include
 *         
 *        o  A, the input matrix A overwritten by the scaled and permuted
 *              matrix as described above
 *        o  ScalePermstruct, modified to describe how the input matrix A was
 *                            equilibrated and row permuted
 *        o  LUstruct, modified to contain the new L and U factors
 *
 *   4. The third value of options->Fact assumes that a matrix B with the same
 *      sparsity pattern as A has already been factored, and where the
 *      row permutation of B can be reused for A. This is useful when A and B
 *      have similar numerical values, so that the same row permutation
 *      will make both factorizations numerically stable. This lets us reuse
 *      all of the previously computed structure of L and U.
 *
 *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
 *            assuming not only the same nonzero pattern as the previously
 *            factored matrix B, but reusing B's row permutation.
 *
 *      In this case the user must still specify the following options
 *      as before:
 *
 *        o  options->Equil
 *        o  options->ReplaceTinyPivot
 *
 *      but not options->RowPerm or options->ColPerm, whose values are
 *      ignored. This is because the permutations from ScalePermstruct->perm_r
 *      and ScalePermstruct->perm_c are used as input.
 *
 *      The user must also supply 
 *
 *        o  A, the input matrix
 *        o  ScalePermstruct->DiagScale, how the previous matrix was row
 *                                       and/or column scaled
 *        o  ScalePermstruct->R, the row scalings of the previous matrix,
 *                               if any
 *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
 *                               if any
 *        o  ScalePermstruct->perm_r, the row permutation of the previous
 *                                    matrix
 *        o  ScalePermstruct->perm_c, the column permutation of the previous 
 *                                    matrix
 *        o  all of LUstruct, the previously computed information about
 *                            L and U (the actual numerical values of L and U
 *                            stored in LUstruct->Llu are ignored)
 *
 *      The outputs returned include
 *         
 *        o  A, the input matrix A overwritten by the scaled and permuted
 *              matrix as described above
 *        o  ScalePermstruct,  modified to describe how the input matrix A was
 *                             equilibrated (thus ScalePermstruct->DiagScale,
 *                             R and C may be modified)
 *        o  LUstruct, modified to contain the new L and U factors
 *
 *   5. The fourth and last value of options->Fact assumes that A is
 *      identical to a matrix that has already been factored on a previous 
 *      call, and reuses its entire LU factorization
 *
 *      -  options->Fact = Factored: A is identical to a previously
 *            factorized matrix, so the entire previous factorization
 *            can be reused.
 *
 *      In this case all the other options mentioned above are ignored
 *      (options->Equil, options->RowPerm, options->ColPerm, 
 *       options->ReplaceTinyPivot)
 *
 *      The user must also supply 
 *
 *        o  A, the unfactored matrix, only in the case that iterative
 *              refinment is to be done (specifically A must be the output
 *              A from the previous call, so that it has been scaled and permuted)
 *        o  all of ScalePermstruct
 *        o  all of LUstruct, including the actual numerical values of
 *           L and U
 *
 *      all of which are unmodified on output.
 *         
 * Arguments
 * =========
 *
 * options (input) superlu_options_t* (global)
 *         The structure defines the input parameters to control
 *         how the LU decomposition will be performed.
 *         The following fields should be defined for this structure:
 *         
 *         o Fact (fact_t)
 *           Specifies whether or not the factored form of the matrix
 *           A is supplied on entry, and if not, how the matrix A should
 *           be factorized based on the previous history.
 *
 *           = DOFACT: The matrix A will be factorized from scratch.
 *                 Inputs:  A
 *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          all of ScalePermstruct
 *                          all of LUstruct
 *
 *           = SamePattern: the matrix A will be factorized assuming
 *             that a factorization of a matrix with the same sparsity
 *             pattern was performed prior to this one. Therefore, this
 *             factorization will reuse column permutation vector 
 *             ScalePermstruct->perm_c and the elimination tree
 *             LUstruct->etree
 *                 Inputs:  A
 *                          options->Equil, RowPerm, ReplaceTinyPivot
 *                          ScalePermstruct->perm_c
 *                          LUstruct->etree
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
 *                          rest of LUstruct (GLU_persist, Llu)
 *
 *           = SamePattern_SameRowPerm: the matrix A will be factorized
 *             assuming that a factorization of a matrix with the same
 *             sparsity	pattern and similar numerical values was performed
 *             prior to this one. Therefore, this factorization will reuse
 *             both row and column scaling factors R and C, and the
 *             both row and column permutation vectors perm_r and perm_c,
 *             distributed data structure set up from the previous symbolic
 *             factorization.
 *                 Inputs:  A
 *                          options->Equil, ReplaceTinyPivot
 *                          all of ScalePermstruct
 *                          all of LUstruct
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          modified LUstruct->Llu
 *           = FACTORED: the matrix A is already factored.
 *                 Inputs:  all of ScalePermstruct
 *                          all of LUstruct
 *
 *         o Equil (yes_no_t)
 *           Specifies whether to equilibrate the system.
 *           = NO:  no equilibration.
 *           = YES: scaling factors are computed to equilibrate the system:
 *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
 *                  Whether or not the system will be equilibrated depends
 *                  on the scaling of the matrix A, but if equilibration is
 *                  used, A is overwritten by diag(R)*A*diag(C) and B by
 *                  diag(R)*B.
 *
 *         o RowPerm (rowperm_t)
 *           Specifies how to permute rows of the matrix A.
 *           = NATURAL:   use the natural ordering.
 *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
 *                        the original matrix to make the diagonal large
 *                        relative to the off-diagonal.
 *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
 *                        input by the user.
 *           
 *         o ColPerm (colperm_t)
 *           Specifies what type of column permutation to use to reduce fill.
 *           = NATURAL:       natural ordering.
 *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
 *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
 *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
 *         
 *         o ReplaceTinyPivot (yes_no_t)
 *           = NO:  do not modify pivots
 *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
 *                  LU factorization.
 *
 *         o IterRefine (IterRefine_t)
 *           Specifies how to perform iterative refinement.
 *           = NO:     no iterative refinement.
 *           = DOUBLE: accumulate residual in double precision.
 *           = EXTRA:  accumulate residual in extra precision.
 *
 *         NOTE: all options must be indentical on all processes when
 *               calling this routine.
 *
 * A (input/output) SuperMatrix* (local)
 *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
 *           The number of linear equations is A->nrow. The type of A must be:
 *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
 *           That is, A is stored in distributed compressed row format.
 *           See supermatrix.h for the definition of 'SuperMatrix'.
 *           This routine only handles square A, however, the LU factorization
 *           routine PDGSTRF can factorize rectangular matrices.
 *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
 *           depending on ScalePermstruct->DiagScale and options->ColPerm:
 *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
 *                diag(R)*A*diag(C).
 *             if options->ColPerm != NATURAL, A is further overwritten by
 *                diag(R)*A*diag(C)*Pc^T.
 *           If all the above condition are true, the LU decomposition is
 *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
 *
 * ScalePermstruct (input/output) ScalePermstruct_t* (global)
 *         The data structure to store the scaling and permutation vectors
 *         describing the transformations performed to the matrix A.
 *         It contains the following fields:
 *
 *         o DiagScale (DiagScale_t)
 *           Specifies the form of equilibration that was done.
 *           = NOEQUIL: no equilibration.
 *           = ROW:     row equilibration, i.e., A was premultiplied by
 *                      diag(R).
 *           = COL:     Column equilibration, i.e., A was postmultiplied
 *                      by diag(C).
 *           = BOTH:    both row and column equilibration, i.e., A was 
 *                      replaced by diag(R)*A*diag(C).
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
 *           DiagScale is an input argument; otherwise it is an output
 *           argument.
 *
 *         o perm_r (int*)
 *           Row permutation vector, which defines the permutation matrix Pr;
 *           perm_r[i] = j means row i of A is in position j in Pr*A.
 *           If options->RowPerm = MY_PERMR, or
 *           options->Fact = SamePattern_SameRowPerm, perm_r is an
 *           input argument; otherwise it is an output argument.
 *
 *         o perm_c (int*)
 *           Column permutation vector, which defines the 
 *           permutation matrix Pc; perm_c[i] = j means column i of A is 
 *           in position j in A*Pc.
 *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
 *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
 *           input argument; otherwise, it is an output argument.
 *           On exit, perm_c may be overwritten by the product of the input
 *           perm_c and a permutation that postorders the elimination tree
 *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
 *           is already in postorder.
 *
 *         o R (double*) dimension (A->nrow)
 *           The row scale factors for A.
 *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
 *                          diag(R).
 *           If DiagScale = NOEQUIL or COL, R is not defined.
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
 *           an input argument; otherwise, R is an output argument.
 *
 *         o C (double*) dimension (A->ncol)
 *           The column scale factors for A.
 *           If DiagScale = COL or BOTH, A is multiplied on the right by 
 *                          diag(C).
 *           If DiagScale = NOEQUIL or ROW, C is not defined.
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
 *           an input argument; otherwise, C is an output argument.
 *         
 * B       (input/output) double* (local)
 *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
 *           where, m_loc is the number of rows stored locally on my
 *           process and is defined in the data structure of matrix A.
 *         On exit, the solution matrix if info = 0;
 *
 * ldb     (input) int (local)
 *         The leading dimension of matrix B.
 *
 * nrhs    (input) int (global)
 *         The number of right-hand sides.
 *         If nrhs = 0, only LU decomposition is performed, the forward
 *         and back substitutions are skipped.
 *
 * grid    (input) gridinfo_t* (global)
 *         The 2D process mesh. It contains the MPI communicator, the number
 *         of process rows (NPROW), the number of process columns (NPCOL),
 *         and my process rank. It is an input argument to all the
 *         parallel routines.
 *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
 *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
 *
 * LUstruct (input/output) LUstruct_t*
 *         The data structures to store the distributed L and U factors.
 *         It contains the following fields:
 *
 *         o etree (int*) dimension (A->ncol) (global)
 *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
 *           It is computed in sp_colorder() during the first factorization,
 *           and is reused in the subsequent factorizations of the matrices
 *           with the same nonzero pattern.
 *           On exit of sp_colorder(), the columns of A are permuted so that
 *           the etree is in a certain postorder. This postorder is reflected
 *           in ScalePermstruct->perm_c.
 *           NOTE:
 *           Etree is a vector of parent pointers for a forest whose vertices
 *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *
 *         o Glu_persist (Glu_persist_t*) (global)
 *           Global data structure (xsup, supno) replicated on all processes,
 *           describing the supernode partition in the factored matrices
 *           L and U:
 *	       xsup[s] is the leading column of the s-th supernode,
 *             supno[i] is the supernode number to which column i belongs.
 *
 *         o Llu (LocalLU_t*) (local)
 *           The distributed data structures to store L and U factors.
 *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
 *
 * SOLVEstruct (input/output) SOLVEstruct_t*
 *         The data structure to hold the communication pattern used
 *         in the phases of triangular solution and iterative refinement.
 *         This pattern should be intialized only once for repeated solutions.
 *         If options->SolveInitialized = YES, it is an input argument.
 *         If options->SolveInitialized = NO and nrhs != 0, it is an output
 *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
 *
 * berr    (output) double*, dimension (nrhs) (global)
 *         The componentwise relative backward error of each solution   
 *         vector X(j) (i.e., the smallest relative change in   
 *         any element of A or B that makes X(j) an exact solution).
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics on runtime and floating-point operation count.
 *        See util.h for the definition of 'SuperLUStat_t'.
 *
 * info    (output) int*
 *         = 0: successful exit
 *         > 0: if info = i, and i is
 *             <= A->ncol: U(i,i) is exactly zero. The factorization has
 *                been completed, but the factor U is exactly singular,
 *                so the solution could not be computed.
 *             > A->ncol: number of bytes allocated when memory allocation
 *                failure occurred, plus A->ncol.
 *
 * See superlu_ddefs.h for the definitions of varioous data types.
 *
 */
    NRformat_loc *Astore;
    SuperMatrix GA;      /* Global A in NC format */
    NCformat *GAstore;
    double   *a_GA;
    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
    NCPformat *GACstore;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    Glu_freeable_t *Glu_freeable;
            /* The nonzero structures of L and U factors, which are
	       replicated on all processrs.
	           (lsub, xlsub) contains the compressed subscript of
		                 supernodes in L.
          	   (usub, xusub) contains the compressed subscript of
		                 nonzero segments in U.
	      If options->Fact != SamePattern_SameRowPerm, they are 
	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
	      routine. They will be freed after PDDISTRIBUTE routine.
	      If options->Fact == SamePattern_SameRowPerm, these
	      structures are not used.                                  */
    fact_t   Fact;
    double   *a;
    int_t    *colptr, *rowind;
    int_t    *perm_r; /* row permutations from partial pivoting */
    int_t    *perm_c; /* column permutation vector */
    int_t    *etree;  /* elimination tree */
    int_t    *rowptr, *colind;  /* Local A in NR*/
    int_t    *rowind_loc, *colptr_loc;
    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
    int_t    nnz_loc, m_loc, fst_row, icol;
    int      iam;
    int      ldx;  /* LDA for matrix X (local). */
    char     equed[1], norm[1];
    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
    double   *X, *b_col, *b_work, *x_col;
    double   t;
    static mem_usage_t num_mem_usage, symb_mem_usage;
#if ( PRNTlevel>= 2 )
    double   dmin, dsum, dprod;
#endif
    int_t procs;

    /* Structures needed for parallel symbolic factorization */
    int_t *sizes, *fstVtxSep, parSymbFact;
    int   noDomains, nprocs_num;
    MPI_Comm symb_comm; /* communicator for symbolic factorization */
    int   col, key; /* parameters for creating a new communicator */
    Pslu_freeable_t Pslu_freeable;
    float  flinfo;

    /* Initialization. */
    m       = A->nrow;
    n       = A->ncol;
    Astore  = (NRformat_loc *) A->Store;
    nnz_loc = Astore->nnz_loc;
    m_loc   = Astore->m_loc;
    fst_row = Astore->fst_row;
    a       = (double *) Astore->nzval;
    rowptr  = Astore->rowptr;
    colind  = Astore->colind;
    sizes   = NULL;
    fstVtxSep = NULL;
    symb_comm = MPI_COMM_NULL;

    /* Test the input parameters. */
    *info = 0;
    Fact = options->Fact;
    if ( Fact < 0 || Fact > FACTORED )
	*info = -1;
    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
	*info = -1;
    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
	*info = -1;
    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
	*info = -1;
    else if ( options->IterRefine == EXTRA ) {
	*info = -1;
	fprintf(stderr, "Extra precise iterative refinement yet to support.");
    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
		|| A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -2;
    else if ( ldb < m_loc )
	*info = -5;
    else if ( nrhs < 0 )
	*info = -6;
    if ( *info ) {
	i = -(*info);
	pxerbla("pdgssvx", grid, -*info);
	return;
    }

    factored = (Fact == FACTORED);
    Equil = (!factored && options->Equil == YES);
    notran = (options->Trans == NOTRANS);
    iam = grid->iam;
    job = 5;
    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
	rowequ = (ScalePermstruct->DiagScale == ROW) ||
	         (ScalePermstruct->DiagScale == BOTH);
	colequ = (ScalePermstruct->DiagScale == COL) ||
	         (ScalePermstruct->DiagScale == BOTH);
    } else rowequ = colequ = FALSE;

    /* The following arrays are replicated on all processes. */
    perm_r = ScalePermstruct->perm_r;
    perm_c = ScalePermstruct->perm_c;
    etree = LUstruct->etree;
    R = ScalePermstruct->R;
    C = ScalePermstruct->C;
    /********/

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pdgssvx()");
#endif

    /* Not factored & ask for equilibration */
    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
	/* Allocate storage if not done so before. */
	switch ( ScalePermstruct->DiagScale ) {
	    case NOEQUIL:
		if ( !(R = (double *) doubleMalloc_dist(m)) )
		    ABORT("Malloc fails for R[].");
	        if ( !(C = (double *) doubleMalloc_dist(n)) )
		    ABORT("Malloc fails for C[].");
		ScalePermstruct->R = R;
		ScalePermstruct->C = C;
		break;
	    case ROW: 
	        if ( !(C = (double *) doubleMalloc_dist(n)) )
		    ABORT("Malloc fails for C[].");
		ScalePermstruct->C = C;
		break;
	    case COL: 
		if ( !(R = (double *) doubleMalloc_dist(m)) )
		    ABORT("Malloc fails for R[].");
		ScalePermstruct->R = R;
		break;
	}
    }

    /* ------------------------------------------------------------
       Diagonal scaling to equilibrate the matrix.
       ------------------------------------------------------------*/
    if ( Equil ) {
#if ( DEBUGlevel>=1 )
	CHECK_MALLOC(iam, "Enter equil");
#endif
	t = SuperLU_timer_();

	if ( Fact == SamePattern_SameRowPerm ) {
	    /* Reuse R and C. */
	    switch ( ScalePermstruct->DiagScale ) {
	      case NOEQUIL:
		break;
	      case ROW:
		irow = fst_row;
		for (j = 0; j < m_loc; ++j) {
		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
			a[i] *= R[irow];       /* Scale rows. */
		    }
		    ++irow;
		}
		break;
	      case COL:
		for (j = 0; j < m_loc; ++j)
		    for (i = rowptr[j]; i < rowptr[j+1]; ++i){
		        icol = colind[i];
			a[i] *= C[icol];          /* Scale columns. */
		    }
		break;
	      case BOTH:
		irow = fst_row;
		for (j = 0; j < m_loc; ++j) {
		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
			icol = colind[i];
			a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
		    }
		    ++irow;
		}
	        break;
	    }
	} else { /* Compute R & C from scratch */
            /* Compute the row and column scalings. */
	    pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);

	    /* Equilibrate matrix A if it is badly-scaled. */
	    pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed);

	    if ( lsame_(equed, "R") ) {
		ScalePermstruct->DiagScale = rowequ = ROW;
	    } else if ( lsame_(equed, "C") ) {
		ScalePermstruct->DiagScale = colequ = COL;
	    } else if ( lsame_(equed, "B") ) {
		ScalePermstruct->DiagScale = BOTH;
		rowequ = ROW;
		colequ = COL;
	    } else ScalePermstruct->DiagScale = NOEQUIL;

#if ( PRNTlevel>=1 )
	    if ( !iam ) {
		printf(".. equilibrated? *equed = %c\n", *equed);
		/*fflush(stdout);*/
	    }
#endif
	} /* if Fact ... */

	stat->utime[EQUIL] = SuperLU_timer_() - t;
#if ( DEBUGlevel>=1 )
	CHECK_MALLOC(iam, "Exit equil");
#endif
    } /* if Equil ... */

    if ( !factored ) { /* Skip this if already factored. */
        /*
         * Gather A from the distributed compressed row format to
         * global A in compressed column format.
         * Numerical values are gathered only when a row permutation
         * for large diagonal is sought after.
         */
	if ( Fact != SamePattern_SameRowPerm ) {
            need_value = (options->RowPerm == LargeDiag);
            pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
            GAstore = (NCformat *) GA.Store;
            colptr = GAstore->colptr;
            rowind = GAstore->rowind;
            nnz = GAstore->nnz;
            if ( need_value ) a_GA = (double *) GAstore->nzval;
            else assert(GAstore->nzval == NULL);
	}

        /* ------------------------------------------------------------
           Find the row permutation for A.
           ------------------------------------------------------------*/
        if ( options->RowPerm != NO ) {
	    t = SuperLU_timer_();
	    if ( Fact != SamePattern_SameRowPerm ) {
	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
	            /* Permute the global matrix GA for symbfact() */
	            for (i = 0; i < colptr[n]; ++i) {
	            	irow = rowind[i]; 
		    	rowind[i] = perm_r[irow];
	            }
	        } else { /* options->RowPerm == LargeDiag */
	            /* Get a new perm_r[] */
	            if ( job == 5 ) {
		        /* Allocate storage for scaling factors. */
		        if ( !(R1 = doubleMalloc_dist(m)) )
		            ABORT("SUPERLU_MALLOC fails for R1[]");
		    	if ( !(C1 = doubleMalloc_dist(n)) )
		            ABORT("SUPERLU_MALLOC fails for C1[]");
	            }

	            if ( !iam ) {
		        /* Process 0 finds a row permutation */
		        dldperm(job, m, nnz, colptr, rowind, a_GA,
		                perm_r, R1, C1);
		
		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
		        if ( job == 5 && Equil ) {
		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
		        }
	            } else {
		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
		        if ( job == 5 && Equil ) {
		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
		        }
	            }

#if ( PRNTlevel>=2 )
	            dmin = dlamch_("Overflow");
	            dsum = 0.0;
	            dprod = 1.0;
#endif
	            if ( job == 5 ) {
		        if ( Equil ) {
		            for (i = 0; i < n; ++i) {
			        R1[i] = exp(R1[i]);
			        C1[i] = exp(C1[i]);
		            }

		            /* Scale the distributed matrix */
		            irow = fst_row;
		            for (j = 0; j < m_loc; ++j) {
			        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
			            icol = colind[i];
			            a[i] *= R1[irow] * C1[icol];
#if ( PRNTlevel>=2 )
			            if ( perm_r[irow] == icol ) { /* New diagonal */
			              if ( job == 2 || job == 3 )
				        dmin = SUPERLU_MIN(dmin, fabs(a[i]));
			              else if ( job == 4 )
				        dsum += fabs(a[i]);
			              else if ( job == 5 )
				        dprod *= fabs(a[i]);
			            }
#endif
			        }
			        ++irow;
		            }

		            /* Multiply together the scaling factors. */
		            if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
		            else for (i = 0; i < m; ++i) R[i] = R1[i];
		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
		            else for (i = 0; i < n; ++i) C[i] = C1[i];
		    
		            ScalePermstruct->DiagScale = BOTH;
		            rowequ = colequ = 1;

		        } /* end Equil */

                        /* Now permute global A to prepare for symbfact() */
                        for (j = 0; j < n; ++j) {
		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
	                        irow = rowind[i];
		                rowind[i] = perm_r[irow];
		            }
		        }
		        SUPERLU_FREE (R1);
		        SUPERLU_FREE (C1);
	            } else { /* job = 2,3,4 */
		        for (j = 0; j < n; ++j) {
		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
			        irow = rowind[i];
			        rowind[i] = perm_r[irow];
		            } /* end for i ... */
		        } /* end for j ... */
	            } /* end else job ... */

#if ( PRNTlevel>=2 )
	            if ( job == 2 || job == 3 ) {
		        if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
	            } else if ( job == 4 ) {
		        if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
	            } else if ( job == 5 ) {
		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
	            }
#endif
	    
                } /* end if options->RowPerm ... */

	        t = SuperLU_timer_() - t;
	        stat->utime[ROWPERM] = t;
#if ( PRNTlevel>=1 )
	        if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t);
#endif
            } /* end if Fact ... */
        } else { /* options->RowPerm == NOROWPERM */
            for (i = 0; i < m; ++i) perm_r[i] = i;
        }

#if ( DEBUGlevel>=2 )
        if ( !iam ) PrintInt10("perm_r",  m, perm_r);
#endif
    } /* end if (!factored) */

    if ( !factored || options->IterRefine ) {
	/* Compute norm(A), which will be used to adjust small diagonal. */
	if ( notran ) *(unsigned char *)norm = '1';
	else *(unsigned char *)norm = 'I';
	anorm = pdlangs(norm, A, grid);
#if ( PRNTlevel>=1 )
	if ( !iam ) printf(".. anorm %e\n", anorm);
#endif
    }

    /* ------------------------------------------------------------
       Perform the LU factorization.
       ------------------------------------------------------------*/
    if ( !factored ) {
	t = SuperLU_timer_();
	/*
	 * Get column permutation vector perm_c[], according to permc_spec:
	 *   permc_spec = NATURAL:  natural ordering 
	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
	 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
	 */
	permc_spec = options->ColPerm;
	parSymbFact = options->ParSymbFact;

#if ( PRNTlevel>=1 )
	if ( parSymbFact && permc_spec != PARMETIS )
	    if ( !iam ) printf(".. Parallel symbolic factorization"
			       " only works wth ParMetis!\n");
#endif

	if ( parSymbFact == YES || permc_spec == PARMETIS ) {	
	    nprocs_num = grid->nprow * grid->npcol;
  	    noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num ))));

	    /* create a new communicator for the first noDomains processors in
	       grid->comm */
	    key = iam;
    	    if (iam < noDomains) col = 0;
	    else col = MPI_UNDEFINED;
	    MPI_Comm_split (grid->comm, col, key, &symb_comm );

	    permc_spec = PARMETIS; /* only works with PARMETIS */
        }

	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
	  if ( permc_spec == PARMETIS ) {
	      /* Get column permutation vector in perm_c.                    *
	       * This routine takes as input the distributed input matrix A  *
	       * and does not modify it.  It also allocates memory for       *
	       * sizes[] and fstVtxSep[] arrays, that contain information    *
	       * on the separator tree computed by ParMETIS.                 */
	      flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
                                  	   noDomains, &sizes, &fstVtxSep,
                                           grid, &symb_comm);
	      if (flinfo > 0)
	          ABORT("ERROR in get perm_c parmetis.");
	  } else {
	      get_perm_c_dist(iam, permc_spec, &GA, perm_c);
          }
        }

	stat->utime[COLPERM] = SuperLU_timer_() - t;

	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
	   (a.k.a. column etree), depending on the choice of ColPerm.
	   Adjust perm_c[] to be consistent with a postorder of etree.
	   Permute columns of A to form A*Pc'. */
	if ( Fact != SamePattern_SameRowPerm ) {
	    if ( parSymbFact == NO ) {
	        int_t *GACcolbeg, *GACcolend, *GACrowind;

	        sp_colorder(options, &GA, perm_c, etree, &GAC); 

	        /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
	        GACstore = (NCPformat *) GAC.Store;
	        GACcolbeg = GACstore->colbeg;
	        GACcolend = GACstore->colend;
	        GACrowind = GACstore->rowind;
	        for (j = 0; j < n; ++j) {
	            for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
		        irow = GACrowind[i];
		        GACrowind[i] = perm_c[irow];
	            }
	        }

	        /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
                   the nonzero data structures for L & U. */
#if ( PRNTlevel>=1 ) 
                if ( !iam )
		  printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
#endif
  	        t = SuperLU_timer_();
	        if ( !(Glu_freeable = (Glu_freeable_t *)
		      SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
		    ABORT("Malloc fails for Glu_freeable.");

	    	/* Every process does this. */
	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
			     	 Glu_persist, Glu_freeable);

	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
	    	if ( iinfo < 0 ) { /* Successful return */
		    QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
#if ( PRNTlevel>=1 )
		    if ( !iam ) {
		    	printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
		    	printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
		    	printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
		    	printf("\tint %d, short %d, float %d, double %d\n", 
			       sizeof(int_t), sizeof(short), sizeof(float),
			       sizeof(double));
		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
			   	symb_mem_usage.for_lu*1e-6, 
			   	symb_mem_usage.total*1e-6,
			   	symb_mem_usage.expansions);
		    }
#endif
	    	} else {
		    if ( !iam ) {
		        fprintf(stderr,"symbfact() error returns %d\n",iinfo);
		    	exit(-1);
		    }
	        }
	    } /* end if serial symbolic factorization */
	    else {  /* parallel symbolic factorization */
	    	t = SuperLU_timer_();
	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
				       sizes, fstVtxSep, &Pslu_freeable, 
				       &(grid->comm), &symb_comm,
				       &symb_mem_usage); 
	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
	    	if (flinfo > 0) 
	      	    ABORT("Insufficient memory for parallel symbolic factorization.");
	    }
	} /* end if Fact ... */

#if ( PRNTlevel>=1 )
	if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]);
#endif
        if (sizes) SUPERLU_FREE (sizes);
        if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
	if (symb_comm != MPI_COMM_NULL)
	  MPI_Comm_free (&symb_comm); 

	if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
  	    /* Apply column permutation to the original distributed A */
	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];

	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
	       NOTE: the row permutation Pc*Pr is applied internally in the
  	       distribution routine. */
	    t = SuperLU_timer_();
	    dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct,
                                      Glu_freeable, LUstruct, grid);
	    stat->utime[DIST] = SuperLU_timer_() - t;

  	    /* Deallocate storage used in symbolic factorization. */
	    if ( Fact != SamePattern_SameRowPerm ) {
	        iinfo = symbfact_SubFree(Glu_freeable);
	        SUPERLU_FREE(Glu_freeable);
	    }
	} else {
	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
	       NOTE: the row permutation Pc*Pr is applied internally in the
	       distribution routine. */
	    /* Apply column permutation to the original distributed A */
	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];

    	    t = SuperLU_timer_();
	    dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct,
		  			   &Pslu_freeable, LUstruct, grid);
	    if (dist_mem_use > 0)
	        ABORT ("Not enough memory available for dist_psymbtonum\n");
	    stat->utime[DIST] = SuperLU_timer_() - t;
	}

#if ( PRNTlevel>=1 )
	if (!iam) printf ("\tDISTRIBUTE time    %8.2f\n", stat->utime[DIST]);
#endif

	/* Perform numerical factorization in parallel. */
	t = SuperLU_timer_();
	pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
	stat->utime[FACT] = SuperLU_timer_() - t;

#if ( PRNTlevel>=1 )
	{
	    int_t TinyPivots;
	    float for_lu, total, max, avg, temp;
	    dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    MPI_Reduce( &num_mem_usage.total, &total,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    temp = SUPERLU_MAX(symb_mem_usage.total,
			       symb_mem_usage.for_lu +
			       (float)dist_mem_use + num_mem_usage.for_lu);
	    if (parSymbFact == TRUE)
	      /* The memory used in the redistribution routine
		 includes the memory used for storing the symbolic
		 structure and the memory allocated for numerical
		 factorization */
	      temp = SUPERLU_MAX(symb_mem_usage.total,
				 (float)dist_mem_use);
	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
	    MPI_Reduce( &temp, &max,
		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
	    MPI_Reduce( &temp, &avg,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
			  MPI_SUM, grid->comm );
	    stat->TinyPivots = TinyPivots;
	    if ( !iam ) {
		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
		       for_lu*1e-6, total*1e-6);
		printf("\tAll space (MB):"
		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
	    }
	}
#endif
    
        /* Destroy GA */
        if ( Fact != SamePattern_SameRowPerm )
            Destroy_CompCol_Matrix_dist(&GA);
    } /* end if (!factored) */
	
    /* ------------------------------------------------------------
       Compute the solution matrix X.
       ------------------------------------------------------------*/
    if ( nrhs ) {

	if ( !(b_work = doubleMalloc_dist(n)) )
	    ABORT("Malloc fails for b_work[]");

	/* ------------------------------------------------------------
	   Scale the right-hand side if equilibration was performed. 
	   ------------------------------------------------------------*/
	if ( notran ) {
	    if ( rowequ ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    irow = fst_row;
		    for (i = 0; i < m_loc; ++i) {
		        b_col[i] *= R[irow];
		        ++irow;
		    }
		    b_col += ldb;
		}
	    }
	} else if ( colequ ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
	        irow = fst_row;
		for (i = 0; i < m_loc; ++i) {
		    b_col[i] *= C[irow];
		    ++irow;
		}
		b_col += ldb;
	    }
	}

	/* Save a copy of the right-hand side. */
	ldx = ldb;
	if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) )
	    ABORT("Malloc fails for X[]");
	x_col = X;  b_col = B;
	for (j = 0; j < nrhs; ++j) {
	    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
	    x_col += ldx;  b_col += ldb;
	}

	/* ------------------------------------------------------------
	   Solve the linear system.
	   ------------------------------------------------------------*/
	if ( options->SolveInitialized == NO ) {
	    dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
		       SOLVEstruct);
	}

	pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
		fst_row, ldb, nrhs, SOLVEstruct, stat, info);

	/* ------------------------------------------------------------
	   Use iterative refinement to improve the computed solution and
	   compute error bounds and backward error estimates for it.
	   ------------------------------------------------------------*/
	if ( options->IterRefine ) {
	    /* Improve the solution by iterative refinement. */
	    int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
	    SOLVEstruct_t *SOLVEstruct1;  /* Used by refinement. */

	    t = SuperLU_timer_();
	    if ( options->RefineInitialized == NO || Fact == DOFACT ) {
	        /* All these cases need to re-initialize gsmv structure */
	        if ( options->RefineInitialized )
		    pdgsmv_finalize(SOLVEstruct->gsmv_comm);
	        pdgsmv_init(A, SOLVEstruct->row_to_proc, grid,
			    SOLVEstruct->gsmv_comm);
	       
                /* Save a copy of the transformed local col indices
		   in colind_gsmv[]. */
	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
	        if ( !(it = intMalloc_dist(nnz_loc)) )
		    ABORT("Malloc fails for colind_gsmv[]");
	        colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
	        for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
	        options->RefineInitialized = YES;
	    } else if ( Fact == SamePattern ||
			Fact == SamePattern_SameRowPerm ) {
	        double at;
	        int_t k, jcol, p;
	        /* Swap to beginning the part of A corresponding to the
		   local part of X, as was done in pdgsmv_init() */
	        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
		    k = rowptr[i];
		    for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
		        jcol = colind[j];
		        p = SOLVEstruct->row_to_proc[jcol];
		        if ( p == iam ) { /* Local */
		            at = a[k]; a[k] = a[j]; a[j] = at;
		            ++k;
		        }
		    }
	        }
	      
	        /* Re-use the local col indices of A obtained from the
		   previous call to pdgsmv_init() */
	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
	    }

	    if ( nrhs == 1 ) { /* Use the existing solve structure */
	        SOLVEstruct1 = SOLVEstruct;
	    } else { /* For nrhs > 1, since refinement is performed for RHS
			one at a time, the communication structure for pdgstrs
			is different than the solve with nrhs RHS. 
			So we use SOLVEstruct1 for the refinement step.
		     */
	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
		    ABORT("Malloc fails for SOLVEstruct1");
	        /* Copy the same stuff */
	        SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
	        SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
	        SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
	        SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
		
		/* Initialize the *gstrs_comm for 1 RHS. */
		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
		    ABORT("Malloc fails for gstrs_comm[]");
		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
			     Glu_persist, SOLVEstruct1);
	    }

	    pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
		    B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);

            /* Deallocate the storage associated with SOLVEstruct1 */
	    if ( nrhs > 1 ) {
	        pxgstrs_finalize(SOLVEstruct1->gstrs_comm);
	        SUPERLU_FREE(SOLVEstruct1);
	    }

	    stat->utime[REFINE] = SuperLU_timer_() - t;
	}

	/* Permute the solution matrix B <= Pc'*X. */
	pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
			       SOLVEstruct->inv_perm_c,
			       X, ldx, B, ldb, nrhs, grid);
#if ( DEBUGlevel>=2 )
	printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
	for (i = 0; i < m_loc; ++i)
	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
#endif
	
	/* Transform the solution matrix X to a solution of the original
	   system before the equilibration. */
	if ( notran ) {
	    if ( colequ ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    irow = fst_row;
		    for (i = 0; i < m_loc; ++i) {
		        b_col[i] *= C[irow];
		        ++irow;
		    }
		    b_col += ldb;
		}
	    }
	} else if ( rowequ ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
	        irow = fst_row;
		for (i = 0; i < m_loc; ++i) {
		    b_col[i] *= R[irow];
		    ++irow;
		}
		b_col += ldb;
	    }
	}

	SUPERLU_FREE(b_work);
	SUPERLU_FREE(X);

    } /* end if nrhs != 0 */

#if ( PRNTlevel>=1 )
    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
#endif

    /* Deallocate R and/or C if it was not used. */
    if ( Equil && Fact != SamePattern_SameRowPerm ) {
	switch ( ScalePermstruct->DiagScale ) {
	    case NOEQUIL:
	        SUPERLU_FREE(R);
		SUPERLU_FREE(C);
		break;
	    case ROW: 
		SUPERLU_FREE(C);
		break;
	    case COL: 
		SUPERLU_FREE(R);
		break;
	}
    }
    if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
 	Destroy_CompCol_Permuted_dist(&GAC);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pdgssvx()");
#endif

}
Ejemplo n.º 8
0
void
dgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
        double *colcnd, double *amax, int *info)
{
/*    
    Purpose   
    =======   

    dgsequ() computes row and column scalings intended to equilibrate an   
    M-by-N sparse matrix A and reduce its condition number. R returns the row
    scale factors and C the column scale factors, chosen to try to make   
    the largest element in each row and column of the matrix B with   
    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   

    R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
    number and BIGNUM = largest safe number.  Use of these scaling   
    factors is not guaranteed to reduce the condition number of A but   
    works well in practice.   

    See supermatrix.h for the definition of 'SuperMatrix' structure.
 
    Arguments   
    =========   

    A       (input) SuperMatrix*
            The matrix of dimension (A->nrow, A->ncol) whose equilibration
            factors are to be computed. The type of A can be:
            Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
	    
    R       (output) double*, size A->nrow
            If INFO = 0 or INFO > M, R contains the row scale factors   
            for A.
	    
    C       (output) double*, size A->ncol
            If INFO = 0,  C contains the column scale factors for A.
	    
    ROWCND  (output) double*
            If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
            AMAX is neither too large nor too small, it is not worth   
            scaling by R.
	    
    COLCND  (output) double*
            If INFO = 0, COLCND contains the ratio of the smallest   
            C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
            worth scaling by C.
	    
    AMAX    (output) double*
            Absolute value of largest matrix element.  If AMAX is very   
            close to overflow or very close to underflow, the matrix   
            should be scaled.
	    
    INFO    (output) int*
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i,  and i is   
                  <= M:  the i-th row of A is exactly zero   
                  >  M:  the (i-M)-th column of A is exactly zero   

    ===================================================================== 
*/

    /* Local variables */
    NCformat *Astore;
    double   *Aval;
    int i, j, irow;
    double rcmin, rcmax;
    double bignum, smlnum;
    extern double dlamch_(char *);
    
    /* Test the input parameters. */
    *info = 0;
    if ( A->nrow < 0 || A->ncol < 0 ||
	 A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -1;
    if (*info != 0) {
	i = -(*info);
	xerbla_("dgsequ", &i);
	return;
    }

    /* Quick return if possible */
    if ( A->nrow == 0 || A->ncol == 0 ) {
	*rowcnd = 1.;
	*colcnd = 1.;
	*amax = 0.;
	return;
    }

    Astore = A->Store;
    Aval = Astore->nzval;
    
    /* Get machine constants. */
    smlnum = dlamch_("S");
    bignum = 1. / smlnum;

    /* Compute row scale factors. */
    for (i = 0; i < A->nrow; ++i) r[i] = 0.;

    /* Find the maximum element in each row. */
    for (j = 0; j < A->ncol; ++j)
        for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
            irow = Astore->rowind[i];
            r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) );
	}

    /* Find the maximum and minimum scale factors. */
    rcmin = bignum;
    rcmax = 0.;
    for (i = 0; i < A->nrow; ++i) {
	rcmax = SUPERLU_MAX(rcmax, r[i]);
	rcmin = SUPERLU_MIN(rcmin, r[i]);
    }
    *amax = rcmax;

    if (rcmin == 0.) {
	/* Find the first zero scale factor and return an error code. */
	for (i = 0; i < A->nrow; ++i)
	    if (r[i] == 0.) {
		*info = i + 1;
		return;
	    }
    } else {
	/* Invert the scale factors. */
	for (i = 0; i < A->nrow; ++i)
	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
	/* Compute ROWCND = min(R(I)) / max(R(I)) */
	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
    }

    /* Compute column scale factors */
    for (j = 0; j < A->ncol; ++j) c[j] = 0.;

    /* Find the maximum element in each column, assuming the row
       scalings computed above. */
    for (j = 0; j < A->ncol; ++j)
	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
	    irow = Astore->rowind[i];
            c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] );
	}

    /* Find the maximum and minimum scale factors. */
    rcmin = bignum;
    rcmax = 0.;
    for (j = 0; j < A->ncol; ++j) {
	rcmax = SUPERLU_MAX(rcmax, c[j]);
	rcmin = SUPERLU_MIN(rcmin, c[j]);
    }

    if (rcmin == 0.) {
	/* Find the first zero scale factor and return an error code. */
	for (j = 0; j < A->ncol; ++j)
	    if ( c[j] == 0. ) {
		*info = A->nrow + j + 1;
		return;
	    }
    } else {
	/* Invert the scale factors. */
	for (j = 0; j < A->ncol; ++j)
	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
	/* Compute COLCND = min(C(J)) / max(C(J)) */
	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
    }

    return;

} /* dgsequ */
Ejemplo n.º 9
0
void
zgstrf (char *refact, SuperMatrix *A, double diag_pivot_thresh, 
	double drop_tol, int relax, int panel_size, int *etree, 
	void *work, int lwork, int *perm_r, int *perm_c, 
	SuperMatrix *L, SuperMatrix *U, int *info)
{
/*
 * Purpose
 * =======
 *
 * ZGSTRF computes an LU factorization of a general sparse m-by-n
 * matrix A using partial pivoting with row interchanges.
 * The factorization has the form
 *     Pr * A = L * U
 * where Pr is a row permutation matrix, L is lower triangular with unit
 * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper 
 * triangular (upper trapezoidal if A->nrow < A->ncol).
 *
 * See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 * Arguments
 * =========
 *
 * refact (input) char*
 *          Specifies whether we want to use perm_r from a previous factor.
 *          = 'Y': re-use perm_r; perm_r is input, and may be modified due to
 *                 different pivoting determined by diagonal threshold.
 *          = 'N': perm_r is determined by partial pivoting, and output.
 *
 * A        (input) SuperMatrix*
 *	    Original matrix A, permuted by columns, of dimension
 *          (A->nrow, A->ncol). The type of A can be:
 *          Stype = SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE.
 *
 * diag_pivot_thresh (input) double
 *	    Diagonal pivoting threshold. At step j of the Gaussian elimination,
 *          if abs(A_jj) >= thresh * (max_(i>=j) abs(A_ij)), use A_jj as pivot.
 *	    0 <= thresh <= 1. The default value of thresh is 1, corresponding
 *          to partial pivoting.
 *
 * drop_tol (input) double (NOT IMPLEMENTED)
 *	    Drop tolerance parameter. At step j of the Gaussian elimination,
 *          if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
 *          0 <= drop_tol <= 1. The default value of drop_tol is 0.
 *
 * relax    (input) int
 *          To control degree of relaxing supernodes. If the number
 *          of nodes (columns) in a subtree of the elimination tree is less
 *          than relax, this subtree is considered as one supernode,
 *          regardless of the row structures of those columns.
 *
 * panel_size (input) int
 *          A panel consists of at most panel_size consecutive columns.
 *
 * etree    (input) int*, dimension (A->ncol)
 *          Elimination tree of A'*A.
 *          Note: etree is a vector of parent pointers for a forest whose
 *          vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *          On input, the columns of A should be permuted so that the
 *          etree is in a certain postorder.
 *
 * work     (input/output) void*, size (lwork) (in bytes)
 *          User-supplied work space and space for the output data structures.
 *          Not referenced if lwork = 0;
 *
 * lwork   (input) int
 *         Specifies the size of work array in bytes.
 *         = 0:  allocate space internally by system malloc;
 *         > 0:  use user-supplied work array of length lwork in bytes,
 *               returns error if space runs out.
 *         = -1: the routine guesses the amount of space needed without
 *               performing the factorization, and returns it in
 *               *info; no other side effects.
 *
 * perm_r   (input/output) int*, dimension (A->nrow)
 *          Row permutation vector which defines the permutation matrix Pr,
 *          perm_r[i] = j means row i of A is in position j in Pr*A.
 *          If refact is not 'Y', perm_r is output argument;
 *          If refact = 'Y', the pivoting routine will try to use the input
 *          perm_r, unless a certain threshold criterion is violated.
 *          In that case, perm_r is overwritten by a new permutation
 *          determined by partial pivoting or diagonal threshold pivoting.
 *
 * perm_c   (input) int*, dimension (A->ncol)
 *	    Column permutation vector, which defines the 
 *          permutation matrix Pc; perm_c[i] = j means column i of A is 
 *          in position j in A*Pc.
 *          When searching for diagonal, perm_c[*] is applied to the
 *          row subscripts of A, so that diagonal threshold pivoting
 *          can find the diagonal of A, rather than that of A*Pc.
 *
 * L        (output) SuperMatrix*
 *          The factor L from the factorization Pr*A=L*U; use compressed row 
 *          subscripts storage for supernodes, i.e., L has type: 
 *          Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
 *
 * U        (output) SuperMatrix*
 *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
 *          storage scheme, i.e., U has types: Stype = SLU_NC, 
 *          Dtype = SLU_Z, Mtype = SLU_TRU.
 *
 * info     (output) int*
 *          = 0: successful exit
 *          < 0: if info = -i, the i-th argument had an illegal value
 *          > 0: if info = i, and i is
 *             <= A->ncol: U(i,i) is exactly zero. The factorization has
 *                been completed, but the factor U is exactly singular,
 *                and division by zero will occur if it is used to solve a
 *                system of equations.
 *             > A->ncol: number of bytes allocated when memory allocation
 *                failure occurred, plus A->ncol. If lwork = -1, it is
 *                the estimated amount of space needed, plus A->ncol.
 *
 * ======================================================================
 *
 * Local Working Arrays: 
 * ======================
 *   m = number of rows in the matrix
 *   n = number of columns in the matrix
 *
 *   xprune[0:n-1]: xprune[*] points to locations in subscript 
 *	vector lsub[*]. For column i, xprune[i] denotes the point where 
 *	structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need 
 *	to be traversed for symbolic factorization.
 *
 *   marker[0:3*m-1]: marker[i] = j means that node i has been 
 *	reached when working on column j.
 *	Storage: relative to original row subscripts
 *	NOTE: There are 3 of them: marker/marker1 are used for panel dfs, 
 *	      see zpanel_dfs.c; marker2 is used for inner-factorization,
 *            see zcolumn_dfs.c.
 *
 *   parent[0:m-1]: parent vector used during dfs
 *      Storage: relative to new row subscripts
 *
 *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs) 
 *	unexplored neighbor of i in lsub[*]
 *
 *   segrep[0:nseg-1]: contains the list of supernodal representatives
 *	in topological order of the dfs. A supernode representative is the 
 *	last column of a supernode.
 *      The maximum size of segrep[] is n.
 *
 *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a 
 *	supernodal representative r, repfnz[r] is the location of the first 
 *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
 *	indicates the supernode r has been explored.
 *	NOTE: There are W of them, each used for one column of a panel. 
 *
 *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below 
 *      the panel diagonal. These are filled in during zpanel_dfs(), and are
 *      used later in the inner LU factorization within the panel.
 *	panel_lsub[]/dense[] pair forms the SPA data structure.
 *	NOTE: There are W of them.
 *
 *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
 *	    	   NOTE: there are W of them.
 *
 *   tempv[0:*]: real temporary used for dense numeric kernels;
 *	The size of this array is defined by NUM_TEMPV() in zsp_defs.h.
 *
 */
    /* Local working arrays */
    NCPformat *Astore;
    int       *iperm_r; /* inverse of perm_r; not used if refact = 'N' */
    int       *iperm_c; /* inverse of perm_c */
    int       *iwork;
    doublecomplex    *zwork;
    int	      *segrep, *repfnz, *parent, *xplore;
    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
    int	      *xprune;
    int	      *marker;
    doublecomplex    *dense, *tempv;
    int       *relax_end;
    doublecomplex    *a;
    int       *asub;
    int       *xa_begin, *xa_end;
    int       *xsup, *supno;
    int       *xlsub, *xlusup, *xusub;
    int       nzlumax;
    static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */

    /* Local scalars */
    int       pivrow;   /* pivotal row number in the original matrix A */
    int       nseg1;	/* no of segments in U-column above panel row jcol */
    int       nseg;	/* no of segments in each U-column */
    register int jcol;	
    register int kcol;	/* end column of a relaxed snode */
    register int icol;
    register int i, k, jj, new_next, iinfo;
    int       m, n, min_mn, jsupno, fsupc, nextlu, nextu;
    int       w_def;	/* upper bound on panel width */
    int       usepr, iperm_r_allocated = 0;
    int       nnzL, nnzU;
    extern SuperLUStat_t SuperLUStat;
    int       *panel_histo = SuperLUStat.panel_histo;
    flops_t   *ops = SuperLUStat.ops;

    iinfo    = 0;
    m        = A->nrow;
    n        = A->ncol;
    min_mn   = SUPERLU_MIN(m, n);
    Astore   = A->Store;
    a        = Astore->nzval;
    asub     = Astore->rowind;
    xa_begin = Astore->colbeg;
    xa_end   = Astore->colend;

    /* Allocate storage common to the factor routines */
    *info = zLUMemInit(refact, work, lwork, m, n, Astore->nnz,
		      panel_size, L, U, &Glu, &iwork, &zwork);
    if ( *info ) return;
    
    xsup    = Glu.xsup;
    supno   = Glu.supno;
    xlsub   = Glu.xlsub;
    xlusup  = Glu.xlusup;
    xusub   = Glu.xusub;
    
    SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
	     &repfnz, &panel_lsub, &xprune, &marker);
    zSetRWork(m, panel_size, zwork, &dense, &tempv);
    
    usepr = lsame_(refact, "Y");
    if ( usepr ) {
	/* Compute the inverse of perm_r */
	iperm_r = (int *) intMalloc(m);
	for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
	iperm_r_allocated = 1;
    }
    iperm_c = (int *) intMalloc(n);
    for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;

    /* Identify relaxed snodes */
    relax_end = (int *) intMalloc(n);
    relax_snode(n, etree, relax, marker, relax_end); 
    
    ifill (perm_r, m, EMPTY);
    ifill (marker, m * NO_MARKER, EMPTY);
    supno[0] = -1;
    xsup[0]  = xlsub[0] = xusub[0] = xlusup[0] = 0;
    w_def    = panel_size;

    /* 
     * Work on one "panel" at a time. A panel is one of the following: 
     *	   (a) a relaxed supernode at the bottom of the etree, or
     *	   (b) panel_size contiguous columns, defined by the user
     */
    for (jcol = 0; jcol < min_mn; ) {

	if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
   	    kcol = relax_end[jcol];	  /* end of the relaxed snode */
	    panel_histo[kcol-jcol+1]++;

	    /* --------------------------------------
	     * Factorize the relaxed supernode(jcol:kcol) 
	     * -------------------------------------- */
	    /* Determine the union of the row structure of the snode */
	    if ( (*info = zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
				    xprune, marker, &Glu)) != 0 )
		return;

            nextu    = xusub[jcol];
	    nextlu   = xlusup[jcol];
	    jsupno   = supno[jcol];
	    fsupc    = xsup[jsupno];
	    new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
	    nzlumax = Glu.nzlumax;
	    while ( new_next > nzlumax ) {
		if ( *info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu) )
		    return;
	    }
    
	    for (icol = jcol; icol<= kcol; icol++) {
		xusub[icol+1] = nextu;
		
    		/* Scatter into SPA dense[*] */
    		for (k = xa_begin[icol]; k < xa_end[icol]; k++)
        	    dense[asub[k]] = a[k];

	       	/* Numeric update within the snode */
	        zsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu);

		if ( *info = zpivotL(icol, diag_pivot_thresh, &usepr, perm_r,
				    iperm_r, iperm_c, &pivrow, &Glu) )
		    if ( iinfo == 0 ) iinfo = *info;
		
#ifdef DEBUG
		zprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu);
#endif

	    }

	    jcol = icol;

	} else { /* Work on one panel of panel_size columns */
	    
	    /* Adjust panel_size so that a panel won't overlap with the next 
	     * relaxed snode.
	     */
	    panel_size = w_def;
	    for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) 
		if ( relax_end[k] != EMPTY ) {
		    panel_size = k - jcol;
		    break;
		}
	    if ( k == min_mn ) panel_size = min_mn - jcol;
	    panel_histo[panel_size]++;

	    /* symbolic factor on a panel of columns */
	    zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
		      dense, panel_lsub, segrep, repfnz, xprune,
		      marker, parent, xplore, &Glu);
	    
	    /* numeric sup-panel updates in topological order */
	    zpanel_bmod(m, panel_size, jcol, nseg1, dense,
		       tempv, segrep, repfnz, &Glu);
	    
	    /* Sparse LU within the panel, and below panel diagonal */
    	    for ( jj = jcol; jj < jcol + panel_size; jj++) {
 		k = (jj - jcol) * m; /* column index for w-wide arrays */

		nseg = nseg1;	/* Begin after all the panel segments */

	    	if ((*info = zcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k],
					segrep, &repfnz[k], xprune, marker,
					parent, xplore, &Glu)) != 0) return;

	      	/* Numeric updates */
	    	if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k],
					 tempv, &segrep[nseg1], &repfnz[k],
					 jcol, &Glu)) != 0) return;
		
	        /* Copy the U-segments to ucol[*] */
		if ((*info = zcopy_to_ucol(jj, nseg, segrep, &repfnz[k],
					  perm_r, &dense[k], &Glu)) != 0)
		    return;

	    	if ( *info = zpivotL(jj, diag_pivot_thresh, &usepr, perm_r,
				    iperm_r, iperm_c, &pivrow, &Glu) )
		    if ( iinfo == 0 ) iinfo = *info;

		/* Prune columns (0:jj-1) using column jj */
	    	zpruneL(jj, perm_r, pivrow, nseg, segrep,
		       &repfnz[k], xprune, &Glu);

		/* Reset repfnz[] for this column */
	    	resetrep_col (nseg, segrep, &repfnz[k]);
		
#ifdef DEBUG
		zprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu);
#endif

	    }

   	    jcol += panel_size;	/* Move to the next panel */

	} /* else */

    } /* for */

    *info = iinfo;
    
    if ( m > n ) {
	k = 0;
        for (i = 0; i < m; ++i) 
            if ( perm_r[i] == EMPTY ) {
    		perm_r[i] = n + k;
		++k;
	    }
    }

    countnz(min_mn, xprune, &nnzL, &nnzU, &Glu);
    fixupL(min_mn, perm_r, &Glu);

    zLUWorkFree(iwork, zwork, &Glu); /* Free work space and compress storage */

    if ( lsame_(refact, "Y") ) {
        /* L and U structures may have changed due to possibly different
	   pivoting, although the storage is available.
	   There could also be memory expansions, so the array locations
           may have changed, */
        ((SCformat *)L->Store)->nnz = nnzL;
	((SCformat *)L->Store)->nsuper = Glu.supno[n];
	((SCformat *)L->Store)->nzval = Glu.lusup;
	((SCformat *)L->Store)->nzval_colptr = Glu.xlusup;
	((SCformat *)L->Store)->rowind = Glu.lsub;
	((SCformat *)L->Store)->rowind_colptr = Glu.xlsub;
	((NCformat *)U->Store)->nnz = nnzU;
	((NCformat *)U->Store)->nzval = Glu.ucol;
	((NCformat *)U->Store)->rowind = Glu.usub;
	((NCformat *)U->Store)->colptr = Glu.xusub;
    } else {
        zCreate_SuperNode_Matrix(L, A->nrow, A->ncol, nnzL, Glu.lusup, 
	                         Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno,
			         Glu.xsup, SLU_SC, SLU_Z, SLU_TRLU);
    	zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, 
			       Glu.usub, Glu.xusub, SLU_NC, SLU_Z, SLU_TRU);
    }
    
    ops[FACT] += ops[TRSV] + ops[GEMV];	
    
    if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
    SUPERLU_FREE (iperm_c);
    SUPERLU_FREE (relax_end);

}
Ejemplo n.º 10
0
/*! \brief
 *
 * <pre>
 * Purpose
 * =======
 *
 * pzgssvx_ABglobal solves a system of linear equations A*X=B,
 * by using Gaussian elimination with "static pivoting" to
 * compute the LU factorization of A.
 *
 * Static pivoting is a technique that combines the numerical stability
 * of partial pivoting with the scalability of Cholesky (no pivoting),
 * to run accurately and efficiently on large numbers of processors.
 *
 * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
 * description of the parallel algorithms.
 *
 * Here are the options for using this code:
 *
 *   1. Independent of all the other options specified below, the
 *      user must supply
 *
 *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
 *      -  grid, a structure describing the 2D processor mesh
 *      -  options->IterRefine, which determines whether or not to
 *            improve the accuracy of the computed solution using 
 *            iterative refinement
 *
 *      On output, B is overwritten with the solution X.
 *
 *   2. Depending on options->Fact, the user has several options
 *      for solving A*X=B. The standard option is for factoring
 *      A "from scratch". (The other options, described below,
 *      are used when A is sufficiently similar to a previously 
 *      solved problem to save time by reusing part or all of 
 *      the previous factorization.)
 *
 *      -  options->Fact = DOFACT: A is factored "from scratch"
 *
 *      In this case the user must also supply
 *
 *      -  A, the input matrix
 *
 *      as well as the following options, which are described in more 
 *      detail below:
 *
 *      -  options->Equil,   to specify how to scale the rows and columns
 *                           of A to "equilibrate" it (to try to reduce its
 *                           condition number and so improve the
 *                           accuracy of the computed solution)
 *
 *      -  options->RowPerm, to specify how to permute the rows of A
 *                           (typically to control numerical stability)
 *
 *      -  options->ColPerm, to specify how to permute the columns of A
 *                           (typically to control fill-in and enhance
 *                           parallelism during factorization)
 *
 *      -  options->ReplaceTinyPivot, to specify how to deal with tiny
 *                           pivots encountered during factorization
 *                           (to control numerical stability)
 *
 *      The outputs returned include
 *         
 *      -  ScalePermstruct,  modified to describe how the input matrix A
 *                           was equilibrated and permuted:
 *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
 *                                        columns of A were scaled
 *         -  ScalePermstruct->R, array of row scale factors
 *         -  ScalePermstruct->C, array of column scale factors
 *         -  ScalePermstruct->perm_r, row permutation vector
 *         -  ScalePermstruct->perm_c, column permutation vector
 *
 *            (part of ScalePermstruct may also need to be supplied on input,
 *             depending on options->RowPerm and options->ColPerm as described 
 *             later).
 *
 *      -  A, the input matrix A overwritten by the scaled and permuted matrix
 *                Pc*Pr*diag(R)*A*diag(C)
 *             where 
 *                Pr and Pc are row and columns permutation matrices determined
 *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c, 
 *                  respectively, and 
 *                diag(R) and diag(C) are diagonal scaling matrices determined
 *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
 *                  ScalePermstruct->C
 *
 *      -  LUstruct, which contains the L and U factorization of A1 where
 *
 *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
 *
 *              (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
 *               in A on output.)
 *
 *   3. The second value of options->Fact assumes that a matrix with the same
 *      sparsity pattern as A has already been factored:
 *     
 *      -  options->Fact = SamePattern: A is factored, assuming that it has
 *            the same nonzero pattern as a previously factored matrix. In this
 *            case the algorithm saves time by reusing the previously computed
 *            column permutation vector stored in ScalePermstruct->perm_c
 *            and the "elimination tree" of A stored in LUstruct->etree.
 *
 *      In this case the user must still specify the following options
 *      as before:
 *
 *      -  options->Equil
 *      -  options->RowPerm
 *      -  options->ReplaceTinyPivot
 *
 *      but not options->ColPerm, whose value is ignored. This is because the
 *      previous column permutation from ScalePermstruct->perm_c is used as
 *      input. The user must also supply 
 *
 *      -  A, the input matrix
 *      -  ScalePermstruct->perm_c, the column permutation
 *      -  LUstruct->etree, the elimination tree
 *
 *      The outputs returned include
 *         
 *      -  A, the input matrix A overwritten by the scaled and permuted matrix
 *            as described above
 *      -  ScalePermstruct,  modified to describe how the input matrix A was
 *                           equilibrated and row permuted
 *      -  LUstruct, modified to contain the new L and U factors
 *
 *   4. The third value of options->Fact assumes that a matrix B with the same
 *      sparsity pattern as A has already been factored, and where the
 *      row permutation of B can be reused for A. This is useful when A and B
 *      have similar numerical values, so that the same row permutation
 *      will make both factorizations numerically stable. This lets us reuse
 *      all of the previously computed structure of L and U.
 *
 *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
 *            assuming not only the same nonzero pattern as the previously
 *            factored matrix B, but reusing B's row permutation.
 *
 *      In this case the user must still specify the following options
 *      as before:
 *
 *      -  options->Equil
 *      -  options->ReplaceTinyPivot
 *
 *      but not options->RowPerm or options->ColPerm, whose values are ignored.
 *      This is because the permutations from ScalePermstruct->perm_r and
 *      ScalePermstruct->perm_c are used as input.
 *
 *      The user must also supply 
 *
 *      -  A, the input matrix
 *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
 *                                     column scaled
 *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
 *      -  ScalePermstruct->C, the columns scalings of the previous matrix, 
 *                             if any
 *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
 *      -  ScalePermstruct->perm_c, the column permutation of the previous 
 *                                  matrix
 *      -  all of LUstruct, the previously computed information about L and U
 *                (the actual numerical values of L and U stored in
 *                 LUstruct->Llu are ignored)
 *
 *      The outputs returned include
 *         
 *      -  A, the input matrix A overwritten by the scaled and permuted matrix
 *            as described above
 *      -  ScalePermstruct,  modified to describe how the input matrix A was
 *                           equilibrated 
 *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
 *      -  LUstruct, modified to contain the new L and U factors
 *
 *   5. The fourth and last value of options->Fact assumes that A is
 *      identical to a matrix that has already been factored on a previous 
 *      call, and reuses its entire LU factorization
 *
 *      -  options->Fact = Factored: A is identical to a previously
 *            factorized matrix, so the entire previous factorization
 *            can be reused.
 *
 *      In this case all the other options mentioned above are ignored
 *      (options->Equil, options->RowPerm, options->ColPerm, 
 *       options->ReplaceTinyPivot)
 *
 *      The user must also supply 
 *
 *      -  A, the unfactored matrix, only in the case that iterative refinment
 *            is to be done (specifically A must be the output A from 
 *            the previous call, so that it has been scaled and permuted)
 *      -  all of ScalePermstruct
 *      -  all of LUstruct, including the actual numerical values of L and U
 *
 *      all of which are unmodified on output.
 *         
 * Arguments
 * =========
 *
 * options (input) superlu_options_t*
 *         The structure defines the input parameters to control
 *         how the LU decomposition will be performed.
 *         The following fields should be defined for this structure:
 *         
 *         o Fact (fact_t)
 *           Specifies whether or not the factored form of the matrix
 *           A is supplied on entry, and if not, how the matrix A should
 *           be factorized based on the previous history.
 *
 *           = DOFACT: The matrix A will be factorized from scratch.
 *                 Inputs:  A
 *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          all of ScalePermstruct
 *                          all of LUstruct
 *
 *           = SamePattern: the matrix A will be factorized assuming
 *             that a factorization of a matrix with the same sparsity
 *             pattern was performed prior to this one. Therefore, this
 *             factorization will reuse column permutation vector 
 *             ScalePermstruct->perm_c and the elimination tree
 *             LUstruct->etree
 *                 Inputs:  A
 *                          options->Equil, RowPerm, ReplaceTinyPivot
 *                          ScalePermstruct->perm_c
 *                          LUstruct->etree
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
 *                          rest of LUstruct (GLU_persist, Llu)
 *
 *           = SamePattern_SameRowPerm: the matrix A will be factorized
 *             assuming that a factorization of a matrix with the same
 *             sparsity	pattern and similar numerical values was performed
 *             prior to this one. Therefore, this factorization will reuse
 *             both row and column scaling factors R and C, and the
 *             both row and column permutation vectors perm_r and perm_c,
 *             distributed data structure set up from the previous symbolic
 *             factorization.
 *                 Inputs:  A
 *                          options->Equil, ReplaceTinyPivot
 *                          all of ScalePermstruct
 *                          all of LUstruct
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          modified LUstruct->Llu
 *           = FACTORED: the matrix A is already factored.
 *                 Inputs:  all of ScalePermstruct
 *                          all of LUstruct
 *
 *         o Equil (yes_no_t)
 *           Specifies whether to equilibrate the system.
 *           = NO:  no equilibration.
 *           = YES: scaling factors are computed to equilibrate the system:
 *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
 *                  Whether or not the system will be equilibrated depends
 *                  on the scaling of the matrix A, but if equilibration is
 *                  used, A is overwritten by diag(R)*A*diag(C) and B by
 *                  diag(R)*B.
 *
 *         o RowPerm (rowperm_t)
 *           Specifies how to permute rows of the matrix A.
 *           = NATURAL:   use the natural ordering.
 *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
 *                        the original matrix to make the diagonal large
 *                        relative to the off-diagonal.
 *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
 *                        input by the user.
 *           
 *         o ColPerm (colperm_t)
 *           Specifies what type of column permutation to use to reduce fill.
 *           = NATURAL:       natural ordering.
 *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
 *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
 *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
 *         
 *         o ReplaceTinyPivot (yes_no_t)
 *           = NO:  do not modify pivots
 *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
 *                  LU factorization.
 *
 *         o IterRefine (IterRefine_t)
 *           Specifies how to perform iterative refinement.
 *           = NO:     no iterative refinement.
 *           = SLU_DOUBLE: accumulate residual in double precision.
 *           = SLU_EXTRA:  accumulate residual in extra precision.
 *
 *         NOTE: all options must be indentical on all processes when
 *               calling this routine.
 *
 * A (input/output) SuperMatrix*
 *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
 *         The number of linear equations is A->nrow. The type of A must be:
 *         Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. That is, A is stored in
 *         compressed column format (also known as Harwell-Boeing format).
 *         See supermatrix.h for the definition of 'SuperMatrix'.
 *         This routine only handles square A, however, the LU factorization
 *         routine pzgstrf can factorize rectangular matrices.
 *         On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C),
 *         depending on ScalePermstruct->DiagScale, options->RowPerm and
 *         options->colpem:
 *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
 *                diag(R)*A*diag(C).
 *             if options->RowPerm != NATURAL, A is further overwritten by
 *                Pr*diag(R)*A*diag(C).
 *             if options->ColPerm != NATURAL, A is further overwritten by
 *                Pc*Pr*diag(R)*A*diag(C).
 *         If all the above condition are true, the LU decomposition is
 *         performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
 *
 *         NOTE: Currently, A must reside in all processes when calling
 *               this routine.
 *
 * ScalePermstruct (input/output) ScalePermstruct_t*
 *         The data structure to store the scaling and permutation vectors
 *         describing the transformations performed to the matrix A.
 *         It contains the following fields:
 *
 *         o DiagScale (DiagScale_t)
 *           Specifies the form of equilibration that was done.
 *           = NOEQUIL: no equilibration.
 *           = ROW:     row equilibration, i.e., A was premultiplied by
 *                      diag(R).
 *           = COL:     Column equilibration, i.e., A was postmultiplied
 *                      by diag(C).
 *           = BOTH:    both row and column equilibration, i.e., A was 
 *                      replaced by diag(R)*A*diag(C).
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
 *           DiagScale is an input argument; otherwise it is an output
 *           argument.
 *
 *         o perm_r (int*)
 *           Row permutation vector, which defines the permutation matrix Pr;
 *           perm_r[i] = j means row i of A is in position j in Pr*A.
 *           If options->RowPerm = MY_PERMR, or
 *           options->Fact = SamePattern_SameRowPerm, perm_r is an
 *           input argument; otherwise it is an output argument.
 *
 *         o perm_c (int*)
 *           Column permutation vector, which defines the 
 *           permutation matrix Pc; perm_c[i] = j means column i of A is 
 *           in position j in A*Pc.
 *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
 *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
 *           input argument; otherwise, it is an output argument.
 *           On exit, perm_c may be overwritten by the product of the input
 *           perm_c and a permutation that postorders the elimination tree
 *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
 *           is already in postorder.
 *
 *         o R (double*) dimension (A->nrow)
 *           The row scale factors for A.
 *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
 *                          diag(R).
 *           If DiagScale = NOEQUIL or COL, R is not defined.
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
 *           an input argument; otherwise, R is an output argument.
 *
 *         o C (double*) dimension (A->ncol)
 *           The column scale factors for A.
 *           If DiagScale = COL or BOTH, A is multiplied on the right by 
 *                          diag(C).
 *           If DiagScale = NOEQUIL or ROW, C is not defined.
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
 *           an input argument; otherwise, C is an output argument.
 *         
 * B       (input/output) doublecomplex*
 *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
 *         On exit, the solution matrix if info = 0;
 *
 *         NOTE: Currently, B must reside in all processes when calling
 *               this routine.
 *
 * ldb     (input) int (global)
 *         The leading dimension of matrix B.
 *
 * nrhs    (input) int (global)
 *         The number of right-hand sides.
 *         If nrhs = 0, only LU decomposition is performed, the forward
 *         and back substitutions are skipped.
 *
 * grid    (input) gridinfo_t*
 *         The 2D process mesh. It contains the MPI communicator, the number
 *         of process rows (NPROW), the number of process columns (NPCOL),
 *         and my process rank. It is an input argument to all the
 *         parallel routines.
 *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
 *         See superlu_zdefs.h for the definition of 'gridinfo_t'.
 *
 * LUstruct (input/output) LUstruct_t*
 *         The data structures to store the distributed L and U factors.
 *         It contains the following fields:
 *
 *         o etree (int*) dimension (A->ncol)
 *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol.
 *           It is computed in sp_colorder() during the first factorization,
 *           and is reused in the subsequent factorizations of the matrices
 *           with the same nonzero pattern.
 *           On exit of sp_colorder(), the columns of A are permuted so that
 *           the etree is in a certain postorder. This postorder is reflected
 *           in ScalePermstruct->perm_c.
 *           NOTE:
 *           Etree is a vector of parent pointers for a forest whose vertices
 *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *
 *         o Glu_persist (Glu_persist_t*)
 *           Global data structure (xsup, supno) replicated on all processes,
 *           describing the supernode partition in the factored matrices
 *           L and U:
 *	       xsup[s] is the leading column of the s-th supernode,
 *             supno[i] is the supernode number to which column i belongs.
 *
 *         o Llu (LocalLU_t*)
 *           The distributed data structures to store L and U factors.
 *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
 *
 * berr    (output) double*, dimension (nrhs)
 *         The componentwise relative backward error of each solution   
 *         vector X(j) (i.e., the smallest relative change in   
 *         any element of A or B that makes X(j) an exact solution).
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics on runtime and floating-point operation count.
 *        See util.h for the definition of 'SuperLUStat_t'.
 *
 * info    (output) int*
 *         = 0: successful exit
 *         > 0: if info = i, and i is
 *             <= A->ncol: U(i,i) is exactly zero. The factorization has
 *                been completed, but the factor U is exactly singular,
 *                so the solution could not be computed.
 *             > A->ncol: number of bytes allocated when memory allocation
 *                failure occurred, plus A->ncol.
 *
 *
 * See superlu_zdefs.h for the definitions of various data types.
 * </pre>
 */
void
pzgssvx_ABglobal(superlu_options_t *options, SuperMatrix *A, 
		 ScalePermstruct_t *ScalePermstruct,
		 doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid,
		 LUstruct_t *LUstruct, double *berr,
		 SuperLUStat_t *stat, int *info)
{
    SuperMatrix AC;
    NCformat *Astore;
    NCPformat *ACstore;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    Glu_freeable_t *Glu_freeable;
            /* The nonzero structures of L and U factors, which are
	       replicated on all processrs.
	           (lsub, xlsub) contains the compressed subscript of
		                 supernodes in L.
          	   (usub, xusub) contains the compressed subscript of
		                 nonzero segments in U.
	      If options->Fact != SamePattern_SameRowPerm, they are 
	      computed by SYMBFACT routine, and then used by DDISTRIBUTE
	      routine. They will be freed after DDISTRIBUTE routine.
	      If options->Fact == SamePattern_SameRowPerm, these
	      structures are not used.                                  */
    fact_t   Fact;
    doublecomplex   *a;
    int_t    *perm_r; /* row permutations from partial pivoting */
    int_t    *perm_c; /* column permutation vector */
    int_t    *etree;  /* elimination tree */
    int_t    *colptr, *rowind;
    int_t    colequ, Equil, factored, job, notran, rowequ;
    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
    int      iam;
    int      ldx;  /* LDA for matrix X (global). */
    char     equed[1], norm[1];
    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
    doublecomplex   *X, *b_col, *b_work, *x_col;
    double   t;
    static mem_usage_t num_mem_usage, symb_mem_usage;
#if ( PRNTlevel>= 2 )
    double   dmin, dsum, dprod;
#endif

    /* Test input parameters. */
    *info = 0;
    Fact = options->Fact;
    if ( Fact < 0 || Fact > FACTORED )
	*info = -1;
    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
	*info = -1;
    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
	*info = -1;
    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
	*info = -1;
    else if ( options->IterRefine == SLU_EXTRA ) {
	*info = -1;
	fprintf(stderr, "Extra precise iterative refinement yet to support.");
    } else if ( A->nrow != A->ncol || A->nrow < 0 ||
         A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
	*info = -2;
    else if ( ldb < A->nrow )
	*info = -5;
    else if ( nrhs < 0 )
	*info = -6;
    if ( *info ) {
	i = -(*info);
	pxerbla("pzgssvx_ABglobal", grid, -*info);
	return;
    }

    /* Initialization */
    factored = (Fact == FACTORED);
    Equil = (!factored && options->Equil == YES);
    notran = (options->Trans == NOTRANS);
    iam = grid->iam;
    job = 5;
    m = A->nrow;
    n = A->ncol;
    Astore = A->Store;
    nnz = Astore->nnz;
    a = Astore->nzval;
    colptr = Astore->colptr;
    rowind = Astore->rowind;
    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
	rowequ = (ScalePermstruct->DiagScale == ROW) ||
	         (ScalePermstruct->DiagScale == BOTH);
	colequ = (ScalePermstruct->DiagScale == COL) ||
	         (ScalePermstruct->DiagScale == BOTH);
    } else rowequ = colequ = FALSE;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pzgssvx_ABglobal()");
#endif

    perm_r = ScalePermstruct->perm_r;
    perm_c = ScalePermstruct->perm_c;
    etree = LUstruct->etree;
    R = ScalePermstruct->R;
    C = ScalePermstruct->C;
    if ( Equil && Fact != SamePattern_SameRowPerm ) {
	/* Allocate storage if not done so before. */
	switch ( ScalePermstruct->DiagScale ) {
	    case NOEQUIL:
		if ( !(R = (double *) doubleMalloc_dist(m)) )
		    ABORT("Malloc fails for R[].");
	        if ( !(C = (double *) doubleMalloc_dist(n)) )
		    ABORT("Malloc fails for C[].");
		ScalePermstruct->R = R;
		ScalePermstruct->C = C;
		break;
	    case ROW: 
	        if ( !(C = (double *) doubleMalloc_dist(n)) )
		    ABORT("Malloc fails for C[].");
		ScalePermstruct->C = C;
		break;
	    case COL: 
		if ( !(R = (double *) doubleMalloc_dist(m)) )
		    ABORT("Malloc fails for R[].");
		ScalePermstruct->R = R;
		break;
	}
    }

    /* ------------------------------------------------------------
       Diagonal scaling to equilibrate the matrix.
       ------------------------------------------------------------*/
    if ( Equil ) {
#if ( DEBUGlevel>=1 )
	CHECK_MALLOC(iam, "Enter equil");
#endif
	t = SuperLU_timer_();

	if ( Fact == SamePattern_SameRowPerm ) {
	    /* Reuse R and C. */
	    switch ( ScalePermstruct->DiagScale ) {
	      case NOEQUIL:
		break;
	      case ROW:
		for (j = 0; j < n; ++j) {
		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
			irow = rowind[i];
			zd_mult(&a[i], &a[i], R[i]); /* Scale rows. */
		    }
		}
		break;
	      case COL:
		for (j = 0; j < n; ++j)
		    for (i = colptr[j]; i < colptr[j+1]; ++i)
			zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */
		break;
	      case BOTH: 
		for (j = 0; j < n; ++j) {
		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
			irow = rowind[i];
			zd_mult(&a[i], &a[i], R[irow]); /* Scale rows. */
			zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */
		    }
		}
	        break;
	    }
	} else {
	    if ( !iam ) {
		/* Compute row and column scalings to equilibrate matrix A. */
		zgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo);
	    
		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
		if ( iinfo == 0 ) {
		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
		} else {
		    if ( iinfo > 0 ) {
			if ( iinfo <= m )
			    fprintf(stderr, "The %d-th row of A is exactly zero\n", 
				    iinfo);
			else fprintf(stderr, "The %d-th column of A is exactly zero\n", 
				     iinfo-n);
			exit(-1);
		    }
		}
	    } else {
		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
		if ( iinfo == 0 ) {
		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
		} else {
		    ABORT("ZGSEQU failed\n");
		}
	    }
	
	    /* Equilibrate matrix A. */
	    zlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed);
	    if ( lsame_(equed, "R") ) {
		ScalePermstruct->DiagScale = rowequ = ROW;
	    } else if ( lsame_(equed, "C") ) {
		ScalePermstruct->DiagScale = colequ = COL;
	    } else if ( lsame_(equed, "B") ) {
		ScalePermstruct->DiagScale = BOTH;
		rowequ = ROW;
		colequ = COL;
	    } else ScalePermstruct->DiagScale = NOEQUIL;

#if ( PRNTlevel>=1 )
	    if ( !iam ) {
		printf(".. equilibrated? *equed = %c\n", *equed);
		/*fflush(stdout);*/
	    }
#endif
	} /* if Fact ... */

	stat->utime[EQUIL] = SuperLU_timer_() - t;
#if ( DEBUGlevel>=1 )
	CHECK_MALLOC(iam, "Exit equil");
#endif
    } /* end if Equil ... */
    
    /* ------------------------------------------------------------
       Permute rows of A. 
       ------------------------------------------------------------*/
    if ( options->RowPerm != NO ) {
	t = SuperLU_timer_();

	if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */
	    || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */
	    for (j = 0; j < n; ++j) {
		for (i = colptr[j]; i < colptr[j+1]; ++i) {
		    irow = rowind[i];
		    rowind[i] = perm_r[irow];
		}
	    }
	} else if ( !factored ) {
	    if ( job == 5 ) {
		/* Allocate storage for scaling factors. */
		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) 
		    ABORT("SUPERLU_MALLOC fails for R1[]");
		if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) )
		    ABORT("SUPERLU_MALLOC fails for C1[]");
	    }

	    if ( !iam ) {
		/* Process 0 finds a row permutation for large diagonal. */
		zldperm(job, m, nnz, colptr, rowind, a, perm_r, R1, C1);
		
		MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
		if ( job == 5 && Equil ) {
		    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
		}
	    } else {
		MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
		if ( job == 5 && Equil ) {
		    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
		}
	    }

#if ( PRNTlevel>=2 )
	    dmin = dlamch_("Overflow");
	    dsum = 0.0;
	    dprod = 1.0;
#endif
	    if ( job == 5 ) {
		if ( Equil ) {
		    for (i = 0; i < n; ++i) {
			R1[i] = exp(R1[i]);
			C1[i] = exp(C1[i]);
		    }
		    for (j = 0; j < n; ++j) {
			for (i = colptr[j]; i < colptr[j+1]; ++i) {
			    irow = rowind[i];
			    zd_mult(&a[i], &a[i], R1[irow]); /* Scale rows. */
			    zd_mult(&a[i], &a[i], C1[j]); /* Scale columns. */
			    rowind[i] = perm_r[irow];
#if ( PRNTlevel>=2 )
			    if ( rowind[i] == j ) /* New diagonal */
				dprod *= slud_z_abs1(&a[i]);
#endif
			}
		    }

		    /* Multiply together the scaling factors. */
		    if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
		    else for (i = 0; i < m; ++i) R[i] = R1[i];
		    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
		    else for (i = 0; i < n; ++i) C[i] = C1[i];
		    
		    ScalePermstruct->DiagScale = BOTH;
		    rowequ = colequ = 1;
		} else { /* No equilibration. */
		    for (j = 0; j < n; ++j) {
			for (i = colptr[j]; i < colptr[j+1]; ++i) {
			    irow = rowind[i];
			    rowind[i] = perm_r[irow];
			}
		    }
		}
		SUPERLU_FREE (R1);
		SUPERLU_FREE (C1);
	    } else { /* job = 2,3,4 */
		for (j = 0; j < n; ++j) {
		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
			irow = rowind[i];
			rowind[i] = perm_r[irow];
#if ( PRNTlevel>=2 )
			if ( rowind[i] == j ) { /* New diagonal */
			    if ( job == 2 || job == 3 )
				dmin = SUPERLU_MIN(dmin, slud_z_abs1(&a[i]));
			    else if ( job == 4 )
				dsum += slud_z_abs1(&a[i]);
			    else if ( job == 5 )
				dprod *= slud_z_abs1(&a[i]);
			}
#endif
		    }
		}
	    }

#if ( PRNTlevel>=2 )
	    if ( job == 2 || job == 3 ) {
		if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
	    } else if ( job == 4 ) {
		if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
	    } else if ( job == 5 ) {
		if ( !iam ) printf("\t product of diagonal %e\n", dprod);
	    }
#endif
	    
        } /* else !factored */

	t = SuperLU_timer_() - t;
	stat->utime[ROWPERM] = t;
    
    } else { /* options->RowPerm == NOROWPERM */
        for (i = 0; i < m; ++i) perm_r[i] = i;
    }

    if ( !factored || options->IterRefine ) {
	/* Compute norm(A), which will be used to adjust small diagonal. */
	if ( notran ) *(unsigned char *)norm = '1';
	else *(unsigned char *)norm = 'I';
	anorm = zlangs_dist(norm, A);
    }

    /* ------------------------------------------------------------
       Perform the LU factorization.
       ------------------------------------------------------------*/
    if ( !factored ) {
	t = SuperLU_timer_();
	/*
	 * Get column permutation vector perm_c[], according to permc_spec:
	 *   permc_spec = NATURAL:  natural ordering 
	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
	 */
	permc_spec = options->ColPerm;
	if ( permc_spec != MY_PERMC && Fact == DOFACT )
	    /* Use an ordering provided by SuperLU */
	    get_perm_c_dist(iam, permc_spec, A, perm_c);

	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
	   (a.k.a. column etree), depending on the choice of ColPerm.
	   Adjust perm_c[] to be consistent with a postorder of etree.
	   Permute columns of A to form A*Pc'. */
	sp_colorder(options, A, perm_c, etree, &AC);

	/* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */
	ACstore = AC.Store;
	for (j = 0; j < n; ++j) 
	    for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) {
		irow = ACstore->rowind[i];
		ACstore->rowind[i] = perm_c[irow];
	    }
	stat->utime[COLPERM] = SuperLU_timer_() - t;

	/* Perform a symbolic factorization on matrix A and set up the
	   nonzero data structures which are suitable for supernodal GENP. */
	if ( Fact != SamePattern_SameRowPerm ) {
#if ( PRNTlevel>=1 ) 
	    if ( !iam ) 
		printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
#endif
	    t = SuperLU_timer_();
	    if ( !(Glu_freeable = (Glu_freeable_t *)
		   SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
		ABORT("Malloc fails for Glu_freeable.");

	    iinfo = symbfact(options, iam, &AC, perm_c, etree, 
			     Glu_persist, Glu_freeable);

	    stat->utime[SYMBFAC] = SuperLU_timer_() - t;

	    if ( iinfo < 0 ) {
		QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
#if ( PRNTlevel>=1 ) 
		if ( !iam ) {
		    printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
		    printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
		    printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
		    printf("\tint %d, short %d, float %d, double %d\n", 
			   sizeof(int_t), sizeof(short), sizeof(float),
			   sizeof(double));
		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
			   symb_mem_usage.for_lu*1e-6, 
			   symb_mem_usage.total*1e-6,
			   symb_mem_usage.expansions);
		}
#endif
	    } else {
		if ( !iam ) {
		    fprintf(stderr, "symbfact() error returns %d\n", iinfo);
		    exit(-1);
		}
	    }
	}

	/* Distribute the L and U factors onto the process grid. */
	t = SuperLU_timer_();
	dist_mem_use = zdistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid);
	stat->utime[DIST] = SuperLU_timer_() - t;

	/* Deallocate storage used in symbolic factor. */
	if ( Fact != SamePattern_SameRowPerm ) {
	    iinfo = symbfact_SubFree(Glu_freeable);
	    SUPERLU_FREE(Glu_freeable);
	}

	/* Perform numerical factorization in parallel. */
	t = SuperLU_timer_();
	pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
	stat->utime[FACT] = SuperLU_timer_() - t;

#if ( PRNTlevel>=1 )
	{
	    int_t TinyPivots;
	    float for_lu, total, max, avg, temp;
	    zQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    MPI_Reduce( &num_mem_usage.total, &total,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    temp = SUPERLU_MAX(symb_mem_usage.total,
			       symb_mem_usage.for_lu +
			       (float)dist_mem_use + num_mem_usage.for_lu);
	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
	    MPI_Reduce( &temp, &max,
		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
	    MPI_Reduce( &temp, &avg,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
			  MPI_SUM, grid->comm );
	    stat->TinyPivots = TinyPivots;
	    if ( !iam ) {
		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
		       for_lu*1e-6, total*1e-6);
		printf("\tAll space (MB):"
		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
	    }
	}
#endif
    
#if ( PRNTlevel>=2 )
	if ( !iam ) printf(".. pzgstrf INFO = %d\n", *info);
#endif

    } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
	/* Permute columns of A to form A*Pc' using the existing perm_c.
	 * NOTE: rows of A were previously permuted to Pc*A.
	 */
	sp_colorder(options, A, perm_c, NULL, &AC);
    } /* if !factored ... */
	
    /* ------------------------------------------------------------
       Compute the solution matrix X.
       ------------------------------------------------------------*/
    if ( nrhs ) {

	if ( !(b_work = doublecomplexMalloc_dist(n)) )
	    ABORT("Malloc fails for b_work[]");

	/* ------------------------------------------------------------
	   Scale the right-hand side if equilibration was performed. 
	   ------------------------------------------------------------*/
	if ( notran ) {
	    if ( rowequ ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], R[i]);
		    b_col += ldb;
		}
	    }
	} else if ( colequ ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
		for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], C[i]);
		b_col += ldb;
	    }
	}

	/* ------------------------------------------------------------
	   Permute the right-hand side to form Pr*B.
	   ------------------------------------------------------------*/
	if ( options->RowPerm != NO ) {
	    if ( notran ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i];
		    for (i = 0; i < m; ++i) b_col[i] = b_work[i];
		    b_col += ldb;
		}
	    }
	}


	/* ------------------------------------------------------------
	   Permute the right-hand side to form Pc*B.
	   ------------------------------------------------------------*/
	if ( notran ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
		for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i];
		for (i = 0; i < m; ++i) b_col[i] = b_work[i];
		b_col += ldb;
	    }
	}

	/* Save a copy of the right-hand side. */
	ldx = ldb;
	if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) )
	    ABORT("Malloc fails for X[]");
	x_col = X;  b_col = B;
	for (j = 0; j < nrhs; ++j) {
	    for (i = 0; i < ldb; ++i) x_col[i] = b_col[i];
	    x_col += ldx;  b_col += ldb;
	}

	/* ------------------------------------------------------------
	   Solve the linear system.
	   ------------------------------------------------------------*/
	pzgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info);

	/* ------------------------------------------------------------
	   Use iterative refinement to improve the computed solution and
	   compute error bounds and backward error estimates for it.
	   ------------------------------------------------------------*/
	if ( options->IterRefine ) {
	    /* Improve the solution by iterative refinement. */
	    t = SuperLU_timer_();
	    pzgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb,
			      X, ldx, nrhs, berr, stat, info);
	    stat->utime[REFINE] = SuperLU_timer_() - t;
	}

	/* Permute the solution matrix X <= Pc'*X. */
	for (j = 0; j < nrhs; j++) {
	    b_col = &B[j*ldb];
	    x_col = &X[j*ldx];
	    for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]];
	}
	
	/* Transform the solution matrix X to a solution of the original system
	   before the equilibration. */
	if ( notran ) {
	    if ( colequ ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], C[i]);
		    b_col += ldb;
		}
	    }
	} else if ( rowequ ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
		for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], R[i]);
		b_col += ldb;
	    }
	}

	SUPERLU_FREE(b_work);
	SUPERLU_FREE(X);

    } /* end if nrhs != 0 */

#if ( PRNTlevel>=1 )
    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
#endif

    /* Deallocate R and/or C if it is not used. */
    if ( Equil && Fact != SamePattern_SameRowPerm ) {
	switch ( ScalePermstruct->DiagScale ) {
	    case NOEQUIL:
	        SUPERLU_FREE(R);
		SUPERLU_FREE(C);
		break;
	    case ROW: 
		SUPERLU_FREE(C);
		break;
	    case COL: 
		SUPERLU_FREE(R);
		break;
	}
    }
    if ( !factored || (factored && options->IterRefine) )
	Destroy_CompCol_Permuted_dist(&AC);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pzgssvx_ABglobal()");
#endif
}
Ejemplo n.º 11
0
void
sgstrf (superlu_options_t *options, SuperMatrix *A,
        int relax, int panel_size, int *etree, void *work, int lwork,
        int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
    	GlobalLU_t *Glu, /* persistent to facilitate multiple factorizations */
        SuperLUStat_t *stat, int *info)
{
    /* Local working arrays */
    NCPformat *Astore;
    int       *iperm_r = NULL; /* inverse of perm_r; used when 
                                  options->Fact == SamePattern_SameRowPerm */
    int       *iperm_c; /* inverse of perm_c */
    int       *iwork;
    float    *swork;
    int	      *segrep, *repfnz, *parent, *xplore;
    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
    int	      *xprune;
    int	      *marker;
    float    *dense, *tempv;
    int       *relax_end;
    float    *a;
    int       *asub;
    int       *xa_begin, *xa_end;
    int       *xsup, *supno;
    int       *xlsub, *xlusup, *xusub;
    int       nzlumax;
    float fill_ratio = sp_ienv(6);  /* estimated fill ratio */

    /* Local scalars */
    fact_t    fact = options->Fact;
    double    diag_pivot_thresh = options->DiagPivotThresh;
    int       pivrow;   /* pivotal row number in the original matrix A */
    int       nseg1;	/* no of segments in U-column above panel row jcol */
    int       nseg;	/* no of segments in each U-column */
    register int jcol;	
    register int kcol;	/* end column of a relaxed snode */
    register int icol;
    register int i, k, jj, new_next, iinfo;
    int       m, n, min_mn, jsupno, fsupc, nextlu, nextu;
    int       w_def;	/* upper bound on panel width */
    int       usepr, iperm_r_allocated = 0;
    int       nnzL, nnzU;
    int       *panel_histo = stat->panel_histo;
    flops_t   *ops = stat->ops;

    iinfo    = 0;
    m        = A->nrow;
    n        = A->ncol;
    min_mn   = SUPERLU_MIN(m, n);
    Astore   = A->Store;
    a        = Astore->nzval;
    asub     = Astore->rowind;
    xa_begin = Astore->colbeg;
    xa_end   = Astore->colend;

    /* Allocate storage common to the factor routines */
    *info = sLUMemInit(fact, work, lwork, m, n, Astore->nnz,
                       panel_size, fill_ratio, L, U, Glu, &iwork, &swork);
    if ( *info ) return;
    
    xsup    = Glu->xsup;
    supno   = Glu->supno;
    xlsub   = Glu->xlsub;
    xlusup  = Glu->xlusup;
    xusub   = Glu->xusub;
    
    SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
	     &repfnz, &panel_lsub, &xprune, &marker);
    sSetRWork(m, panel_size, swork, &dense, &tempv);
    
    usepr = (fact == SamePattern_SameRowPerm);
    if ( usepr ) {
	/* Compute the inverse of perm_r */
	iperm_r = (int *) intMalloc(m);
	for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
	iperm_r_allocated = 1;
    }
    iperm_c = (int *) intMalloc(n);
    for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;

    /* Identify relaxed snodes */
    relax_end = (int *) intMalloc(n);
    if ( options->SymmetricMode == YES ) {
        heap_relax_snode(n, etree, relax, marker, relax_end); 
    } else {
        relax_snode(n, etree, relax, marker, relax_end); 
    }
    
    ifill (perm_r, m, EMPTY);
    ifill (marker, m * NO_MARKER, EMPTY);
    supno[0] = -1;
    xsup[0]  = xlsub[0] = xusub[0] = xlusup[0] = 0;
    w_def    = panel_size;

    /* 
     * Work on one "panel" at a time. A panel is one of the following: 
     *	   (a) a relaxed supernode at the bottom of the etree, or
     *	   (b) panel_size contiguous columns, defined by the user
     */
    for (jcol = 0; jcol < min_mn; ) {

	if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
   	    kcol = relax_end[jcol];	  /* end of the relaxed snode */
	    panel_histo[kcol-jcol+1]++;

	    /* --------------------------------------
	     * Factorize the relaxed supernode(jcol:kcol) 
	     * -------------------------------------- */
	    /* Determine the union of the row structure of the snode */
	    if ( (*info = ssnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
				    xprune, marker, Glu)) != 0 )
		return;

            nextu    = xusub[jcol];
	    nextlu   = xlusup[jcol];
	    jsupno   = supno[jcol];
	    fsupc    = xsup[jsupno];
	    new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
	    nzlumax = Glu->nzlumax;
	    while ( new_next > nzlumax ) {
		if ( (*info = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) )
		    return;
	    }
    
	    for (icol = jcol; icol<= kcol; icol++) {
		xusub[icol+1] = nextu;
		
    		/* Scatter into SPA dense[*] */
    		for (k = xa_begin[icol]; k < xa_end[icol]; k++)
        	    dense[asub[k]] = a[k];

	       	/* Numeric update within the snode */
	        ssnode_bmod(icol, jsupno, fsupc, dense, tempv, Glu, stat);

		if ( (*info = spivotL(icol, diag_pivot_thresh, &usepr, perm_r,
				      iperm_r, iperm_c, &pivrow, Glu, stat)) )
		    if ( iinfo == 0 ) iinfo = *info;
		
#ifdef DEBUG
		sprint_lu_col("[1]: ", icol, pivrow, xprune, Glu);
#endif

	    }

	    jcol = icol;

	} else { /* Work on one panel of panel_size columns */
	    
	    /* Adjust panel_size so that a panel won't overlap with the next 
	     * relaxed snode.
	     */
	    panel_size = w_def;
	    for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) 
		if ( relax_end[k] != EMPTY ) {
		    panel_size = k - jcol;
		    break;
		}
	    if ( k == min_mn ) panel_size = min_mn - jcol;
	    panel_histo[panel_size]++;

	    /* symbolic factor on a panel of columns */
	    spanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
		      dense, panel_lsub, segrep, repfnz, xprune,
		      marker, parent, xplore, Glu);
	    
	    /* numeric sup-panel updates in topological order */
	    spanel_bmod(m, panel_size, jcol, nseg1, dense,
		        tempv, segrep, repfnz, Glu, stat);
	    
	    /* Sparse LU within the panel, and below panel diagonal */
    	    for ( jj = jcol; jj < jcol + panel_size; jj++) {
 		k = (jj - jcol) * m; /* column index for w-wide arrays */

		nseg = nseg1;	/* Begin after all the panel segments */

	    	if ((*info = scolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k],
					segrep, &repfnz[k], xprune, marker,
					parent, xplore, Glu)) != 0) return;

	      	/* Numeric updates */
	    	if ((*info = scolumn_bmod(jj, (nseg - nseg1), &dense[k],
					 tempv, &segrep[nseg1], &repfnz[k],
					 jcol, Glu, stat)) != 0) return;
		
	        /* Copy the U-segments to ucol[*] */
		if ((*info = scopy_to_ucol(jj, nseg, segrep, &repfnz[k],
					  perm_r, &dense[k], Glu)) != 0)
		    return;

	    	if ( (*info = spivotL(jj, diag_pivot_thresh, &usepr, perm_r,
				      iperm_r, iperm_c, &pivrow, Glu, stat)) )
		    if ( iinfo == 0 ) iinfo = *info;

		/* Prune columns (0:jj-1) using column jj */
	    	spruneL(jj, perm_r, pivrow, nseg, segrep,
                        &repfnz[k], xprune, Glu);

		/* Reset repfnz[] for this column */
	    	resetrep_col (nseg, segrep, &repfnz[k]);
		
#ifdef DEBUG
		sprint_lu_col("[2]: ", jj, pivrow, xprune, Glu);
#endif

	    }

   	    jcol += panel_size;	/* Move to the next panel */

	} /* else */

    } /* for */

    *info = iinfo;
    
    if ( m > n ) {
	k = 0;
        for (i = 0; i < m; ++i) 
            if ( perm_r[i] == EMPTY ) {
    		perm_r[i] = n + k;
		++k;
	    }
    }

    countnz(min_mn, xprune, &nnzL, &nnzU, Glu);
    fixupL(min_mn, perm_r, Glu);

    sLUWorkFree(iwork, swork, Glu); /* Free work space and compress storage */

    if ( fact == SamePattern_SameRowPerm ) {
        /* L and U structures may have changed due to possibly different
	   pivoting, even though the storage is available.
	   There could also be memory expansions, so the array locations
           may have changed, */
        ((SCformat *)L->Store)->nnz = nnzL;
	((SCformat *)L->Store)->nsuper = Glu->supno[n];
	((SCformat *)L->Store)->nzval = Glu->lusup;
	((SCformat *)L->Store)->nzval_colptr = Glu->xlusup;
	((SCformat *)L->Store)->rowind = Glu->lsub;
	((SCformat *)L->Store)->rowind_colptr = Glu->xlsub;
	((NCformat *)U->Store)->nnz = nnzU;
	((NCformat *)U->Store)->nzval = Glu->ucol;
	((NCformat *)U->Store)->rowind = Glu->usub;
	((NCformat *)U->Store)->colptr = Glu->xusub;
    } else {
        sCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu->lusup, 
	                         Glu->xlusup, Glu->lsub, Glu->xlsub, Glu->supno,
			         Glu->xsup, SLU_SC, SLU_S, SLU_TRLU);
    	sCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu->ucol, 
			       Glu->usub, Glu->xusub, SLU_NC, SLU_S, SLU_TRU);
    }
    
    ops[FACT] += ops[TRSV] + ops[GEMV];	
    stat->expansions = --(Glu->num_expansions);
    
    if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
    SUPERLU_FREE (iperm_c);
    SUPERLU_FREE (relax_end);

}
Ejemplo n.º 12
0
void
heap_relax_snode (
    const     int n,
    int       *et,           /* column elimination tree */
    const int relax_columns, /* max no of columns allowed in a
					 relaxed snode */
    int       *descendants,  /* no of descendants of each node
					 in the etree */
    int       *relax_end     /* last column in a supernode */
)
{
    /*
     * Purpose
     * =======
     *    relax_snode() - Identify the initial relaxed supernodes, assuming that
     *    the matrix has been reordered according to the postorder of the etree.
     *
     */
    register int i, j, k, l, parent;
    register int snode_start;	/* beginning of a snode */
    int *et_save, *post, *inv_post, *iwork;
    int nsuper_et = 0, nsuper_et_post = 0;

    /* The etree may not be postordered, but is heap ordered. */

    iwork = (int*) intMalloc(3*n+2);
    if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]");
    inv_post = iwork + n+1;
    et_save = inv_post + n+1;

    /* Post order etree */
    post = (int *) TreePostorder(n, et);
    for (i = 0; i < n+1; ++i) inv_post[post[i]] = i;

    /* Renumber etree in postorder */
    for (i = 0; i < n; ++i) {
        iwork[post[i]] = post[et[i]];
        et_save[i] = et[i]; /* Save the original etree */
    }
    for (i = 0; i < n; ++i) et[i] = iwork[i];

    /* Compute the number of descendants of each node in the etree */
    ifill (relax_end, n, EMPTY);
    for (j = 0; j < n; j++) descendants[j] = 0;
    for (j = 0; j < n; j++) {
        parent = et[j];
        if ( parent != n )  /* not the dummy root */
            descendants[parent] += descendants[j] + 1;
    }

    /* Identify the relaxed supernodes by postorder traversal of the etree. */
    for (j = 0; j < n; ) {
        parent = et[j];
        snode_start = j;
        while ( parent != n && descendants[parent] < relax_columns ) {
            j = parent;
            parent = et[j];
        }
        /* Found a supernode in postordered etree; j is the last column. */
        ++nsuper_et_post;
        k = n;
        for (i = snode_start; i <= j; ++i)
            k = SUPERLU_MIN(k, inv_post[i]);
        l = inv_post[j];
        if ( (l - k) == (j - snode_start) ) {
            /* It's also a supernode in the original etree */
            relax_end[k] = l;		/* Last column is recorded */
            ++nsuper_et;
        } else {
            for (i = snode_start; i <= j; ++i) {
                l = inv_post[i];
                if ( descendants[i] == 0 ) relax_end[l] = l;
            }
        }
        j++;
        /* Search for a new leaf */
        while ( descendants[j] != 0 && j < n ) j++;
    }

#if ( PRNTlevel>=1 )
    printf(".. heap_snode_relax:\n"
           "\tNo of relaxed snodes in postordered etree:\t%d\n"
           "\tNo of relaxed snodes in original etree:\t%d\n",
           nsuper_et_post, nsuper_et);
#endif

    /* Recover the original etree */
    for (i = 0; i < n; ++i) et[i] = et_save[i];

    SUPERLU_FREE(post);
    SUPERLU_FREE(iwork);
}
Ejemplo n.º 13
0
void
psgssvx(int nprocs, superlumt_options_t *superlumt_options, SuperMatrix *A,
        int *perm_c, int *perm_r, equed_t *equed, float *R, float *C,
        SuperMatrix *L, SuperMatrix *U,
        SuperMatrix *B, SuperMatrix *X, float *recip_pivot_growth,
        float *rcond, float *ferr, float *berr,
        superlu_memusage_t *superlu_memusage, int *info)
{
    /*
     * -- SuperLU MT routine (version 2.0) --
     * Lawrence Berkeley National Lab, Univ. of California Berkeley,
     * and Xerox Palo Alto Research Center.
     * September 10, 2007
     *
     * Purpose
     * =======
     *
     * psgssvx() solves the system of linear equations A*X=B or A'*X=B, using
     * the LU factorization from sgstrf(). Error bounds on the solution and
     * a condition estimate are also provided. It performs the following steps:
     *
     * 1. If A is stored column-wise (A->Stype = NC):
     *
     *    1.1. If fact = EQUILIBRATE, scaling factors are computed to equilibrate
     *         the system:
     *           trans = NOTRANS: diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B
     *           trans = TRANS:  (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
     *           trans = CONJ:   (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
     *         Whether or not the system will be equilibrated depends on the
     *         scaling of the matrix A, but if equilibration is used, A is
     *         overwritten by diag(R)*A*diag(C) and B by diag(R)*B
     *         (if trans = NOTRANS) or diag(C)*B (if trans = TRANS or CONJ).
     *
     *    1.2. Permute columns of A, forming A*Pc, where Pc is a permutation matrix
     *         that usually preserves sparsity.
     *         For more details of this step, see ssp_colorder.c.
     *
     *    1.3. If fact = DOFACT or EQUILIBRATE, the LU decomposition is used to
     *         factor the matrix A (after equilibration if fact = EQUILIBRATE) as
     *         Pr*A*Pc = L*U, with Pr determined by partial pivoting.
     *
     *    1.4. Compute the reciprocal pivot growth factor.
     *
     *    1.5. If some U(i,i) = 0, so that U is exactly singular, then the routine
     *         returns with info = i. Otherwise, the factored form of A is used to
     *         estimate the condition number of the matrix A. If the reciprocal of
     *         the condition number is less than machine precision,
     *         info = A->ncol+1 is returned as a warning, but the routine still
     *         goes on to solve for X and computes error bounds as described below.
     *
     *    1.6. The system of equations is solved for X using the factored form
     *         of A.
     *
     *    1.7. Iterative refinement is applied to improve the computed solution
     *         matrix and calculate error bounds and backward error estimates
     *         for it.
     *
     *    1.8. If equilibration was used, the matrix X is premultiplied by
     *         diag(C) (if trans = NOTRANS) or diag(R) (if trans = TRANS or CONJ)
     *         so that it solves the original system before equilibration.
     *
     * 2. If A is stored row-wise (A->Stype = NR), apply the above algorithm
     *    to the tranpose of A:
     *
     *    2.1. If fact = EQUILIBRATE, scaling factors are computed to equilibrate
     *         the system:
     *           trans = NOTRANS:diag(R)*A'*diag(C)*inv(diag(C))*X = diag(R)*B
     *           trans = TRANS: (diag(R)*A'*diag(C))**T *inv(diag(R))*X = diag(C)*B
     *           trans = CONJ:  (diag(R)*A'*diag(C))**H *inv(diag(R))*X = diag(C)*B
     *         Whether or not the system will be equilibrated depends on the
     *         scaling of the matrix A, but if equilibration is used, A' is
     *         overwritten by diag(R)*A'*diag(C) and B by diag(R)*B
     *         (if trans = NOTRANS) or diag(C)*B (if trans = TRANS or CONJ).
     *
     *    2.2. Permute columns of transpose(A) (rows of A),
     *         forming transpose(A)*Pc, where Pc is a permutation matrix that
     *         usually preserves sparsity.
     *         For more details of this step, see ssp_colorder.c.
     *
     *    2.3. If fact = DOFACT or EQUILIBRATE, the LU decomposition is used to
     *         factor the matrix A (after equilibration if fact = EQUILIBRATE) as
     *         Pr*transpose(A)*Pc = L*U, with the permutation Pr determined by
     *         partial pivoting.
     *
     *    2.4. Compute the reciprocal pivot growth factor.
     *
     *    2.5. If some U(i,i) = 0, so that U is exactly singular, then the routine
     *         returns with info = i. Otherwise, the factored form of transpose(A)
     *         is used to estimate the condition number of the matrix A.
     *         If the reciprocal of the condition number is less than machine
     *         precision, info = A->nrow+1 is returned as a warning, but the
     *         routine still goes on to solve for X and computes error bounds
     *         as described below.
     *
     *    2.6. The system of equations is solved for X using the factored form
     *         of transpose(A).
     *
     *    2.7. Iterative refinement is applied to improve the computed solution
     *         matrix and calculate error bounds and backward error estimates
     *         for it.
     *
     *    2.8. If equilibration was used, the matrix X is premultiplied by
     *         diag(C) (if trans = NOTRANS) or diag(R) (if trans = TRANS or CONJ)
     *         so that it solves the original system before equilibration.
     *
     * See supermatrix.h for the definition of 'SuperMatrix' structure.
     *
     * Arguments
     * =========
     *
     * nprocs (input) int
     *         Number of processes (or threads) to be spawned and used to perform
     *         the LU factorization by psgstrf(). There is a single thread of
     *         control to call psgstrf(), and all threads spawned by psgstrf()
     *         are terminated before returning from psgstrf().
     *
     * superlumt_options (input) superlumt_options_t*
     *         The structure defines the input parameters and data structure
     *         to control how the LU factorization will be performed.
     *         The following fields should be defined for this structure:
     *
     *         o fact (fact_t)
     *           Specifies whether or not the factored form of the matrix
     *           A is supplied on entry, and if not, whether the matrix A should
     *           be equilibrated before it is factored.
     *           = FACTORED: On entry, L, U, perm_r and perm_c contain the
     *             factored form of A. If equed is not NOEQUIL, the matrix A has
     *             been equilibrated with scaling factors R and C.
     *             A, L, U, perm_r are not modified.
     *           = DOFACT: The matrix A will be factored, and the factors will be
     *             stored in L and U.
     *           = EQUILIBRATE: The matrix A will be equilibrated if necessary,
     *             then factored into L and U.
     *
     *         o trans (trans_t)
     *           Specifies the form of the system of equations:
     *           = NOTRANS: A * X = B        (No transpose)
     *           = TRANS:   A**T * X = B     (Transpose)
     *           = CONJ:    A**H * X = B     (Transpose)
     *
     *         o refact (yes_no_t)
     *           Specifies whether this is first time or subsequent factorization.
     *           = NO:  this factorization is treated as the first one;
     *           = YES: it means that a factorization was performed prior to this
     *               one. Therefore, this factorization will re-use some
     *               existing data structures, such as L and U storage, column
     *               elimination tree, and the symbolic information of the
     *               Householder matrix.
     *
     *         o panel_size (int)
     *           A panel consists of at most panel_size consecutive columns.
     *
     *         o relax (int)
     *           To control degree of relaxing supernodes. If the number
     *           of nodes (columns) in a subtree of the elimination tree is less
     *           than relax, this subtree is considered as one supernode,
     *           regardless of the row structures of those columns.
     *
     *         o diag_pivot_thresh (float)
     *           Diagonal pivoting threshold. At step j of the Gaussian
     *           elimination, if
     *               abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)),
     *           use A_jj as pivot, else use A_ij with maximum magnitude.
     *           0 <= diag_pivot_thresh <= 1. The default value is 1,
     *           corresponding to partial pivoting.
     *
     *         o usepr (yes_no_t)
     *           Whether the pivoting will use perm_r specified by the user.
     *           = YES: use perm_r; perm_r is input, unchanged on exit.
     *           = NO:  perm_r is determined by partial pivoting, and is output.
     *
     *         o drop_tol (double) (NOT IMPLEMENTED)
     *	     Drop tolerance parameter. At step j of the Gaussian elimination,
     *           if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
     *           0 <= drop_tol <= 1. The default value of drop_tol is 0,
     *           corresponding to not dropping any entry.
     *
     *         o work (void*) of size lwork
     *           User-supplied work space and space for the output data structures.
     *           Not referenced if lwork = 0;
     *
     *         o lwork (int)
     *           Specifies the length of work array.
     *           = 0:  allocate space internally by system malloc;
     *           > 0:  use user-supplied work array of length lwork in bytes,
     *                 returns error if space runs out.
     *           = -1: the routine guesses the amount of space needed without
     *                 performing the factorization, and returns it in
     *                 superlu_memusage->total_needed; no other side effects.
     *
     * A       (input/output) SuperMatrix*
     *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol), where
     *         A->nrow = A->ncol. Currently, the type of A can be:
     *         Stype = NC or NR, Dtype = _D, Mtype = GE. In the future,
     *         more general A will be handled.
     *
     *         On entry, If superlumt_options->fact = FACTORED and equed is not
     *         NOEQUIL, then A must have been equilibrated by the scaling factors
     *         in R and/or C.  On exit, A is not modified
     *         if superlumt_options->fact = FACTORED or DOFACT, or
     *         if superlumt_options->fact = EQUILIBRATE and equed = NOEQUIL.
     *
     *         On exit, if superlumt_options->fact = EQUILIBRATE and equed is not
     *         NOEQUIL, A is scaled as follows:
     *         If A->Stype = NC:
     *           equed = ROW:  A := diag(R) * A
     *           equed = COL:  A := A * diag(C)
     *           equed = BOTH: A := diag(R) * A * diag(C).
     *         If A->Stype = NR:
     *           equed = ROW:  transpose(A) := diag(R) * transpose(A)
     *           equed = COL:  transpose(A) := transpose(A) * diag(C)
     *           equed = BOTH: transpose(A) := diag(R) * transpose(A) * diag(C).
     *
     * perm_c  (input/output) int*
     *	   If A->Stype = NC, Column permutation vector of size A->ncol,
     *         which defines the permutation matrix Pc; perm_c[i] = j means
     *         column i of A is in position j in A*Pc.
     *         On exit, perm_c may be overwritten by the product of the input
     *         perm_c and a permutation that postorders the elimination tree
     *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
     *         is already in postorder.
     *
     *         If A->Stype = NR, column permutation vector of size A->nrow,
     *         which describes permutation of columns of tranpose(A)
     *         (rows of A) as described above.
     *
     * perm_r  (input/output) int*
     *         If A->Stype = NC, row permutation vector of size A->nrow,
     *         which defines the permutation matrix Pr, and is determined
     *         by partial pivoting.  perm_r[i] = j means row i of A is in
     *         position j in Pr*A.
     *
     *         If A->Stype = NR, permutation vector of size A->ncol, which
     *         determines permutation of rows of transpose(A)
     *         (columns of A) as described above.
     *
     *         If superlumt_options->usepr = NO, perm_r is output argument;
     *         If superlumt_options->usepr = YES, the pivoting routine will try
     *            to use the input perm_r, unless a certain threshold criterion
     *            is violated. In that case, perm_r is overwritten by a new
     *            permutation determined by partial pivoting or diagonal
     *            threshold pivoting.
     *
     * equed   (input/output) equed_t*
     *         Specifies the form of equilibration that was done.
     *         = NOEQUIL: No equilibration.
     *         = ROW:  Row equilibration, i.e., A was premultiplied by diag(R).
     *         = COL:  Column equilibration, i.e., A was postmultiplied by diag(C).
     *         = BOTH: Both row and column equilibration, i.e., A was replaced
     *                 by diag(R)*A*diag(C).
     *         If superlumt_options->fact = FACTORED, equed is an input argument,
     *         otherwise it is an output argument.
     *
     * R       (input/output) double*, dimension (A->nrow)
     *         The row scale factors for A or transpose(A).
     *         If equed = ROW or BOTH, A (if A->Stype = NC) or transpose(A)
     *            (if A->Stype = NR) is multiplied on the left by diag(R).
     *         If equed = NOEQUIL or COL, R is not accessed.
     *         If fact = FACTORED, R is an input argument; otherwise, R is output.
     *         If fact = FACTORED and equed = ROW or BOTH, each element of R must
     *            be positive.
     *
     * C       (input/output) double*, dimension (A->ncol)
     *         The column scale factors for A or transpose(A).
     *         If equed = COL or BOTH, A (if A->Stype = NC) or trnspose(A)
     *            (if A->Stype = NR) is multiplied on the right by diag(C).
     *         If equed = NOEQUIL or ROW, C is not accessed.
     *         If fact = FACTORED, C is an input argument; otherwise, C is output.
     *         If fact = FACTORED and equed = COL or BOTH, each element of C must
     *            be positive.
     *
     * L       (output) SuperMatrix*
     *	   The factor L from the factorization
     *             Pr*A*Pc=L*U              (if A->Stype = NC) or
     *             Pr*transpose(A)*Pc=L*U   (if A->Stype = NR).
     *         Uses compressed row subscripts storage for supernodes, i.e.,
     *         L has types: Stype = SCP, Dtype = _D, Mtype = TRLU.
     *
     * U       (output) SuperMatrix*
     *	   The factor U from the factorization
     *             Pr*A*Pc=L*U              (if A->Stype = NC) or
     *             Pr*transpose(A)*Pc=L*U   (if A->Stype = NR).
     *         Uses column-wise storage scheme, i.e., U has types:
     *         Stype = NCP, Dtype = _D, Mtype = TRU.
     *
     * B       (input/output) SuperMatrix*
     *         B has types: Stype = DN, Dtype = _D, Mtype = GE.
     *         On entry, the right hand side matrix.
     *         On exit,
     *            if equed = NOEQUIL, B is not modified; otherwise
     *            if A->Stype = NC:
     *               if trans = NOTRANS and equed = ROW or BOTH, B is overwritten
     *                  by diag(R)*B;
     *               if trans = TRANS or CONJ and equed = COL of BOTH, B is
     *                  overwritten by diag(C)*B;
     *            if A->Stype = NR:
     *               if trans = NOTRANS and equed = COL or BOTH, B is overwritten
     *                  by diag(C)*B;
     *               if trans = TRANS or CONJ and equed = ROW of BOTH, B is
     *                  overwritten by diag(R)*B.
     *
     * X       (output) SuperMatrix*
     *         X has types: Stype = DN, Dtype = _D, Mtype = GE.
     *         If info = 0 or info = A->ncol+1, X contains the solution matrix
     *         to the original system of equations. Note that A and B are modified
     *         on exit if equed is not NOEQUIL, and the solution to the
     *         equilibrated system is inv(diag(C))*X if trans = NOTRANS and
     *         equed = COL or BOTH, or inv(diag(R))*X if trans = TRANS or CONJ
     *         and equed = ROW or BOTH.
     *
     * recip_pivot_growth (output) float*
     *         The reciprocal pivot growth factor computed as
     *             max_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ).
     *         If recip_pivot_growth is much less than 1, the stability of the
     *         LU factorization could be poor.
     *
     * rcond   (output) float*
     *         The estimate of the reciprocal condition number of the matrix A
     *         after equilibration (if done). If rcond is less than the machine
     *         precision (in particular, if rcond = 0), the matrix is singular
     *         to working precision. This condition is indicated by a return
     *         code of info > 0.
     *
     * ferr    (output) float*, dimension (B->ncol)
     *         The estimated forward error bound for each solution vector
     *         X(j) (the j-th column of the solution matrix X).
     *         If XTRUE is the true solution corresponding to X(j), FERR(j)
     *         is an estimated upper bound for the magnitude of the largest
     *         element in (X(j) - XTRUE) divided by the magnitude of the
     *         largest element in X(j).  The estimate is as reliable as
     *         the estimate for RCOND, and is almost always a slight
     *         overestimate of the true error.
     *
     * berr    (output) float*, dimension (B->ncol)
     *         The componentwise relative backward error of each solution
     *         vector X(j) (i.e., the smallest relative change in
     *         any element of A or B that makes X(j) an exact solution).
     *
     * superlu_memusage (output) superlu_memusage_t*
     *         Record the memory usage statistics, consisting of following fields:
     *         - for_lu (float)
     *           The amount of space used in bytes for L\U data structures.
     *         - total_needed (float)
     *           The amount of space needed in bytes to perform factorization.
     *         - expansions (int)
     *           The number of memory expansions during the LU factorization.
     *
     * info    (output) int*
     *         = 0: successful exit
     *         < 0: if info = -i, the i-th argument had an illegal value
     *         > 0: if info = i, and i is
     *              <= A->ncol: U(i,i) is exactly zero. The factorization has
     *                    been completed, but the factor U is exactly
     *                    singular, so the solution and error bounds
     *                    could not be computed.
     *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
     *                    precision, meaning that the matrix is singular to
     *                    working precision. Nevertheless, the solution and
     *                    error bounds are computed because there are a number
     *                    of situations where the computed solution can be more
     *                    accurate than the value of RCOND would suggest.
     *              > A->ncol+1: number of bytes allocated when memory allocation
     *                    failure occurred, plus A->ncol.
     *
     */

    NCformat  *Astore;
    DNformat  *Bstore, *Xstore;
    float    *Bmat, *Xmat;
    int       ldb, ldx, nrhs;
    SuperMatrix *AA; /* A in NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int       colequ, equil, dofact, notran, rowequ;
    char      norm[1];
    trans_t   trant;
    int       i, j, info1;
    float amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
    int       n, relax, panel_size;
    Gstat_t   Gstat;
    double    t0;      /* temporary time */
    double    *utime;
    flops_t   *ops, flopcnt;

    /* External functions */
    extern float slangs(char *, SuperMatrix *);
    extern double slamch_(char *);

    Astore = A->Store;
    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    n      = A->ncol;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;
    superlumt_options->perm_c = perm_c;
    superlumt_options->perm_r = perm_r;

    *info = 0;
    dofact = (superlumt_options->fact == DOFACT);
    equil = (superlumt_options->fact == EQUILIBRATE);
    notran = (superlumt_options->trans == NOTRANS);
    if (dofact || equil) {
        *equed = NOEQUIL;
        rowequ = FALSE;
        colequ = FALSE;
    } else {
        rowequ = (*equed == ROW) || (*equed == BOTH);
        colequ = (*equed == COL) || (*equed == BOTH);
        smlnum = slamch_("Safe minimum");
        bignum = 1. / smlnum;
    }

    /* ------------------------------------------------------------
       Test the input parameters.
       ------------------------------------------------------------*/
    if ( nprocs <= 0 ) *info = -1;
    else if ( (!dofact && !equil && (superlumt_options->fact != FACTORED))
              || (!notran && (superlumt_options->trans != TRANS) &&
                  (superlumt_options->trans != CONJ))
              || (superlumt_options->refact != YES &&
                  superlumt_options->refact != NO)
              || (superlumt_options->usepr != YES &&
                  superlumt_options->usepr != NO)
              || superlumt_options->lwork < -1 )
        *info = -2;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
              (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
              A->Dtype != SLU_S || A->Mtype != SLU_GE )
        *info = -3;
    else if ((superlumt_options->fact == FACTORED) &&
             !(rowequ || colequ || (*equed == NOEQUIL))) *info = -6;
    else {
        if (rowequ) {
            rcmin = bignum;
            rcmax = 0.;
            for (j = 0; j < A->nrow; ++j) {
                rcmin = SUPERLU_MIN(rcmin, R[j]);
                rcmax = SUPERLU_MAX(rcmax, R[j]);
            }
            if (rcmin <= 0.) *info = -7;
            else if ( A->nrow > 0)
                rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
            else rowcnd = 1.;
        }
        if (colequ && *info == 0) {
            rcmin = bignum;
            rcmax = 0.;
            for (j = 0; j < A->nrow; ++j) {
                rcmin = SUPERLU_MIN(rcmin, C[j]);
                rcmax = SUPERLU_MAX(rcmax, C[j]);
            }
            if (rcmin <= 0.) *info = -8;
            else if (A->nrow > 0)
                colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
            else colcnd = 1.;
        }
        if (*info == 0) {
            if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
                    B->Stype != SLU_DN || B->Dtype != SLU_S ||
                    B->Mtype != SLU_GE )
                *info = -11;
            else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
                      B->ncol != X->ncol || X->Stype != SLU_DN ||
                      X->Dtype != SLU_S || X->Mtype != SLU_GE )
                *info = -12;
        }
    }
    if (*info != 0) {
        i = -(*info);
        xerbla_("psgssvx", &i);
        return;
    }


    /* ------------------------------------------------------------
       Allocate storage and initialize statistics variables.
       ------------------------------------------------------------*/
    panel_size = superlumt_options->panel_size;
    relax = superlumt_options->relax;
    StatAlloc(n, nprocs, panel_size, relax, &Gstat);
    StatInit(n, nprocs, &Gstat);
    utime = Gstat.utime;
    ops = Gstat.ops;

    /* ------------------------------------------------------------
       Convert A to NC format when necessary.
       ------------------------------------------------------------*/
    if ( A->Stype == SLU_NR ) {
        NRformat *Astore = A->Store;
        AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
        sCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz,
                               Astore->nzval, Astore->colind, Astore->rowptr,
                               SLU_NC, A->Dtype, A->Mtype);
        if ( notran ) { /* Reverse the transpose argument. */
            trant = TRANS;
            notran = 0;
        } else {
            trant = NOTRANS;
            notran = 1;
        }
    } else { /* A->Stype == NC */
        trant = superlumt_options->trans;
        AA = A;
    }

    /* ------------------------------------------------------------
       Diagonal scaling to equilibrate the matrix.
       ------------------------------------------------------------*/
    if ( equil ) {
        t0 = SuperLU_timer_();
        /* Compute row and column scalings to equilibrate the matrix A. */
        sgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);

        if ( info1 == 0 ) {
            /* Equilibrate matrix A. */
            slaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
            rowequ = (*equed == ROW) || (*equed == BOTH);
            colequ = (*equed == COL) || (*equed == BOTH);
        }
        utime[EQUIL] = SuperLU_timer_() - t0;
    }

    /* ------------------------------------------------------------
       Scale the right hand side.
       ------------------------------------------------------------*/
    if ( notran ) {
        if ( rowequ ) {
            for (j = 0; j < nrhs; ++j)
                for (i = 0; i < A->nrow; ++i) {
                    Bmat[i + j*ldb] *= R[i];
                }
        }
    } else if ( colequ ) {
        for (j = 0; j < nrhs; ++j)
            for (i = 0; i < A->nrow; ++i) {
                Bmat[i + j*ldb] *= C[i];
            }
    }


    /* ------------------------------------------------------------
       Perform the LU factorization.
       ------------------------------------------------------------*/
    if ( dofact || equil ) {

        /* Obtain column etree, the column count (colcnt_h) and supernode
        partition (part_super_h) for the Householder matrix. */
        t0 = SuperLU_timer_();
        sp_colorder(AA, perm_c, superlumt_options, &AC);
        utime[ETREE] = SuperLU_timer_() - t0;

#if ( PRNTlevel >= 2 )
        printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n",
               relax, panel_size, sp_ienv(3), sp_ienv(4));
        fflush(stdout);
#endif

        /* Compute the LU factorization of A*Pc. */
        t0 = SuperLU_timer_();
        psgstrf(superlumt_options, &AC, perm_r, L, U, &Gstat, info);
        utime[FACT] = SuperLU_timer_() - t0;

        flopcnt = 0;
        for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops;
        ops[FACT] = flopcnt;

        if ( superlumt_options->lwork == -1 ) {
            superlu_memusage->total_needed = *info - A->ncol;
            return;
        }
    }

    if ( *info > 0 ) {
        if ( *info <= A->ncol ) {
            /* Compute the reciprocal pivot growth factor of the leading
               rank-deficient *info columns of A. */
            *recip_pivot_growth = sPivotGrowth(*info, AA, perm_c, L, U);
        }
    } else {

        /* ------------------------------------------------------------
           Compute the reciprocal pivot growth factor *recip_pivot_growth.
           ------------------------------------------------------------*/
        *recip_pivot_growth = sPivotGrowth(A->ncol, AA, perm_c, L, U);

        /* ------------------------------------------------------------
           Estimate the reciprocal of the condition number of A.
           ------------------------------------------------------------*/
        t0 = SuperLU_timer_();
        if ( notran ) {
            *(unsigned char *)norm = '1';
        } else {
            *(unsigned char *)norm = 'I';
        }
        anorm = slangs(norm, AA);
        sgscon(norm, L, U, anorm, rcond, info);
        utime[RCOND] = SuperLU_timer_() - t0;

        /* ------------------------------------------------------------
           Compute the solution matrix X.
           ------------------------------------------------------------*/
        for (j = 0; j < nrhs; j++)    /* Save a copy of the right hand sides */
            for (i = 0; i < B->nrow; i++)
                Xmat[i + j*ldx] = Bmat[i + j*ldb];

        t0 = SuperLU_timer_();
        sgstrs(trant, L, U, perm_r, perm_c, X, &Gstat, info);
        utime[SOLVE] = SuperLU_timer_() - t0;
        ops[SOLVE] = ops[TRISOLVE];

        /* ------------------------------------------------------------
           Use iterative refinement to improve the computed solution and
           compute error bounds and backward error estimates for it.
           ------------------------------------------------------------*/
        t0 = SuperLU_timer_();
        sgsrfs(trant, AA, L, U, perm_r, perm_c, *equed,
               R, C, B, X, ferr, berr, &Gstat, info);
        utime[REFINE] = SuperLU_timer_() - t0;

        /* ------------------------------------------------------------
           Transform the solution matrix X to a solution of the original
           system.
           ------------------------------------------------------------*/
        if ( notran ) {
            if ( colequ ) {
                for (j = 0; j < nrhs; ++j)
                    for (i = 0; i < A->nrow; ++i) {
                        Xmat[i + j*ldx] *= C[i];
                    }
            }
        } else if ( rowequ ) {
            for (j = 0; j < nrhs; ++j)
                for (i = 0; i < A->nrow; ++i) {
                    Xmat[i + j*ldx] *= R[i];
                }
        }

        /* Set INFO = A->ncol+1 if the matrix is singular to
           working precision.*/
        if ( *rcond < slamch_("E") ) *info = A->ncol + 1;

    }

    superlu_sQuerySpace(nprocs, L, U, panel_size, superlu_memusage);

    /* ------------------------------------------------------------
       Deallocate storage after factorization.
       ------------------------------------------------------------*/
    if ( superlumt_options->refact == NO ) {
        SUPERLU_FREE(superlumt_options->etree);
        SUPERLU_FREE(superlumt_options->colcnt_h);
        SUPERLU_FREE(superlumt_options->part_super_h);
    }
    if ( dofact || equil ) {
        Destroy_CompCol_Permuted(&AC);
    }
    if ( A->Stype == SLU_NR ) {
        Destroy_SuperMatrix_Store(AA);
        SUPERLU_FREE(AA);
    }

    /* ------------------------------------------------------------
       Print timings, then deallocate statistic variables.
       ------------------------------------------------------------*/
#ifdef PROFILE
    {
        SCPformat *Lstore = (SCPformat *) L->Store;
        ParallelProfile(n, Lstore->nsuper+1, Gstat.num_panels, nprocs, &Gstat);
    }
#endif
    PrintStat(&Gstat);
    StatFree(&Gstat);
}
Ejemplo n.º 14
0
/*! \brief
 *
 * <pre>
 * Purpose   
 *   =======   
 *
 *   DGSEQU computes row and column scalings intended to equilibrate an   
 *   M-by-N sparse matrix A and reduce its condition number. R returns the row
 *   scale factors and C the column scale factors, chosen to try to make   
 *   the largest element in each row and column of the matrix B with   
 *   elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
 *
 *   R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
 *   number and BIGNUM = largest safe number.  Use of these scaling   
 *   factors is not guaranteed to reduce the condition number of A but   
 *   works well in practice.   
 *
 *   See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 *   Arguments   
 *   =========   
 *
 *   A       (input) SuperMatrix*
 *           The matrix of dimension (A->nrow, A->ncol) whose equilibration
 *           factors are to be computed. The type of A can be:
 *           Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
 *	    
 *   R       (output) double*, size A->nrow
 *           If INFO = 0 or INFO > M, R contains the row scale factors   
 *           for A.
 *	    
 *   C       (output) double*, size A->ncol
 *           If INFO = 0,  C contains the column scale factors for A.
 *	    
 *   ROWCND  (output) double*
 *           If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
 *           smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
 *           AMAX is neither too large nor too small, it is not worth   
 *           scaling by R.
 *	    
 *   COLCND  (output) double*
 *           If INFO = 0, COLCND contains the ratio of the smallest   
 *           C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
 *           worth scaling by C.
 *	    
 *   AMAX    (output) double*
 *           Absolute value of largest matrix element.  If AMAX is very   
 *           close to overflow or very close to underflow, the matrix   
 *           should be scaled.
 *	    
 *   INFO    (output) int*
 *           = 0:  successful exit   
 *           < 0:  if INFO = -i, the i-th argument had an illegal value   
 *           > 0:  if INFO = i,  and i is   
 *                 <= A->nrow:  the i-th row of A is exactly zero   
 *                 >  A->ncol:  the (i-M)-th column of A is exactly zero   
 *
 *   ===================================================================== 
 * </pre>
 */
void
dgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
	double *colcnd, double *amax, int *info)
{


    /* Local variables */
    NCformat *Astore;
    double   *Aval;
    int i, j, irow;
    double rcmin, rcmax;
    double bignum, smlnum;
    extern double dlamch_(char *);
    
    /* Test the input parameters. */
    *info = 0;
    if ( A->nrow < 0 || A->ncol < 0 ||
	 A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -1;
    if (*info != 0) {
	i = -(*info);
	xerbla_("dgsequ", &i);
	return;
    }

    /* Quick return if possible */
    if ( A->nrow == 0 || A->ncol == 0 ) {
	*rowcnd = 1.;
	*colcnd = 1.;
	*amax = 0.;
	return;
    }

    Astore = A->Store;
    Aval = Astore->nzval;
    
    /* Get machine constants. */
    smlnum = dlamch_("S");
    bignum = 1. / smlnum;

    /* Compute row scale factors. */
    for (i = 0; i < A->nrow; ++i) r[i] = 0.;

    /* Find the maximum element in each row. */
    for (j = 0; j < A->ncol; ++j)
	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
	    irow = Astore->rowind[i];
	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) );
	}

    /* Find the maximum and minimum scale factors. */
    rcmin = bignum;
    rcmax = 0.;
    for (i = 0; i < A->nrow; ++i) {
	rcmax = SUPERLU_MAX(rcmax, r[i]);
	rcmin = SUPERLU_MIN(rcmin, r[i]);
    }
    *amax = rcmax;

    if (rcmin == 0.) {
	/* Find the first zero scale factor and return an error code. */
	for (i = 0; i < A->nrow; ++i)
	    if (r[i] == 0.) {
		*info = i + 1;
		return;
	    }
    } else {
	/* Invert the scale factors. */
	for (i = 0; i < A->nrow; ++i)
	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
	/* Compute ROWCND = min(R(I)) / max(R(I)) */
	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
    }

    /* Compute column scale factors */
    for (j = 0; j < A->ncol; ++j) c[j] = 0.;

    /* Find the maximum element in each column, assuming the row
       scalings computed above. */
    for (j = 0; j < A->ncol; ++j)
	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
	    irow = Astore->rowind[i];
	    c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] );
	}

    /* Find the maximum and minimum scale factors. */
    rcmin = bignum;
    rcmax = 0.;
    for (j = 0; j < A->ncol; ++j) {
	rcmax = SUPERLU_MAX(rcmax, c[j]);
	rcmin = SUPERLU_MIN(rcmin, c[j]);
    }

    if (rcmin == 0.) {
	/* Find the first zero scale factor and return an error code. */
	for (j = 0; j < A->ncol; ++j)
	    if ( c[j] == 0. ) {
		*info = A->nrow + j + 1;
		return;
	    }
    } else {
	/* Invert the scale factors. */
	for (j = 0; j < A->ncol; ++j)
	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
	/* Compute COLCND = min(C(J)) / max(C(J)) */
	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
    }

    return;

} /* dgsequ */
Ejemplo n.º 15
0
void
pdgstrf_thread_finalize(pdgstrf_threadarg_t *pdgstrf_threadarg, 
			pxgstrf_shared_t *pxgstrf_shared,
			SuperMatrix *A, int *perm_r,
			SuperMatrix *L, SuperMatrix *U
			)
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 *
 * Purpose
 * =======
 * 
 * pdgstrf_thread_finalize() performs cleanups after the multithreaded 
 * factorization pdgstrf_thread(). It sets up the L and U data
 * structures, and deallocats the storage associated with the structures
 * pxgstrf_shared and pdgstrf_threadarg.
 *
 * Arguments
 * =========
 *
 * pdgstrf_threadarg (input) pdgstrf_threadarg_t*
 *          The structure contains the parameters to each thread.
 *
 * pxgstrf_shared (input) pxgstrf_shared_t*
 *          The structure contains the shared task queue, the 
 *          synchronization variables, and the L and U data structures.
 *
 * A        (input) SuperMatrix*
 *	    Original matrix A, permutated by columns, of dimension
 *          (A->nrow, A->ncol). The type of A can be:
 *          Stype = NCP; Dtype = _D; Mtype = GE.
 *
 * perm_r   (input) int*, dimension A->nrow
 *          Row permutation vector which defines the permutation matrix Pr,
 *          perm_r[i] = j means row i of A is in position j in Pr*A.
 *
 * L        (output) SuperMatrix*
 *          The factor L from the factorization Pr*A=L*U; use compressed row 
 *          subscripts storage for supernodes, i.e., L has type: 
 *          Stype = SCP, Dtype = _D, Mtype = TRLU.
 *
 * U        (output) SuperMatrix*
 *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
 *          storage scheme, i.e., U has type:
 *          Stype = NCP, Dtype = _D, Mtype = TRU.
 *
 *
 */
    register int nprocs, n, i, iinfo;
    int       nnzL, nnzU;
    superlumt_options_t *superlumt_options;
    GlobalLU_t *Glu;
    extern ExpHeader *dexpanders;

    n = A->ncol;
    superlumt_options = pdgstrf_threadarg->superlumt_options;
    Glu = pxgstrf_shared->Glu;
    Glu->supno[n] = Glu->nsuper;

    countnz(n, pxgstrf_shared->xprune, &nnzL, &nnzU, Glu);
    fixupL(n, perm_r, Glu);
    
#ifdef COMPRESS_LUSUP
    compressSUP(n, pxgstrf_shared->Glu);
#endif

    if ( superlumt_options->refact == YES ) {
        /* L and U structures may have changed due to possibly different
	   pivoting, although the storage is available. */
        ((SCPformat *)L->Store)->nnz = nnzL;
	((SCPformat *)L->Store)->nsuper = Glu->supno[n];
	((NCPformat *)U->Store)->nnz = nnzU;
    } else {
	dCreate_SuperNode_Permuted(L, A->nrow, A->ncol, nnzL, Glu->lusup,
				   Glu->xlusup, Glu->xlusup_end, 
				   Glu->lsub, Glu->xlsub, Glu->xlsub_end,
				   Glu->supno, Glu->xsup, Glu->xsup_end,
				   SLU_SCP, SLU_D, SLU_TRLU);
	dCreate_CompCol_Permuted(U, A->nrow, A->ncol, nnzU, Glu->ucol,
				 Glu->usub, Glu->xusub, Glu->xusub_end,
				 SLU_NCP, SLU_D, SLU_TRU);
    }

    /* Combine the INFO returned from individual threads. */
    iinfo = 0;
    nprocs = superlumt_options->nprocs;
    for (i = 0; i < nprocs; ++i) {
        if ( pdgstrf_threadarg[i].info ) {
	    if (iinfo) iinfo=SUPERLU_MIN(iinfo, pdgstrf_threadarg[i].info);
	    else iinfo = pdgstrf_threadarg[i].info;
	}
    }
    *pxgstrf_shared->info = iinfo;

#if ( DEBUGlevel>=2 )
    printf("Last nsuper %d\n", Glu->nsuper);
    QueryQueue(&pxgstrf_shared->taskq);
    PrintGLGU(n, pxgstrf_shared->xprune, Glu);
    PrintInt10("perm_r", n, perm_r);
    PrintInt10("inv_perm_r", n, pxgstrf_shared->inv_perm_r);
#endif

    /* Deallocate the storage used by the parallel scheduling algorithm. */
    ParallelFinalize(pxgstrf_shared);
    SUPERLU_FREE(pdgstrf_threadarg);
    SUPERLU_FREE(pxgstrf_shared->inv_perm_r);
    SUPERLU_FREE(pxgstrf_shared->inv_perm_c);
    SUPERLU_FREE(pxgstrf_shared->xprune);
    SUPERLU_FREE(pxgstrf_shared->ispruned);
    SUPERLU_FREE(dexpanders);
    dexpanders = 0;

#if ( DEBUGlevel>=1 )
    printf("** pdgstrf_thread_finalize() called\n");
#endif
}
Ejemplo n.º 16
0
/*! \brief
 *
 * <pre>
 * Purpose
 * =======
 *   symbfact() performs a symbolic factorization on matrix A and sets up 
 *   the nonzero data structures which are suitable for supernodal Gaussian
 *   elimination with no pivoting (GENP). This routine features:
 *        o depth-first search (DFS)
 *        o supernodes
 *        o symmetric structure pruning
 *
 * Return value
 * ============
 *   < 0, number of bytes needed for LSUB.
 *   = 0, matrix dimension is 1.
 *   > 0, number of bytes allocated when out of memory.
 * </pre>
 */
int_t symbfact
/************************************************************************/
(
 superlu_options_t *options, /* input options */
 int         pnum,     /* process number */
 SuperMatrix *A,       /* original matrix A permuted by columns (input) */
 int_t       *perm_c,  /* column permutation vector (input) */
 int_t       *etree,   /* column elimination tree (input) */
 Glu_persist_t *Glu_persist,  /* output */
 Glu_freeable_t *Glu_freeable /* output */
 )
{

    int_t m, n, min_mn, j, i, k, irep, nseg, pivrow, info;
    int_t *iwork, *perm_r, *segrep, *repfnz;
    int_t *xprune, *marker, *parent, *xplore;
    int_t relax, *desc, *relax_end;
    int_t nnzL, nnzU;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(pnum, "Enter symbfact()");
#endif

    m = A->nrow;
    n = A->ncol;
    min_mn = SUPERLU_MIN(m, n);

    /* Allocate storage common to the symbolic factor routines */
    info = symbfact_SubInit(DOFACT, NULL, 0, m, n, ((NCPformat*)A->Store)->nnz,
			    Glu_persist, Glu_freeable);

    iwork = (int_t *) intMalloc_dist(6*m+2*n);
    perm_r = iwork;
    segrep = iwork + m;
    repfnz = segrep + m;
    marker = repfnz + m;
    parent = marker + m;
    xplore = parent + m;
    xprune = xplore + m;
    relax_end = xprune + n;
    relax = sp_ienv_dist(2);
    ifill_dist(perm_r, m, EMPTY);
    ifill_dist(repfnz, m, EMPTY);
    ifill_dist(marker, m, EMPTY);
    Glu_persist->supno[0] = -1;
    Glu_persist->xsup[0] = 0;
    Glu_freeable->xlsub[0] = 0;
    Glu_freeable->xusub[0] = 0;

    /*for (j = 0; j < n; ++j) iperm_c[perm_c[j]] = j;*/

    /* Identify relaxed supernodes. */
    if ( !(desc = intMalloc_dist(n+1)) )
	ABORT("Malloc fails for desc[]");;
    relax_snode(n, etree, relax, desc, relax_end);
    SUPERLU_FREE(desc);
    
    for (j = 0; j < min_mn; ) {
	if ( relax_end[j] != EMPTY ) { /* beginning of a relaxed snode */
   	    k = relax_end[j];          /* end of the relaxed snode */
	 
	    /* Determine union of the row structure of supernode (j:k). */
	    if ( (info = snode_dfs(A, j, k, xprune, marker,
				   Glu_persist, Glu_freeable)) != 0 )
		return info;

	    for (i = j; i <= k; ++i)
		pivotL(i, perm_r, &pivrow, Glu_persist, Glu_freeable); 

	    j = k+1;
	} else {
	    /* Perform a symbolic factorization on column j, and detects
	       whether column j starts a new supernode. */
	    if ((info = column_dfs(A, j, perm_r, &nseg, segrep, repfnz,
				   xprune, marker, parent, xplore,
				   Glu_persist, Glu_freeable)) != 0)
		return info;
	    
	    /* Copy the U-segments to usub[*]. */
	    if ((info = set_usub(min_mn, j, nseg, segrep, repfnz,
				 Glu_persist, Glu_freeable)) != 0)
		return info;

	    pivotL(j, perm_r, &pivrow, Glu_persist, Glu_freeable); 

	    /* Prune columns [0:j-1] using column j. */
	    pruneL(j, perm_r, pivrow, nseg, segrep, repfnz, xprune,
		   Glu_persist, Glu_freeable);

	    /* Reset repfnz[*] to prepare for the next column. */
	    for (i = 0; i < nseg; i++) {
		irep = segrep[i];
		repfnz[irep] = EMPTY;
	    }

	    ++j;
	} /* else */
    } /* for j ... */

    countnz_dist(min_mn, xprune, &nnzL, &nnzU, Glu_persist, Glu_freeable);

    /* Apply perm_r to L; Compress LSUB array. */
    i = fixupL_dist(min_mn, perm_r, Glu_persist, Glu_freeable);

    if ( !pnum && (options->PrintStat == YES)) {
	printf("\tNonzeros in L       %ld\n", nnzL);
	printf("\tNonzeros in U       %ld\n", nnzU);
	printf("\tnonzeros in L+U     %ld\n", nnzL + nnzU - min_mn);
	printf("\tnonzeros in LSUB    %ld\n", i);
    }
    SUPERLU_FREE(iwork);

#if ( PRNTlevel>=3 )
    PrintInt10("lsub", Glu_freeable->xlsub[n], Glu_freeable->lsub);
    PrintInt10("xlsub", n+1, Glu_freeable->xlsub);
    PrintInt10("xprune", n, xprune);
    PrintInt10("usub", Glu_freeable->xusub[n], Glu_freeable->usub);
    PrintInt10("xusub", n+1, Glu_freeable->xusub);
    PrintInt10("supno", n, Glu_persist->supno);
    PrintInt10("xsup", (Glu_persist->supno[n])+2, Glu_persist->xsup);
#endif

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(pnum, "Exit symbfact()");
#endif

    return (-i);

} /* SYMBFACT */
Ejemplo n.º 17
0
int
ParallelInit(int n, pxgstrf_relax_t *pxgstrf_relax, 
	     superlumt_options_t *superlumt_options, 
	     pxgstrf_shared_t *pxgstrf_shared)
{
    int      *etree = superlumt_options->etree;
    register int w, dad, ukids, i, j, k, rs, panel_size, relax;
    register int P, w_top, do_split = 0;
    panel_t panel_type;
    int      *panel_histo = pxgstrf_shared->Gstat->panel_histo;
    register int nthr, concurrency, info;
    Gstat_t  *Gstat = pxgstrf_shared->Gstat;

#if ( MACH==SUN )
    register int sync_type = USYNC_THREAD;
    
    /* Set concurrency level. */
    nthr = sysconf(_SC_NPROCESSORS_ONLN);
    thr_setconcurrency(nthr);            /* number of LWPs */
    concurrency = thr_getconcurrency();

#if ( PRNTlevel==1 )    
    printf(".. CPUs %d, concurrency (#LWP) %d, P %d\n",
	   nthr, concurrency, P);
#endif

    /* Initialize mutex variables. */
    pxgstrf_shared->lu_locks = (mutex_t *) 
        SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(mutex_t));
    for (i = 0; i < NO_GLU_LOCKS; ++i)
	mutex_init(&pxgstrf_shared->lu_locks[i], sync_type, 0);

#elif ( MACH==DEC || MACH==PTHREAD )
    pxgstrf_shared->lu_locks = (pthread_mutex_t *) 
        SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(pthread_mutex_t));
    for (i = 0; i < NO_GLU_LOCKS; ++i)
	pthread_mutex_init(&pxgstrf_shared->lu_locks[i], NULL);
#else

    pxgstrf_shared->lu_locks = (mutex_t *) SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(mutex_t));

#endif    
    
#if ( PRNTlevel==1 )
    printf(".. ParallelInit() ... nprocs %2d\n", superlumt_options->nprocs);
#endif

    pxgstrf_shared->spin_locks = intCalloc(n);
    pxgstrf_shared->pan_status = 
        (pan_status_t *) SUPERLU_MALLOC((n+1)*sizeof(pan_status_t));
    pxgstrf_shared->fb_cols    = intMalloc(n+1);

    panel_size = superlumt_options->panel_size;
    relax = superlumt_options->relax;
    w = SUPERLU_MAX(panel_size, relax) + 1;
    for (i = 0; i < w; ++i) panel_histo[i] = 0;
    pxgstrf_shared->num_splits = 0;
    
    if ( (info = queue_init(&pxgstrf_shared->taskq, n)) ) {
	fprintf(stderr, "ParallelInit(): %d\n", info);
	SUPERLU_ABORT("queue_init fails.");
    }

    /* Count children of each node in the etree. */
    for (i = 0; i <= n; ++i) pxgstrf_shared->pan_status[i].ukids = 0;
    for (i = 0; i < n; ++i) {
	dad = etree[i];
	++pxgstrf_shared->pan_status[dad].ukids;
    }

    
    /* Find the panel partitions and initialize each panel's status */

#ifdef PROFILE
    Gstat->num_panels = 0;
#endif

    pxgstrf_shared->tasks_remain = 0;
    rs = 1;   /* index for the next relaxed s-node */
    w_top = panel_size/2;
    if ( w_top == 0 ) w_top = 1;
    P = 12;

    for (i = 0; i < n; ) {
	if ( pxgstrf_relax[rs].fcol == i ) {
	    w = pxgstrf_relax[rs++].size;
	    panel_type = RELAXED_SNODE;
	    pxgstrf_shared->pan_status[i].state = CANGO;
	} else {
	    /* Adjust panel_size so that a panel won't overlap with
	       the next relaxed snode.     */
#if 0
	    /* Only works when etree is postordered. */
	    w = SUPERLU_MIN(panel_size, pxgstrf_relax[rs].fcol - i);
#else
	    w = panel_size;
	    for (k = i + 1; k < SUPERLU_MIN(i + panel_size, n); ++k)
		if ( k == pxgstrf_relax[rs].fcol ) {
		    w = k - i;  /* panel stops at column k-1 */
		    break;
		}
	    if ( k == n ) w = n - i;
#endif

#ifdef SPLIT_TOP
	    if ( !do_split ) {
	  	if ( (n-i) < panel_size * P ) do_split = 1;
	    }
	    if ( do_split && w > w_top ) { /* split large panel */
	    	w = w_top;
	    	++pxgstrf_shared->num_splits;
	    }
#endif
	    for (j = i+1; j < i + w; ++j) 
		/* Do not allow panel to cross a branch point in the etree. */
		if ( pxgstrf_shared->pan_status[j].ukids > 1 ) break;
	    w = j - i;    /* j should start a new panel */
	    panel_type = REGULAR_PANEL;
	    pxgstrf_shared->pan_status[i].state = UNREADY;
#ifdef DOMAINS
	    if ( in_domain[i] == TREE_DOMAIN ) panel_type = TREE_DOMAIN;
#endif
	}

	if ( panel_type == REGULAR_PANEL ) {
	    ++pxgstrf_shared->tasks_remain;
	    /*printf("nondomain panel %6d -- %6d\n", i, i+w-1);
	    fflush(stdout);*/
	}

	ukids = k = 0;
	for (j = i; j < i + w; ++j) {
	    pxgstrf_shared->pan_status[j].size = k--;
	    pxgstrf_shared->pan_status[j].type = panel_type;
	    ukids += pxgstrf_shared->pan_status[j].ukids;
	}
	pxgstrf_shared->pan_status[i].size = w; /* leading column */
	/* only count those kids outside the panel */
	pxgstrf_shared->pan_status[i].ukids = ukids - (w-1);
	panel_histo[w]++;
	
#ifdef PROFILE
	Gstat->panstat[i].size = w;
	++Gstat->num_panels;
#endif
	
	pxgstrf_shared->fb_cols[i] = i;
	i += w;    /* move to the next panel */

    } /* for i ... */
    
    /* Dummy root */
    pxgstrf_shared->pan_status[n].size = 1;
    pxgstrf_shared->pan_status[n].state = UNREADY;

#if ( PRNTlevel==1 )
    printf(".. Split: P %d, #nondomain panels %d\n", P, pxgstrf_shared->tasks_remain);
#endif
#ifdef DOMAINS
    EnqueueDomains(&pxgstrf_shared->taskq, list_head, pxgstrf_shared);
#else
    EnqueueRelaxSnode(&pxgstrf_shared->taskq, n, pxgstrf_relax, pxgstrf_shared);
#endif
#if ( PRNTlevel==1 )
    printf(".. # tasks %d\n", pxgstrf_shared->tasks_remain);
    fflush(stdout);
#endif

#ifdef PREDICT_OPT
    /* Set up structure describing children */
    for (i = 0; i <= n; cp_firstkid[i++] = EMPTY);
    for (i = n-1; i >= 0; i--) {
	dad = etree[i];
	cp_nextkid[i] = cp_firstkid[dad];
	cp_firstkid[dad] = i;
    }
#endif

    return 0;
} /* ParallelInit */
Ejemplo n.º 18
0
void
pdgssvx_ABglobal(superlu_options_t_Distributed *options, SuperMatrix *A, 
		 ScalePermstruct_t *ScalePermstruct,
		 double B[], int ldb, int nrhs, gridinfo_t *grid,
		 LUstruct_t *LUstruct, double *berr,
		 SuperLUStat_t *stat, int *info)
{
/* 
 * -- Distributed SuperLU routine (version 1.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley.
 * September 1, 1999
 *
 *
 * Purpose
 * =======
 *
 * pdgssvx_ABglobal solves a system of linear equations A*X=B,
 * by using Gaussian elimination with "static pivoting" to
 * compute the LU factorization of A.
 *
 * Static pivoting is a technique that combines the numerical stability
 * of partial pivoting with the scalability of Cholesky (no pivoting),
 * to run accurately and efficiently on large numbers of processors.
 *
 * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
 * description of the parallel algorithms.
 *
 * Here are the options for using this code:
 *
 *   1. Independent of all the other options specified below, the
 *      user must supply
 *
 *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
 *      -  grid, a structure describing the 2D processor mesh
 *      -  options->IterRefine, which determines whether or not to
 *            improve the accuracy of the computed solution using 
 *            iterative refinement
 *
 *      On output, B is overwritten with the solution X.
 *
 *   2. Depending on options->Fact, the user has several options
 *      for solving A*X=B. The standard option is for factoring
 *      A "from scratch". (The other options, described below,
 *      are used when A is sufficiently similar to a previously 
 *      solved problem to save time by reusing part or all of 
 *      the previous factorization.)
 *
 *      -  options->Fact = DOFACT: A is factored "from scratch"
 *
 *      In this case the user must also supply
 *
 *      -  A, the input matrix
 *
 *      as well as the following options, which are described in more 
 *      detail below:
 *
 *      -  options->Equil,   to specify how to scale the rows and columns
 *                           of A to "equilibrate" it (to try to reduce its
 *                           condition number and so improve the
 *                           accuracy of the computed solution)
 *
 *      -  options->RowPerm, to specify how to permute the rows of A
 *                           (typically to control numerical stability)
 *
 *      -  options->ColPerm, to specify how to permute the columns of A
 *                           (typically to control fill-in and enhance
 *                           parallelism during factorization)
 *
 *      -  options->ReplaceTinyPivot, to specify how to deal with tiny
 *                           pivots encountered during factorization
 *                           (to control numerical stability)
 *
 *      The outputs returned include
 *         
 *      -  ScalePermstruct,  modified to describe how the input matrix A
 *                           was equilibrated and permuted:
 *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
 *                                        columns of A were scaled
 *         -  ScalePermstruct->R, array of row scale factors
 *         -  ScalePermstruct->C, array of column scale factors
 *         -  ScalePermstruct->perm_r, row permutation vector
 *         -  ScalePermstruct->perm_c, column permutation vector
 *
 *            (part of ScalePermstruct may also need to be supplied on input,
 *             depending on options->RowPerm and options->ColPerm as described 
 *             later).
 *
 *      -  A, the input matrix A overwritten by the scaled and permuted matrix
 *                Pc*Pr*diag(R)*A*diag(C)
 *             where 
 *                Pr and Pc are row and columns permutation matrices determined
 *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c, 
 *                  respectively, and 
 *                diag(R) and diag(C) are diagonal scaling matrices determined
 *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
 *                  ScalePermstruct->C
 *
 *      -  LUstruct, which contains the L and U factorization of A1 where
 *
 *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
 *
 *              (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
 *               in A on output.)
 *
 *   3. The second value of options->Fact assumes that a matrix with the same
 *      sparsity pattern as A has already been factored:
 *     
 *      -  options->Fact = SamePattern: A is factored, assuming that it has
 *            the same nonzero pattern as a previously factored matrix. In this
 *            case the algorithm saves time by reusing the previously computed
 *            column permutation vector stored in ScalePermstruct->perm_c
 *            and the "elimination tree" of A stored in LUstruct->etree.
 *
 *      In this case the user must still specify the following options
 *      as before:
 *
 *      -  options->Equil
 *      -  options->RowPerm
 *      -  options->ReplaceTinyPivot
 *
 *      but not options->ColPerm, whose value is ignored. This is because the
 *      previous column permutation from ScalePermstruct->perm_c is used as
 *      input. The user must also supply 
 *
 *      -  A, the input matrix
 *      -  ScalePermstruct->perm_c, the column permutation
 *      -  LUstruct->etree, the elimination tree
 *
 *      The outputs returned include
 *         
 *      -  A, the input matrix A overwritten by the scaled and permuted matrix
 *            as described above
 *      -  ScalePermstruct,  modified to describe how the input matrix A was
 *                           equilibrated and row permuted
 *      -  LUstruct, modified to contain the new L and U factors
 *
 *   4. The third value of options->Fact assumes that a matrix B with the same
 *      sparsity pattern as A has already been factored, and where the
 *      row permutation of B can be reused for A. This is useful when A and B
 *      have similar numerical values, so that the same row permutation
 *      will make both factorizations numerically stable. This lets us reuse
 *      all of the previously computed structure of L and U.
 *
 *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
 *            assuming not only the same nonzero pattern as the previously
 *            factored matrix B, but reusing B's row permutation.
 *
 *      In this case the user must still specify the following options
 *      as before:
 *
 *      -  options->Equil
 *      -  options->ReplaceTinyPivot
 *
 *      but not options->RowPerm or options->ColPerm, whose values are ignored.
 *      This is because the permutations from ScalePermstruct->perm_r and
 *      ScalePermstruct->perm_c are used as input.
 *
 *      The user must also supply 
 *
 *      -  A, the input matrix
 *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
 *                                     column scaled
 *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
 *      -  ScalePermstruct->C, the columns scalings of the previous matrix, 
 *                             if any
 *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
 *      -  ScalePermstruct->perm_c, the column permutation of the previous 
 *                                  matrix
 *      -  all of LUstruct, the previously computed information about L and U
 *                (the actual numerical values of L and U stored in
 *                 LUstruct->Llu are ignored)
 *
 *      The outputs returned include
 *         
 *      -  A, the input matrix A overwritten by the scaled and permuted matrix
 *            as described above
 *      -  ScalePermstruct,  modified to describe how the input matrix A was
 *                           equilibrated 
 *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
 *      -  LUstruct, modified to contain the new L and U factors
 *
 *   5. The fourth and last value of options->Fact assumes that A is
 *      identical to a matrix that has already been factored on a previous 
 *      call, and reuses its entire LU factorization
 *
 *      -  options->Fact = Factored: A is identical to a previously
 *            factorized matrix, so the entire previous factorization
 *            can be reused.
 *
 *      In this case all the other options mentioned above are ignored
 *      (options->Equil, options->RowPerm, options->ColPerm, 
 *       options->ReplaceTinyPivot)
 *
 *      The user must also supply 
 *
 *      -  A, the unfactored matrix, only in the case that iterative refinment
 *            is to be done (specifically A must be the output A from 
 *            the previous call, so that it has been scaled and permuted)
 *      -  all of ScalePermstruct
 *      -  all of LUstruct, including the actual numerical values of L and U
 *
 *      all of which are unmodified on output.
 *         
 * Arguments
 * =========
 *
 * options (input) superlu_options_t_Distributed*
 *         The structure defines the input parameters to control
 *         how the LU decomposition will be performed.
 *         The following fields should be defined for this structure:
 *         
 *         o Fact (fact_t)
 *           Specifies whether or not the factored form of the matrix
 *           A is supplied on entry, and if not, how the matrix A should
 *           be factorized based on the previous history.
 *
 *           = DOFACT: The matrix A will be factorized from scratch.
 *                 Inputs:  A
 *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          all of ScalePermstruct
 *                          all of LUstruct
 *
 *           = SamePattern: the matrix A will be factorized assuming
 *             that a factorization of a matrix with the same sparsity
 *             pattern was performed prior to this one. Therefore, this
 *             factorization will reuse column permutation vector 
 *             ScalePermstruct->perm_c and the elimination tree
 *             LUstruct->etree
 *                 Inputs:  A
 *                          options->Equil, RowPerm, ReplaceTinyPivot
 *                          ScalePermstruct->perm_c
 *                          LUstruct->etree
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
 *                          rest of LUstruct (GLU_persist, Llu)
 *
 *           = SamePattern_SameRowPerm: the matrix A will be factorized
 *             assuming that a factorization of a matrix with the same
 *             sparsity	pattern and similar numerical values was performed
 *             prior to this one. Therefore, this factorization will reuse
 *             both row and column scaling factors R and C, and the
 *             both row and column permutation vectors perm_r and perm_c,
 *             distributed data structure set up from the previous symbolic
 *             factorization.
 *                 Inputs:  A
 *                          options->Equil, ReplaceTinyPivot
 *                          all of ScalePermstruct
 *                          all of LUstruct
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          modified LUstruct->Llu
 *           = FACTORED: the matrix A is already factored.
 *                 Inputs:  all of ScalePermstruct
 *                          all of LUstruct
 *
 *         o Equil (yes_no_t)
 *           Specifies whether to equilibrate the system.
 *           = NO:  no equilibration.
 *           = YES: scaling factors are computed to equilibrate the system:
 *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
 *                  Whether or not the system will be equilibrated depends
 *                  on the scaling of the matrix A, but if equilibration is
 *                  used, A is overwritten by diag(R)*A*diag(C) and B by
 *                  diag(R)*B.
 *
 *         o RowPerm (rowperm_t)
 *           Specifies how to permute rows of the matrix A.
 *           = NATURAL:   use the natural ordering.
 *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
 *                        the original matrix to make the diagonal large
 *                        relative to the off-diagonal.
 *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
 *                        input by the user.
 *           
 *         o ColPerm (colperm_t)
 *           Specifies what type of column permutation to use to reduce fill.
 *           = NATURAL:       natural ordering.
 *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
 *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
 *           = COLAMD:        approximate minimum degree column ordering.
 *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
 *         
 *         o ReplaceTinyPivot (yes_no_t)
 *           = NO:  do not modify pivots
 *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
 *                  LU factorization.
 *
 *         o IterRefine (IterRefine_t)
 *           Specifies how to perform iterative refinement.
 *           = NO:     no iterative refinement.
 *           = DOUBLE: accumulate residual in double precision.
 *           = EXTRA:  accumulate residual in extra precision.
 *
 *         NOTE: all options must be indentical on all processes when
 *               calling this routine.
 *
 * A (input/output) SuperMatrix*
 *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
 *         The number of linear equations is A->nrow. The type of A must be:
 *         Stype = NC; Dtype = D; Mtype = GE. That is, A is stored in
 *         compressed column format (also known as Harwell-Boeing format).
 *         See supermatrix.h for the definition of 'SuperMatrix'.
 *         This routine only handles square A, however, the LU factorization
 *         routine pdgstrf can factorize rectangular matrices.
 *         On exit, A may be overwtirren by Pc*Pr*diag(R)*A*diag(C),
 *         depending on ScalePermstruct->DiagScale, options->RowPerm and
 *         options->colpem:
 *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
 *                diag(R)*A*diag(C).
 *             if options->RowPerm != NATURAL, A is further overwritten by
 *                Pr*diag(R)*A*diag(C).
 *             if options->ColPerm != NATURAL, A is further overwritten by
 *                Pc*Pr*diag(R)*A*diag(C).
 *         If all the above condition are true, the LU decomposition is
 *         performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
 *
 *         NOTE: Currently, A must reside in all processes when calling
 *               this routine.
 *
 * ScalePermstruct (input/output) ScalePermstruct_t*
 *         The data structure to store the scaling and permutation vectors
 *         describing the transformations performed to the matrix A.
 *         It contains the following fields:
 *
 *         o DiagScale (DiagScale_t)
 *           Specifies the form of equilibration that was done.
 *           = NOEQUIL: no equilibration.
 *           = ROW:     row equilibration, i.e., A was premultiplied by
 *                      diag(R).
 *           = COL:     Column equilibration, i.e., A was postmultiplied
 *                      by diag(C).
 *           = BOTH:    both row and column equilibration, i.e., A was 
 *                      replaced by diag(R)*A*diag(C).
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
 *           DiagScale is an input argument; otherwise it is an output
 *           argument.
 *
 *         o perm_r (int*)
 *           Row permutation vector, which defines the permutation matrix Pr;
 *           perm_r[i] = j means row i of A is in position j in Pr*A.
 *           If options->RowPerm = MY_PERMR, or
 *           options->Fact = SamePattern_SameRowPerm, perm_r is an
 *           input argument; otherwise it is an output argument.
 *
 *         o perm_c (int*)
 *           Column permutation vector, which defines the 
 *           permutation matrix Pc; perm_c[i] = j means column i of A is 
 *           in position j in A*Pc.
 *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
 *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
 *           input argument; otherwise, it is an output argument.
 *           On exit, perm_c may be overwritten by the product of the input
 *           perm_c and a permutation that postorders the elimination tree
 *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
 *           is already in postorder.
 *
 *         o R (double*) dimension (A->nrow)
 *           The row scale factors for A.
 *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
 *                          diag(R).
 *           If DiagScale = NOEQUIL or COL, R is not defined.
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
 *           an input argument; otherwise, R is an output argument.
 *
 *         o C (double*) dimension (A->ncol)
 *           The column scale factors for A.
 *           If DiagScale = COL or BOTH, A is multiplied on the right by 
 *                          diag(C).
 *           If DiagScale = NOEQUIL or ROW, C is not defined.
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
 *           an input argument; otherwise, C is an output argument.
 *         
 * B       (input/output) double*
 *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
 *         On exit, the solution matrix if info = 0;
 *
 *         NOTE: Currently, B must reside in all processes when calling
 *               this routine.
 *
 * ldb     (input) int (global)
 *         The leading dimension of matrix B.
 *
 * nrhs    (input) int (global)
 *         The number of right-hand sides.
 *         If nrhs = 0, only LU decomposition is performed, the forward
 *         and back substitutions are skipped.
 *
 * grid    (input) gridinfo_t*
 *         The 2D process mesh. It contains the MPI communicator, the number
 *         of process rows (NPROW), the number of process columns (NPCOL),
 *         and my process rank. It is an input argument to all the
 *         parallel routines.
 *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
 *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
 *
 * LUstruct (input/output) LUstruct_t*
 *         The data structures to store the distributed L and U factors.
 *         It contains the following fields:
 *
 *         o etree (int*) dimension (A->ncol)
 *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol.
 *           It is computed in sp_colorder() during the first factorization,
 *           and is reused in the subsequent factorizations of the matrices
 *           with the same nonzero pattern.
 *           On exit of sp_colorder(), the columns of A are permuted so that
 *           the etree is in a certain postorder. This postorder is reflected
 *           in ScalePermstruct->perm_c.
 *           NOTE:
 *           Etree is a vector of parent pointers for a forest whose vertices
 *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *
 *         o Glu_persist (Glu_persist_t*)
 *           Global data structure (xsup, supno) replicated on all processes,
 *           describing the supernode partition in the factored matrices
 *           L and U:
 *	       xsup[s] is the leading column of the s-th supernode,
 *             supno[i] is the supernode number to which column i belongs.
 *
 *         o Llu (LocalLU_t*)
 *           The distributed data structures to store L and U factors.
 *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
 *
 * berr    (output) double*, dimension (nrhs)
 *         The componentwise relative backward error of each solution   
 *         vector X(j) (i.e., the smallest relative change in   
 *         any element of A or B that makes X(j) an exact solution).
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics on runtime and floating-point operation count.
 *        See util.h for the definition of 'SuperLUStat_t'.
 *
 * info    (output) int*
 *         = 0: successful exit
 *         > 0: if info = i, and i is
 *             <= A->ncol: U(i,i) is exactly zero. The factorization has
 *                been completed, but the factor U is exactly singular,
 *                so the solution could not be computed.
 *             > A->ncol: number of bytes allocated when memory allocation
 *                failure occurred, plus A->ncol.
 *
 *
 * See superlu_ddefs.h for the definitions of various data types.
 *
 */
    SuperMatrix AC;
    NCformat *Astore;
    NCPformat *ACstore;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    Glu_freeable_t *Glu_freeable;
            /* The nonzero structures of L and U factors, which are
	       replicated on all processrs.
	           (lsub, xlsub) contains the compressed subscript of
		                 supernodes in L.
          	   (usub, xusub) contains the compressed subscript of
		                 nonzero segments in U.
	      If options->Fact != SamePattern_SameRowPerm, they are 
	      computed by SYMBFACT routine, and then used by DDISTRIBUTE
	      routine. They will be freed after DDISTRIBUTE routine.
	      If options->Fact == SamePattern_SameRowPerm, these
	      structures are not used.                                  */
    fact_t   Fact;
    double   *a;
    int_t    *perm_r; /* row permutations from partial pivoting */
    int_t    *perm_c; /* column permutation vector */
    int_t    *etree;  /* elimination tree */
    int_t    *colptr, *rowind;
    int_t    colequ, Equil, factored, job, notran, rowequ;
    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
    int      iam;
    int      ldx;  /* LDA for matrix X (global). */
    char     equed[1], norm[1];
    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
    double   *X, *b_col, *b_work, *x_col;
    double   t;
    static mem_usage_t_Distributed num_mem_usage, symb_mem_usage;
#if ( PRNTlevel>= 2 )
    double   dmin, dsum, dprod;
#endif

    /* Test input parameters. */
    *info = 0;
    Fact = options->Fact;
    if ( Fact < 0 || Fact > FACTORED )
	*info = -1;
    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
	*info = -1;
    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
	*info = -1;
    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
	*info = -1;
    else if ( options->IterRefine == EXTRA ) {
	*info = -1;
	fprintf(stderr, "Extra precise iterative refinement yet to support.");
    } else if ( A->nrow != A->ncol || A->nrow < 0 ||
         A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -2;
    else if ( ldb < A->nrow )
	*info = -5;
    else if ( nrhs < 0 )
	*info = -6;
    if ( *info ) {
	i = -(*info);
	pxerbla("pdgssvx_ABglobal", grid, -*info);
	return;
    }

    /* Initialization */
    factored = (Fact == FACTORED);
    Equil = (!factored && options->Equil == YES);
    notran = (options->Trans == NOTRANS);
    iam = grid->iam;
    job = 5;
    m = A->nrow;
    n = A->ncol;
    Astore = A->Store;
    nnz = Astore->nnz;
    a = Astore->nzval;
    colptr = Astore->colptr;
    rowind = Astore->rowind;
    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
	rowequ = (ScalePermstruct->DiagScale == ROW) ||
	         (ScalePermstruct->DiagScale == BOTH);
	colequ = (ScalePermstruct->DiagScale == COL) ||
	         (ScalePermstruct->DiagScale == BOTH);
    } else rowequ = colequ = FALSE;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pdgssvx_ABglobal()");
#endif

    perm_r = ScalePermstruct->perm_r;
    perm_c = ScalePermstruct->perm_c;
    etree = LUstruct->etree;
    R = ScalePermstruct->R;
    C = ScalePermstruct->C;
    if ( Equil ) {
	/* Allocate storage if not done so before. */
	switch ( ScalePermstruct->DiagScale ) {
	    case NOEQUIL:
		if ( !(R = (double *) doubleMalloc_dist(m)) )
		    ABORT("Malloc fails for R[].");
	        if ( !(C = (double *) doubleMalloc_dist(n)) )
		    ABORT("Malloc fails for C[].");
		ScalePermstruct->R = R;
		ScalePermstruct->C = C;
		break;
	    case ROW: 
	        if ( !(C = (double *) doubleMalloc_dist(n)) )
		    ABORT("Malloc fails for C[].");
		ScalePermstruct->C = C;
		break;
	    case COL: 
		if ( !(R = (double *) doubleMalloc_dist(m)) )
		    ABORT("Malloc fails for R[].");
		ScalePermstruct->R = R;
		break;
	}
    }

    /* ------------------------------------------------------------
       Diagonal scaling to equilibrate the matrix.
       ------------------------------------------------------------*/
    if ( Equil ) {
#if ( DEBUGlevel>=1 )
	CHECK_MALLOC(iam, "Enter equil");
#endif
	t = SuperLU_timer_();

	if ( Fact == SamePattern_SameRowPerm ) {
	    /* Reuse R and C. */
	    switch ( ScalePermstruct->DiagScale ) {
	      case NOEQUIL:
		break;
	      case ROW:
		for (j = 0; j < n; ++j) {
		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
			irow = rowind[i];
			a[i] *= R[irow];       /* Scale rows. */
		    }
		}
		break;
	      case COL:
		for (j = 0; j < n; ++j)
		    for (i = colptr[j]; i < colptr[j+1]; ++i)
			a[i] *= C[j];          /* Scale columns. */
		break;
	      case BOTH: 
		for (j = 0; j < n; ++j) {
		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
			irow = rowind[i];
			a[i] *= R[irow] * C[j]; /* Scale rows and columns. */
		    }
		}
	        break;
	    }
	} else {
	    if ( !iam ) {
		/* Compute row and column scalings to equilibrate matrix A. */
		dgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo);
	    
		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
		if ( iinfo == 0 ) {
		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
		} else {
		    if ( iinfo > 0 ) {
			if ( iinfo <= m )
			    fprintf(stderr, "The %d-th row of A is exactly zero\n", 
				    iinfo);
			else fprintf(stderr, "The %d-th column of A is exactly zero\n", 
				     iinfo-n);
			exit(-1);
		    }
		}
	    } else {
		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
		if ( iinfo == 0 ) {
		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
		} else {
		    ABORT("DGSEQU failed\n");
		}
	    }
	
	    /* Equilibrate matrix A. */
	    dlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed);
	    if ( lsame_(equed, "R") ) {
		ScalePermstruct->DiagScale = rowequ = ROW;
	    } else if ( lsame_(equed, "C") ) {
		ScalePermstruct->DiagScale = colequ = COL;
	    } else if ( lsame_(equed, "B") ) {
		ScalePermstruct->DiagScale = BOTH;
		rowequ = ROW;
		colequ = COL;
	    } else ScalePermstruct->DiagScale = NOEQUIL;

#if ( PRNTlevel>=1 )
	    if ( !iam ) {
		printf(".. equilibrated? *equed = %c\n", *equed);
		/*fflush(stdout);*/
	    }
#endif
	} /* if Fact ... */

	stat->utime[EQUIL] = SuperLU_timer_() - t;
#if ( DEBUGlevel>=1 )
	CHECK_MALLOC(iam, "Exit equil");
#endif
    } /* if Equil ... */
    
    /* ------------------------------------------------------------
       Permute rows of A. 
       ------------------------------------------------------------*/
    if ( options->RowPerm != NO ) {
	t = SuperLU_timer_();

	if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */
	    || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */
/*	    for (j = 0; j < n; ++j) {
		for (i = colptr[j]; i < colptr[j+1]; ++i) {*/
	    for (i = 0; i < colptr[n]; ++i) {
		    irow = rowind[i]; 
		    rowind[i] = perm_r[irow];
/*		}*/
	    }
	} else if ( !factored ) {
	    if ( job == 5 ) {
		/* Allocate storage for scaling factors. */
		if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) 
		    ABORT("SUPERLU_MALLOC fails for R1[]");
		if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) )
		    ABORT("SUPERLU_MALLOC fails for C1[]");
	    }

	    if ( !iam ) {
		/* Process 0 finds a row permutation for large diagonal. */
		dldperm(job, m, nnz, colptr, rowind, a, perm_r, R1, C1);
		
		MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
		if ( job == 5 && Equil ) {
		    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
		}
	    } else {
		MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
		if ( job == 5 && Equil ) {
		    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
		    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
		}
	    }

#if ( PRNTlevel>=2 )
	    dmin = dlamch_("Overflow");
	    dsum = 0.0;
	    dprod = 1.0;
#endif
	    if ( job == 5 ) {
		if ( Equil ) {
		    for (i = 0; i < n; ++i) {
			R1[i] = exp(R1[i]);
			C1[i] = exp(C1[i]);
		    }
		    for (j = 0; j < n; ++j) {
			for (i = colptr[j]; i < colptr[j+1]; ++i) {
			    irow = rowind[i];
			    a[i] *= R1[irow] * C1[j]; /* Scale the matrix. */
			    rowind[i] = perm_r[irow];
#if ( PRNTlevel>=2 )
			    if ( rowind[i] == j ) /* New diagonal */
				dprod *= fabs(a[i]);
#endif
			}
		    }

		    /* Multiply together the scaling factors. */
		    if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
		    else for (i = 0; i < m; ++i) R[i] = R1[i];
		    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
		    else for (i = 0; i < n; ++i) C[i] = C1[i];
		    
		    ScalePermstruct->DiagScale = BOTH;
		    rowequ = colequ = 1;
		} else { /* No equilibration. */
/*		    for (j = 0; j < n; ++j) {
			for (i = colptr[j]; i < colptr[j+1]; ++i) {*/
		    for (i = colptr[0]; i < colptr[n]; ++i) {
			    irow = rowind[i];
			    rowind[i] = perm_r[irow];
			}
/*		    }*/
		}
		SUPERLU_FREE (R1);
		SUPERLU_FREE (C1);
	    } else { /* job = 2,3,4 */
		for (j = 0; j < n; ++j) {
		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
			irow = rowind[i];
			rowind[i] = perm_r[irow];
#if ( PRNTlevel>=2 )
			if ( rowind[i] == j ) { /* New diagonal */
			    if ( job == 2 || job == 3 )
				dmin = SUPERLU_MIN(dmin, fabs(a[i]));
			    else if ( job == 4 )
				dsum += fabs(a[i]);
			    else if ( job == 5 )
				dprod *= fabs(a[i]);
			}
#endif
		    }
		}
	    }

#if ( PRNTlevel>=2 )
	    if ( job == 2 || job == 3 ) {
		if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
	    } else if ( job == 4 ) {
		if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
	    } else if ( job == 5 ) {
		if ( !iam ) printf("\t product of diagonal %e\n", dprod);
	    }
#endif
	    
        } /* else !factored */

	t = SuperLU_timer_() - t;
	stat->utime[ROWPERM] = t;
#if ( PRNTlevel>=1 )
	if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t);
#endif
    
    } /* if options->RowPerm ... */

    if ( !factored || options->IterRefine ) {
	/* Compute norm(A), which will be used to adjust small diagonal. */
	if ( notran ) *(unsigned char *)norm = '1';
	else *(unsigned char *)norm = 'I';
	anorm = dlangs_dist(norm, A);
#if ( PRNTlevel>=1 )
	if ( !iam ) printf(".. anorm %e\n", anorm);
#endif
    }

    /* ------------------------------------------------------------
       Perform the LU factorization.
       ------------------------------------------------------------*/
    if ( !factored ) {
	t = SuperLU_timer_();
	/*
	 * Get column permutation vector perm_c[], according to permc_spec:
	 *   permc_spec = NATURAL:  natural ordering 
	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
	 *   permc_spec = COLAMD:   approximate minimum degree column ordering
	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
	 */
	permc_spec = options->ColPerm;
	if ( permc_spec != MY_PERMC && Fact == DOFACT )
	    /* Use an ordering provided by SuperLU */
	    get_perm_c_dist(iam, permc_spec, A, perm_c);

	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
	   (a.k.a. column etree), depending on the choice of ColPerm.
	   Adjust perm_c[] to be consistent with a postorder of etree.
	   Permute columns of A to form A*Pc'. */
	sp_colorder(options, A, perm_c, etree, &AC);

	/* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */
	ACstore = AC.Store;
	for (j = 0; j < n; ++j) 
	    for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) {
		irow = ACstore->rowind[i];
		ACstore->rowind[i] = perm_c[irow];
	    }
	stat->utime[COLPERM] = SuperLU_timer_() - t;

	/* Perform a symbolic factorization on matrix A and set up the
	   nonzero data structures which are suitable for supernodal GENP. */
	if ( Fact != SamePattern_SameRowPerm ) {
#if ( PRNTlevel>=1 ) 
	    if ( !iam ) 
		printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
#endif
	    t = SuperLU_timer_();
	    if ( !(Glu_freeable = (Glu_freeable_t *)
		   SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
		ABORT("Malloc fails for Glu_freeable.");

	    iinfo = symbfact(iam, &AC, perm_c, etree, 
			     Glu_persist, Glu_freeable);

	    stat->utime[SYMBFAC] = SuperLU_timer_() - t;

	    if ( iinfo < 0 ) {
		QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
#if ( PRNTlevel>=1 )
		if ( !iam ) {
		    printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
		    printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
		    printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
		    printf("\tint %d, short %d, float %d, double %d\n", 
			   sizeof(int_t), sizeof(short), sizeof(float),
			   sizeof(double));
		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
			   symb_mem_usage.for_lu*1e-6, 
			   symb_mem_usage.total*1e-6,
			   symb_mem_usage.expansions);
		}
#endif
	    } else {
		if ( !iam ) {
		    fprintf(stderr, "symbfact() error returns %d\n", iinfo);
		    exit(-1);
		}
	    }
	}

	/* Distribute the L and U factors onto the process grid. */
	t = SuperLU_timer_();
	dist_mem_use = ddistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid);
	stat->utime[DIST] = SuperLU_timer_() - t;

	/* Deallocate storage used in symbolic factor. */
	if ( Fact != SamePattern_SameRowPerm ) {
	    iinfo = symbfact_SubFree(Glu_freeable);
	    SUPERLU_FREE(Glu_freeable);
	}

	/* Perform numerical factorization in parallel. */
	t = SuperLU_timer_();
	pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
	stat->utime[FACT] = SuperLU_timer_() - t;

#if ( PRNTlevel>=1 )
	{
	    int_t TinyPivots;
	    float for_lu, total, max, avg, temp;
	    dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    MPI_Reduce( &num_mem_usage.total, &total,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    temp = SUPERLU_MAX(symb_mem_usage.total,
			       symb_mem_usage.for_lu +
			       (float)dist_mem_use + num_mem_usage.for_lu);
	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
	    MPI_Reduce( &temp, &max,
		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
	    MPI_Reduce( &temp, &avg,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
			  MPI_SUM, grid->comm );
	    stat->TinyPivots = TinyPivots;
	    if ( !iam ) {
		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
		       for_lu*1e-6, total*1e-6);
		printf("\tAll space (MB):"
		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
	    }
	}
#endif
    
#if ( PRNTlevel>=2 )
	if ( !iam ) printf(".. pdgstrf INFO = %d\n", *info);
#endif

    } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
	/* Permute columns of A to form A*Pc' using the existing perm_c.
	 * NOTE: rows of A were previously permuted to Pc*A.
	 */
	sp_colorder(options, A, perm_c, NULL, &AC);
    } /* if !factored ... */
	
    /* ------------------------------------------------------------
       Compute the solution matrix X.
       ------------------------------------------------------------*/
    if ( nrhs ) {

	if ( !(b_work = doubleMalloc_dist(n)) )
	    ABORT("Malloc fails for b_work[]");

	/* ------------------------------------------------------------
	   Scale the right-hand side if equilibration was performed. 
	   ------------------------------------------------------------*/
	if ( notran ) {
	    if ( rowequ ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    for (i = 0; i < m; ++i) b_col[i] *= R[i];
		    b_col += ldb;
		}
	    }
	} else if ( colequ ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
		for (i = 0; i < m; ++i) b_col[i] *= C[i];
		b_col += ldb;
	    }
	}

	/* ------------------------------------------------------------
	   Permute the right-hand side to form Pr*B.
	   ------------------------------------------------------------*/
	if ( options->RowPerm != NO ) {
	    if ( notran ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i];
		    for (i = 0; i < m; ++i) b_col[i] = b_work[i];
		    b_col += ldb;
		}
	    }
	}


	/* ------------------------------------------------------------
	   Permute the right-hand side to form Pc*B.
	   ------------------------------------------------------------*/
	if ( notran ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
		for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i];
		for (i = 0; i < m; ++i) b_col[i] = b_work[i];
		b_col += ldb;
	    }
	}


	/* Save a copy of the right-hand side. */
	ldx = ldb;
	if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) )
	    ABORT("Malloc fails for X[]");
	x_col = X;  b_col = B;
	for (j = 0; j < nrhs; ++j) {
	    for (i = 0; i < ldb; ++i) x_col[i] = b_col[i];
	    x_col += ldx;  b_col += ldb;
	}

	/* ------------------------------------------------------------
	   Solve the linear system.
	   ------------------------------------------------------------*/
	pdgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info);

	/* ------------------------------------------------------------
	   Use iterative refinement to improve the computed solution and
	   compute error bounds and backward error estimates for it.
	   ------------------------------------------------------------*/
	if ( options->IterRefine ) {
	    /* Improve the solution by iterative refinement. */
	    t = SuperLU_timer_();
	    pdgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb,
			      X, ldx, nrhs, berr, stat, info);
	    stat->utime[REFINE] = SuperLU_timer_() - t;
	}

	/* Permute the solution matrix X <= Pc'*X. */
	for (j = 0; j < nrhs; j++) {
	    b_col = &B[j*ldb];
	    x_col = &X[j*ldx];
	    for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]];
	}
	
	/* Transform the solution matrix X to a solution of the original system
	   before the equilibration. */
	if ( notran ) {
	    if ( colequ ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    for (i = 0; i < n; ++i) b_col[i] *= C[i];
		    b_col += ldb;
		}
	    }
	} else if ( rowequ ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
		for (i = 0; i < n; ++i) b_col[i] *= R[i];
		b_col += ldb;
	    }
	}

	SUPERLU_FREE(b_work);
	SUPERLU_FREE(X);

    } /* end if nrhs != 0 */

#if ( PRNTlevel>=1 )
    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
#endif

    /* Deallocate storage. */
    if ( Equil && Fact != SamePattern_SameRowPerm ) {
	switch ( ScalePermstruct->DiagScale ) {
	    case NOEQUIL:
	        SUPERLU_FREE(R);
		SUPERLU_FREE(C);
		break;
	    case ROW: 
		SUPERLU_FREE(C);
		break;
	    case COL: 
		SUPERLU_FREE(R);
		break;
	}
    }
    if ( !factored || (factored && options->IterRefine) )
	Destroy_CompCol_Permuted_dist(&AC);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pdgssvx_ABglobal()");
#endif
}
Ejemplo n.º 19
0
/*! \brief
 * <pre>
 * Purpose
 * =======
 *    ilu_cdrop_row() - Drop some small rows from the previous 
 *    supernode (L-part only).
 * </pre>
 */
int ilu_cdrop_row(
	superlu_options_t *options, /* options */
	int    first,	    /* index of the first column in the supernode */
	int    last,	    /* index of the last column in the supernode */
	double drop_tol,    /* dropping parameter */
	int    quota,	    /* maximum nonzero entries allowed */
	int    *nnzLj,	    /* in/out number of nonzeros in L(:, 1:last) */
	double *fill_tol,   /* in/out - on exit, fill_tol=-num_zero_pivots,
			     * does not change if options->ILU_MILU != SMILU1 */
	GlobalLU_t *Glu,    /* modified */
	float swork[],   /* working space
	                     * the length of swork[] should be no less than
			     * the number of rows in the supernode */
	float swork2[], /* working space with the same size as swork[],
			     * used only by the second dropping rule */
	int    lastc	    /* if lastc == 0, there is nothing after the
			     * working supernode [first:last];
			     * if lastc == 1, there is one more column after
			     * the working supernode. */ )
{
    register int i, j, k, m1;
    register int nzlc; /* number of nonzeros in column last+1 */
    register int xlusup_first, xlsub_first;
    int m, n; /* m x n is the size of the supernode */
    int r = 0; /* number of dropped rows */
    register float *temp;
    register complex *lusup = (complex *) Glu->lusup;
    register int *lsub = Glu->lsub;
    register int *xlsub = Glu->xlsub;
    register int *xlusup = Glu->xlusup;
    register float d_max = 0.0, d_min = 1.0;
    int    drop_rule = options->ILU_DropRule;
    milu_t milu = options->ILU_MILU;
    norm_t nrm = options->ILU_Norm;
    complex zero = {0.0, 0.0};
    complex one = {1.0, 0.0};
    complex none = {-1.0, 0.0};
    int i_1 = 1;
    int inc_diag; /* inc_diag = m + 1 */
    int nzp = 0;  /* number of zero pivots */
    float alpha = pow((double)(Glu->n), -1.0 / options->ILU_MILU_Dim);

    xlusup_first = xlusup[first];
    xlsub_first = xlsub[first];
    m = xlusup[first + 1] - xlusup_first;
    n = last - first + 1;
    m1 = m - 1;
    inc_diag = m + 1;
    nzlc = lastc ? (xlusup[last + 2] - xlusup[last + 1]) : 0;
    temp = swork - n;

    /* Quick return if nothing to do. */
    if (m == 0 || m == n || drop_rule == NODROP)
    {
	*nnzLj += m * n;
	return 0;
    }

    /* basic dropping: ILU(tau) */
    for (i = n; i <= m1; )
    {
	/* the average abs value of ith row */
	switch (nrm)
	{
	    case ONE_NORM:
		temp[i] = scasum_(&n, &lusup[xlusup_first + i], &m) / (double)n;
		break;
	    case TWO_NORM:
		temp[i] = scnrm2_(&n, &lusup[xlusup_first + i], &m)
		    / sqrt((double)n);
		break;
	    case INF_NORM:
	    default:
		k = icamax_(&n, &lusup[xlusup_first + i], &m) - 1;
		temp[i] = c_abs1(&lusup[xlusup_first + i + m * k]);
		break;
	}

	/* drop small entries due to drop_tol */
	if (drop_rule & DROP_BASIC && temp[i] < drop_tol)
	{
	    r++;
	    /* drop the current row and move the last undropped row here */
	    if (r > 1) /* add to last row */
	    {
		/* accumulate the sum (for MILU) */
		switch (milu)
		{
		    case SMILU_1:
		    case SMILU_2:
			caxpy_(&n, &one, &lusup[xlusup_first + i], &m,
				&lusup[xlusup_first + m - 1], &m);
			break;
		    case SMILU_3:
			for (j = 0; j < n; j++)
			    lusup[xlusup_first + (m - 1) + j * m].r +=
				    c_abs1(&lusup[xlusup_first + i + j * m]);
			break;
		    case SILU:
		    default:
			break;
		}
		ccopy_(&n, &lusup[xlusup_first + m1], &m,
                       &lusup[xlusup_first + i], &m);
	    } /* if (r > 1) */
	    else /* move to last row */
	    {
		cswap_(&n, &lusup[xlusup_first + m1], &m,
			&lusup[xlusup_first + i], &m);
		if (milu == SMILU_3)
		    for (j = 0; j < n; j++) {
			lusup[xlusup_first + m1 + j * m].r =
				c_abs1(&lusup[xlusup_first + m1 + j * m]);
			lusup[xlusup_first + m1 + j * m].i = 0.0;
                    }
	    }
	    lsub[xlsub_first + i] = lsub[xlsub_first + m1];
	    m1--;
	    continue;
	} /* if dropping */
	else
	{
	    if (temp[i] > d_max) d_max = temp[i];
	    if (temp[i] < d_min) d_min = temp[i];
	}
	i++;
    } /* for */

    /* Secondary dropping: drop more rows according to the quota. */
    quota = ceil((double)quota / (double)n);
    if (drop_rule & DROP_SECONDARY && m - r > quota)
    {
	register double tol = d_max;

	/* Calculate the second dropping tolerance */
	if (quota > n)
	{
	    if (drop_rule & DROP_INTERP) /* by interpolation */
	    {
		d_max = 1.0 / d_max; d_min = 1.0 / d_min;
		tol = 1.0 / (d_max + (d_min - d_max) * quota / (m - n - r));
	    }
	    else /* by quick select */
	    {
		int len = m1 - n + 1;
		scopy_(&len, swork, &i_1, swork2, &i_1);
		tol = sqselect(len, swork2, quota - n);
#if 0
		register int *itemp = iwork - n;
		A = temp;
		for (i = n; i <= m1; i++) itemp[i] = i;
		qsort(iwork, m1 - n + 1, sizeof(int), _compare_);
		tol = temp[itemp[quota]];
#endif
	    }
	}

	for (i = n; i <= m1; )
	{
	    if (temp[i] <= tol)
	    {
		register int j;
		r++;
		/* drop the current row and move the last undropped row here */
		if (r > 1) /* add to last row */
		{
		    /* accumulate the sum (for MILU) */
		    switch (milu)
		    {
			case SMILU_1:
			case SMILU_2:
			    caxpy_(&n, &one, &lusup[xlusup_first + i], &m,
				    &lusup[xlusup_first + m - 1], &m);
			    break;
			case SMILU_3:
			    for (j = 0; j < n; j++)
				lusup[xlusup_first + (m - 1) + j * m].r +=
   				  c_abs1(&lusup[xlusup_first + i + j * m]);
			    break;
			case SILU:
			default:
			    break;
		    }
		    ccopy_(&n, &lusup[xlusup_first + m1], &m,
			    &lusup[xlusup_first + i], &m);
		} /* if (r > 1) */
		else /* move to last row */
		{
		    cswap_(&n, &lusup[xlusup_first + m1], &m,
			    &lusup[xlusup_first + i], &m);
		    if (milu == SMILU_3)
			for (j = 0; j < n; j++) {
			    lusup[xlusup_first + m1 + j * m].r =
				    c_abs1(&lusup[xlusup_first + m1 + j * m]);
			    lusup[xlusup_first + m1 + j * m].i = 0.0;
                        }
		}
		lsub[xlsub_first + i] = lsub[xlsub_first + m1];
		m1--;
		temp[i] = temp[m1];

		continue;
	    }
	    i++;

	} /* for */

    } /* if secondary dropping */

    for (i = n; i < m; i++) temp[i] = 0.0;

    if (r == 0)
    {
	*nnzLj += m * n;
	return 0;
    }

    /* add dropped entries to the diagnal */
    if (milu != SILU)
    {
	register int j;
	complex t;
	float omega;
	for (j = 0; j < n; j++)
	{
	    t = lusup[xlusup_first + (m - 1) + j * m];
            if (t.r == 0.0 && t.i == 0.0) continue;
            omega = SUPERLU_MIN(2.0 * (1.0 - alpha) / c_abs1(&t), 1.0);
	    cs_mult(&t, &t, omega);

 	    switch (milu)
	    {
		case SMILU_1:
		    if ( !(c_eq(&t, &none)) ) {
                        c_add(&t, &t, &one);
                        cc_mult(&lusup[xlusup_first + j * inc_diag],
			                  &lusup[xlusup_first + j * inc_diag],
                                          &t);
                    }
		    else
		    {
                        cs_mult(
                                &lusup[xlusup_first + j * inc_diag],
			        &lusup[xlusup_first + j * inc_diag],
                                *fill_tol);
#ifdef DEBUG
			printf("[1] ZERO PIVOT: FILL col %d.\n", first + j);
			fflush(stdout);
#endif
			nzp++;
		    }
		    break;
		case SMILU_2:
                    cs_mult(&lusup[xlusup_first + j * inc_diag],
                                          &lusup[xlusup_first + j * inc_diag],
                                          1.0 + c_abs1(&t));
		    break;
		case SMILU_3:
                    c_add(&t, &t, &one);
                    cc_mult(&lusup[xlusup_first + j * inc_diag],
	                              &lusup[xlusup_first + j * inc_diag],
                                      &t);
		    break;
		case SILU:
		default:
		    break;
	    }
	}
	if (nzp > 0) *fill_tol = -nzp;
    }

    /* Remove dropped entries from the memory and fix the pointers. */
    m1 = m - r;
    for (j = 1; j < n; j++)
    {
	register int tmp1, tmp2;
	tmp1 = xlusup_first + j * m1;
	tmp2 = xlusup_first + j * m;
	for (i = 0; i < m1; i++)
	    lusup[i + tmp1] = lusup[i + tmp2];
    }
    for (i = 0; i < nzlc; i++)
	lusup[xlusup_first + i + n * m1] = lusup[xlusup_first + i + n * m];
    for (i = 0; i < nzlc; i++)
	lsub[xlsub[last + 1] - r + i] = lsub[xlsub[last + 1] + i];
    for (i = first + 1; i <= last + 1; i++)
    {
	xlusup[i] -= r * (i - first);
	xlsub[i] -= r;
    }
    if (lastc)
    {
	xlusup[last + 2] -= r * n;
	xlsub[last + 2] -= r;
    }

    *nnzLj += (m - r) * n;
    return r;
}
Ejemplo n.º 20
0
void
dgssvx(char *fact, char *trans, char *refact,
       SuperMatrix *A, factor_param_t *factor_params, int *perm_c,
       int *perm_r, int *etree, char *equed, double *R, double *C,
       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
       SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, 
       double *rcond, double *ferr, double *berr, 
       mem_usage_t *mem_usage, int *info )
{
/*
 * Purpose
 * =======
 *
 * DGSSVX solves the system of linear equations A*X=B or A'*X=B, using
 * the LU factorization from dgstrf(). Error bounds on the solution and
 * a condition estimate are also provided. It performs the following steps:
 *
 *   1. If A is stored column-wise (A->Stype = NC):
 *  
 *      1.1. If fact = 'E', scaling factors are computed to equilibrate the
 *           system:
 *             trans = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
 *             trans = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
 *             trans = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
 *           Whether or not the system will be equilibrated depends on the
 *           scaling of the matrix A, but if equilibration is used, A is
 *           overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if trans='N')
 *           or diag(C)*B (if trans = 'T' or 'C').
 *
 *      1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
 *           matrix that usually preserves sparsity.
 *           For more details of this step, see sp_preorder.c.
 *
 *      1.3. If fact = 'N' or 'E', the LU decomposition is used to factor the
 *           matrix A (after equilibration if fact = 'E') as Pr*A*Pc = L*U,
 *           with Pr determined by partial pivoting.
 *
 *      1.4. Compute the reciprocal pivot growth factor.
 *
 *      1.5. If some U(i,i) = 0, so that U is exactly singular, then the
 *           routine returns with info = i. Otherwise, the factored form of 
 *           A is used to estimate the condition number of the matrix A. If
 *           the reciprocal of the condition number is less than machine
 *           precision, info = A->ncol+1 is returned as a warning, but the
 *           routine still goes on to solve for X and computes error bounds
 *           as described below.
 *
 *      1.6. The system of equations is solved for X using the factored form
 *           of A.
 *
 *      1.7. Iterative refinement is applied to improve the computed solution
 *           matrix and calculate error bounds and backward error estimates
 *           for it.
 *
 *      1.8. If equilibration was used, the matrix X is premultiplied by
 *           diag(C) (if trans = 'N') or diag(R) (if trans = 'T' or 'C') so
 *           that it solves the original system before equilibration.
 *
 *   2. If A is stored row-wise (A->Stype = NR), apply the above algorithm
 *      to the transpose of A:
 *
 *      2.1. If fact = 'E', scaling factors are computed to equilibrate the
 *           system:
 *             trans = 'N':  diag(R)*A'*diag(C)     *inv(diag(C))*X = diag(R)*B
 *             trans = 'T': (diag(R)*A'*diag(C))**T *inv(diag(R))*X = diag(C)*B
 *             trans = 'C': (diag(R)*A'*diag(C))**H *inv(diag(R))*X = diag(C)*B
 *           Whether or not the system will be equilibrated depends on the
 *           scaling of the matrix A, but if equilibration is used, A' is
 *           overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
 *           (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
 *
 *      2.2. Permute columns of transpose(A) (rows of A), 
 *           forming transpose(A)*Pc, where Pc is a permutation matrix that 
 *           usually preserves sparsity.
 *           For more details of this step, see sp_preorder.c.
 *
 *      2.3. If fact = 'N' or 'E', the LU decomposition is used to factor the
 *           transpose(A) (after equilibration if fact = 'E') as 
 *           Pr*transpose(A)*Pc = L*U with the permutation Pr determined by
 *           partial pivoting.
 *
 *      2.4. Compute the reciprocal pivot growth factor.
 *
 *      2.5. If some U(i,i) = 0, so that U is exactly singular, then the
 *           routine returns with info = i. Otherwise, the factored form 
 *           of transpose(A) is used to estimate the condition number of the
 *           matrix A. If the reciprocal of the condition number
 *           is less than machine precision, info = A->nrow+1 is returned as
 *           a warning, but the routine still goes on to solve for X and
 *           computes error bounds as described below.
 *
 *      2.6. The system of equations is solved for X using the factored form
 *           of transpose(A).
 *
 *      2.7. Iterative refinement is applied to improve the computed solution
 *           matrix and calculate error bounds and backward error estimates
 *           for it.
 *
 *      2.8. If equilibration was used, the matrix X is premultiplied by
 *           diag(C) (if trans = 'N') or diag(R) (if trans = 'T' or 'C') so
 *           that it solves the original system before equilibration.
 *
 *   See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 * Arguments
 * =========
 *
 * fact    (input) char*
 *         Specifies whether or not the factored form of the matrix
 *         A is supplied on entry, and if not, whether the matrix A should
 *         be equilibrated before it is factored.
 *         = 'F': On entry, L, U, perm_r and perm_c contain the factored
 *                form of A. If equed is not 'N', the matrix A has been
 *                equilibrated with scaling factors R and C.
 *                A, L, U, perm_r are not modified.
 *         = 'N': The matrix A will be factored, and the factors will be
 *                stored in L and U.
 *         = 'E': The matrix A will be equilibrated if necessary, then
 *                factored into L and U.
 *
 * trans   (input) char*
 *         Specifies the form of the system of equations:
 *         = 'N': A * X = B        (No transpose)
 *         = 'T': A**T * X = B     (Transpose)
 *         = 'C': A**H * X = B     (Transpose)
 *
 * refact  (input) char*
 *         Specifies whether we want to re-factor the matrix.
 *         = 'N': Factor the matrix A.
 *         = 'Y': Matrix A was factored before, now we want to re-factor
 *                matrix A with perm_r and etree as inputs. Use
 *                the same storage for the L\U factors previously allocated,
 *                expand it if necessary. User should insure to use the same
 *                memory model.  In this case, perm_r may be modified due to
 *                different pivoting determined by diagonal threshold.
 *         If fact = 'F', then refact is not accessed.
 *
 * A       (input/output) SuperMatrix*
 *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
 *         of the linear equations is A->nrow. Currently, the type of A can be:
 *         Stype = NC or NR, Dtype = D_, Mtype = GE. In the future,
 *         more general A can be handled.
 *
 *         On entry, If fact = 'F' and equed is not 'N', then A must have
 *         been equilibrated by the scaling factors in R and/or C.  
 *         A is not modified if fact = 'F' or 'N', or if fact = 'E' and 
 *         equed = 'N' on exit.
 *
 *         On exit, if fact = 'E' and equed is not 'N', A is scaled as follows:
 *         If A->Stype = NC:
 *           equed = 'R':  A := diag(R) * A
 *           equed = 'C':  A := A * diag(C)
 *           equed = 'B':  A := diag(R) * A * diag(C).
 *         If A->Stype = NR:
 *           equed = 'R':  transpose(A) := diag(R) * transpose(A)
 *           equed = 'C':  transpose(A) := transpose(A) * diag(C)
 *           equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
 *
 * factor_params (input) factor_param_t*
 *         The structure defines the input scalar parameters, consisting of
 *         the following fields. If factor_params = NULL, the default
 *         values are used for all the fields; otherwise, the values
 *         are given by the user.
 *         - panel_size (int): Panel size. A panel consists of at most
 *             panel_size consecutive columns. If panel_size = -1, use 
 *             default value 8.
 *         - relax (int): To control degree of relaxing supernodes. If the
 *             number of nodes (columns) in a subtree of the elimination
 *             tree is less than relax, this subtree is considered as one
 *             supernode, regardless of the row structures of those columns.
 *             If relax = -1, use default value 8.
 *         - diag_pivot_thresh (double): Diagonal pivoting threshold.
 *             At step j of the Gaussian elimination, if
 *                 abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)),
 *             then use A_jj as pivot. 0 <= diag_pivot_thresh <= 1.
 *             If diag_pivot_thresh = -1, use default value 1.0,
 *             which corresponds to standard partial pivoting.
 *         - drop_tol (double): Drop tolerance threshold. (NOT IMPLEMENTED)
 *             At step j of the Gaussian elimination, if
 *                 abs(A_ij)/(max_i abs(A_ij)) < drop_tol,
 *             then drop entry A_ij. 0 <= drop_tol <= 1.
 *             If drop_tol = -1, use default value 0.0, which corresponds to
 *             standard Gaussian elimination.
 *
 * perm_c  (input/output) int*
 *	   If A->Stype = NC, Column permutation vector of size A->ncol,
 *         which defines the permutation matrix Pc; perm_c[i] = j means
 *         column i of A is in position j in A*Pc.
 *         On exit, perm_c may be overwritten by the product of the input
 *         perm_c and a permutation that postorders the elimination tree
 *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
 *         is already in postorder.
 *
 *         If A->Stype = NR, column permutation vector of size A->nrow,
 *         which describes permutation of columns of transpose(A) 
 *         (rows of A) as described above.
 * 
 * perm_r  (input/output) int*
 *         If A->Stype = NC, row permutation vector of size A->nrow, 
 *         which defines the permutation matrix Pr, and is determined
 *         by partial pivoting.  perm_r[i] = j means row i of A is in 
 *         position j in Pr*A.
 *
 *         If A->Stype = NR, permutation vector of size A->ncol, which
 *         determines permutation of rows of transpose(A)
 *         (columns of A) as described above.
 *
 *         If refact is not 'Y', perm_r is output argument;
 *         If refact = 'Y', the pivoting routine will try to use the input
 *         perm_r, unless a certain threshold criterion is violated.
 *         In that case, perm_r is overwritten by a new permutation
 *         determined by partial pivoting or diagonal threshold pivoting.
 * 
 * etree   (input/output) int*,  dimension (A->ncol)
 *         Elimination tree of Pc'*A'*A*Pc.
 *         If fact is not 'F' and refact = 'Y', etree is an input argument,
 *         otherwise it is an output argument.
 *         Note: etree is a vector of parent pointers for a forest whose
 *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *
 * equed   (input/output) char*
 *         Specifies the form of equilibration that was done.
 *         = 'N': No equilibration.
 *         = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
 *         = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
 *         = 'B': Both row and column equilibration, i.e., A was replaced 
 *                by diag(R)*A*diag(C).
 *         If fact = 'F', equed is an input argument, otherwise it is
 *         an output argument.
 *
 * R       (input/output) double*, dimension (A->nrow)
 *         The row scale factors for A or transpose(A).
 *         If equed = 'R' or 'B', A (if A->Stype = NC) or transpose(A) (if
 *             A->Stype = NR) is multiplied on the left by diag(R).
 *         If equed = 'N' or 'C', R is not accessed.
 *         If fact = 'F', R is an input argument; otherwise, R is output.
 *         If fact = 'F' and equed = 'R' or 'B', each element of R must
 *            be positive.
 * 
 * C       (input/output) double*, dimension (A->ncol)
 *         The column scale factors for A or transpose(A).
 *         If equed = 'C' or 'B', A (if A->Stype = NC) or transpose(A) (if 
 *             A->Stype = NR) is multiplied on the right by diag(C).
 *         If equed = 'N' or 'R', C is not accessed.
 *         If fact = 'F', C is an input argument; otherwise, C is output.
 *         If fact = 'F' and equed = 'C' or 'B', each element of C must
 *            be positive.
 *         
 * L       (output) SuperMatrix*
 *	   The factor L from the factorization
 *             Pr*A*Pc=L*U              (if A->Stype = NC) or
 *             Pr*transpose(A)*Pc=L*U   (if A->Stype = NR).
 *         Uses compressed row subscripts storage for supernodes, i.e.,
 *         L has types: Stype = SC, Dtype = D_, Mtype = TRLU.
 *
 * U       (output) SuperMatrix*
 *	   The factor U from the factorization
 *             Pr*A*Pc=L*U              (if A->Stype = NC) or
 *             Pr*transpose(A)*Pc=L*U   (if A->Stype = NR).
 *         Uses column-wise storage scheme, i.e., U has types:
 *         Stype = NC, Dtype = D_, Mtype = TRU.
 *
 * work    (workspace/output) void*, size (lwork) (in bytes)
 *         User supplied workspace, should be large enough
 *         to hold data structures for factors L and U.
 *         On exit, if fact is not 'F', L and U point to this array.
 *
 * lwork   (input) int
 *         Specifies the size of work array in bytes.
 *         = 0:  allocate space internally by system malloc;
 *         > 0:  use user-supplied work array of length lwork in bytes,
 *               returns error if space runs out.
 *         = -1: the routine guesses the amount of space needed without
 *               performing the factorization, and returns it in
 *               mem_usage->total_needed; no other side effects.
 *
 *         See argument 'mem_usage' for memory usage statistics.
 *
 * B       (input/output) SuperMatrix*
 *         B has types: Stype = DN, Dtype = D_, Mtype = GE.
 *         On entry, the right hand side matrix.
 *         On exit,
 *            if equed = 'N', B is not modified; otherwise
 *            if A->Stype = NC:
 *               if trans = 'N' and equed = 'R' or 'B', B is overwritten by
 *                  diag(R)*B;
 *               if trans = 'T' or 'C' and equed = 'C' of 'B', B is
 *                  overwritten by diag(C)*B;
 *            if A->Stype = NR:
 *               if trans = 'N' and equed = 'C' or 'B', B is overwritten by
 *                  diag(C)*B;
 *               if trans = 'T' or 'C' and equed = 'R' of 'B', B is
 *                  overwritten by diag(R)*B.
 *
 * X       (output) SuperMatrix*
 *         X has types: Stype = DN, Dtype = D_, Mtype = GE. 
 *         If info = 0 or info = A->ncol+1, X contains the solution matrix
 *         to the original system of equations. Note that A and B are modified
 *         on exit if equed is not 'N', and the solution to the equilibrated
 *         system is inv(diag(C))*X if trans = 'N' and equed = 'C' or 'B',
 *         or inv(diag(R))*X if trans = 'T' or 'C' and equed = 'R' or 'B'.
 *
 * recip_pivot_growth (output) double*
 *         The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
 *         The infinity norm is used. If recip_pivot_growth is much less
 *         than 1, the stability of the LU factorization could be poor.
 *
 * rcond   (output) double*
 *         The estimate of the reciprocal condition number of the matrix A
 *         after equilibration (if done). If rcond is less than the machine
 *         precision (in particular, if rcond = 0), the matrix is singular
 *         to working precision. This condition is indicated by a return
 *         code of info > 0.
 *
 * FERR    (output) double*, dimension (B->ncol)   
 *         The estimated forward error bound for each solution vector   
 *         X(j) (the j-th column of the solution matrix X).   
 *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
 *         is an estimated upper bound for the magnitude of the largest 
 *         element in (X(j) - XTRUE) divided by the magnitude of the   
 *         largest element in X(j).  The estimate is as reliable as   
 *         the estimate for RCOND, and is almost always a slight   
 *         overestimate of the true error.
 *
 * BERR    (output) double*, dimension (B->ncol)
 *         The componentwise relative backward error of each solution   
 *         vector X(j) (i.e., the smallest relative change in   
 *         any element of A or B that makes X(j) an exact solution).
 *
 * mem_usage (output) mem_usage_t*
 *         Record the memory usage statistics, consisting of following fields:
 *         - for_lu (float)
 *           The amount of space used in bytes for L\U data structures.
 *         - total_needed (float)
 *           The amount of space needed in bytes to perform factorization.
 *         - expansions (int)
 *           The number of memory expansions during the LU factorization.
 *
 * info    (output) int*
 *         = 0: successful exit   
 *         < 0: if info = -i, the i-th argument had an illegal value   
 *         > 0: if info = i, and i is   
 *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
 *                    been completed, but the factor U is exactly   
 *                    singular, so the solution and error bounds   
 *                    could not be computed.   
 *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
 *                    precision, meaning that the matrix is singular to
 *                    working precision. Nevertheless, the solution and
 *                    error bounds are computed because there are a number
 *                    of situations where the computed solution can be more
 *                    accurate than the value of RCOND would suggest.   
 *              > A->ncol+1: number of bytes allocated when memory allocation
 *                    failure occurred, plus A->ncol.
 *
 */

    DNformat  *Bstore, *Xstore;
    double    *Bmat, *Xmat;
    int       ldb, ldx, nrhs;
    SuperMatrix *AA; /* A in NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int       colequ, equil, nofact, notran, rowequ;
    char      trant[1], norm[1];
    int       i, j, info1;
    double    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
    int       relax, panel_size;
    double    diag_pivot_thresh, drop_tol;
    double    t0;      /* temporary time */
    double    *utime;
    extern SuperLUStat_t SuperLUStat;

    /* External functions */
    extern double dlangs(char *, SuperMatrix *);
    extern double dlamch_(char *);

    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;

#if 0
printf("dgssvx: fact=%c, trans=%c, refact=%c, equed=%c\n",
       *fact, *trans, *refact, *equed);
#endif
    
    *info = 0;
    nofact = lsame_(fact, "N");
    equil = lsame_(fact, "E");
    notran = lsame_(trans, "N");
    if (nofact || equil) {
	*(unsigned char *)equed = 'N';
	rowequ = FALSE;
	colequ = FALSE;
    } else {
	rowequ = lsame_(equed, "R") || lsame_(equed, "B");
	colequ = lsame_(equed, "C") || lsame_(equed, "B");
	smlnum = dlamch_("Safe minimum");
	bignum = 1. / smlnum;
    }

    /* Test the input parameters */
    if (!nofact && !equil && !lsame_(fact, "F")) *info = -1;
    else if (!notran && !lsame_(trans, "T") && !lsame_(trans, "C")) *info = -2;
    else if ( !(lsame_(refact,"Y") || lsame_(refact, "N")) ) *info = -3;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      (A->Stype != NC && A->Stype != NR) ||
	      A->Dtype != D_ || A->Mtype != GE )
	*info = -4;
    else if (lsame_(fact, "F") && !(rowequ || colequ || lsame_(equed, "N")))
	*info = -9;
    else {
	if (rowequ) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, R[j]);
		rcmax = SUPERLU_MAX(rcmax, R[j]);
	    }
	    if (rcmin <= 0.) *info = -10;
	    else if ( A->nrow > 0)
		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else rowcnd = 1.;
	}
	if (colequ && *info == 0) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, C[j]);
		rcmax = SUPERLU_MAX(rcmax, C[j]);
	    }
	    if (rcmin <= 0.) *info = -11;
	    else if (A->nrow > 0)
		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else colcnd = 1.;
	}
	if (*info == 0) {
	    if ( lwork < -1 ) *info = -15;
	    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      B->Stype != DN || B->Dtype != D_ || 
		      B->Mtype != GE )
		*info = -16;
	    else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      B->ncol != X->ncol || X->Stype != DN ||
		      X->Dtype != D_ || X->Mtype != GE )
		*info = -17;
	}
    }
    if (*info != 0) {
	i = -(*info);
	xerbla_("dgssvx", &i);
	return;
    }
    
    /* Default values for factor_params */
    panel_size = sp_ienv(1);
    relax      = sp_ienv(2);
    diag_pivot_thresh = 1.0;
    drop_tol   = 0.0;
    if ( factor_params != NULL ) {
	if ( factor_params->panel_size != -1 )
	    panel_size = factor_params->panel_size;
	if ( factor_params->relax != -1 ) relax = factor_params->relax;
	if ( factor_params->diag_pivot_thresh != -1 )
	    diag_pivot_thresh = factor_params->diag_pivot_thresh;
	if ( factor_params->drop_tol != -1 )
	    drop_tol = factor_params->drop_tol;
    }

    StatInit(panel_size, relax);
    utime = SuperLUStat.utime;
    
    /* Convert A to NC format when necessary. */
    if ( A->Stype == NR ) {
	NRformat *Astore = A->Store;
	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
	dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
			       Astore->nzval, Astore->colind, Astore->rowptr,
			       NC, A->Dtype, A->Mtype);
	if ( notran ) { /* Reverse the transpose argument. */
	    *trant = 'T';
	    notran = 0;
	} else {
	    *trant = 'N';
	    notran = 1;
	}
    } else { /* A->Stype == NC */
	*trant = *trans;
	AA = A;
    }

    if ( equil ) {
	t0 = SuperLU_timer_();
	/* Compute row and column scalings to equilibrate the matrix A. */
	dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
	
	if ( info1 == 0 ) {
	    /* Equilibrate matrix A. */
	    dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
	    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
	    colequ = lsame_(equed, "C") || lsame_(equed, "B");
	}
	utime[EQUIL] = SuperLU_timer_() - t0;
    }

    /* Scale the right hand side if equilibration was performed. */
    if ( notran ) {
	if ( rowequ ) {
	    for (j = 0; j < nrhs; ++j)
		for (i = 0; i < A->nrow; ++i) {
		  Bmat[i + j*ldb] *= R[i];
	        }
	}
    } else if ( colequ ) {
	for (j = 0; j < nrhs; ++j)
	    for (i = 0; i < A->nrow; ++i) {
	      Bmat[i + j*ldb] *= C[i];
	    }
    }

    if ( nofact || equil ) {
	
	t0 = SuperLU_timer_();
	sp_preorder(refact, AA, perm_c, etree, &AC);
	utime[ETREE] = SuperLU_timer_() - t0;
    
/*	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
	       relax, panel_size, sp_ienv(3), sp_ienv(4));
	fflush(stdout); */
	
	/* Compute the LU factorization of A*Pc. */
	t0 = SuperLU_timer_();
	dgstrf(refact, &AC, diag_pivot_thresh, drop_tol, relax, panel_size,
	       etree, work, lwork, perm_r, perm_c, L, U, info);
	utime[FACT] = SuperLU_timer_() - t0;
	
	if ( lwork == -1 ) {
	    mem_usage->total_needed = *info - A->ncol;
	    return;
	}
    }

    if ( *info > 0 ) {
	if ( *info <= A->ncol ) {
	    /* Compute the reciprocal pivot growth factor of the leading
	       rank-deficient *info columns of A. */
	    *recip_pivot_growth = dPivotGrowth(*info, AA, perm_c, L, U);
	}
	return;
    }

    /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
    *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U);

    /* Estimate the reciprocal of the condition number of A. */
    t0 = SuperLU_timer_();
    if ( notran ) {
	*(unsigned char *)norm = '1';
    } else {
	*(unsigned char *)norm = 'I';
    }
    anorm = dlangs(norm, AA);
    dgscon(norm, L, U, anorm, rcond, info);
    utime[RCOND] = SuperLU_timer_() - t0;
    
    /* Compute the solution matrix X. */
    for (j = 0; j < nrhs; j++)    /* Save a copy of the right hand sides */
	for (i = 0; i < B->nrow; i++)
	    Xmat[i + j*ldx] = Bmat[i + j*ldb];
    
    t0 = SuperLU_timer_();
    dgstrs (trant, L, U, perm_r, perm_c, X, info);
    utime[SOLVE] = SuperLU_timer_() - t0;
    
    /* Use iterative refinement to improve the computed solution and compute
       error bounds and backward error estimates for it. */
    t0 = SuperLU_timer_();
    dgsrfs(trant, AA, L, U, perm_r, perm_c, equed, R, C, B,
	      X, ferr, berr, info);
    utime[REFINE] = SuperLU_timer_() - t0;

    /* Transform the solution matrix X to a solution of the original system. */
    if ( notran ) {
	if ( colequ ) {
	    for (j = 0; j < nrhs; ++j)
		for (i = 0; i < A->nrow; ++i) {
                  Xmat[i + j*ldx] *= C[i];
	        }
	}
    } else if ( rowequ ) {
	for (j = 0; j < nrhs; ++j)
	    for (i = 0; i < A->nrow; ++i) {
	      Xmat[i + j*ldx] *= R[i];
            }
    }

    /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
    if ( *rcond < dlamch_("E") ) *info = A->ncol + 1;

    dQuerySpace(L, U, panel_size, mem_usage);

    if ( nofact || equil ) Destroy_CompCol_Permuted(&AC);
    if ( A->Stype == NR ) {
	Destroy_SuperMatrix_Store(AA);
	SUPERLU_FREE(AA);
    }

/*     PrintStat( &SuperLUStat ); */
    StatFree();
}
Ejemplo n.º 21
0
void
cgsitrf(superlu_options_t *options, SuperMatrix *A, int relax, int panel_size,
        int *etree, void *work, int lwork, int *perm_c, int *perm_r,
        SuperMatrix *L, SuperMatrix *U, SuperLUStat_t *stat, int *info)
{
    /* Local working arrays */
    NCPformat *Astore;
    int       *iperm_r = NULL; /* inverse of perm_r; used when
                                  options->Fact == SamePattern_SameRowPerm */
    int       *iperm_c; /* inverse of perm_c */
    int       *swap, *iswap; /* swap is used to store the row permutation
                                during the factorization. Initially, it is set
                                to iperm_c (row indeces of Pc*A*Pc').
                                iswap is the inverse of swap. After the
                                factorization, it is equal to perm_r. */
    int       *iwork;
    complex   *cwork;
    int       *segrep, *repfnz, *parent, *xplore;
    int       *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
    int       *marker, *marker_relax;
    complex    *dense, *tempv;
    float *stempv;
    int       *relax_end, *relax_fsupc;
    complex    *a;
    int       *asub;
    int       *xa_begin, *xa_end;
    int       *xsup, *supno;
    int       *xlsub, *xlusup, *xusub;
    int       nzlumax;
    float    *amax;
    complex    drop_sum;
    float alpha, omega;  /* used in MILU, mimicing DRIC */
    static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */
    float    *swork2;      /* used by the second dropping rule */

    /* Local scalars */
    fact_t    fact = options->Fact;
    double    diag_pivot_thresh = options->DiagPivotThresh;
    double    drop_tol = options->ILU_DropTol; /* tau */
    double    fill_ini = options->ILU_FillTol; /* tau^hat */
    double    gamma = options->ILU_FillFactor;
    int       drop_rule = options->ILU_DropRule;
    milu_t    milu = options->ILU_MILU;
    double    fill_tol;
    int       pivrow;   /* pivotal row number in the original matrix A */
    int       nseg1;    /* no of segments in U-column above panel row jcol */
    int       nseg;     /* no of segments in each U-column */
    register int jcol;
    register int kcol;  /* end column of a relaxed snode */
    register int icol;
    register int i, k, jj, new_next, iinfo;
    int       m, n, min_mn, jsupno, fsupc, nextlu, nextu;
    int       w_def;    /* upper bound on panel width */
    int       usepr, iperm_r_allocated = 0;
    int       nnzL, nnzU;
    int       *panel_histo = stat->panel_histo;
    flops_t   *ops = stat->ops;

    int       last_drop;/* the last column which the dropping rules applied */
    int       quota;
    int       nnzAj;    /* number of nonzeros in A(:,1:j) */
    int       nnzLj, nnzUj;
    double    tol_L = drop_tol, tol_U = drop_tol;
    complex zero = {0.0, 0.0};
    float one = 1.0;

    /* Executable */
    iinfo    = 0;
    m        = A->nrow;
    n        = A->ncol;
    min_mn   = SUPERLU_MIN(m, n);
    Astore   = A->Store;
    a        = Astore->nzval;
    asub     = Astore->rowind;
    xa_begin = Astore->colbeg;
    xa_end   = Astore->colend;

    /* Allocate storage common to the factor routines */
    *info = cLUMemInit(fact, work, lwork, m, n, Astore->nnz, panel_size,
                       gamma, L, U, &Glu, &iwork, &cwork);
    if ( *info ) return;

    xsup    = Glu.xsup;
    supno   = Glu.supno;
    xlsub   = Glu.xlsub;
    xlusup  = Glu.xlusup;
    xusub   = Glu.xusub;

    SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
             &repfnz, &panel_lsub, &marker_relax, &marker);
    cSetRWork(m, panel_size, cwork, &dense, &tempv);

    usepr = (fact == SamePattern_SameRowPerm);
    if ( usepr ) {
        /* Compute the inverse of perm_r */
        iperm_r = (int *) intMalloc(m);
        for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
        iperm_r_allocated = 1;
    }

    iperm_c = (int *) intMalloc(n);
    for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;
    swap = (int *)intMalloc(n);
    for (k = 0; k < n; k++) swap[k] = iperm_c[k];
    iswap = (int *)intMalloc(n);
    for (k = 0; k < n; k++) iswap[k] = perm_c[k];
    amax = (float *) floatMalloc(panel_size);
    if (drop_rule & DROP_SECONDARY)
        swork2 = (float *)floatMalloc(n);
    else
        swork2 = NULL;

    nnzAj = 0;
    nnzLj = 0;
    nnzUj = 0;
    last_drop = SUPERLU_MAX(min_mn - 2 * sp_ienv(7), (int)(min_mn * 0.95));
    alpha = pow((double)n, -1.0 / options->ILU_MILU_Dim);

    /* Identify relaxed snodes */
    relax_end = (int *) intMalloc(n);
    relax_fsupc = (int *) intMalloc(n);
    if ( options->SymmetricMode == YES )
        ilu_heap_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc);
    else
        ilu_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc);

    ifill (perm_r, m, EMPTY);
    ifill (marker, m * NO_MARKER, EMPTY);
    supno[0] = -1;
    xsup[0]  = xlsub[0] = xusub[0] = xlusup[0] = 0;
    w_def    = panel_size;

    /* Mark the rows used by relaxed supernodes */
    ifill (marker_relax, m, EMPTY);
    i = mark_relax(m, relax_end, relax_fsupc, xa_begin, xa_end,
                 asub, marker_relax);
#if ( PRNTlevel >= 1)
    printf("%d relaxed supernodes.\n", i);
#endif

    /*
     * Work on one "panel" at a time. A panel is one of the following:
     *     (a) a relaxed supernode at the bottom of the etree, or
     *     (b) panel_size contiguous columns, defined by the user
     */
    for (jcol = 0; jcol < min_mn; ) {

        if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
            kcol = relax_end[jcol];       /* end of the relaxed snode */
            panel_histo[kcol-jcol+1]++;

            /* Drop small rows in the previous supernode. */
            if (jcol > 0 && jcol < last_drop) {
                int first = xsup[supno[jcol - 1]];
                int last = jcol - 1;
                int quota;

                /* Compute the quota */
                if (drop_rule & DROP_PROWS)
                    quota = gamma * Astore->nnz / m * (m - first) / m
                            * (last - first + 1);
                else if (drop_rule & DROP_COLUMN) {
                    int i;
                    quota = 0;
                    for (i = first; i <= last; i++)
                        quota += xa_end[i] - xa_begin[i];
                    quota = gamma * quota * (m - first) / m;
                } else if (drop_rule & DROP_AREA)
                    quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m)
                            - nnzLj;
                else
                    quota = m * n;
                fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / min_mn);

                /* Drop small rows */
                stempv = (float *) tempv;
                i = ilu_cdrop_row(options, first, last, tol_L, quota, &nnzLj,
                                  &fill_tol, &Glu, stempv, swork2, 0);
                /* Reset the parameters */
                if (drop_rule & DROP_DYNAMIC) {
                    if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m)
                             < nnzLj)
                        tol_L = SUPERLU_MIN(1.0, tol_L * 2.0);
                    else
                        tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5);
                }
                if (fill_tol < 0) iinfo -= (int)fill_tol;
#ifdef DEBUG
                num_drop_L += i * (last - first + 1);
#endif
            }

            /* --------------------------------------
             * Factorize the relaxed supernode(jcol:kcol)
             * -------------------------------------- */
            /* Determine the union of the row structure of the snode */
            if ( (*info = ilu_csnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
                                         marker, &Glu)) != 0 )
                return;

            nextu    = xusub[jcol];
            nextlu   = xlusup[jcol];
            jsupno   = supno[jcol];
            fsupc    = xsup[jsupno];
            new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
            nzlumax = Glu.nzlumax;
            while ( new_next > nzlumax ) {
                if ((*info = cLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)))
                    return;
            }

            for (icol = jcol; icol <= kcol; icol++) {
                xusub[icol+1] = nextu;

                amax[0] = 0.0;
                /* Scatter into SPA dense[*] */
                for (k = xa_begin[icol]; k < xa_end[icol]; k++) {
                    register float tmp = c_abs1 (&a[k]);
                    if (tmp > amax[0]) amax[0] = tmp;
                    dense[asub[k]] = a[k];
                }
                nnzAj += xa_end[icol] - xa_begin[icol];
                if (amax[0] == 0.0) {
                    amax[0] = fill_ini;
#if ( PRNTlevel >= 1)
                    printf("Column %d is entirely zero!\n", icol);
                    fflush(stdout);
#endif
                }

                /* Numeric update within the snode */
                csnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat);

                if (usepr) pivrow = iperm_r[icol];
                fill_tol = pow(fill_ini, 1.0 - (double)icol / (double)min_mn);
                if ( (*info = ilu_cpivotL(icol, diag_pivot_thresh, &usepr,
                                          perm_r, iperm_c[icol], swap, iswap,
                                          marker_relax, &pivrow,
                                          amax[0] * fill_tol, milu, zero,
                                          &Glu, stat)) ) {
                    iinfo++;
                    marker[pivrow] = kcol;
                }

            }

            jcol = kcol + 1;

        } else { /* Work on one panel of panel_size columns */

            /* Adjust panel_size so that a panel won't overlap with the next
             * relaxed snode.
             */
            panel_size = w_def;
            for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++)
                if ( relax_end[k] != EMPTY ) {
                    panel_size = k - jcol;
                    break;
                }
            if ( k == min_mn ) panel_size = min_mn - jcol;
            panel_histo[panel_size]++;

            /* symbolic factor on a panel of columns */
            ilu_cpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
                          dense, amax, panel_lsub, segrep, repfnz,
                          marker, parent, xplore, &Glu);

            /* numeric sup-panel updates in topological order */
            cpanel_bmod(m, panel_size, jcol, nseg1, dense,
                        tempv, segrep, repfnz, &Glu, stat);

            /* Sparse LU within the panel, and below panel diagonal */
            for (jj = jcol; jj < jcol + panel_size; jj++) {

                k = (jj - jcol) * m; /* column index for w-wide arrays */

                nseg = nseg1;   /* Begin after all the panel segments */

                nnzAj += xa_end[jj] - xa_begin[jj];

                if ((*info = ilu_ccolumn_dfs(m, jj, perm_r, &nseg,
                                             &panel_lsub[k], segrep, &repfnz[k],
                                             marker, parent, xplore, &Glu)))
                    return;

                /* Numeric updates */
                if ((*info = ccolumn_bmod(jj, (nseg - nseg1), &dense[k],
                                          tempv, &segrep[nseg1], &repfnz[k],
                                          jcol, &Glu, stat)) != 0) return;

                /* Make a fill-in position if the column is entirely zero */
                if (xlsub[jj + 1] == xlsub[jj]) {
                    register int i, row;
                    int nextl;
                    int nzlmax = Glu.nzlmax;
                    int *lsub = Glu.lsub;
                    int *marker2 = marker + 2 * m;

                    /* Allocate memory */
                    nextl = xlsub[jj] + 1;
                    if (nextl >= nzlmax) {
                        int error = cLUMemXpand(jj, nextl, LSUB, &nzlmax, &Glu);
                        if (error) { *info = error; return; }
                        lsub = Glu.lsub;
                    }
                    xlsub[jj + 1]++;
                    assert(xlusup[jj]==xlusup[jj+1]);
                    xlusup[jj + 1]++;
                    Glu.lusup[xlusup[jj]] = zero;

                    /* Choose a row index (pivrow) for fill-in */
                    for (i = jj; i < n; i++)
                        if (marker_relax[swap[i]] <= jj) break;
                    row = swap[i];
                    marker2[row] = jj;
                    lsub[xlsub[jj]] = row;
#ifdef DEBUG
                    printf("Fill col %d.\n", jj);
                    fflush(stdout);
#endif
                }

                /* Computer the quota */
                if (drop_rule & DROP_PROWS)
                    quota = gamma * Astore->nnz / m * jj / m;
                else if (drop_rule & DROP_COLUMN)
                    quota = gamma * (xa_end[jj] - xa_begin[jj]) *
                            (jj + 1) / m;
                else if (drop_rule & DROP_AREA)
                    quota = gamma * 0.9 * nnzAj * 0.5 - nnzUj;
                else
                    quota = m;

                /* Copy the U-segments to ucol[*] and drop small entries */
                if ((*info = ilu_ccopy_to_ucol(jj, nseg, segrep, &repfnz[k],
                                               perm_r, &dense[k], drop_rule,
                                               milu, amax[jj - jcol] * tol_U,
                                               quota, &drop_sum, &nnzUj, &Glu,
                                               swork2)) != 0)
                    return;

                /* Reset the dropping threshold if required */
                if (drop_rule & DROP_DYNAMIC) {
                    if (gamma * 0.9 * nnzAj * 0.5 < nnzLj)
                        tol_U = SUPERLU_MIN(1.0, tol_U * 2.0);
                    else
                        tol_U = SUPERLU_MAX(drop_tol, tol_U * 0.5);
                }

                if (drop_sum.r != 0.0 && drop_sum.i != 0.0)
                {
                    omega = SUPERLU_MIN(2.0*(1.0-alpha)/c_abs1(&drop_sum), 1.0);
                    cs_mult(&drop_sum, &drop_sum, omega);
                }
                if (usepr) pivrow = iperm_r[jj];
                fill_tol = pow(fill_ini, 1.0 - (double)jj / (double)min_mn);
                if ( (*info = ilu_cpivotL(jj, diag_pivot_thresh, &usepr, perm_r,
                                          iperm_c[jj], swap, iswap,
                                          marker_relax, &pivrow,
                                          amax[jj - jcol] * fill_tol, milu,
                                          drop_sum, &Glu, stat)) ) {
                    iinfo++;
                    marker[m + pivrow] = jj;
                    marker[2 * m + pivrow] = jj;
                }

                /* Reset repfnz[] for this column */
                resetrep_col (nseg, segrep, &repfnz[k]);

                /* Start a new supernode, drop the previous one */
                if (jj > 0 && supno[jj] > supno[jj - 1] && jj < last_drop) {
                    int first = xsup[supno[jj - 1]];
                    int last = jj - 1;
                    int quota;

                    /* Compute the quota */
                    if (drop_rule & DROP_PROWS)
                        quota = gamma * Astore->nnz / m * (m - first) / m
                                * (last - first + 1);
                    else if (drop_rule & DROP_COLUMN) {
                        int i;
                        quota = 0;
                        for (i = first; i <= last; i++)
                            quota += xa_end[i] - xa_begin[i];
                        quota = gamma * quota * (m - first) / m;
                    } else if (drop_rule & DROP_AREA)
                        quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0)
                                / m) - nnzLj;
                    else
                        quota = m * n;
                    fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) /
                            (double)min_mn);

                    /* Drop small rows */
                    stempv = (float *) tempv;
                    i = ilu_cdrop_row(options, first, last, tol_L, quota,
                                      &nnzLj, &fill_tol, &Glu, stempv, swork2,
                                      1);

                    /* Reset the parameters */
                    if (drop_rule & DROP_DYNAMIC) {
                        if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m)
                                < nnzLj)
                            tol_L = SUPERLU_MIN(1.0, tol_L * 2.0);
                        else
                            tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5);
                    }
                    if (fill_tol < 0) iinfo -= (int)fill_tol;
#ifdef DEBUG
                    num_drop_L += i * (last - first + 1);
#endif
                } /* if start a new supernode */

            } /* for */

            jcol += panel_size; /* Move to the next panel */

        } /* else */

    } /* for */

    *info = iinfo;

    if ( m > n ) {
        k = 0;
        for (i = 0; i < m; ++i)
            if ( perm_r[i] == EMPTY ) {
                perm_r[i] = n + k;
                ++k;
            }
    }

    ilu_countnz(min_mn, &nnzL, &nnzU, &Glu);
    fixupL(min_mn, perm_r, &Glu);

    cLUWorkFree(iwork, cwork, &Glu); /* Free work space and compress storage */

    if ( fact == SamePattern_SameRowPerm ) {
        /* L and U structures may have changed due to possibly different
           pivoting, even though the storage is available.
           There could also be memory expansions, so the array locations
           may have changed, */
        ((SCformat *)L->Store)->nnz = nnzL;
        ((SCformat *)L->Store)->nsuper = Glu.supno[n];
        ((SCformat *)L->Store)->nzval = Glu.lusup;
        ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup;
        ((SCformat *)L->Store)->rowind = Glu.lsub;
        ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub;
        ((NCformat *)U->Store)->nnz = nnzU;
        ((NCformat *)U->Store)->nzval = Glu.ucol;
        ((NCformat *)U->Store)->rowind = Glu.usub;
        ((NCformat *)U->Store)->colptr = Glu.xusub;
    } else {
        cCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup,
                                 Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno,
                                 Glu.xsup, SLU_SC, SLU_C, SLU_TRLU);
        cCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol,
                               Glu.usub, Glu.xusub, SLU_NC, SLU_C, SLU_TRU);
    }

    ops[FACT] += ops[TRSV] + ops[GEMV];
    stat->expansions = --(Glu.num_expansions);

    if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
    SUPERLU_FREE (iperm_c);
    SUPERLU_FREE (relax_end);
    SUPERLU_FREE (swap);
    SUPERLU_FREE (iswap);
    SUPERLU_FREE (relax_fsupc);
    SUPERLU_FREE (amax);
    if ( swork2 ) SUPERLU_FREE (swork2);

}
Ejemplo n.º 22
0
void
sgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
       int *etree, char *equed, float *R, float *C,
       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
       SuperMatrix *B, SuperMatrix *X, float *recip_pivot_growth, 
       float *rcond, float *ferr, float *berr, 
       mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info )
{


    DNformat  *Bstore, *Xstore;
    float    *Bmat, *Xmat;
    int       ldb, ldx, nrhs;
    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int       colequ, equil, nofact, notran, rowequ, permc_spec;
    trans_t   trant;
    char      norm[1];
    int       i, j, info1;
    float    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
    int       relax, panel_size;
    float    diag_pivot_thresh;
    double    t0;      /* temporary time */
    double    *utime;

    /* External functions */
    extern float slangs(char *, SuperMatrix *);

    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;

    *info = 0;
    nofact = (options->Fact != FACTORED);
    equil = (options->Equil == YES);
    notran = (options->Trans == NOTRANS);
    if ( nofact ) {
	*(unsigned char *)equed = 'N';
	rowequ = FALSE;
	colequ = FALSE;
    } else {
	rowequ = lsame_(equed, "R") || lsame_(equed, "B");
	colequ = lsame_(equed, "C") || lsame_(equed, "B");
	smlnum = slamch_("Safe minimum");
	bignum = 1. / smlnum;
    }

#if 0
printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n",
       options->Fact, options->Trans, *equed);
#endif

    /* Test the input parameters */
    if (options->Fact != DOFACT && options->Fact != SamePattern &&
	options->Fact != SamePattern_SameRowPerm &&
	options->Fact != FACTORED &&
	options->Trans != NOTRANS && options->Trans != TRANS && 
	options->Trans != CONJ &&
	options->Equil != NO && options->Equil != YES)
	*info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
	      A->Dtype != SLU_S || A->Mtype != SLU_GE )
	*info = -2;
    else if (options->Fact == FACTORED &&
	     !(rowequ || colequ || lsame_(equed, "N")))
	*info = -6;
    else {
	if (rowequ) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, R[j]);
		rcmax = SUPERLU_MAX(rcmax, R[j]);
	    }
	    if (rcmin <= 0.) *info = -7;
	    else if ( A->nrow > 0)
		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else rowcnd = 1.;
	}
	if (colequ && *info == 0) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, C[j]);
		rcmax = SUPERLU_MAX(rcmax, C[j]);
	    }
	    if (rcmin <= 0.) *info = -8;
	    else if (A->nrow > 0)
		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else colcnd = 1.;
	}
	if (*info == 0) {
	    if ( lwork < -1 ) *info = -12;
	    else if ( B->ncol < 0 ) *info = -13;
	    else if ( B->ncol > 0 ) { /* no checking if B->ncol=0 */
	         if ( Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      B->Stype != SLU_DN || B->Dtype != SLU_S || 
		      B->Mtype != SLU_GE )
		*info = -13;
            }
	    if ( X->ncol < 0 ) *info = -14;
            else if ( X->ncol > 0 ) { /* no checking if X->ncol=0 */
                 if ( Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      (B->ncol != 0 && B->ncol != X->ncol) ||
                      X->Stype != SLU_DN ||
		      X->Dtype != SLU_S || X->Mtype != SLU_GE )
		*info = -14;
            }
	}
    }
    if (*info != 0) {
	i = -(*info);
	xerbla_("sgssvx", &i);
	return;
    }
    
    /* Initialization for factor parameters */
    panel_size = sp_ienv(1);
    relax      = sp_ienv(2);
    diag_pivot_thresh = options->DiagPivotThresh;

    utime = stat->utime;
    
    /* Convert A to SLU_NC format when necessary. */
    if ( A->Stype == SLU_NR ) {
	NRformat *Astore = A->Store;
	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
	sCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
			       Astore->nzval, Astore->colind, Astore->rowptr,
			       SLU_NC, A->Dtype, A->Mtype);
	if ( notran ) { /* Reverse the transpose argument. */
	    trant = TRANS;
	    notran = 0;
	} else {
	    trant = NOTRANS;
	    notran = 1;
	}
    } else { /* A->Stype == SLU_NC */
	trant = options->Trans;
	AA = A;
    }

    if ( nofact && equil ) {
	t0 = SuperLU_timer_();
	/* Compute row and column scalings to equilibrate the matrix A. */
	sgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
	
	if ( info1 == 0 ) {
	    /* Equilibrate matrix A. */
	    slaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
	    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
	    colequ = lsame_(equed, "C") || lsame_(equed, "B");
	}
	utime[EQUIL] = SuperLU_timer_() - t0;
    }


    if ( nofact ) {
	
        t0 = SuperLU_timer_();
	/*
	 * Gnet column permutation vector perm_c[], according to permc_spec:
	 *   permc_spec = NATURAL:  natural ordering 
	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
	 *   permc_spec = COLAMD:   approximate minimum degree column ordering
	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
	 */
	permc_spec = options->ColPerm;
	if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
            get_perm_c(permc_spec, AA, perm_c);
	utime[COLPERM] = SuperLU_timer_() - t0;

	t0 = SuperLU_timer_();
	sp_preorder(options, AA, perm_c, etree, &AC);
	utime[ETREE] = SuperLU_timer_() - t0;
    
/*	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
	       relax, panel_size, sp_ienv(3), sp_ienv(4));
	fflush(stdout); */
	
	/* Compute the LU factorization of A*Pc. */
	t0 = SuperLU_timer_();
	sgstrf(options, &AC, relax, panel_size, etree,
                work, lwork, perm_c, perm_r, L, U, stat, info);
	utime[FACT] = SuperLU_timer_() - t0;
	
	if ( lwork == -1 ) {
	    mem_usage->total_needed = *info - A->ncol;
	    return;
	}
    }

    if ( options->PivotGrowth ) {
        if ( *info > 0 ) {
	    if ( *info <= A->ncol ) {
	        /* Compute the reciprocal pivot growth factor of the leading
	           rank-deficient *info columns of A. */
	        *recip_pivot_growth = sPivotGrowth(*info, AA, perm_c, L, U);
	    }
	    return;
        }

        /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
        *recip_pivot_growth = sPivotGrowth(A->ncol, AA, perm_c, L, U);
    }

    if ( options->ConditionNumber ) {
        /* Estimate the reciprocal of the condition number of A. */
        t0 = SuperLU_timer_();
        if ( notran ) {
	    *(unsigned char *)norm = '1';
        } else {
	    *(unsigned char *)norm = 'I';
        }
        anorm = slangs(norm, AA);
        sgscon(norm, L, U, anorm, rcond, stat, info);
        utime[RCOND] = SuperLU_timer_() - t0;
    }
    
    if ( nrhs > 0 ) {
        /* Scale the right hand side if equilibration was performed. */
        if ( notran ) {
	    if ( rowequ ) {
	        for (j = 0; j < nrhs; ++j)
		    for (i = 0; i < A->nrow; ++i)
		        Bmat[i + j*ldb] *= R[i];
	    }
        } else if ( colequ ) {
	    for (j = 0; j < nrhs; ++j)
	        for (i = 0; i < A->nrow; ++i)
	            Bmat[i + j*ldb] *= C[i];
        }

        /* Compute the solution matrix X. */
        for (j = 0; j < nrhs; j++)  /* Save a copy of the right hand sides */
            for (i = 0; i < B->nrow; i++)
	        Xmat[i + j*ldx] = Bmat[i + j*ldb];
    
        t0 = SuperLU_timer_();
        sgstrs (trant, L, U, perm_c, perm_r, X, stat, info);
        utime[SOLVE] = SuperLU_timer_() - t0;
    
        /* Use iterative refinement to improve the computed solution and compute
           error bounds and backward error estimates for it. */
        t0 = SuperLU_timer_();
        if ( options->IterRefine != NOREFINE ) {
            sgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B,
                   X, ferr, berr, stat, info);
        } else {
            for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0;
        }
        utime[REFINE] = SuperLU_timer_() - t0;

        /* Transform the solution matrix X to a solution of the original system. */
        if ( notran ) {
	    if ( colequ ) {
	        for (j = 0; j < nrhs; ++j)
		    for (i = 0; i < A->nrow; ++i)
                        Xmat[i + j*ldx] *= C[i];
	    }
        } else if ( rowequ ) {
	    for (j = 0; j < nrhs; ++j)
	        for (i = 0; i < A->nrow; ++i)
	            Xmat[i + j*ldx] *= R[i];
        }
    } /* end if nrhs > 0 */

    if ( options->ConditionNumber ) {
        /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
        if ( *rcond < slamch_("E") ) *info = A->ncol + 1;
    }

    if ( nofact ) {
        sQuerySpace(L, U, mem_usage);
        Destroy_CompCol_Permuted(&AC);
    }
    if ( A->Stype == SLU_NR ) {
	Destroy_SuperMatrix_Store(AA);
	SUPERLU_FREE(AA);
    }

}
Ejemplo n.º 23
0
float clangs(char *norm, SuperMatrix *A)
{
/* 
    Purpose   
    =======   

    CLANGS returns the value of the one norm, or the Frobenius norm, or 
    the infinity norm, or the element of largest absolute value of a 
    real matrix A.   

    Description   
    ===========   

    CLANGE returns the value   

       CLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
                (   
                ( norm1(A),         NORM = '1', 'O' or 'o'   
                (   
                ( normI(A),         NORM = 'I' or 'i'   
                (   
                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   

    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   

    Arguments   
    =========   

    NORM    (input) CHARACTER*1   
            Specifies the value to be returned in CLANGE as described above.   
    A       (input) SuperMatrix*
            The M by N sparse matrix A. 

   ===================================================================== 
*/
    
    /* Local variables */
    NCformat *Astore;
    complex   *Aval;
    int      i, j, irow;
    float   value, sum;
    float   *rwork;

    Astore = A->Store;
    Aval   = Astore->nzval;
    
    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
	value = 0.;
	
    } else if (lsame_(norm, "M")) {
	/* Find max(abs(A(i,j))). */
	value = 0.;
	for (j = 0; j < A->ncol; ++j)
	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
		value = SUPERLU_MAX( value, slu_c_abs( &Aval[i]) );
	
    } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') {
	/* Find norm1(A). */
	value = 0.;
	for (j = 0; j < A->ncol; ++j) {
	    sum = 0.;
	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
		sum += slu_c_abs( &Aval[i] );
	    value = SUPERLU_MAX(value,sum);
	}
	
    } else if (lsame_(norm, "I")) {
	/* Find normI(A). */
	if ( !(rwork = (float *) SUPERLU_MALLOC(A->nrow * sizeof(float))) )
	    ABORT("SUPERLU_MALLOC fails for rwork.");
	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
	for (j = 0; j < A->ncol; ++j)
	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
		irow = Astore->rowind[i];
		rwork[irow] += slu_c_abs( &Aval[i] );
	    }
	value = 0.;
	for (i = 0; i < A->nrow; ++i)
	    value = SUPERLU_MAX(value, rwork[i]);
	
	SUPERLU_FREE (rwork);
	
    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
	/* Find normF(A). */
	ABORT("Not implemented.");
    } else
	ABORT("Illegal norm specified.");

    return (value);

} /* clangs */
Ejemplo n.º 24
0
int sgst07(trans_t trans, int n, int nrhs, SuperMatrix *A, float *b, 
	      int ldb, float *x, int ldx, float *xact, 
              int ldxact, float *ferr, float *berr, float *reslts)
{
/*
    Purpose   
    =======   

    SGST07 tests the error bounds from iterative refinement for the   
    computed solution to a system of equations op(A)*X = B, where A is a 
    general n by n matrix and op(A) = A or A**T, depending on TRANS.
    
    RESLTS(1) = test of the error bound   
              = norm(X - XACT) / ( norm(X) * FERR )   
    A large value is returned if this ratio is not less than one.   

    RESLTS(2) = residual from the iterative refinement routine   
              = the maximum of BERR / ( (n+1)*EPS + (*) ), where   
                (*) = (n+1)*UNFL / (min_i (abs(op(A))*abs(X) +abs(b))_i ) 

    Arguments   
    =========   

    TRANS   (input) trans_t
            Specifies the form of the system of equations.   
            = NOTRANS:  A *x = b   
            = TRANS  :  A'*x = b, where A' is the transpose of A   
            = CONJ   :  A'*x = b, where A' is the transpose of A   

    N       (input) INT
            The number of rows of the matrices X and XACT.  N >= 0.   

    NRHS    (input) INT   
            The number of columns of the matrices X and XACT.  NRHS >= 0. 
  

    A       (input) SuperMatrix *, dimension (A->nrow, A->ncol)
            The original n by n matrix A.   

    B       (input) FLOAT PRECISION array, dimension (LDB,NRHS)   
            The right hand side vectors for the system of linear   
            equations.   

    LDB     (input) INT   
            The leading dimension of the array B.  LDB >= max(1,N).   

    X       (input) FLOAT PRECISION array, dimension (LDX,NRHS)   
            The computed solution vectors.  Each vector is stored as a   
            column of the matrix X.   

    LDX     (input) INT   
            The leading dimension of the array X.  LDX >= max(1,N).   

    XACT    (input) FLOAT PRECISION array, dimension (LDX,NRHS)   
            The exact solution vectors.  Each vector is stored as a   
            column of the matrix XACT.   

    LDXACT  (input) INT   
            The leading dimension of the array XACT.  LDXACT >= max(1,N). 
  

    FERR    (input) FLOAT PRECISION array, dimension (NRHS)   
            The estimated forward error bounds for each solution vector   
            X.  If XTRUE is the true solution, FERR bounds the magnitude 
            of the largest entry in (X - XTRUE) divided by the magnitude 
            of the largest entry in X.   

    BERR    (input) FLOAT PRECISION array, dimension (NRHS)   
            The componentwise relative backward error of each solution   
            vector (i.e., the smallest relative change in any entry of A 
  
            or B that makes X an exact solution).   

    RESLTS  (output) FLOAT PRECISION array, dimension (2)   
            The maximum over the NRHS solution vectors of the ratios:   
            RESLTS(1) = norm(X - XACT) / ( norm(X) * FERR )   
            RESLTS(2) = BERR / ( (n+1)*EPS + (*) )   

    ===================================================================== 
*/
    
    /* Table of constant values */
    int c__1 = 1;

    /* System generated locals */
    float d__1, d__2;

    /* Local variables */
    float diff, axbi;
    int    imax, irow, n__1;
    int    i, j, k;
    float unfl, ovfl;
    float xnorm;
    float errbnd;
    int    notran;
    float eps, tmp;
    float *rwork;
    float *Aval;
    NCformat *Astore;

    /* Function prototypes */
    extern int    lsame_(char *, char *);
    extern int    isamax_(int *, float *, int *);


    /* Quick exit if N = 0 or NRHS = 0. */
    if ( n <= 0 || nrhs <= 0 ) {
	reslts[0] = 0.;
	reslts[1] = 0.;
	return 0;
    }

    eps = slamch_("Epsilon");
    unfl = slamch_("Safe minimum");
    ovfl   = 1. / unfl;
    notran = (trans == NOTRANS);

    rwork  = (float *) SUPERLU_MALLOC(n*sizeof(float));
    if ( !rwork ) ABORT("SUPERLU_MALLOC fails for rwork");
    Astore = A->Store;
    Aval   = (float *) Astore->nzval;
    
    /* Test 1:  Compute the maximum of   
       norm(X - XACT) / ( norm(X) * FERR )   
       over all the vectors X and XACT using the infinity-norm. */

    errbnd = 0.;
    for (j = 0; j < nrhs; ++j) {
	n__1 = n;
	imax = isamax_(&n__1, &x[j*ldx], &c__1);
	d__1 = fabs(x[imax-1 + j*ldx]);
	xnorm = SUPERLU_MAX(d__1,unfl);
	diff = 0.;
	for (i = 0; i < n; ++i) {
	    d__1 = fabs(x[i+j*ldx] - xact[i+j*ldxact]);
	    diff = SUPERLU_MAX(diff, d__1);
	}

	if (xnorm > 1.) {
	    goto L20;
	} else if (diff <= ovfl * xnorm) {
	    goto L20;
	} else {
	    errbnd = 1. / eps;
	    goto L30;
	}

L20:
#if 0	
	if (diff / xnorm <= ferr[j]) {
	    d__1 = diff / xnorm / ferr[j];
	    errbnd = SUPERLU_MAX(errbnd,d__1);
	} else {
	    errbnd = 1. / eps;
	}
#endif
	d__1 = diff / xnorm / ferr[j];
	errbnd = SUPERLU_MAX(errbnd,d__1);
	/*printf("Ferr: %f\n", errbnd);*/
L30:
	;
    }
    reslts[0] = errbnd;

    /* Test 2: Compute the maximum of BERR / ( (n+1)*EPS + (*) ), where 
       (*) = (n+1)*UNFL / (min_i (abs(op(A))*abs(X) + abs(b))_i ) */

    for (k = 0; k < nrhs; ++k) {
	for (i = 0; i < n; ++i) 
            rwork[i] = fabs( b[i + k*ldb] );
	if ( notran ) {
	    for (j = 0; j < n; ++j) {
		tmp = fabs( x[j + k*ldx] );
		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
		    rwork[Astore->rowind[i]] += fabs(Aval[i]) * tmp;
                }
	    }
	} else {
	    for (j = 0; j < n; ++j) {
		tmp = 0.;
		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
		    irow = Astore->rowind[i];
		    d__1 = fabs( x[irow + k*ldx] );
		    tmp += fabs(Aval[i]) * d__1;
		}
		rwork[j] += tmp;
	    }
	}

	axbi = rwork[0];
	for (i = 1; i < n; ++i) axbi = SUPERLU_MIN(axbi, rwork[i]);
	
	/* Computing MAX */
	d__1 = axbi, d__2 = (n + 1) * unfl;
	tmp = berr[k] / ((n + 1) * eps + (n + 1) * unfl / SUPERLU_MAX(d__1,d__2));
	
	if (k == 0) {
	    reslts[1] = tmp;
	} else {
	    reslts[1] = SUPERLU_MAX(reslts[1],tmp);
	}
    }

    SUPERLU_FREE(rwork);
    return 0;

} /* sgst07 */
Ejemplo n.º 25
0
void
cpanel_bmod (
            const int  m,          /* in - number of rows in the matrix */
            const int  w,          /* in */
            const int  jcol,       /* in */
            const int  nseg,       /* in */
            complex     *dense,     /* out, of size n by w */
            complex     *tempv,     /* working array */
            int        *segrep,    /* in */
            int        *repfnz,    /* in, of size n by w */
            GlobalLU_t *Glu,       /* modified */
            SuperLUStat_t *stat    /* output */
            )
{


#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    int          incx = 1, incy = 1;
    complex       alpha, beta;
#endif

    register int k, ksub;
    int          fsupc, nsupc, nsupr, nrow;
    int          krep, krep_ind;
    complex       ukj, ukj1, ukj2;
    int          luptr, luptr1, luptr2;
    int          segsze;
    int          block_nrow;  /* no of rows in a block row */
    register int lptr;        /* Points to the row subscripts of a supernode */
    int          kfnz, irow, no_zeros;
    register int isub, isub1, i;
    register int jj;          /* Index through each column in the panel */
    int          *xsup, *supno;
    int          *lsub, *xlsub;
    complex       *lusup;
    int          *xlusup;
    int          *repfnz_col; /* repfnz[] for a column in the panel */
    complex       *dense_col;  /* dense[] for a column in the panel */
    complex       *tempv1;             /* Used in 1-D update */
    complex       *TriTmp, *MatvecTmp; /* used in 2-D update */
    complex      zero = {0.0, 0.0};
    complex      one = {1.0, 0.0};
    complex      comp_temp, comp_temp1;
    register int ldaTmp;
    register int r_ind, r_hi;
    static   int first = 1, maxsuper, rowblk, colblk;
    flops_t  *ops = stat->ops;

    xsup    = Glu->xsup;
    supno   = Glu->supno;
    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    lusup   = Glu->lusup;
    xlusup  = Glu->xlusup;

    if ( first ) {
        maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) );
        rowblk   = sp_ienv(4);
        colblk   = sp_ienv(5);
        first = 0;
    }
    ldaTmp = maxsuper + rowblk;

    /*
     * For each nonz supernode segment of U[*,j] in topological order
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */

        /* krep = representative of current k-th supernode
         * fsupc = first supernodal column
         * nsupc = no of columns in a supernode
         * nsupr = no of rows in a supernode
         */
        krep = segrep[k--];
        fsupc = xsup[supno[krep]];
        nsupc = krep - fsupc + 1;
        nsupr = xlsub[fsupc+1] - xlsub[fsupc];
        nrow = nsupr - nsupc;
        lptr = xlsub[fsupc];
        krep_ind = lptr + nsupc - 1;

        repfnz_col = repfnz;
        dense_col = dense;

        if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */

            TriTmp = tempv;

            /* Sequence through each column in panel -- triangular solves */
            for (jj = jcol; jj < jcol + w; jj++,
                 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {

                kfnz = repfnz_col[krep];
                if ( kfnz == EMPTY ) continue;  /* Skip any zero segment */

                segsze = krep - kfnz + 1;
                luptr = xlusup[fsupc];

                ops[TRSV] += 4 * segsze * (segsze - 1);
                ops[GEMV] += 8 * nrow * segsze;

                /* Case 1: Update U-segment of size 1 -- col-col update */
                if ( segsze == 1 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    luptr += nsupr*(nsupc-1) + nsupc;

                    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
                        irow = lsub[i];
                        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                        c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        ++luptr;
                    }

                } else if ( segsze <= 3 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    ukj1 = dense_col[lsub[krep_ind - 1]];
                    luptr += nsupr*(nsupc-1) + nsupc-1;
                    luptr1 = luptr - nsupr;

                    if ( segsze == 2 ) {
                        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        c_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            luptr++; luptr1++;
                            cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                            cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    } else {
                        ukj2 = dense_col[lsub[krep_ind - 2]];
                        luptr2 = luptr1 - nsupr;
                        cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                        c_sub(&ukj1, &ukj1, &comp_temp);

                        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        c_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        dense_col[lsub[krep_ind-1]] = ukj1;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            luptr++; luptr1++; luptr2++;
                            cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                            cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    }

                } else  {       /* segsze >= 4 */

                    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
                       holds the result of triangular solves.    */
                    no_zeros = kfnz - fsupc;
                    isub = lptr + no_zeros;
                    for (i = 0; i < segsze; ++i) {
                        irow = lsub[isub];
                        TriTmp[i] = dense_col[irow]; /* Gather */
                        ++isub;
                    }

                    /* start effective triangle */
                    luptr += nsupr * no_zeros + no_zeros;

#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
                           &nsupr, TriTmp, &incx );
#else
                    ctrsv_( "L", "N", "U", &segsze, &lusup[luptr],
                           &nsupr, TriTmp, &incx );
#endif
#else
                    clsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
#endif


                } /* else ... */

            }  /* for jj ... end tri-solves */

            /* Block row updates; push all the way into dense[*] block */
            for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {

                r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
                block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
                luptr = xlusup[fsupc] + nsupc + r_ind;
                isub1 = lptr + nsupc + r_ind;

                repfnz_col = repfnz;
                TriTmp = tempv;
                dense_col = dense;

                /* Sequence through each column in panel -- matrix-vector */
                for (jj = jcol; jj < jcol + w; jj++,
                     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {

                    kfnz = repfnz_col[krep];
                    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */

                    segsze = krep - kfnz + 1;
                    if ( segsze <= 3 ) continue;   /* skip unrolled cases */

                    /* Perform a block update, and scatter the result of
                       matrix-vector to dense[].                 */
                    no_zeros = kfnz - fsupc;
                    luptr1 = luptr + nsupr * no_zeros;
                    MatvecTmp = &TriTmp[maxsuper];

#ifdef USE_VENDOR_BLAS
                    alpha = one;
                    beta = zero;
#ifdef _CRAY
                    CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
                           &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
#else
                    cgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
                           &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
#endif
#else
                    cmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
                           TriTmp, MatvecTmp);
#endif

                    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
                     * such that MatvecTmp[*] can be re-used for the
                     * the next blok row update. dense[] will be copied into
                     * global store after the whole panel has been finished.
                     */
                    isub = isub1;
                    for (i = 0; i < block_nrow; i++) {
                        irow = lsub[isub];
                        c_sub(&dense_col[irow], &dense_col[irow],
                              &MatvecTmp[i]);
                        MatvecTmp[i] = zero;
                        ++isub;
                    }

                } /* for jj ... */

            } /* for each block row ... */

            /* Scatter the triangular solves into SPA dense[*] */
            repfnz_col = repfnz;
            TriTmp = tempv;
            dense_col = dense;

            for (jj = jcol; jj < jcol + w; jj++,
                 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
                kfnz = repfnz_col[krep];
                if ( kfnz == EMPTY ) continue; /* Skip any zero segment */

                segsze = krep - kfnz + 1;
                if ( segsze <= 3 ) continue; /* skip unrolled cases */

                no_zeros = kfnz - fsupc;
                isub = lptr + no_zeros;
                for (i = 0; i < segsze; i++) {
                    irow = lsub[isub];
                    dense_col[irow] = TriTmp[i];
                    TriTmp[i] = zero;
                    ++isub;
                }

            } /* for jj ... */

        } else { /* 1-D block modification */


            /* Sequence through each column in the panel */
            for (jj = jcol; jj < jcol + w; jj++,
                 repfnz_col += m, dense_col += m) {

                kfnz = repfnz_col[krep];
                if ( kfnz == EMPTY ) continue;  /* Skip any zero segment */

                segsze = krep - kfnz + 1;
                luptr = xlusup[fsupc];

                ops[TRSV] += 4 * segsze * (segsze - 1);
                ops[GEMV] += 8 * nrow * segsze;

                /* Case 1: Update U-segment of size 1 -- col-col update */
                if ( segsze == 1 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    luptr += nsupr*(nsupc-1) + nsupc;

                    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
                        irow = lsub[i];
                        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                        c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        ++luptr;
                    }

                } else if ( segsze <= 3 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    luptr += nsupr*(nsupc-1) + nsupc-1;
                    ukj1 = dense_col[lsub[krep_ind - 1]];
                    luptr1 = luptr - nsupr;

                    if ( segsze == 2 ) {
                        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        c_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            ++luptr;  ++luptr1;
                            cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                            cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    } else {
                        ukj2 = dense_col[lsub[krep_ind - 2]];
                        luptr2 = luptr1 - nsupr;
                        cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                        c_sub(&ukj1, &ukj1, &comp_temp);

                        cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        c_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        dense_col[lsub[krep_ind-1]] = ukj1;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            ++luptr; ++luptr1; ++luptr2;
                            cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                            cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                            c_add(&comp_temp, &comp_temp, &comp_temp1);
                            c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    }

                } else  { /* segsze >= 4 */
                    /*
                     * Perform a triangular solve and block update,
                     * then scatter the result of sup-col update to dense[].
                     */
                    no_zeros = kfnz - fsupc;

                    /* Copy U[*,j] segment from dense[*] to tempv[*]:
                     *    The result of triangular solve is in tempv[*];
                     *    The result of matrix vector update is in dense_col[*]
                     */
                    isub = lptr + no_zeros;
                    for (i = 0; i < segsze; ++i) {
                        irow = lsub[isub];
                        tempv[i] = dense_col[irow]; /* Gather */
                        ++isub;
                    }

                    /* start effective triangle */
                    luptr += nsupr * no_zeros + no_zeros;

#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
                           &nsupr, tempv, &incx );
#else
                    ctrsv_( "L", "N", "U", &segsze, &lusup[luptr],
                           &nsupr, tempv, &incx );
#endif

                    luptr += segsze;    /* Dense matrix-vector */
                    tempv1 = &tempv[segsze];
                    alpha = one;
                    beta = zero;
#ifdef _CRAY
                    CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
                           &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
                    cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
                           &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
                    clsolve ( nsupr, segsze, &lusup[luptr], tempv );

                    luptr += segsze;        /* Dense matrix-vector */
                    tempv1 = &tempv[segsze];
                    cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
#endif

                    /* Scatter tempv[*] into SPA dense[*] temporarily, such
                     * that tempv[*] can be used for the triangular solve of
                     * the next column of the panel. They will be copied into
                     * ucol[*] after the whole panel has been finished.
                     */
                    isub = lptr + no_zeros;
                    for (i = 0; i < segsze; i++) {
                        irow = lsub[isub];
                        dense_col[irow] = tempv[i];
                        tempv[i] = zero;
                        isub++;
                    }

                    /* Scatter the update from tempv1[*] into SPA dense[*] */
                    /* Start dense rectangular L */
                    for (i = 0; i < nrow; i++) {
                        irow = lsub[isub];
                        c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
                        tempv1[i] = zero;
                        ++isub;
                    }

                } /* else segsze>=4 ... */

            } /* for each column in the panel... */

        } /* else 1-D update ... */

    } /* for each updating supernode ... */

}
Ejemplo n.º 26
0
void
dgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
       int *etree, char *equed, double *R, double *C,
       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
       SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, 
       double *rcond, double *ferr, double *berr, 
       mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info )
{
/*
 * Purpose
 * =======
 *
 * DGSSVX solves the system of linear equations A*X=B or A'*X=B, using
 * the LU factorization from dgstrf(). Error bounds on the solution and
 * a condition estimate are also provided. It performs the following steps:
 *
 *   1. If A is stored column-wise (A->Stype = SLU_NC):
 *  
 *      1.1. If options->Equil = YES, scaling factors are computed to
 *           equilibrate the system:
 *           options->Trans = NOTRANS:
 *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
 *           options->Trans = TRANS:
 *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
 *           options->Trans = CONJ:
 *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
 *           Whether or not the system will be equilibrated depends on the
 *           scaling of the matrix A, but if equilibration is used, A is
 *           overwritten by diag(R)*A*diag(C) and B by diag(R)*B
 *           (if options->Trans=NOTRANS) or diag(C)*B (if options->Trans
 *           = TRANS or CONJ).
 *
 *      1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
 *           matrix that usually preserves sparsity.
 *           For more details of this step, see sp_preorder.c.
 *
 *      1.3. If options->Fact != FACTORED, the LU decomposition is used to
 *           factor the matrix A (after equilibration if options->Equil = YES)
 *           as Pr*A*Pc = L*U, with Pr determined by partial pivoting.
 *
 *      1.4. Compute the reciprocal pivot growth factor.
 *
 *      1.5. If some U(i,i) = 0, so that U is exactly singular, then the
 *           routine returns with info = i. Otherwise, the factored form of 
 *           A is used to estimate the condition number of the matrix A. If
 *           the reciprocal of the condition number is less than machine
 *           precision, info = A->ncol+1 is returned as a warning, but the
 *           routine still goes on to solve for X and computes error bounds
 *           as described below.
 *
 *      1.6. The system of equations is solved for X using the factored form
 *           of A.
 *
 *      1.7. If options->IterRefine != NOREFINE, iterative refinement is
 *           applied to improve the computed solution matrix and calculate
 *           error bounds and backward error estimates for it.
 *
 *      1.8. If equilibration was used, the matrix X is premultiplied by
 *           diag(C) (if options->Trans = NOTRANS) or diag(R)
 *           (if options->Trans = TRANS or CONJ) so that it solves the
 *           original system before equilibration.
 *
 *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
 *      to the transpose of A:
 *
 *      2.1. If options->Equil = YES, scaling factors are computed to
 *           equilibrate the system:
 *           options->Trans = NOTRANS:
 *               diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B
 *           options->Trans = TRANS:
 *               (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
 *           options->Trans = CONJ:
 *               (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
 *           Whether or not the system will be equilibrated depends on the
 *           scaling of the matrix A, but if equilibration is used, A' is
 *           overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
 *           (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
 *
 *      2.2. Permute columns of transpose(A) (rows of A), 
 *           forming transpose(A)*Pc, where Pc is a permutation matrix that 
 *           usually preserves sparsity.
 *           For more details of this step, see sp_preorder.c.
 *
 *      2.3. If options->Fact != FACTORED, the LU decomposition is used to
 *           factor the transpose(A) (after equilibration if 
 *           options->Fact = YES) as Pr*transpose(A)*Pc = L*U with the
 *           permutation Pr determined by partial pivoting.
 *
 *      2.4. Compute the reciprocal pivot growth factor.
 *
 *      2.5. If some U(i,i) = 0, so that U is exactly singular, then the
 *           routine returns with info = i. Otherwise, the factored form 
 *           of transpose(A) is used to estimate the condition number of the
 *           matrix A. If the reciprocal of the condition number
 *           is less than machine precision, info = A->nrow+1 is returned as
 *           a warning, but the routine still goes on to solve for X and
 *           computes error bounds as described below.
 *
 *      2.6. The system of equations is solved for X using the factored form
 *           of transpose(A).
 *
 *      2.7. If options->IterRefine != NOREFINE, iterative refinement is
 *           applied to improve the computed solution matrix and calculate
 *           error bounds and backward error estimates for it.
 *
 *      2.8. If equilibration was used, the matrix X is premultiplied by
 *           diag(C) (if options->Trans = NOTRANS) or diag(R) 
 *           (if options->Trans = TRANS or CONJ) so that it solves the
 *           original system before equilibration.
 *
 *   See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 * Arguments
 * =========
 *
 * options (input) superlu_options_t*
 *         The structure defines the input parameters to control
 *         how the LU decomposition will be performed and how the
 *         system will be solved.
 *
 * A       (input/output) SuperMatrix*
 *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
 *         of the linear equations is A->nrow. Currently, the type of A can be:
 *         Stype = SLU_NC or SLU_NR, Dtype = SLU_D, Mtype = SLU_GE.
 *         In the future, more general A may be handled.
 *
 *         On entry, If options->Fact = FACTORED and equed is not 'N', 
 *         then A must have been equilibrated by the scaling factors in
 *         R and/or C.  
 *         On exit, A is not modified if options->Equil = NO, or if 
 *         options->Equil = YES but equed = 'N' on exit.
 *         Otherwise, if options->Equil = YES and equed is not 'N',
 *         A is scaled as follows:
 *         If A->Stype = SLU_NC:
 *           equed = 'R':  A := diag(R) * A
 *           equed = 'C':  A := A * diag(C)
 *           equed = 'B':  A := diag(R) * A * diag(C).
 *         If A->Stype = SLU_NR:
 *           equed = 'R':  transpose(A) := diag(R) * transpose(A)
 *           equed = 'C':  transpose(A) := transpose(A) * diag(C)
 *           equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
 *
 * perm_c  (input/output) int*
 *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
 *         which defines the permutation matrix Pc; perm_c[i] = j means
 *         column i of A is in position j in A*Pc.
 *         On exit, perm_c may be overwritten by the product of the input
 *         perm_c and a permutation that postorders the elimination tree
 *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
 *         is already in postorder.
 *
 *         If A->Stype = SLU_NR, column permutation vector of size A->nrow,
 *         which describes permutation of columns of transpose(A) 
 *         (rows of A) as described above.
 * 
 * perm_r  (input/output) int*
 *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
 *         which defines the permutation matrix Pr, and is determined
 *         by partial pivoting.  perm_r[i] = j means row i of A is in 
 *         position j in Pr*A.
 *
 *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
 *         determines permutation of rows of transpose(A)
 *         (columns of A) as described above.
 *
 *         If options->Fact = SamePattern_SameRowPerm, the pivoting routine
 *         will try to use the input perm_r, unless a certain threshold
 *         criterion is violated. In that case, perm_r is overwritten by a
 *         new permutation determined by partial pivoting or diagonal
 *         threshold pivoting.
 *         Otherwise, perm_r is output argument.
 * 
 * etree   (input/output) int*,  dimension (A->ncol)
 *         Elimination tree of Pc'*A'*A*Pc.
 *         If options->Fact != FACTORED and options->Fact != DOFACT,
 *         etree is an input argument, otherwise it is an output argument.
 *         Note: etree is a vector of parent pointers for a forest whose
 *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *
 * equed   (input/output) char*
 *         Specifies the form of equilibration that was done.
 *         = 'N': No equilibration.
 *         = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
 *         = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
 *         = 'B': Both row and column equilibration, i.e., A was replaced 
 *                by diag(R)*A*diag(C).
 *         If options->Fact = FACTORED, equed is an input argument,
 *         otherwise it is an output argument.
 *
 * R       (input/output) double*, dimension (A->nrow)
 *         The row scale factors for A or transpose(A).
 *         If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
 *             (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
 *         If equed = 'N' or 'C', R is not accessed.
 *         If options->Fact = FACTORED, R is an input argument,
 *             otherwise, R is output.
 *         If options->zFact = FACTORED and equed = 'R' or 'B', each element
 *             of R must be positive.
 * 
 * C       (input/output) double*, dimension (A->ncol)
 *         The column scale factors for A or transpose(A).
 *         If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
 *             (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
 *         If equed = 'N' or 'R', C is not accessed.
 *         If options->Fact = FACTORED, C is an input argument,
 *             otherwise, C is output.
 *         If options->Fact = FACTORED and equed = 'C' or 'B', each element
 *             of C must be positive.
 *         
 * L       (output) SuperMatrix*
 *	   The factor L from the factorization
 *             Pr*A*Pc=L*U              (if A->Stype SLU_= NC) or
 *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
 *         Uses compressed row subscripts storage for supernodes, i.e.,
 *         L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
 *
 * U       (output) SuperMatrix*
 *	   The factor U from the factorization
 *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
 *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
 *         Uses column-wise storage scheme, i.e., U has types:
 *         Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU.
 *
 * work    (workspace/output) void*, size (lwork) (in bytes)
 *         User supplied workspace, should be large enough
 *         to hold data structures for factors L and U.
 *         On exit, if fact is not 'F', L and U point to this array.
 *
 * lwork   (input) int
 *         Specifies the size of work array in bytes.
 *         = 0:  allocate space internally by system malloc;
 *         > 0:  use user-supplied work array of length lwork in bytes,
 *               returns error if space runs out.
 *         = -1: the routine guesses the amount of space needed without
 *               performing the factorization, and returns it in
 *               mem_usage->total_needed; no other side effects.
 *
 *         See argument 'mem_usage' for memory usage statistics.
 *
 * B       (input/output) SuperMatrix*
 *         B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE.
 *         On entry, the right hand side matrix.
 *         If B->ncol = 0, only LU decomposition is performed, the triangular
 *                         solve is skipped.
 *         On exit,
 *            if equed = 'N', B is not modified; otherwise
 *            if A->Stype = SLU_NC:
 *               if options->Trans = NOTRANS and equed = 'R' or 'B',
 *                  B is overwritten by diag(R)*B;
 *               if options->Trans = TRANS or CONJ and equed = 'C' of 'B',
 *                  B is overwritten by diag(C)*B;
 *            if A->Stype = SLU_NR:
 *               if options->Trans = NOTRANS and equed = 'C' or 'B',
 *                  B is overwritten by diag(C)*B;
 *               if options->Trans = TRANS or CONJ and equed = 'R' of 'B',
 *                  B is overwritten by diag(R)*B.
 *
 * X       (output) SuperMatrix*
 *         X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. 
 *         If info = 0 or info = A->ncol+1, X contains the solution matrix
 *         to the original system of equations. Note that A and B are modified
 *         on exit if equed is not 'N', and the solution to the equilibrated
 *         system is inv(diag(C))*X if options->Trans = NOTRANS and
 *         equed = 'C' or 'B', or inv(diag(R))*X if options->Trans = 'T' or 'C'
 *         and equed = 'R' or 'B'.
 *
 * recip_pivot_growth (output) double*
 *         The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
 *         The infinity norm is used. If recip_pivot_growth is much less
 *         than 1, the stability of the LU factorization could be poor.
 *
 * rcond   (output) double*
 *         The estimate of the reciprocal condition number of the matrix A
 *         after equilibration (if done). If rcond is less than the machine
 *         precision (in particular, if rcond = 0), the matrix is singular
 *         to working precision. This condition is indicated by a return
 *         code of info > 0.
 *
 * FERR    (output) double*, dimension (B->ncol)   
 *         The estimated forward error bound for each solution vector   
 *         X(j) (the j-th column of the solution matrix X).   
 *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
 *         is an estimated upper bound for the magnitude of the largest 
 *         element in (X(j) - XTRUE) divided by the magnitude of the   
 *         largest element in X(j).  The estimate is as reliable as   
 *         the estimate for RCOND, and is almost always a slight   
 *         overestimate of the true error.
 *         If options->IterRefine = NOREFINE, ferr = 1.0.
 *
 * BERR    (output) double*, dimension (B->ncol)
 *         The componentwise relative backward error of each solution   
 *         vector X(j) (i.e., the smallest relative change in   
 *         any element of A or B that makes X(j) an exact solution).
 *         If options->IterRefine = NOREFINE, berr = 1.0.
 *
 * mem_usage (output) mem_usage_t*
 *         Record the memory usage statistics, consisting of following fields:
 *         - for_lu (float)
 *           The amount of space used in bytes for L\U data structures.
 *         - total_needed (float)
 *           The amount of space needed in bytes to perform factorization.
 *         - expansions (int)
 *           The number of memory expansions during the LU factorization.
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics on runtime and floating-point operation count.
 *        See util.h for the definition of 'SuperLUStat_t'.
 *
 * info    (output) int*
 *         = 0: successful exit   
 *         < 0: if info = -i, the i-th argument had an illegal value   
 *         > 0: if info = i, and i is   
 *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
 *                    been completed, but the factor U is exactly   
 *                    singular, so the solution and error bounds   
 *                    could not be computed.   
 *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
 *                    precision, meaning that the matrix is singular to
 *                    working precision. Nevertheless, the solution and
 *                    error bounds are computed because there are a number
 *                    of situations where the computed solution can be more
 *                    accurate than the value of RCOND would suggest.   
 *              > A->ncol+1: number of bytes allocated when memory allocation
 *                    failure occurred, plus A->ncol.
 *
 */

    DNformat  *Bstore, *Xstore;
    double    *Bmat, *Xmat;
    int       ldb, ldx, nrhs;
    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int       colequ, equil, nofact, notran, rowequ, permc_spec;
    trans_t   trant;
    char      norm[1];
    int       i, j, info1;
    double    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
    int       relax, panel_size;
    double    drop_tol;
    double    t0;      /* temporary time */
    double    *utime;

    /* External functions */
    extern double dlangs(char *, SuperMatrix *);
    extern double hypre_F90_NAME_LAPACK(dlamch,DLAMCH)(const char *);

    Bstore = (DNformat*) B->Store;
    Xstore = (DNformat*) X->Store;
    Bmat   = (  double*) Bstore->nzval;
    Xmat   = (  double*) Xstore->nzval;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;

    *info = 0;
    nofact = (options->Fact != FACTORED);
    equil = (options->Equil == YES);
    notran = (options->Trans == NOTRANS);
    if ( nofact ) {
	*(unsigned char *)equed = 'N';
	rowequ = FALSE;
	colequ = FALSE;
    } else {
	rowequ = superlu_lsame(equed, "R") || superlu_lsame(equed, "B");
	colequ = superlu_lsame(equed, "C") || superlu_lsame(equed, "B");
        smlnum = hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("Safe minimum");
	bignum = 1. / smlnum;
    }

#if 0
printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n",
       options->Fact, options->Trans, *equed);
#endif

    /* Test the input parameters */
    if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern &&
	options->Fact != SamePattern_SameRowPerm &&
	!notran && options->Trans != TRANS && options->Trans != CONJ &&
	!equil && options->Equil != NO)
	*info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
	      A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -2;
    else if (options->Fact == FACTORED &&
	     !(rowequ || colequ || superlu_lsame(equed, "N")))
	*info = -6;
    else {
	if (rowequ) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, R[j]);
		rcmax = SUPERLU_MAX(rcmax, R[j]);
	    }
	    if (rcmin <= 0.) *info = -7;
	    else if ( A->nrow > 0)
		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else rowcnd = 1.;
	}
	if (colequ && *info == 0) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, C[j]);
		rcmax = SUPERLU_MAX(rcmax, C[j]);
	    }
	    if (rcmin <= 0.) *info = -8;
	    else if (A->nrow > 0)
		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else colcnd = 1.;
	}
	if (*info == 0) {
	    if ( lwork < -1 ) *info = -12;
	    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      B->Stype != SLU_DN || B->Dtype != SLU_D || 
		      B->Mtype != SLU_GE )
		*info = -13;
	    else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      (B->ncol != 0 && B->ncol != X->ncol) ||
                      X->Stype != SLU_DN ||
		      X->Dtype != SLU_D || X->Mtype != SLU_GE )
		*info = -14;
	}
    }
    if (*info != 0) {
	i = -(*info);
	superlu_xerbla("dgssvx", &i);
	return;
    }
    
    /* Initialization for factor parameters */
    panel_size = sp_ienv(1);
    relax      = sp_ienv(2);
    drop_tol   = 0.0;

    utime = stat->utime;
    
    /* Convert A to SLU_NC format when necessary. */
    if ( A->Stype == SLU_NR ) {
	NRformat *Astore = (NRformat*) A->Store;
	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
	dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
			       (double*) Astore->nzval, Astore->colind, Astore->rowptr,
			       SLU_NC, A->Dtype, A->Mtype);
	if ( notran ) { /* Reverse the transpose argument. */
	    trant = TRANS;
	    notran = 0;
	} else {
	    trant = NOTRANS;
	    notran = 1;
	}
    } else { /* A->Stype == SLU_NC */
	trant = options->Trans;
	AA = A;
    }

    if ( nofact && equil ) {
	t0 = SuperLU_timer_();
	/* Compute row and column scalings to equilibrate the matrix A. */
	dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
	
	if ( info1 == 0 ) {
	    /* Equilibrate matrix A. */
	    dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
	    rowequ = superlu_lsame(equed, "R") || superlu_lsame(equed, "B");
	    colequ = superlu_lsame(equed, "C") || superlu_lsame(equed, "B");
	}
	utime[EQUIL] = SuperLU_timer_() - t0;
    }

    if ( nrhs > 0 ) {
        /* Scale the right hand side if equilibration was performed. */
        if ( notran ) {
	    if ( rowequ ) {
	        for (j = 0; j < nrhs; ++j)
		    for (i = 0; i < A->nrow; ++i) {
		        Bmat[i + j*ldb] *= R[i];
	            }
	    }
        } else if ( colequ ) {
	    for (j = 0; j < nrhs; ++j)
	        for (i = 0; i < A->nrow; ++i) {
	            Bmat[i + j*ldb] *= C[i];
	        }
        }
    }

    if ( nofact ) {
	
        t0 = SuperLU_timer_();
	/*
	 * Gnet column permutation vector perm_c[], according to permc_spec:
	 *   permc_spec = NATURAL:  natural ordering 
	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
	 *   permc_spec = COLAMD:   approximate minimum degree column ordering
	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
	 */
	permc_spec = options->ColPerm;
	if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
            get_perm_c(permc_spec, AA, perm_c);
	utime[COLPERM] = SuperLU_timer_() - t0;

	t0 = SuperLU_timer_();
	sp_preorder(options, AA, perm_c, etree, &AC);
	utime[ETREE] = SuperLU_timer_() - t0;
    
/*	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
	       relax, panel_size, sp_ienv(3), sp_ienv(4));
	fflush(stdout); */
	
	/* Compute the LU factorization of A*Pc. */
	t0 = SuperLU_timer_();
	dgstrf(options, &AC, drop_tol, relax, panel_size,
	       etree, work, lwork, perm_c, perm_r, L, U, stat, info);
	utime[FACT] = SuperLU_timer_() - t0;
	
	if ( lwork == -1 ) {
	    mem_usage->total_needed = *info - A->ncol;
	    return;
	}
    }

    if ( options->PivotGrowth ) {
        if ( *info > 0 ) {
	    if ( *info <= A->ncol ) {
	        /* Compute the reciprocal pivot growth factor of the leading
	           rank-deficient *info columns of A. */
	        *recip_pivot_growth = dPivotGrowth(*info, AA, perm_c, L, U);
	    }
	    return;
        }

        /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
        *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U);
    }

    if ( options->ConditionNumber ) {
        /* Estimate the reciprocal of the condition number of A. */
        t0 = SuperLU_timer_();
        if ( notran ) {
	    *(unsigned char *)norm = '1';
        } else {
	    *(unsigned char *)norm = 'I';
        }
        anorm = dlangs(norm, AA);
        dgscon(norm, L, U, anorm, rcond, stat, info);
        utime[RCOND] = SuperLU_timer_() - t0;
    }
    
    if ( nrhs > 0 ) {
        /* Compute the solution matrix X. */
        for (j = 0; j < nrhs; j++)  /* Save a copy of the right hand sides */
            for (i = 0; i < B->nrow; i++)
	        Xmat[i + j*ldx] = Bmat[i + j*ldb];
    
        t0 = SuperLU_timer_();
        dgstrs (trant, L, U, perm_c, perm_r, X, stat, info);
        utime[SOLVE] = SuperLU_timer_() - t0;
    
        /* Use iterative refinement to improve the computed solution and compute
           error bounds and backward error estimates for it. */
        t0 = SuperLU_timer_();
        if ( options->IterRefine != NOREFINE ) {
            dgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B,
                   X, ferr, berr, stat, info);
        } else {
            for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0;
        }
        utime[REFINE] = SuperLU_timer_() - t0;

        /* Transform the solution matrix X to a solution of the original system. */
        if ( notran ) {
	    if ( colequ ) {
	        for (j = 0; j < nrhs; ++j)
		    for (i = 0; i < A->nrow; ++i) {
                        Xmat[i + j*ldx] *= C[i];
	            }
	    }
        } else if ( rowequ ) {
	    for (j = 0; j < nrhs; ++j)
	        for (i = 0; i < A->nrow; ++i) {
	            Xmat[i + j*ldx] *= R[i];
                }
        }
    } /* end if nrhs > 0 */

    if ( options->ConditionNumber ) {
        /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
       if (*rcond < hypre_F90_NAME_LAPACK(dlamch,DLAMCH)("E")) *info=A->ncol+1;
    }

    if ( nofact ) {
        dQuerySpace(L, U, mem_usage);
        Destroy_CompCol_Permuted(&AC);
    }
    if ( A->Stype == SLU_NR ) {
	Destroy_SuperMatrix_Store(AA);
	SUPERLU_FREE(AA);
    }

}
Ejemplo n.º 27
0
float
cPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, 
             SuperMatrix *L, SuperMatrix *U)
{
/*
 * Purpose
 * =======
 *
 * Compute the reciprocal pivot growth factor of the leading ncols columns
 * of the matrix, using the formula:
 *     min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) )
 *
 * Arguments
 * =========
 *
 * ncols    (input) int
 *          The number of columns of matrices A, L and U.
 *
 * A        (input) SuperMatrix*
 *	    Original matrix A, permuted by columns, of dimension
 *          (A->nrow, A->ncol). The type of A can be:
 *          Stype = NC; Dtype = SLU_C; Mtype = GE.
 *
 * L        (output) SuperMatrix*
 *          The factor L from the factorization Pr*A=L*U; use compressed row 
 *          subscripts storage for supernodes, i.e., L has type: 
 *          Stype = SC; Dtype = SLU_C; Mtype = TRLU.
 *
 * U        (output) SuperMatrix*
 *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
 *          storage scheme, i.e., U has types: Stype = NC;
 *          Dtype = SLU_C; Mtype = TRU.
 *
 */
    NCformat *Astore;
    SCformat *Lstore;
    NCformat *Ustore;
    complex  *Aval, *Lval, *Uval;
    int      fsupc, nsupr, luptr, nz_in_U;
    int      i, j, k, oldcol;
    int      *inv_perm_c;
    float   rpg, maxaj, maxuj;
    extern   double slamch_(char *);
    float   smlnum;
    complex   *luval;
    complex   temp_comp;
   
    /* Get machine constants. */
    smlnum = slamch_("S");
    rpg = 1. / smlnum;

    Astore = A->Store;
    Lstore = L->Store;
    Ustore = U->Store;
    Aval = Astore->nzval;
    Lval = Lstore->nzval;
    Uval = Ustore->nzval;
    
    inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int));
    for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j;

    for (k = 0; k <= Lstore->nsuper; ++k) {
	fsupc = L_FST_SUPC(k);
	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
	luptr = L_NZ_START(fsupc);
	luval = &Lval[luptr];
	nz_in_U = 1;
	
	for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) {
	    maxaj = 0.;
            oldcol = inv_perm_c[j];
	    for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i)
		maxaj = SUPERLU_MAX( maxaj, slu_c_abs1( &Aval[i]) );
	
	    maxuj = 0.;
	    for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++)
		maxuj = SUPERLU_MAX( maxuj, slu_c_abs1( &Uval[i]) );
	    
	    /* Supernode */
	    for (i = 0; i < nz_in_U; ++i)
		maxuj = SUPERLU_MAX( maxuj, slu_c_abs1( &luval[i]) );

	    ++nz_in_U;
	    luval += nsupr;

	    if ( maxuj == 0. )
		rpg = SUPERLU_MIN( rpg, 1.);
	    else
		rpg = SUPERLU_MIN( rpg, maxaj / maxuj );
	}
	
	if ( j >= ncols ) break;
    }

    SUPERLU_FREE(inv_perm_c);
    return (rpg);
}
int
static_schedule(superlu_options_t * options, int m, int n, 
		LUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat,
		int_t *perm_c_supno, int_t *iperm_c_supno, int *info)
{
    int_t *xsup;
    int_t  i, ib, jb, lb, nlb, il, iu;
    int_t Pc, Pr;
    int iam, krow, yourcol, mycol, myrow; 
    int j, k, nsupers;  /* k - current panel to work on */
    int_t *index;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;
    int ncb, nrb, p, pr, pc, nblocks;
    int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
        *Lblock, *Lrows, *sf_block, *sf_block_l, *nnodes_l,
        *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
    float edag_supno_l_bytes;
    int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
    etree_node *head, *tail, *ptr;
    int *num_child;

    int iword = sizeof (int_t);

    /* Test the input parameters. */
    *info = 0;
    if (m < 0) *info = -2;
    else if (n < 0) *info = -3;
    if (*info) {
        pxerbla ("pdgstrf", grid, -*info);
        return (-1);
    }

    /* Quick return if possible. */
    if (m == 0 || n == 0) return 0;
 
    /* 
     * Initialization.  
     */
    iam = grid->iam;
    Pc = grid->npcol; 
    Pr = grid->nprow;
    myrow = MYROW (iam, grid);
    mycol = MYCOL (iam, grid);
    nsupers = Glu_persist->supno[n - 1] + 1;
    xsup = Glu_persist->xsup;
    nblocks = 0;
    ncb = nsupers / Pc;
    nrb = nsupers / Pr;

#if ( DEBUGlevel >= 1 ) 
    print_memorylog(stat, "before static schedule");
#endif

    /* ================================================== *
     * static scheduling of j-th step of LU-factorization *
     * ================================================== */
    if (options->lookahead_etree == YES &&  /* use e-tree of symmetrized matrix and */
        (options->ParSymbFact == NO ||  /* 1) symmetric fact with serial symbolic, or */
         (options->SymPattern == YES && /* 2) symmetric pattern, and                  */
          options->RowPerm == NOROWPERM))) { /* no rowperm to destroy symmetry */

        /* if symmetric pattern or using e-tree of |A^T|+|A|,
           then we can use a simple tree structure for static schduling */

        if (options->ParSymbFact == NO) {
            /* Use the etree computed from serial symb. fact., and turn it
               into supernodal tree.  */
            int_t *etree = LUstruct->etree;
#if ( PRNTlevel>=1 )
            if (grid->iam == 0)
                printf (" === using column e-tree ===\n");
#endif

            /* look for the first off-diagonal blocks */
            etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t));
	    log_memory(nsupers * iword, stat);

            for (i = 0; i < nsupers; i++) etree_supno[i] = nsupers;

            for (j = 0, lb = 0; lb < nsupers; lb++) {
                for (k = 0; k < SuperSize (lb); k++) {
                    jb = Glu_persist->supno[etree[j + k]];
                    if (jb != lb)
                        etree_supno[lb] = SUPERLU_MIN (etree_supno[lb], jb);
                }
                j += SuperSize (lb);
            }
        } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */
            /* Compute an "etree" based on struct(L),
               assuming struct(U) = struct(L').   */
#if ( PRNTlevel>=1 )
            if (grid->iam == 0)
                printf (" === using supernodal e-tree ===\n");
#endif

            /* find the first block in each supernodal-column of local L-factor */
            etree_supno_l = SUPERLU_MALLOC (nsupers * sizeof (int_t));
	    log_memory(nsupers * iword, stat);

            for (i = 0; i < nsupers; i++) etree_supno_l[i] = nsupers;
            for (lb = 0; lb < ncb; lb++) {
                jb = lb * grid->npcol + mycol;
                index = Llu->Lrowind_bc_ptr[lb];
                if (index) {   /* Not an empty column */
                    i = index[0];
                    k = BC_HEADER;
                    krow = PROW (jb, grid);
                    if (krow == myrow) {  /* skip the diagonal block */
                        k += LB_DESCRIPTOR + index[k + 1];
                        i--;
                    }
                    if (i > 0)
                    {
                        etree_supno_l[jb] = index[k];
                        k += LB_DESCRIPTOR + index[k + 1];
                        i--;
                    }

                    for (j = 0; j < i; j++)
                    {
                        etree_supno_l[jb] =
                            SUPERLU_MIN (etree_supno_l[jb], index[k]);
                        k += LB_DESCRIPTOR + index[k + 1];
                    }
                }
            }
            if (mycol < nsupers % grid->npcol) {
                jb = ncb * grid->npcol + mycol;
                index = Llu->Lrowind_bc_ptr[ncb];
                if (index) {     /* Not an empty column */
                    i = index[0];
                    k = BC_HEADER;
                    krow = PROW (jb, grid);
                    if (krow == myrow) { /* skip the diagonal block */
                        k += LB_DESCRIPTOR + index[k + 1];
                        i--;
                    }
                    if (i > 0) {
                        etree_supno_l[jb] = index[k];
                        k += LB_DESCRIPTOR + index[k + 1];
                        i--;
                    }
                    for (j = 0; j < i; j++) {
                        etree_supno_l[jb] =
                            SUPERLU_MIN (etree_supno_l[jb], index[k]);
                        k += LB_DESCRIPTOR + index[k + 1];
                    }
                }
            }

            /* form global e-tree */
            etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t));

            MPI_Allreduce (etree_supno_l, etree_supno, nsupers, mpi_int_t,
                           MPI_MIN, grid->comm);

            SUPERLU_FREE (etree_supno_l);
        }

        /* initialize number of children for each node */
        num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t));
        for (i = 0; i < nsupers; i++) num_child[i] = 0;
        for (i = 0; i < nsupers; i++)
            if (etree_supno[i] != nsupers)  num_child[etree_supno[i]]++;

        /* push initial leaves to the fifo queue */
        nnodes = 0;
        for (i = 0; i < nsupers; i++) {
            if (num_child[i] == 0) {
                ptr = SUPERLU_MALLOC (sizeof (etree_node));
                ptr->id = i;
                ptr->next = NULL;
                /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */
                nnodes++;

                if (nnodes == 1) {
                    head = ptr;
                    tail = ptr;
                } else {
                    tail->next = ptr;
                    tail = ptr;
                }
            }
        }

        /* process fifo queue, and compute the ordering */
        i = 0;

        while (nnodes > 0) {
            ptr = head;
            j = ptr->id;
            head = ptr->next;
            perm_c_supno[i] = j;
            SUPERLU_FREE (ptr);
            i++;
            nnodes--;

            if (etree_supno[j] != nsupers) {
                num_child[etree_supno[j]]--;
                if (num_child[etree_supno[j]] == 0) {
                    nnodes++;

                    ptr = SUPERLU_MALLOC (sizeof (etree_node));
                    ptr->id = etree_supno[j];
                    ptr->next = NULL;

                    /*printf( "=== push %d ===\n",ptr->id ); */
                    if (nnodes == 1) {
                        head = ptr;
                        tail = ptr;
                    } else {
                        tail->next = ptr;
                        tail = ptr;
                    }
                }
            }
            /*printf( "\n" ); */
        }
        SUPERLU_FREE (num_child);
        SUPERLU_FREE (etree_supno);
	log_memory(-2 * nsupers * iword, stat);

    } else {         /* Unsymmetric pattern */

        /* Need to process both L- and U-factors, use the symmetrically
           pruned graph of L & U instead of tree (very naive implementation) */
        int nrbp1 = nrb + 1;
	float Ublock_bytes, Urows_bytes, Lblock_bytes, Lrows_bytes;

        /* allocate some workspace */
        if (! (sendcnts = SUPERLU_MALLOC ((4 + 2 * nrbp1) * Pr * Pc * sizeof (int))))
            ABORT ("Malloc fails for sendcnts[].");
	log_memory((4 + 2 * nrbp1) * Pr * Pc * sizeof (int), stat);

        sdispls = &sendcnts[Pr * Pc];
        recvcnts = &sdispls[Pr * Pc];
        rdispls = &recvcnts[Pr * Pc];
        srows = &rdispls[Pr * Pc];
        rrows = &srows[Pr * Pc * nrbp1];

        myrow = MYROW (iam, grid);
#if ( PRNTlevel>=1 )
        if (grid->iam == 0)
            printf (" === using DAG ===\n");
#endif

        /* send supno block of local U-factor to a processor *
         * who owns the corresponding block of L-factor      */

        /* srows   : # of block to send to a processor from each supno row */
        /* sendcnts: total # of blocks to send to a processor              */
        for (p = 0; p < Pr * Pc * nrbp1; p++) srows[p] = 0;
        for (p = 0; p < Pr * Pc; p++) sendcnts[p] = 0;

        /* sending blocks of U-factors corresponding to L-factors */
        /* count the number of blocks to send */
        for (lb = 0; lb < nrb; ++lb) {
            jb = lb * Pr + myrow;
            pc = jb % Pc;
            index = Llu->Ufstnz_br_ptr[lb];

            if (index) {         /* Not an empty row */
                k = BR_HEADER;
                nblocks += index[0];
                for (j = 0; j < index[0]; ++j) {
                    ib = index[k];
                    pr = ib % Pr;
                    p = pr * Pc + pc;
                    sendcnts[p]++;
                    srows[p * nrbp1 + lb]++;

                    k += UB_DESCRIPTOR + SuperSize (index[k]);
                }
            }
        }

        if (myrow < nsupers % grid->nprow) {
            jb = nrb * Pr + myrow;
            pc = jb % Pc;
            index = Llu->Ufstnz_br_ptr[nrb];

            if (index) {         /* Not an empty row */
                k = BR_HEADER;
                nblocks += index[0];
                for (j = 0; j < index[0]; ++j) {
                    ib = index[k];
                    pr = ib % Pr;
                    p = pr * Pc + pc;
                    sendcnts[p]++;
                    srows[p * nrbp1 + nrb]++;
                    k += UB_DESCRIPTOR + SuperSize (index[k]);
                }
            }
        }

        /* insert blocks to send */
        sdispls[0] = 0;
        for (p = 1; p < Pr * Pc; p++) sdispls[p] = sdispls[p - 1] + sendcnts[p - 1];
        if (!(blocks = intMalloc_dist (nblocks)))
            ABORT ("Malloc fails for blocks[].");
	log_memory( nblocks * iword, stat );

        for (lb = 0; lb < nrb; ++lb) {
            jb = lb * Pr + myrow;
            pc = jb % Pc;
            index = Llu->Ufstnz_br_ptr[lb];

            if (index) {       /* Not an empty row */
                k = BR_HEADER;
                for (j = 0; j < index[0]; ++j) {
                    ib = index[k];
                    pr = ib % Pr;
                    p = pr * Pc + pc;
                    blocks[sdispls[p]] = ib;
                    sdispls[p]++;

                    k += UB_DESCRIPTOR + SuperSize (index[k]);
                }
            }
        }

        if (myrow < nsupers % grid->nprow) {
            jb = nrb * Pr + myrow;
            pc = jb % Pc;
            index = Llu->Ufstnz_br_ptr[nrb];

            if (index) {       /* Not an empty row */
                k = BR_HEADER;
                for (j = 0; j < index[0]; ++j) {
                    ib = index[k];
                    pr = ib % Pr;
                    p = pr * Pc + pc;
                    blocks[sdispls[p]] = ib;
                    sdispls[p]++;

                    k += UB_DESCRIPTOR + SuperSize (index[k]);
                }
            }
        }

        /* communication */
        MPI_Alltoall (sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
        MPI_Alltoall (srows, nrbp1, MPI_INT, rrows, nrbp1, MPI_INT, grid->comm);

	log_memory( -(nblocks * iword), stat );  /* blocks[] to be freed soon */

        nblocks = recvcnts[0];
        rdispls[0] = sdispls[0] = 0;
        for (p = 1; p < Pr * Pc; p++) {
            rdispls[p] = rdispls[p - 1] + recvcnts[p - 1];
            sdispls[p] = sdispls[p - 1] + sendcnts[p - 1];
            nblocks += recvcnts[p];
        }

        if (!(blockr = intMalloc_dist (nblocks))) ABORT ("Malloc fails for blockr[].");
	log_memory( nblocks * iword, stat );

        MPI_Alltoallv (blocks, sendcnts, sdispls, mpi_int_t, blockr, recvcnts,
                       rdispls, mpi_int_t, grid->comm);

        SUPERLU_FREE (blocks); /* memory logged before */

	
        /* store the received U-blocks by rows */
        nlb = nsupers / Pc;
        if (!(Ublock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Ublock[].");
        if (!(Urows = intMalloc_dist (1 + nlb))) ABORT ("Malloc fails for Urows[].");

	Ublock_bytes = nblocks * iword;
	Urows_bytes = (1 + nlb) * iword;
	log_memory( Ublock_bytes + Urows_bytes, stat );

        k = 0;
        for (jb = 0; jb < nlb; jb++) {
            j = jb * Pc + mycol;
            pr = j % Pr;
            lb = j / Pr;
            Urows[jb] = 0;

            for (pc = 0; pc < Pc; pc++) {
                p = pr * Pc + pc; /* the processor owning this block of U-factor */

                for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb];
                     i++) {
                    Ublock[k] = blockr[i];
                    k++;
                    Urows[jb]++;
                }
                rdispls[p] += rrows[p * nrbp1 + lb];
            }
            /* sort by the column indices to make things easier for later on */

#ifdef ISORT
            isort1 (Urows[jb], &(Ublock[k - Urows[jb]]));
#else
            qsort (&(Ublock[k - Urows[jb]]), (size_t) (Urows[jb]),
                   sizeof (int_t), &superlu_sort_perm);
#endif
        }
        if (mycol < nsupers % grid->npcol) {
            j = nlb * Pc + mycol;
            pr = j % Pr;
            lb = j / Pr;
            Urows[nlb] = 0;

            for (pc = 0; pc < Pc; pc++) {
                p = pr * Pc + pc;
                for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb];
                     i++) {
                    Ublock[k] = blockr[i];
                    k++;
                    Urows[nlb]++;
                }
                rdispls[p] += rrows[p * nrb + lb];
            }
#ifdef ISORT
            isort1 (Urows[nlb], &(Ublock[k - Urows[nlb]]));
#else
            qsort (&(Ublock[k - Urows[nlb]]), (size_t) (Urows[nlb]),
                   sizeof (int_t), &superlu_sort_perm);
#endif
        }
        SUPERLU_FREE (blockr);
	log_memory( -nblocks * iword, stat );

        /* sort the block in L-factor */
        nblocks = 0;
        for (lb = 0; lb < ncb; lb++) {
            jb = lb * Pc + mycol;
            index = Llu->Lrowind_bc_ptr[lb];
            if (index) {        /* Not an empty column */
                nblocks += index[0];
            }
        }
        if (mycol < nsupers % grid->npcol) {
            jb = ncb * Pc + mycol;
            index = Llu->Lrowind_bc_ptr[ncb];
            if (index) {       /* Not an empty column */
                nblocks += index[0];
            }
        }

        if (!(Lblock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Lblock[].");
        if (!(Lrows = intMalloc_dist (1 + ncb))) ABORT ("Malloc fails for Lrows[].");

	Lblock_bytes = nblocks * iword;
	Lrows_bytes = (1 + ncb) * iword;
	log_memory(Lblock_bytes + Lrows_bytes, stat);

        for (lb = 0; lb <= ncb; lb++) Lrows[lb] = 0;
        nblocks = 0;
        for (lb = 0; lb < ncb; lb++) {
            Lrows[lb] = 0;

            jb = lb * Pc + mycol;
            index = Llu->Lrowind_bc_ptr[lb];
            if (index) {      /* Not an empty column */
                i = index[0];
                k = BC_HEADER;
                krow = PROW (jb, grid);
                if (krow == myrow)  /* skip the diagonal block */
                {
                    k += LB_DESCRIPTOR + index[k + 1];
                    i--;
                }

                for (j = 0; j < i; j++) {
                    Lblock[nblocks] = index[k];
                    Lrows[lb]++;
                    nblocks++;

                    k += LB_DESCRIPTOR + index[k + 1];
                }
            }
#ifdef ISORT
            isort1 (Lrows[lb], &(Lblock[nblocks - Lrows[lb]]));
#else
            qsort (&(Lblock[nblocks - Lrows[lb]]), (size_t) (Lrows[lb]),
                   sizeof (int_t), &superlu_sort_perm);
#endif
        }
        if (mycol < nsupers % grid->npcol) {
            Lrows[ncb] = 0;
            jb = ncb * Pc + mycol;
            index = Llu->Lrowind_bc_ptr[ncb];
            if (index) {       /* Not an empty column */
                i = index[0];
                k = BC_HEADER;
                krow = PROW (jb, grid);
                if (krow == myrow) { /* skip the diagonal block */
                    k += LB_DESCRIPTOR + index[k + 1];
                    i--;
                }
                for (j = 0; j < i; j++) {
                    Lblock[nblocks] = index[k];
                    Lrows[ncb]++;
                    nblocks++;
                    k += LB_DESCRIPTOR + index[k + 1];
                }
#ifdef ISORT
                isort1 (Lrows[ncb], &(Lblock[nblocks - Lrows[ncb]]));
#else
                qsort (&(Lblock[nblocks - Lrows[ncb]]), (size_t) (Lrows[ncb]),
                       sizeof (int_t), &superlu_sort_perm);
#endif
            }
        }

        /* look for the first local symmetric nonzero block match */
        if (!(sf_block = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block[].");
        if (!(sf_block_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block_l[].");

	log_memory( 2 * nsupers * iword, stat );

        for (lb = 0; lb < nsupers; lb++)
            sf_block_l[lb] = nsupers;
        i = 0;
        j = 0;
        for (jb = 0; jb < nlb; jb++) {
            if (Urows[jb] > 0) {
                ib = i + Urows[jb];
                lb = jb * Pc + mycol;
                for (k = 0; k < Lrows[jb]; k++) {
                    while (Ublock[i] < Lblock[j] && i + 1 < ib)
                        i++;

                    if (Ublock[i] == Lblock[j]) {
                        sf_block_l[lb] = Lblock[j];
                        j += (Lrows[jb] - k);
                        k = Lrows[jb];
                    } else {
                        j++;
                    }
                }
                i = ib;
            } else {
                j += Lrows[jb];
            }
        }
        if (mycol < nsupers % grid->npcol) {
            if (Urows[nlb] > 0) {
                ib = i + Urows[nlb];
                lb = nlb * Pc + mycol;
                for (k = 0; k < Lrows[nlb]; k++) {
                    while (Ublock[i] < Lblock[j] && i + 1 < ib)
                        i++;

                    if (Ublock[i] == Lblock[j])
                    {
                        sf_block_l[lb] = Lblock[j];
                        j += (Lrows[nlb] - k);
                        k = Lrows[nlb];
                    }
                    else
                    {
                        j++;
                    }
                }
                i = ib;
            } else {
                j += Lrows[nlb];
            }
        }

        /* compute the first global symmetric matchs */
        MPI_Allreduce (sf_block_l, sf_block, nsupers, mpi_int_t, MPI_MIN,
                       grid->comm);
        SUPERLU_FREE (sf_block_l);
	log_memory( -nsupers * iword, stat );

        /* count number of nodes in DAG (i.e., the number of blocks on and above the first match) */
        if (!(nnodes_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_l[].");
        if (!(nnodes_u = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_u[].");
	log_memory( 2 * nsupers * iword, stat );

        for (lb = 0; lb < nsupers; lb++)  nnodes_l[lb] = 0;
        for (lb = 0; lb < nsupers; lb++)  nnodes_u[lb] = 0;

        nblocks = 0;
        /* from U-factor */
        for (i = 0, jb = 0; jb < nlb; jb++) {
            lb = jb * Pc + mycol;
            ib = i + Urows[jb];
            while (i < ib) {
                if (Ublock[i] <= sf_block[lb]) {
                    nnodes_u[lb]++;
                    i++;
                    nblocks++;
                } else {     /* get out */
                    i = ib;
                }
            }
            i = ib;
        }
        if (mycol < nsupers % grid->npcol) {
            lb = nlb * Pc + mycol;
            ib = i + Urows[nlb];
            while (i < ib) {
                if (Ublock[i] <= sf_block[lb]) {
                    nnodes_u[lb]++;
                    i++;
                    nblocks++;
                } else {         /* get out */
                    i = ib;
                }
            }
            i = ib;
        }

        /* from L-factor */
        for (i = 0, jb = 0; jb < nlb; jb++) {
            lb = jb * Pc + mycol;
            ib = i + Lrows[jb];
            while (i < ib) {
                if (Lblock[i] < sf_block[lb]) {
                    nnodes_l[lb]++;
                    i++;
                    nblocks++;
                } else {
                    i = ib;
                }
            }
            i = ib;
        }
        if (mycol < nsupers % grid->npcol) {
            lb = nlb * Pc + mycol;
            ib = i + Lrows[nlb];
            while (i < ib) {
                if (Lblock[i] < sf_block[lb]) {
                    nnodes_l[lb]++;
                    i++;
                    nblocks++;
                } else {
                    i = ib;
                }
            }
            i = ib;
        }

#ifdef USE_ALLGATHER
        /* insert local nodes in DAG */
        if (!(edag_supno_l = intMalloc_dist (nsupers + nblocks)))
            ABORT ("Malloc fails for edag_supno_l[].");
	edag_supno_l_bytes = (nsupers + nblocks) * iword;
	log_memory(edag_supno_l_bytes, stat);

        iu = il = nblocks = 0;
        for (lb = 0; lb < nsupers; lb++) {
            j = lb / Pc;
            pc = lb % Pc;

            edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb];
            nblocks++;
            if (mycol == pc) {
                /* from U-factor */
                ib = iu + Urows[j];
                for (jb = 0; jb < nnodes_u[lb]; jb++) {
                    edag_supno_l[nblocks] = Ublock[iu];
                    iu++;
                    nblocks++;
                }
                iu = ib;

                /* from L-factor */
                ib = il + Lrows[j];
                for (jb = 0; jb < nnodes_l[lb]; jb++) {
                    edag_supno_l[nblocks] = Lblock[il];
                    il++;
                    nblocks++;
                }
                il = ib;
            }
        }
        SUPERLU_FREE (nnodes_u);
	log_memory(-nsupers * iword, stat);

        /* form global DAG on each processor */
        MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT,
                       grid->comm);
        nblocks = recvcnts[0];
        rdispls[0] = 0;
        for (lb = 1; lb < Pc * Pr; lb++) {
            rdispls[lb] = nblocks;
            nblocks += recvcnts[lb];
        }
        if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[].");
	log_memory(nblocks * iword, stat);

        MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t,
                        recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm);
        SUPERLU_FREE (edag_supno_l);
	log_memory(-edag_supno_l_bytes, stat);

        if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *))))
            ABORT ("Malloc fails for edag_supno[].");
	log_memory(nsupers * iword, stat);

        k = 0;
        for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0;
        for (p = 0; p < Pc * Pr; p++) {
            for (lb = 0; lb < nsupers; lb++) {
                nnodes_l[lb] += recvbuf[k];
                k += (1 + recvbuf[k]);
            }
        }
        for (lb = 0; lb < nsupers; lb++) {
            if (nnodes_l[lb] > 0)
                if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb])))
                    ABORT ("Malloc fails for edag_supno[lb].");
            nnodes_l[lb] = 0;
        }
        k = 0;
        for (p = 0; p < Pc * Pr; p++) {
            for (lb = 0; lb < nsupers; lb++) {
                jb = k + recvbuf[k] + 1;
                k++;
                for (; k < jb; k++) {
                    edag_supno[lb][nnodes_l[lb]] = recvbuf[k];
                    nnodes_l[lb]++;
                }
            }
        }
        SUPERLU_FREE (recvbuf);
	log_memory(-nblocks * iword, stat);

#else   /* not USE_ALLGATHER */
        int nlsupers = nsupers / Pc;
        if (mycol < nsupers % Pc) nlsupers++;

        /* insert local nodes in DAG */
        if (!(edag_supno_l = intMalloc_dist (nlsupers + nblocks)))
            ABORT ("Malloc fails for edag_supno_l[].");
	edag_supno_l_bytes = (nlsupers + nblocks) * iword;
	log_memory(edag_supno_l_bytes, stat);

        iu = il = nblocks = 0;
        for (lb = 0; lb < nsupers; lb++) {
            j = lb / Pc;
            pc = lb % Pc;
            if (mycol == pc) {
                edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb];
                nblocks++;
                /* from U-factor */
                ib = iu + Urows[j];
                for (jb = 0; jb < nnodes_u[lb]; jb++) {
                    edag_supno_l[nblocks] = Ublock[iu];
                    iu++;
                    nblocks++;
                }
                iu = ib;

                /* from L-factor */
                ib = il + Lrows[j];
                for (jb = 0; jb < nnodes_l[lb]; jb++) {
                    edag_supno_l[nblocks] = Lblock[il];
                    il++;
                    nblocks++;
                }
                il = ib;
            } else if (nnodes_l[lb] + nnodes_u[lb] != 0)
                printf (" # %d: nnodes[%d]=%d+%d\n", grid->iam, lb,
                        nnodes_l[lb], nnodes_u[lb]);
        }
        SUPERLU_FREE (nnodes_u);
	log_memory(-nsupers * iword, stat);

        /* form global DAG on each processor */  
        MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
        nblocks = recvcnts[0];
        rdispls[0] = 0;
        for (lb = 1; lb < Pc * Pr; lb++) {
            rdispls[lb] = nblocks;
            nblocks += recvcnts[lb];
        }
        if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[].");
	log_memory(nblocks * iword, stat);

        MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t,
                        recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm);

        SUPERLU_FREE (edag_supno_l);
	log_memory(-edag_supno_l_bytes, stat);

        if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *))))
            ABORT ("Malloc fails for edag_supno[].");
	log_memory(nsupers * sizeof(int_t *), stat);

        k = 0;
        for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0;
        for (p = 0; p < Pc * Pr; p++) {
            yourcol = MYCOL (p, grid);

            for (lb = 0; lb < nsupers; lb++) {
                j = lb / Pc;
                pc = lb % Pc;
                if (yourcol == pc) {
                    nnodes_l[lb] += recvbuf[k];
                    k += (1 + recvbuf[k]);
                }
            }
        }
        for (lb = 0; lb < nsupers; lb++) {
            if (nnodes_l[lb] > 0)
                if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb])))
                    ABORT ("Malloc fails for edag_supno[lb].");
            nnodes_l[lb] = 0;
        }
        k = 0;
        for (p = 0; p < Pc * Pr; p++) {
            yourcol = MYCOL (p, grid);

            for (lb = 0; lb < nsupers; lb++) {
                j = lb / Pc;
                pc = lb % Pc;
                if (yourcol == pc)
                {
                    jb = k + recvbuf[k] + 1;
                    k++;
                    for (; k < jb; k++)
                    {
                        edag_supno[lb][nnodes_l[lb]] = recvbuf[k];
                        nnodes_l[lb]++;
                    }
                }
            }
        }
        SUPERLU_FREE (recvbuf);
	log_memory( -nblocks * iword , stat);

#endif  /* end USE_ALL_GATHER */

        /* initialize the num of child for each node */
        num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t));
        for (i = 0; i < nsupers; i++) num_child[i] = 0;
        for (i = 0; i < nsupers; i++) {
            for (jb = 0; jb < nnodes_l[i]; jb++) {
                num_child[edag_supno[i][jb]]++;
            }
        }

        /* push initial leaves to the fifo queue */
        nnodes = 0;
        for (i = 0; i < nsupers; i++) {
            if (num_child[i] == 0) {
                ptr = SUPERLU_MALLOC (sizeof (etree_node));
                ptr->id = i;
                ptr->next = NULL;
                /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */
                nnodes++;

                if (nnodes == 1) {
                    head = ptr;
                    tail = ptr;
                } else {
                    tail->next = ptr;
                    tail = ptr;
                }
            }
        }

        /* process fifo queue, and compute the ordering */
        i = 0;

        while (nnodes > 0) {
            /*printf( "=== pop %d (%d) ===\n",head->id,i ); */
            ptr = head;
            j = ptr->id;
            head = ptr->next;

            perm_c_supno[i] = j;
            SUPERLU_FREE (ptr);
            i++;
            nnodes--;

            for (jb = 0; jb < nnodes_l[j]; jb++) {
                num_child[edag_supno[j][jb]]--;
                if (num_child[edag_supno[j][jb]] == 0) {
                    nnodes++;

                    ptr = SUPERLU_MALLOC (sizeof (etree_node));
                    ptr->id = edag_supno[j][jb];
                    ptr->next = NULL;

                    /*printf( "=== push %d ===\n",ptr->id ); */
                    if (nnodes == 1) {
                        head = ptr;
                        tail = ptr;
                    } else {
                        tail->next = ptr;
                        tail = ptr;
                    }
                }
            }
            /*printf( "\n" ); */
        }
        for (lb = 0; lb < nsupers; lb++)
            if (nnodes_l[lb] > 0)  SUPERLU_FREE (edag_supno[lb]);

        SUPERLU_FREE (num_child);
        SUPERLU_FREE (edag_supno);
        SUPERLU_FREE (nnodes_l);
        SUPERLU_FREE (sf_block);
        SUPERLU_FREE (sendcnts);

	log_memory(-(4 * nsupers + (4 + 2 * nrbp1)*Pr*Pc) * iword, stat);

        SUPERLU_FREE (Ublock);
        SUPERLU_FREE (Urows);
        SUPERLU_FREE (Lblock);
        SUPERLU_FREE (Lrows);
	log_memory(-(Ublock_bytes + Urows_bytes + Lblock_bytes + Lrows_bytes), stat);
    }
    /* ======================== *
     * end of static scheduling *
     * ======================== */

    for (lb = 0; lb < nsupers; lb++) iperm_c_supno[perm_c_supno[lb]] = lb;

#if ( DEBUGlevel >= 1 )
    print_memorylog(stat, "after static schedule");
#endif

    return 0;
} /* STATIC_SCHEDULE */
Ejemplo n.º 29
0
int main(int argc, char *argv[])
{
    void smatvec_mult(float alpha, float x[], float beta, float y[]);
    void spsolve(int n, float x[], float y[]);
    extern int sfgmr( int n,
	void (*matvec_mult)(float, float [], float, float []),
	void (*psolve)(int n, float [], float[]),
	float *rhs, float *sol, double tol, int restrt, int *itmax,
	FILE *fits);
    extern int sfill_diag(int n, NCformat *Astore);

    char     equed[1] = {'B'};
    yes_no_t equil;
    trans_t  trans;
    SuperMatrix A, L, U;
    SuperMatrix B, X;
    NCformat *Astore;
    NCformat *Ustore;
    SCformat *Lstore;
    GlobalLU_t	   Glu; /* facilitate multiple factorizations with 
                           SamePattern_SameRowPerm                  */
    float   *a;
    int      *asub, *xa;
    int      *etree;
    int      *perm_c; /* column permutation vector */
    int      *perm_r; /* row permutations from partial pivoting */
    int      nrhs, ldx, lwork, info, m, n, nnz;
    float   *rhsb, *rhsx, *xact;
    float   *work = NULL;
    float   *R, *C;
    float   u, rpg, rcond;
    float zero = 0.0;
    float one = 1.0;
    mem_usage_t   mem_usage;
    superlu_options_t options;
    SuperLUStat_t stat;
    FILE 	  *fp = stdin;

    int restrt, iter, maxit, i;
    double resid;
    float *x, *b;

#ifdef DEBUG
    extern int num_drop_L, num_drop_U;
#endif

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC("Enter main()");
#endif

    /* Defaults */
    lwork = 0;
    nrhs  = 1;
    trans = NOTRANS;

    /* Set the default input options:
	options.Fact = DOFACT;
	options.Equil = YES;
	options.ColPerm = COLAMD;
	options.DiagPivotThresh = 0.1; //different from complete LU
	options.Trans = NOTRANS;
	options.IterRefine = NOREFINE;
	options.SymmetricMode = NO;
	options.PivotGrowth = NO;
	options.ConditionNumber = NO;
	options.PrintStat = YES;
	options.RowPerm = LargeDiag;
	options.ILU_DropTol = 1e-4;
	options.ILU_FillTol = 1e-2;
	options.ILU_FillFactor = 10.0;
	options.ILU_DropRule = DROP_BASIC | DROP_AREA;
	options.ILU_Norm = INF_NORM;
	options.ILU_MILU = SILU;
     */
    ilu_set_default_options(&options);

    /* Modify the defaults. */
    options.PivotGrowth = YES;	  /* Compute reciprocal pivot growth */
    options.ConditionNumber = YES;/* Compute reciprocal condition number */

    if ( lwork > 0 ) {
	work = SUPERLU_MALLOC(lwork);
	if ( !work ) ABORT("Malloc fails for work[].");
    }

    /* Read matrix A from a file in Harwell-Boeing format.*/
    if (argc < 2)
    {
	printf("Usage:\n%s [OPTION] < [INPUT] > [OUTPUT]\nOPTION:\n"
		"-h -hb:\n\t[INPUT] is a Harwell-Boeing format matrix.\n"
		"-r -rb:\n\t[INPUT] is a Rutherford-Boeing format matrix.\n"
		"-t -triplet:\n\t[INPUT] is a triplet format matrix.\n",
		argv[0]);
	return 0;
    }
    else
    {
	switch (argv[1][1])
	{
	    case 'H':
	    case 'h':
		printf("Input a Harwell-Boeing format matrix:\n");
		sreadhb(fp, &m, &n, &nnz, &a, &asub, &xa);
		break;
	    case 'R':
	    case 'r':
		printf("Input a Rutherford-Boeing format matrix:\n");
		sreadrb(&m, &n, &nnz, &a, &asub, &xa);
		break;
	    case 'T':
	    case 't':
		printf("Input a triplet format matrix:\n");
		sreadtriple(&m, &n, &nnz, &a, &asub, &xa);
		break;
	    default:
		printf("Unrecognized format.\n");
		return 0;
	}
    }

    sCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa,
                                SLU_NC, SLU_S, SLU_GE);
    Astore = A.Store;
    sfill_diag(n, Astore);
    printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz);
    fflush(stdout);

    /* Generate the right-hand side */
    if ( !(rhsb = floatMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsb[].");
    if ( !(rhsx = floatMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsx[].");
    sCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_S, SLU_GE);
    sCreate_Dense_Matrix(&X, m, nrhs, rhsx, m, SLU_DN, SLU_S, SLU_GE);
    xact = floatMalloc(n * nrhs);
    ldx = n;
    sGenXtrue(n, nrhs, xact, ldx);
    sFillRHS(trans, nrhs, xact, ldx, &A, &B);

    if ( !(etree = intMalloc(n)) ) ABORT("Malloc fails for etree[].");
    if ( !(perm_r = intMalloc(m)) ) ABORT("Malloc fails for perm_r[].");
    if ( !(perm_c = intMalloc(n)) ) ABORT("Malloc fails for perm_c[].");
    if ( !(R = (float *) SUPERLU_MALLOC(A.nrow * sizeof(float))) )
	ABORT("SUPERLU_MALLOC fails for R[].");
    if ( !(C = (float *) SUPERLU_MALLOC(A.ncol * sizeof(float))) )
	ABORT("SUPERLU_MALLOC fails for C[].");

    info = 0;
#ifdef DEBUG
    num_drop_L = 0;
    num_drop_U = 0;
#endif

    /* Initialize the statistics variables. */
    StatInit(&stat);

    /* Compute the incomplete factorization and compute the condition number
       and pivot growth using dgsisx. */
    B.ncol = 0;  /* not to perform triangular solution */
    sgsisx(&options, &A, perm_c, perm_r, etree, equed, R, C, &L, &U, work,
	   lwork, &B, &X, &rpg, &rcond, &Glu, &mem_usage, &stat, &info);

    /* Set RHS for GMRES. */
    if (!(b = floatMalloc(m))) ABORT("Malloc fails for b[].");
    if (*equed == 'R' || *equed == 'B') {
	for (i = 0; i < n; ++i) b[i] = rhsb[i] * R[i];
    } else {
	for (i = 0; i < m; i++) b[i] = rhsb[i];
    }

    printf("sgsisx(): info %d, equed %c\n", info, equed[0]);
    if (info > 0 || rcond < 1e-8 || rpg > 1e8)
	printf("WARNING: This preconditioner might be unstable.\n");

    if ( info == 0 || info == n+1 ) {
	if ( options.PivotGrowth == YES )
	    printf("Recip. pivot growth = %e\n", rpg);
	if ( options.ConditionNumber == YES )
	    printf("Recip. condition number = %e\n", rcond);
    } else if ( info > 0 && lwork == -1 ) {
	printf("** Estimated memory: %d bytes\n", info - n);
    }

    Lstore = (SCformat *) L.Store;
    Ustore = (NCformat *) U.Store;
    printf("n(A) = %d, nnz(A) = %d\n", n, Astore->nnz);
    printf("No of nonzeros in factor L = %d\n", Lstore->nnz);
    printf("No of nonzeros in factor U = %d\n", Ustore->nnz);
    printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n);
    printf("Fill ratio: nnz(F)/nnz(A) = %.3f\n",
	    ((double)(Lstore->nnz) + (double)(Ustore->nnz) - (double)n)
	    / (double)Astore->nnz);
    printf("L\\U MB %.3f\ttotal MB needed %.3f\n",
	   mem_usage.for_lu/1e6, mem_usage.total_needed/1e6);
    fflush(stdout);

    /* Set the global variables. */
    GLOBAL_A = &A;
    GLOBAL_L = &L;
    GLOBAL_U = &U;
    GLOBAL_STAT = &stat;
    GLOBAL_PERM_C = perm_c;
    GLOBAL_PERM_R = perm_r;
    GLOBAL_OPTIONS = &options;
    GLOBAL_R = R;
    GLOBAL_C = C;
    GLOBAL_MEM_USAGE = &mem_usage;

    /* Set the options to do solve-only. */
    options.Fact = FACTORED;
    options.PivotGrowth = NO;
    options.ConditionNumber = NO;

    /* Set the variables used by GMRES. */
    restrt = SUPERLU_MIN(n / 3 + 1, 50);
    maxit = 1000;
    iter = maxit;
    resid = 1e-8;
    if (!(x = floatMalloc(n))) ABORT("Malloc fails for x[].");

    if (info <= n + 1)
    {
	int i_1 = 1;
	double maxferr = 0.0, nrmA, nrmB, res, t;
        float temp;
	extern float snrm2_(int *, float [], int *);
	extern void saxpy_(int *, float *, float [], int *, float [], int *);

	/* Initial guess */
	for (i = 0; i < n; i++) x[i] = zero;

	t = SuperLU_timer_();

	/* Call GMRES */
	sfgmr(n, smatvec_mult, spsolve, b, x, resid, restrt, &iter, stdout);

	t = SuperLU_timer_() - t;

	/* Output the result. */
	nrmA = snrm2_(&(Astore->nnz), (float *)((DNformat *)A.Store)->nzval,
		&i_1);
	nrmB = snrm2_(&m, b, &i_1);
	sp_sgemv("N", -1.0, &A, x, 1, 1.0, b, 1);
	res = snrm2_(&m, b, &i_1);
	resid = res / nrmB;
	printf("||A||_F = %.1e, ||B||_2 = %.1e, ||B-A*X||_2 = %.1e, "
		"relres = %.1e\n", nrmA, nrmB, res, resid);

	if (iter >= maxit)
	{
	    if (resid >= 1.0) iter = -180;
	    else if (resid > 1e-8) iter = -111;
	}
	printf("iteration: %d\nresidual: %.1e\nGMRES time: %.2f seconds.\n",
		iter, resid, t);

	/* Scale the solution back if equilibration was performed. */
	if (*equed == 'C' || *equed == 'B') 
	    for (i = 0; i < n; i++) x[i] *= C[i];

	for (i = 0; i < m; i++) {
	    maxferr = SUPERLU_MAX(maxferr, fabs(x[i] - xact[i]));
        }
	printf("||X-X_true||_oo = %.1e\n", maxferr);
    }
#ifdef DEBUG
    printf("%d entries in L and %d entries in U dropped.\n",
	    num_drop_L, num_drop_U);
#endif
    fflush(stdout);

    if ( options.PrintStat ) StatPrint(&stat);
    StatFree(&stat);

    SUPERLU_FREE (rhsb);
    SUPERLU_FREE (rhsx);
    SUPERLU_FREE (xact);
    SUPERLU_FREE (etree);
    SUPERLU_FREE (perm_r);
    SUPERLU_FREE (perm_c);
    SUPERLU_FREE (R);
    SUPERLU_FREE (C);
    Destroy_CompCol_Matrix(&A);
    Destroy_SuperMatrix_Store(&B);
    Destroy_SuperMatrix_Store(&X);
    if ( lwork >= 0 ) {
	Destroy_SuperNode_Matrix(&L);
	Destroy_CompCol_Matrix(&U);
    }
    SUPERLU_FREE(b);
    SUPERLU_FREE(x);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC("Exit main()");
#endif

    return 0;
}
Ejemplo n.º 30
0
void
zgsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
       int *etree, char *equed, double *R, double *C,
       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
       SuperMatrix *B, SuperMatrix *X,
       double *recip_pivot_growth, double *rcond,
       mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info)
{

    DNformat  *Bstore, *Xstore;
    doublecomplex    *Bmat, *Xmat;
    int       ldb, ldx, nrhs;
    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int       colequ, equil, nofact, notran, rowequ, permc_spec, mc64;
    trans_t   trant;
    char      norm[1];
    int       i, j, info1;
    double    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
    int       relax, panel_size;
    double    diag_pivot_thresh;
    double    t0;      /* temporary time */
    double    *utime;

    int *perm = NULL;

    /* External functions */
    extern double zlangs(char *, SuperMatrix *);

    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;

    *info = 0;
    nofact = (options->Fact != FACTORED);
    equil = (options->Equil == YES);
    notran = (options->Trans == NOTRANS);
    mc64 = (options->RowPerm == LargeDiag);
    if ( nofact ) {
	*(unsigned char *)equed = 'N';
	rowequ = FALSE;
	colequ = FALSE;
    } else {
	rowequ = lsame_(equed, "R") || lsame_(equed, "B");
	colequ = lsame_(equed, "C") || lsame_(equed, "B");
	smlnum = dlamch_("Safe minimum");
	bignum = 1. / smlnum;
    }

    /* Test the input parameters */
    if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern &&
	options->Fact != SamePattern_SameRowPerm &&
	!notran && options->Trans != TRANS && options->Trans != CONJ &&
	!equil && options->Equil != NO)
	*info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
	      A->Dtype != SLU_Z || A->Mtype != SLU_GE )
	*info = -2;
    else if (options->Fact == FACTORED &&
	     !(rowequ || colequ || lsame_(equed, "N")))
	*info = -6;
    else {
	if (rowequ) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, R[j]);
		rcmax = SUPERLU_MAX(rcmax, R[j]);
	    }
	    if (rcmin <= 0.) *info = -7;
	    else if ( A->nrow > 0)
		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else rowcnd = 1.;
	}
	if (colequ && *info == 0) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, C[j]);
		rcmax = SUPERLU_MAX(rcmax, C[j]);
	    }
	    if (rcmin <= 0.) *info = -8;
	    else if (A->nrow > 0)
		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else colcnd = 1.;
	}
	if (*info == 0) {
	    if ( lwork < -1 ) *info = -12;
	    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      B->Stype != SLU_DN || B->Dtype != SLU_Z || 
		      B->Mtype != SLU_GE )
		*info = -13;
	    else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      (B->ncol != 0 && B->ncol != X->ncol) ||
		      X->Stype != SLU_DN ||
		      X->Dtype != SLU_Z || X->Mtype != SLU_GE )
		*info = -14;
	}
    }
    if (*info != 0) {
	i = -(*info);
	xerbla_("zgsisx", &i);
	return;
    }

    /* Initialization for factor parameters */
    panel_size = sp_ienv(1);
    relax      = sp_ienv(2);
    diag_pivot_thresh = options->DiagPivotThresh;

    utime = stat->utime;

    /* Convert A to SLU_NC format when necessary. */
    if ( A->Stype == SLU_NR ) {
	NRformat *Astore = A->Store;
	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
	zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz,
			       Astore->nzval, Astore->colind, Astore->rowptr,
			       SLU_NC, A->Dtype, A->Mtype);
	if ( notran ) { /* Reverse the transpose argument. */
	    trant = TRANS;
	    notran = 0;
	} else {
	    trant = NOTRANS;
	    notran = 1;
	}
    } else { /* A->Stype == SLU_NC */
	trant = options->Trans;
	AA = A;
    }

    if ( nofact ) {
	register int i, j;
	NCformat *Astore = AA->Store;
	int nnz = Astore->nnz;
	int *colptr = Astore->colptr;
	int *rowind = Astore->rowind;
	doublecomplex *nzval = (doublecomplex *)Astore->nzval;
	int n = AA->nrow;

	if ( mc64 ) {
	    *equed = 'B';
	    rowequ = colequ = 1;
	    t0 = SuperLU_timer_();
	    if ((perm = intMalloc(n)) == NULL)
		ABORT("SUPERLU_MALLOC fails for perm[]");

	    info1 = zldperm(5, n, nnz, colptr, rowind, nzval, perm, R, C);

	    if (info1 > 0) { /* MC64 fails, call zgsequ() later */
		mc64 = 0;
		SUPERLU_FREE(perm);
		perm = NULL;
	    } else {
		for (i = 0; i < n; i++) {
		    R[i] = exp(R[i]);
		    C[i] = exp(C[i]);
		}
		/* permute and scale the matrix */
		for (j = 0; j < n; j++) {
		    for (i = colptr[j]; i < colptr[j + 1]; i++) {
                        zd_mult(&nzval[i], &nzval[i], R[rowind[i]] * C[j]);
			rowind[i] = perm[rowind[i]];
		    }
		}
	    }
	    utime[EQUIL] = SuperLU_timer_() - t0;
	}
	if ( !mc64 & equil ) {
	    t0 = SuperLU_timer_();
	    /* Compute row and column scalings to equilibrate the matrix A. */
	    zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);

	    if ( info1 == 0 ) {
		/* Equilibrate matrix A. */
		zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
		rowequ = lsame_(equed, "R") || lsame_(equed, "B");
		colequ = lsame_(equed, "C") || lsame_(equed, "B");
	    }
	    utime[EQUIL] = SuperLU_timer_() - t0;
	}
    }

    if ( nrhs > 0 ) {
	/* Scale the right hand side if equilibration was performed. */
	if ( notran ) {
	    if ( rowequ ) {
		for (j = 0; j < nrhs; ++j)
		    for (i = 0; i < A->nrow; ++i) {
                        zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]);
		    }
	    }
	} else if ( colequ ) {
	    for (j = 0; j < nrhs; ++j)
		for (i = 0; i < A->nrow; ++i) {
                    zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]);
		}
	}
    }

    if ( nofact ) {
	
	t0 = SuperLU_timer_();
	/*
	 * Gnet column permutation vector perm_c[], according to permc_spec:
	 *   permc_spec = NATURAL:  natural ordering 
	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
	 *   permc_spec = COLAMD:   approximate minimum degree column ordering
	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
	 */
	permc_spec = options->ColPerm;
	if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
	    get_perm_c(permc_spec, AA, perm_c);
	utime[COLPERM] = SuperLU_timer_() - t0;

	t0 = SuperLU_timer_();
	sp_preorder(options, AA, perm_c, etree, &AC);
	utime[ETREE] = SuperLU_timer_() - t0;

	/* Compute the LU factorization of A*Pc. */
	t0 = SuperLU_timer_();
	zgsitrf(options, &AC, relax, panel_size, etree, work, lwork,
                perm_c, perm_r, L, U, stat, info);
	utime[FACT] = SuperLU_timer_() - t0;

	if ( lwork == -1 ) {
	    mem_usage->total_needed = *info - A->ncol;
	    return;
	}
    }

    if ( options->PivotGrowth ) {
	if ( *info > 0 ) return;

	/* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
	*recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U);
    }

    if ( options->ConditionNumber ) {
	/* Estimate the reciprocal of the condition number of A. */
	t0 = SuperLU_timer_();
	if ( notran ) {
	    *(unsigned char *)norm = '1';
	} else {
	    *(unsigned char *)norm = 'I';
	}
	anorm = zlangs(norm, AA);
	zgscon(norm, L, U, anorm, rcond, stat, &info1);
	utime[RCOND] = SuperLU_timer_() - t0;
    }

    if ( nrhs > 0 ) {
	/* Compute the solution matrix X. */
	for (j = 0; j < nrhs; j++)  /* Save a copy of the right hand sides */
	    for (i = 0; i < B->nrow; i++)
		Xmat[i + j*ldx] = Bmat[i + j*ldb];

	t0 = SuperLU_timer_();
	zgstrs (trant, L, U, perm_c, perm_r, X, stat, &info1);
	utime[SOLVE] = SuperLU_timer_() - t0;

	/* Transform the solution matrix X to a solution of the original
	   system. */
	if ( notran ) {
	    if ( colequ ) {
		for (j = 0; j < nrhs; ++j)
		    for (i = 0; i < A->nrow; ++i) {
                        zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]);
                    }
	    }
	} else {
	    if ( rowequ ) {
		if (perm) {
		    doublecomplex *tmp;
		    int n = A->nrow;

                    if ((tmp = doublecomplexMalloc(n)) == NULL)
			ABORT("SUPERLU_MALLOC fails for tmp[]");
		    for (j = 0; j < nrhs; j++) {
			for (i = 0; i < n; i++)
			    tmp[i] = Xmat[i + j * ldx]; /*dcopy*/
			for (i = 0; i < n; i++)
                           zd_mult(&Xmat[i+j*ldx], &tmp[perm[i]], R[i]);
		    }
		    SUPERLU_FREE(tmp);
		} else {
		    for (j = 0; j < nrhs; ++j)
			for (i = 0; i < A->nrow; ++i) {
                           zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]);
                        }
		}
	    }
	}
    } /* end if nrhs > 0 */

    if ( options->ConditionNumber ) {
	/* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
	if ( *rcond < dlamch_("E") && *info == 0) *info = A->ncol + 1;
    }

    if (perm) SUPERLU_FREE(perm);

    if ( nofact ) {
	ilu_zQuerySpace(L, U, mem_usage);
	Destroy_CompCol_Permuted(&AC);
    }
    if ( A->Stype == SLU_NR ) {
	Destroy_SuperMatrix_Store(AA);
	SUPERLU_FREE(AA);
    }

}