예제 #1
0
파일: superlu_dist.c 프로젝트: Kun-Qu/petsc
PetscErrorCode MatDestroy_SuperLU_DIST(Mat A)
{
  PetscErrorCode   ierr;
  Mat_SuperLU_DIST *lu = (Mat_SuperLU_DIST*)A->spptr;
  PetscBool        flg;

  PetscFunctionBegin;
  if (lu && lu->CleanUpSuperLU_Dist) {
    /* Deallocate SuperLU_DIST storage */
    if (lu->MatInputMode == GLOBAL) {
      Destroy_CompCol_Matrix_dist(&lu->A_sup);
    } else {
      Destroy_CompRowLoc_Matrix_dist(&lu->A_sup);
      if ( lu->options.SolveInitialized ) {
#if defined(PETSC_USE_COMPLEX)
        zSolveFinalize(&lu->options, &lu->SOLVEstruct);
#else
        dSolveFinalize(&lu->options, &lu->SOLVEstruct);
#endif
      }
    }
    Destroy_LU(A->cmap->N, &lu->grid, &lu->LUstruct);
    ScalePermstructFree(&lu->ScalePermstruct);
    LUstructFree(&lu->LUstruct);

    /* Release the SuperLU_DIST process grid. */
    superlu_gridexit(&lu->grid);

    ierr = MPI_Comm_free(&(lu->comm_superlu));CHKERRQ(ierr);
  }
  ierr = PetscFree(A->spptr);CHKERRQ(ierr);

  ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&flg);CHKERRQ(ierr);
  if (flg) {
    ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
  } else {
    ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}
예제 #2
0
int main(int argc, char *argv[])
{
    superlu_dist_options_t options;
    SuperLUStat_t stat;
    SuperMatrix A;
    ScalePermstruct_t ScalePermstruct;
    LUstruct_t LUstruct;
    gridinfo_t grid1, grid2;
    double   *berr;
    double   *a, *b, *xtrue;
    int_t    *asub, *xa;
    int_t    i, j, m, n, nnz;
    int_t    nprow, npcol, ldumap, p;
    int_t    usermap[6];
    int      iam, info, ldb, ldx, nprocs;
    int      nrhs = 1;   /* Number of right-hand side. */
    char     trans[1];
    char     **cpp, c;
    FILE *fp, *fopen();

    /* prototypes */
    extern void LUstructInit(const int_t, LUstruct_t *);
    extern void LUstructFree(LUstruct_t *);
    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);

    /* ------------------------------------------------------------
       INITIALIZE MPI ENVIRONMENT. 
       ------------------------------------------------------------*/
    MPI_Init( &argc, &argv );
    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
    if ( nprocs < 10 ) {
	fprintf(stderr, "Requires at least 10 processes\n");
	exit(-1);
    }

    /* Parse command line argv[]. */
    for (cpp = argv+1; *cpp; ++cpp) {
	if ( **cpp == '-' ) {
	    c = *(*cpp+1);
	    ++cpp;
	    switch (c) {
	      case 'h':
		  printf("Options:\n");
		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
		  printf("\t-c <int>: process columns (default %d)\n", npcol);
		  exit(0);
		  break;
	      case 'r': nprow = atoi(*cpp);
		        break;
	      case 'c': npcol = atoi(*cpp);
		        break;
	    }
	} else { /* Last arg is considered a filename */
	    if ( !(fp = fopen(*cpp, "r")) ) {
                ABORT("File does not exist");
            }
	    break;
	}
    }

    /* ------------------------------------------------------------
       INITIALIZE THE SUPERLU PROCESS GRID 1. 
       ------------------------------------------------------------*/
    nprow = 2;
    npcol = 3;
    ldumap = 2;
    p = 0;    /* Grid 1 starts from process 0. */
    for (i = 0; i < nprow; ++i)
	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1);

    /* ------------------------------------------------------------
       INITIALIZE THE SUPERLU PROCESS GRID 2. 
       ------------------------------------------------------------*/
    nprow = 2;
    npcol = 2;
    ldumap = 2;
    p = 6;   /* Grid 2 starts from process 6. */
    for (i = 0; i < nprow; ++i)
	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2);

    /* Bail out if I do not belong in any of the 2 grids. */
    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
    if ( iam >= 10 ) goto out;
    
#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter main()");
#endif

    if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */
	iam = grid1.iam;  /* Get the logical number in the new grid. */

	/* ------------------------------------------------------------
	   PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
	   THE OTHER PROCESSES.
	   ------------------------------------------------------------*/
	if ( !iam ) {
	    /* Read the matrix stored on disk in Harwell-Boeing format. */
	    dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
	
	    printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
	    printf("\tProcess grid\t%d X %d\n", (int) grid1.nprow, (int) grid1.npcol);

	    /* Broadcast matrix A to the other PEs. */
	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid1.comm );
	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid1.comm );
	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid1.comm );
	    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid1.comm );
	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid1.comm );
	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid1.comm );
	} else {
	    /* Receive matrix A from PE 0. */
	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid1.comm );
	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid1.comm );
	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid1.comm );

	    /* Allocate storage for compressed column representation. */
	    dallocateA_dist(n, nnz, &a, &asub, &xa);
	    
	    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid1.comm );
	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid1.comm );
	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid1.comm );
	}
	
	/* Create compressed column matrix for A. */
	dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
				    SLU_NC, SLU_D, SLU_GE);

	/* Generate the exact solution and compute the right-hand side. */
	if (!(b=doubleMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
	if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
	*trans = 'N';
	ldx = n;
	ldb = m;
	dGenXtrue_dist(n, nrhs, xtrue, ldx);
	dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);

	if ( !(berr = doubleMalloc_dist(nrhs)) )
	    ABORT("Malloc fails for berr[].");

	/* ------------------------------------------------------------
	   NOW WE SOLVE THE LINEAR SYSTEM.
	   ------------------------------------------------------------*/
	
        /* Set the default input options:
            options.Fact = DOFACT;
            options.Equil = YES;
            options.ColPerm = METIS_AT_PLUS_A;
            options.RowPerm = LargeDiag;
            options.ReplaceTinyPivot = YES;
            options.Trans = NOTRANS;
            options.IterRefine = DOUBLE;
            options.SolveInitialized = NO;
            options.RefineInitialized = NO;
            options.PrintStat = YES;
         */
	set_default_options_dist(&options);

        if (!iam) {
	    print_sp_ienv_dist(&options);
	    print_options_dist(&options);
        }

	/* Initialize ScalePermstruct and LUstruct. */
	ScalePermstructInit(m, n, &ScalePermstruct);
	LUstructInit(n, &LUstruct);

	/* Initialize the statistics variables. */
	PStatInit(&stat);
	
	/* Call the linear equation solver: factorize and solve. */
	pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1,
			 &LUstruct, berr, &stat, &info);

	/* Check the accuracy of the solution. */
	if ( !iam ) {
	    dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid1);
	}
    
    
	/* Print the statistics. */
	PStatPrint(&options, &stat, &grid1);

	/* ------------------------------------------------------------
	   DEALLOCATE STORAGE.
	   ------------------------------------------------------------*/
	PStatFree(&stat);
	Destroy_CompCol_Matrix_dist(&A); 
	Destroy_LU(n, &grid1, &LUstruct);
	ScalePermstructFree(&ScalePermstruct);
	LUstructFree(&LUstruct);
	SUPERLU_FREE(b);
	SUPERLU_FREE(xtrue);
	SUPERLU_FREE(berr);

    } else { /* I am in grid 2. */
	iam = grid2.iam;  /* Get the logical number in the new grid. */

	/* ------------------------------------------------------------
	   PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
	   THE OTHER PROCESSES.
	   ------------------------------------------------------------*/
	if ( !iam ) {
	    /* Read the matrix stored on disk in Harwell-Boeing format. */
	    dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
	
	    printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
	    printf("\tProcess grid\t%d X %d\n", (int) grid2.nprow, (int) grid2.npcol);

	    /* Broadcast matrix A to the other PEs. */
	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid2.comm );
	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid2.comm );
	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid2.comm );
	    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid2.comm );
	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid2.comm );
	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid2.comm );
	} else {
	    /* Receive matrix A from PE 0. */
	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid2.comm );
	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid2.comm );
	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid2.comm );

	    /* Allocate storage for compressed column representation. */
	    dallocateA_dist(n, nnz, &a, &asub, &xa);
	    
	    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid2.comm );
	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid2.comm );
	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid2.comm );
	}
	
	/* Create compressed column matrix for A. */
	dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
				    SLU_NC, SLU_D, SLU_GE);

	/* Generate the exact solution and compute the right-hand side. */
	if (!(b=doubleMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
	if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
	*trans = 'N';
	ldx = n;
	ldb = m;
	dGenXtrue_dist(n, nrhs, xtrue, ldx);
	dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);

	if ( !(berr = doubleMalloc_dist(nrhs)) )
	    ABORT("Malloc fails for berr[].");

	/* ------------------------------------------------------------
	   NOW WE SOLVE THE LINEAR SYSTEM.
	   ------------------------------------------------------------*/
	
        /* Set the default input options:
            options.Fact = DOFACT;
            options.Equil = YES;
            options.ColPerm = MMD_AT_PLUS_A;
            options.RowPerm = LargeDiag;
            options.ReplaceTinyPivot = YES;
            options.Trans = NOTRANS;
            options.IterRefine = DOUBLE;
            options.SolveInitialized = NO;
            options.RefineInitialized = NO;
            options.PrintStat = YES;
         */
	set_default_options_dist(&options);
	
	/* Initialize ScalePermstruct and LUstruct. */
	ScalePermstructInit(m, n, &ScalePermstruct);
	LUstructInit(n, &LUstruct);

	/* Initialize the statistics variables. */
	PStatInit(&stat);
	
	/* Call the linear equation solver: factorize and solve. */
	pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2,
			 &LUstruct, berr, &stat, &info);

	/* Check the accuracy of the solution. */
	if ( !iam ) {
	    dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid2);
	}
    
    
	/* Print the statistics. */
	PStatPrint(&options, &stat, &grid2);

	/* ------------------------------------------------------------
	   DEALLOCATE STORAGE.
	   ------------------------------------------------------------*/
	PStatFree(&stat);
	Destroy_CompCol_Matrix_dist(&A); 
	Destroy_LU(n, &grid2, &LUstruct);
	ScalePermstructFree(&ScalePermstruct);
	LUstructFree(&LUstruct);
	SUPERLU_FREE(b);
	SUPERLU_FREE(xtrue);
	SUPERLU_FREE(berr);
    }

    /* ------------------------------------------------------------
       RELEASE THE SUPERLU PROCESS GRIDS.
       ------------------------------------------------------------*/
    superlu_gridexit(&grid1);
    superlu_gridexit(&grid2);

out:
    /* ------------------------------------------------------------
       TERMINATES THE MPI EXECUTION ENVIRONMENT.
       ------------------------------------------------------------*/
    MPI_Finalize();

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit main()");
#endif

}
예제 #3
0
void
pdgssvx(superlu_options_t *options, SuperMatrix *A, 
	ScalePermstruct_t *ScalePermstruct,
	double B[], int ldb, int nrhs, gridinfo_t *grid,
	LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
	SuperLUStat_t *stat, int *info)
{
/* 
 * -- Distributed SuperLU routine (version 2.2) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley.
 * November 1, 2007
 * Feburary 20, 2008
 *
 *
 * Purpose
 * =======
 *
 * PDGSSVX solves a system of linear equations A*X=B,
 * by using Gaussian elimination with "static pivoting" to
 * compute the LU factorization of A.
 *
 * Static pivoting is a technique that combines the numerical stability
 * of partial pivoting with the scalability of Cholesky (no pivoting),
 * to run accurately and efficiently on large numbers of processors.
 * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
 * description of the parallel algorithms.
 *
 * The input matrices A and B are distributed by block rows.
 * Here is a graphical illustration (0-based indexing):
 *
 *                        A                B
 *               0 ---------------       ------
 *                   |           |        |  |
 *                   |           |   P0   |  |
 *                   |           |        |  |
 *                 ---------------       ------
 *        - fst_row->|           |        |  |
 *        |          |           |        |  |
 *       m_loc       |           |   P1   |  |
 *        |          |           |        |  |
 *        -          |           |        |  |
 *                 ---------------       ------
 *                   |    .      |        |. |
 *                   |    .      |        |. |
 *                   |    .      |        |. |
 *                 ---------------       ------
 * 
 * where, fst_row is the row number of the first row,
 *        m_loc is the number of rows local to this processor
 * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
 *
 *
 * Here are the options for using this code:
 *
 *   1. Independent of all the other options specified below, the
 *      user must supply
 *
 *      -  B, the matrix of right-hand sides, distributed by block rows,
 *            and its dimensions ldb (local) and nrhs (global)
 *      -  grid, a structure describing the 2D processor mesh
 *      -  options->IterRefine, which determines whether or not to
 *            improve the accuracy of the computed solution using 
 *            iterative refinement
 *
 *      On output, B is overwritten with the solution X.
 *
 *   2. Depending on options->Fact, the user has four options
 *      for solving A*X=B. The standard option is for factoring
 *      A "from scratch". (The other options, described below,
 *      are used when A is sufficiently similar to a previously 
 *      solved problem to save time by reusing part or all of 
 *      the previous factorization.)
 *
 *      -  options->Fact = DOFACT: A is factored "from scratch"
 *
 *      In this case the user must also supply
 *
 *        o  A, the input matrix
 *
 *        as well as the following options to determine what matrix to
 *        factorize.
 *
 *        o  options->Equil,   to specify how to scale the rows and columns
 *                             of A to "equilibrate" it (to try to reduce its
 *                             condition number and so improve the
 *                             accuracy of the computed solution)
 *
 *        o  options->RowPerm, to specify how to permute the rows of A
 *                             (typically to control numerical stability)
 *
 *        o  options->ColPerm, to specify how to permute the columns of A
 *                             (typically to control fill-in and enhance
 *                             parallelism during factorization)
 *
 *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
 *                             pivots encountered during factorization
 *                             (to control numerical stability)
 *
 *      The outputs returned include
 *         
 *        o  ScalePermstruct,  modified to describe how the input matrix A
 *                             was equilibrated and permuted:
 *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
 *                                         columns of A were scaled
 *          .  ScalePermstruct->R, array of row scale factors
 *          .  ScalePermstruct->C, array of column scale factors
 *          .  ScalePermstruct->perm_r, row permutation vector
 *          .  ScalePermstruct->perm_c, column permutation vector
 *
 *          (part of ScalePermstruct may also need to be supplied on input,
 *           depending on options->RowPerm and options->ColPerm as described 
 *           later).
 *
 *        o  A, the input matrix A overwritten by the scaled and permuted
 *              matrix diag(R)*A*diag(C)*Pc^T, where 
 *              Pc is the row permutation matrix determined by
 *                  ScalePermstruct->perm_c
 *              diag(R) and diag(C) are diagonal scaling matrices determined
 *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and 
 *                  ScalePermstruct->C
 *
 *        o  LUstruct, which contains the L and U factorization of A1 where
 *
 *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
 *
 *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
 *                in A on output.)
 *
 *   3. The second value of options->Fact assumes that a matrix with the same
 *      sparsity pattern as A has already been factored:
 *     
 *      -  options->Fact = SamePattern: A is factored, assuming that it has
 *            the same nonzero pattern as a previously factored matrix. In
 *            this case the algorithm saves time by reusing the previously
 *            computed column permutation vector stored in
 *            ScalePermstruct->perm_c and the "elimination tree" of A
 *            stored in LUstruct->etree
 *
 *      In this case the user must still specify the following options
 *      as before:
 *
 *        o  options->Equil
 *        o  options->RowPerm
 *        o  options->ReplaceTinyPivot
 *
 *      but not options->ColPerm, whose value is ignored. This is because the
 *      previous column permutation from ScalePermstruct->perm_c is used as
 *      input. The user must also supply 
 *
 *        o  A, the input matrix
 *        o  ScalePermstruct->perm_c, the column permutation
 *        o  LUstruct->etree, the elimination tree
 *
 *      The outputs returned include
 *         
 *        o  A, the input matrix A overwritten by the scaled and permuted
 *              matrix as described above
 *        o  ScalePermstruct, modified to describe how the input matrix A was
 *                            equilibrated and row permuted
 *        o  LUstruct, modified to contain the new L and U factors
 *
 *   4. The third value of options->Fact assumes that a matrix B with the same
 *      sparsity pattern as A has already been factored, and where the
 *      row permutation of B can be reused for A. This is useful when A and B
 *      have similar numerical values, so that the same row permutation
 *      will make both factorizations numerically stable. This lets us reuse
 *      all of the previously computed structure of L and U.
 *
 *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
 *            assuming not only the same nonzero pattern as the previously
 *            factored matrix B, but reusing B's row permutation.
 *
 *      In this case the user must still specify the following options
 *      as before:
 *
 *        o  options->Equil
 *        o  options->ReplaceTinyPivot
 *
 *      but not options->RowPerm or options->ColPerm, whose values are
 *      ignored. This is because the permutations from ScalePermstruct->perm_r
 *      and ScalePermstruct->perm_c are used as input.
 *
 *      The user must also supply 
 *
 *        o  A, the input matrix
 *        o  ScalePermstruct->DiagScale, how the previous matrix was row
 *                                       and/or column scaled
 *        o  ScalePermstruct->R, the row scalings of the previous matrix,
 *                               if any
 *        o  ScalePermstruct->C, the columns scalings of the previous matrix, 
 *                               if any
 *        o  ScalePermstruct->perm_r, the row permutation of the previous
 *                                    matrix
 *        o  ScalePermstruct->perm_c, the column permutation of the previous 
 *                                    matrix
 *        o  all of LUstruct, the previously computed information about
 *                            L and U (the actual numerical values of L and U
 *                            stored in LUstruct->Llu are ignored)
 *
 *      The outputs returned include
 *         
 *        o  A, the input matrix A overwritten by the scaled and permuted
 *              matrix as described above
 *        o  ScalePermstruct,  modified to describe how the input matrix A was
 *                             equilibrated (thus ScalePermstruct->DiagScale,
 *                             R and C may be modified)
 *        o  LUstruct, modified to contain the new L and U factors
 *
 *   5. The fourth and last value of options->Fact assumes that A is
 *      identical to a matrix that has already been factored on a previous 
 *      call, and reuses its entire LU factorization
 *
 *      -  options->Fact = Factored: A is identical to a previously
 *            factorized matrix, so the entire previous factorization
 *            can be reused.
 *
 *      In this case all the other options mentioned above are ignored
 *      (options->Equil, options->RowPerm, options->ColPerm, 
 *       options->ReplaceTinyPivot)
 *
 *      The user must also supply 
 *
 *        o  A, the unfactored matrix, only in the case that iterative
 *              refinment is to be done (specifically A must be the output
 *              A from the previous call, so that it has been scaled and permuted)
 *        o  all of ScalePermstruct
 *        o  all of LUstruct, including the actual numerical values of
 *           L and U
 *
 *      all of which are unmodified on output.
 *         
 * Arguments
 * =========
 *
 * options (input) superlu_options_t* (global)
 *         The structure defines the input parameters to control
 *         how the LU decomposition will be performed.
 *         The following fields should be defined for this structure:
 *         
 *         o Fact (fact_t)
 *           Specifies whether or not the factored form of the matrix
 *           A is supplied on entry, and if not, how the matrix A should
 *           be factorized based on the previous history.
 *
 *           = DOFACT: The matrix A will be factorized from scratch.
 *                 Inputs:  A
 *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          all of ScalePermstruct
 *                          all of LUstruct
 *
 *           = SamePattern: the matrix A will be factorized assuming
 *             that a factorization of a matrix with the same sparsity
 *             pattern was performed prior to this one. Therefore, this
 *             factorization will reuse column permutation vector 
 *             ScalePermstruct->perm_c and the elimination tree
 *             LUstruct->etree
 *                 Inputs:  A
 *                          options->Equil, RowPerm, ReplaceTinyPivot
 *                          ScalePermstruct->perm_c
 *                          LUstruct->etree
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
 *                          rest of LUstruct (GLU_persist, Llu)
 *
 *           = SamePattern_SameRowPerm: the matrix A will be factorized
 *             assuming that a factorization of a matrix with the same
 *             sparsity	pattern and similar numerical values was performed
 *             prior to this one. Therefore, this factorization will reuse
 *             both row and column scaling factors R and C, and the
 *             both row and column permutation vectors perm_r and perm_c,
 *             distributed data structure set up from the previous symbolic
 *             factorization.
 *                 Inputs:  A
 *                          options->Equil, ReplaceTinyPivot
 *                          all of ScalePermstruct
 *                          all of LUstruct
 *                 Outputs: modified A
 *                             (possibly row and/or column scaled and/or 
 *                              permuted)
 *                          modified LUstruct->Llu
 *           = FACTORED: the matrix A is already factored.
 *                 Inputs:  all of ScalePermstruct
 *                          all of LUstruct
 *
 *         o Equil (yes_no_t)
 *           Specifies whether to equilibrate the system.
 *           = NO:  no equilibration.
 *           = YES: scaling factors are computed to equilibrate the system:
 *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
 *                  Whether or not the system will be equilibrated depends
 *                  on the scaling of the matrix A, but if equilibration is
 *                  used, A is overwritten by diag(R)*A*diag(C) and B by
 *                  diag(R)*B.
 *
 *         o RowPerm (rowperm_t)
 *           Specifies how to permute rows of the matrix A.
 *           = NATURAL:   use the natural ordering.
 *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
 *                        the original matrix to make the diagonal large
 *                        relative to the off-diagonal.
 *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
 *                        input by the user.
 *           
 *         o ColPerm (colperm_t)
 *           Specifies what type of column permutation to use to reduce fill.
 *           = NATURAL:       natural ordering.
 *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
 *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
 *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
 *         
 *         o ReplaceTinyPivot (yes_no_t)
 *           = NO:  do not modify pivots
 *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during 
 *                  LU factorization.
 *
 *         o IterRefine (IterRefine_t)
 *           Specifies how to perform iterative refinement.
 *           = NO:     no iterative refinement.
 *           = DOUBLE: accumulate residual in double precision.
 *           = EXTRA:  accumulate residual in extra precision.
 *
 *         NOTE: all options must be indentical on all processes when
 *               calling this routine.
 *
 * A (input/output) SuperMatrix* (local)
 *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
 *           The number of linear equations is A->nrow. The type of A must be:
 *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
 *           That is, A is stored in distributed compressed row format.
 *           See supermatrix.h for the definition of 'SuperMatrix'.
 *           This routine only handles square A, however, the LU factorization
 *           routine PDGSTRF can factorize rectangular matrices.
 *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
 *           depending on ScalePermstruct->DiagScale and options->ColPerm:
 *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
 *                diag(R)*A*diag(C).
 *             if options->ColPerm != NATURAL, A is further overwritten by
 *                diag(R)*A*diag(C)*Pc^T.
 *           If all the above condition are true, the LU decomposition is
 *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
 *
 * ScalePermstruct (input/output) ScalePermstruct_t* (global)
 *         The data structure to store the scaling and permutation vectors
 *         describing the transformations performed to the matrix A.
 *         It contains the following fields:
 *
 *         o DiagScale (DiagScale_t)
 *           Specifies the form of equilibration that was done.
 *           = NOEQUIL: no equilibration.
 *           = ROW:     row equilibration, i.e., A was premultiplied by
 *                      diag(R).
 *           = COL:     Column equilibration, i.e., A was postmultiplied
 *                      by diag(C).
 *           = BOTH:    both row and column equilibration, i.e., A was 
 *                      replaced by diag(R)*A*diag(C).
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
 *           DiagScale is an input argument; otherwise it is an output
 *           argument.
 *
 *         o perm_r (int*)
 *           Row permutation vector, which defines the permutation matrix Pr;
 *           perm_r[i] = j means row i of A is in position j in Pr*A.
 *           If options->RowPerm = MY_PERMR, or
 *           options->Fact = SamePattern_SameRowPerm, perm_r is an
 *           input argument; otherwise it is an output argument.
 *
 *         o perm_c (int*)
 *           Column permutation vector, which defines the 
 *           permutation matrix Pc; perm_c[i] = j means column i of A is 
 *           in position j in A*Pc.
 *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
 *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
 *           input argument; otherwise, it is an output argument.
 *           On exit, perm_c may be overwritten by the product of the input
 *           perm_c and a permutation that postorders the elimination tree
 *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
 *           is already in postorder.
 *
 *         o R (double*) dimension (A->nrow)
 *           The row scale factors for A.
 *           If DiagScale = ROW or BOTH, A is multiplied on the left by 
 *                          diag(R).
 *           If DiagScale = NOEQUIL or COL, R is not defined.
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
 *           an input argument; otherwise, R is an output argument.
 *
 *         o C (double*) dimension (A->ncol)
 *           The column scale factors for A.
 *           If DiagScale = COL or BOTH, A is multiplied on the right by 
 *                          diag(C).
 *           If DiagScale = NOEQUIL or ROW, C is not defined.
 *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
 *           an input argument; otherwise, C is an output argument.
 *         
 * B       (input/output) double* (local)
 *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
 *           where, m_loc is the number of rows stored locally on my
 *           process and is defined in the data structure of matrix A.
 *         On exit, the solution matrix if info = 0;
 *
 * ldb     (input) int (local)
 *         The leading dimension of matrix B.
 *
 * nrhs    (input) int (global)
 *         The number of right-hand sides.
 *         If nrhs = 0, only LU decomposition is performed, the forward
 *         and back substitutions are skipped.
 *
 * grid    (input) gridinfo_t* (global)
 *         The 2D process mesh. It contains the MPI communicator, the number
 *         of process rows (NPROW), the number of process columns (NPCOL),
 *         and my process rank. It is an input argument to all the
 *         parallel routines.
 *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
 *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
 *
 * LUstruct (input/output) LUstruct_t*
 *         The data structures to store the distributed L and U factors.
 *         It contains the following fields:
 *
 *         o etree (int*) dimension (A->ncol) (global)
 *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
 *           It is computed in sp_colorder() during the first factorization,
 *           and is reused in the subsequent factorizations of the matrices
 *           with the same nonzero pattern.
 *           On exit of sp_colorder(), the columns of A are permuted so that
 *           the etree is in a certain postorder. This postorder is reflected
 *           in ScalePermstruct->perm_c.
 *           NOTE:
 *           Etree is a vector of parent pointers for a forest whose vertices
 *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *
 *         o Glu_persist (Glu_persist_t*) (global)
 *           Global data structure (xsup, supno) replicated on all processes,
 *           describing the supernode partition in the factored matrices
 *           L and U:
 *	       xsup[s] is the leading column of the s-th supernode,
 *             supno[i] is the supernode number to which column i belongs.
 *
 *         o Llu (LocalLU_t*) (local)
 *           The distributed data structures to store L and U factors.
 *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
 *
 * SOLVEstruct (input/output) SOLVEstruct_t*
 *         The data structure to hold the communication pattern used
 *         in the phases of triangular solution and iterative refinement.
 *         This pattern should be intialized only once for repeated solutions.
 *         If options->SolveInitialized = YES, it is an input argument.
 *         If options->SolveInitialized = NO and nrhs != 0, it is an output
 *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
 *
 * berr    (output) double*, dimension (nrhs) (global)
 *         The componentwise relative backward error of each solution   
 *         vector X(j) (i.e., the smallest relative change in   
 *         any element of A or B that makes X(j) an exact solution).
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics on runtime and floating-point operation count.
 *        See util.h for the definition of 'SuperLUStat_t'.
 *
 * info    (output) int*
 *         = 0: successful exit
 *         > 0: if info = i, and i is
 *             <= A->ncol: U(i,i) is exactly zero. The factorization has
 *                been completed, but the factor U is exactly singular,
 *                so the solution could not be computed.
 *             > A->ncol: number of bytes allocated when memory allocation
 *                failure occurred, plus A->ncol.
 *
 * See superlu_ddefs.h for the definitions of varioous data types.
 *
 */
    NRformat_loc *Astore;
    SuperMatrix GA;      /* Global A in NC format */
    NCformat *GAstore;
    double   *a_GA;
    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
    NCPformat *GACstore;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    Glu_freeable_t *Glu_freeable;
            /* The nonzero structures of L and U factors, which are
	       replicated on all processrs.
	           (lsub, xlsub) contains the compressed subscript of
		                 supernodes in L.
          	   (usub, xusub) contains the compressed subscript of
		                 nonzero segments in U.
	      If options->Fact != SamePattern_SameRowPerm, they are 
	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
	      routine. They will be freed after PDDISTRIBUTE routine.
	      If options->Fact == SamePattern_SameRowPerm, these
	      structures are not used.                                  */
    fact_t   Fact;
    double   *a;
    int_t    *colptr, *rowind;
    int_t    *perm_r; /* row permutations from partial pivoting */
    int_t    *perm_c; /* column permutation vector */
    int_t    *etree;  /* elimination tree */
    int_t    *rowptr, *colind;  /* Local A in NR*/
    int_t    *rowind_loc, *colptr_loc;
    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
    int_t    nnz_loc, m_loc, fst_row, icol;
    int      iam;
    int      ldx;  /* LDA for matrix X (local). */
    char     equed[1], norm[1];
    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
    double   *X, *b_col, *b_work, *x_col;
    double   t;
    static mem_usage_t num_mem_usage, symb_mem_usage;
#if ( PRNTlevel>= 2 )
    double   dmin, dsum, dprod;
#endif
    int_t procs;

    /* Structures needed for parallel symbolic factorization */
    int_t *sizes, *fstVtxSep, parSymbFact;
    int   noDomains, nprocs_num;
    MPI_Comm symb_comm; /* communicator for symbolic factorization */
    int   col, key; /* parameters for creating a new communicator */
    Pslu_freeable_t Pslu_freeable;
    float  flinfo;

    /* Initialization. */
    m       = A->nrow;
    n       = A->ncol;
    Astore  = (NRformat_loc *) A->Store;
    nnz_loc = Astore->nnz_loc;
    m_loc   = Astore->m_loc;
    fst_row = Astore->fst_row;
    a       = (double *) Astore->nzval;
    rowptr  = Astore->rowptr;
    colind  = Astore->colind;
    sizes   = NULL;
    fstVtxSep = NULL;
    symb_comm = MPI_COMM_NULL;

    /* Test the input parameters. */
    *info = 0;
    Fact = options->Fact;
    if ( Fact < 0 || Fact > FACTORED )
	*info = -1;
    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
	*info = -1;
    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
	*info = -1;
    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
	*info = -1;
    else if ( options->IterRefine == EXTRA ) {
	*info = -1;
	fprintf(stderr, "Extra precise iterative refinement yet to support.");
    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
		|| A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -2;
    else if ( ldb < m_loc )
	*info = -5;
    else if ( nrhs < 0 )
	*info = -6;
    if ( *info ) {
	i = -(*info);
	pxerbla("pdgssvx", grid, -*info);
	return;
    }

    factored = (Fact == FACTORED);
    Equil = (!factored && options->Equil == YES);
    notran = (options->Trans == NOTRANS);
    iam = grid->iam;
    job = 5;
    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
	rowequ = (ScalePermstruct->DiagScale == ROW) ||
	         (ScalePermstruct->DiagScale == BOTH);
	colequ = (ScalePermstruct->DiagScale == COL) ||
	         (ScalePermstruct->DiagScale == BOTH);
    } else rowequ = colequ = FALSE;

    /* The following arrays are replicated on all processes. */
    perm_r = ScalePermstruct->perm_r;
    perm_c = ScalePermstruct->perm_c;
    etree = LUstruct->etree;
    R = ScalePermstruct->R;
    C = ScalePermstruct->C;
    /********/

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pdgssvx()");
#endif

    /* Not factored & ask for equilibration */
    if ( Equil && Fact != SamePattern_SameRowPerm ) { 
	/* Allocate storage if not done so before. */
	switch ( ScalePermstruct->DiagScale ) {
	    case NOEQUIL:
		if ( !(R = (double *) doubleMalloc_dist(m)) )
		    ABORT("Malloc fails for R[].");
	        if ( !(C = (double *) doubleMalloc_dist(n)) )
		    ABORT("Malloc fails for C[].");
		ScalePermstruct->R = R;
		ScalePermstruct->C = C;
		break;
	    case ROW: 
	        if ( !(C = (double *) doubleMalloc_dist(n)) )
		    ABORT("Malloc fails for C[].");
		ScalePermstruct->C = C;
		break;
	    case COL: 
		if ( !(R = (double *) doubleMalloc_dist(m)) )
		    ABORT("Malloc fails for R[].");
		ScalePermstruct->R = R;
		break;
	}
    }

    /* ------------------------------------------------------------
       Diagonal scaling to equilibrate the matrix.
       ------------------------------------------------------------*/
    if ( Equil ) {
#if ( DEBUGlevel>=1 )
	CHECK_MALLOC(iam, "Enter equil");
#endif
	t = SuperLU_timer_();

	if ( Fact == SamePattern_SameRowPerm ) {
	    /* Reuse R and C. */
	    switch ( ScalePermstruct->DiagScale ) {
	      case NOEQUIL:
		break;
	      case ROW:
		irow = fst_row;
		for (j = 0; j < m_loc; ++j) {
		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
			a[i] *= R[irow];       /* Scale rows. */
		    }
		    ++irow;
		}
		break;
	      case COL:
		for (j = 0; j < m_loc; ++j)
		    for (i = rowptr[j]; i < rowptr[j+1]; ++i){
		        icol = colind[i];
			a[i] *= C[icol];          /* Scale columns. */
		    }
		break;
	      case BOTH:
		irow = fst_row;
		for (j = 0; j < m_loc; ++j) {
		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
			icol = colind[i];
			a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
		    }
		    ++irow;
		}
	        break;
	    }
	} else { /* Compute R & C from scratch */
            /* Compute the row and column scalings. */
	    pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);

	    /* Equilibrate matrix A if it is badly-scaled. */
	    pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed);

	    if ( lsame_(equed, "R") ) {
		ScalePermstruct->DiagScale = rowequ = ROW;
	    } else if ( lsame_(equed, "C") ) {
		ScalePermstruct->DiagScale = colequ = COL;
	    } else if ( lsame_(equed, "B") ) {
		ScalePermstruct->DiagScale = BOTH;
		rowequ = ROW;
		colequ = COL;
	    } else ScalePermstruct->DiagScale = NOEQUIL;

#if ( PRNTlevel>=1 )
	    if ( !iam ) {
		printf(".. equilibrated? *equed = %c\n", *equed);
		/*fflush(stdout);*/
	    }
#endif
	} /* if Fact ... */

	stat->utime[EQUIL] = SuperLU_timer_() - t;
#if ( DEBUGlevel>=1 )
	CHECK_MALLOC(iam, "Exit equil");
#endif
    } /* if Equil ... */

    if ( !factored ) { /* Skip this if already factored. */
        /*
         * Gather A from the distributed compressed row format to
         * global A in compressed column format.
         * Numerical values are gathered only when a row permutation
         * for large diagonal is sought after.
         */
	if ( Fact != SamePattern_SameRowPerm ) {
            need_value = (options->RowPerm == LargeDiag);
            pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
            GAstore = (NCformat *) GA.Store;
            colptr = GAstore->colptr;
            rowind = GAstore->rowind;
            nnz = GAstore->nnz;
            if ( need_value ) a_GA = (double *) GAstore->nzval;
            else assert(GAstore->nzval == NULL);
	}

        /* ------------------------------------------------------------
           Find the row permutation for A.
           ------------------------------------------------------------*/
        if ( options->RowPerm != NO ) {
	    t = SuperLU_timer_();
	    if ( Fact != SamePattern_SameRowPerm ) {
	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
	            /* Permute the global matrix GA for symbfact() */
	            for (i = 0; i < colptr[n]; ++i) {
	            	irow = rowind[i]; 
		    	rowind[i] = perm_r[irow];
	            }
	        } else { /* options->RowPerm == LargeDiag */
	            /* Get a new perm_r[] */
	            if ( job == 5 ) {
		        /* Allocate storage for scaling factors. */
		        if ( !(R1 = doubleMalloc_dist(m)) )
		            ABORT("SUPERLU_MALLOC fails for R1[]");
		    	if ( !(C1 = doubleMalloc_dist(n)) )
		            ABORT("SUPERLU_MALLOC fails for C1[]");
	            }

	            if ( !iam ) {
		        /* Process 0 finds a row permutation */
		        dldperm(job, m, nnz, colptr, rowind, a_GA,
		                perm_r, R1, C1);
		
		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
		        if ( job == 5 && Equil ) {
		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
		        }
	            } else {
		        MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
		        if ( job == 5 && Equil ) {
		            MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
		            MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
		        }
	            }

#if ( PRNTlevel>=2 )
	            dmin = dlamch_("Overflow");
	            dsum = 0.0;
	            dprod = 1.0;
#endif
	            if ( job == 5 ) {
		        if ( Equil ) {
		            for (i = 0; i < n; ++i) {
			        R1[i] = exp(R1[i]);
			        C1[i] = exp(C1[i]);
		            }

		            /* Scale the distributed matrix */
		            irow = fst_row;
		            for (j = 0; j < m_loc; ++j) {
			        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
			            icol = colind[i];
			            a[i] *= R1[irow] * C1[icol];
#if ( PRNTlevel>=2 )
			            if ( perm_r[irow] == icol ) { /* New diagonal */
			              if ( job == 2 || job == 3 )
				        dmin = SUPERLU_MIN(dmin, fabs(a[i]));
			              else if ( job == 4 )
				        dsum += fabs(a[i]);
			              else if ( job == 5 )
				        dprod *= fabs(a[i]);
			            }
#endif
			        }
			        ++irow;
		            }

		            /* Multiply together the scaling factors. */
		            if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
		            else for (i = 0; i < m; ++i) R[i] = R1[i];
		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
		            else for (i = 0; i < n; ++i) C[i] = C1[i];
		    
		            ScalePermstruct->DiagScale = BOTH;
		            rowequ = colequ = 1;

		        } /* end Equil */

                        /* Now permute global A to prepare for symbfact() */
                        for (j = 0; j < n; ++j) {
		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
	                        irow = rowind[i];
		                rowind[i] = perm_r[irow];
		            }
		        }
		        SUPERLU_FREE (R1);
		        SUPERLU_FREE (C1);
	            } else { /* job = 2,3,4 */
		        for (j = 0; j < n; ++j) {
		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
			        irow = rowind[i];
			        rowind[i] = perm_r[irow];
		            } /* end for i ... */
		        } /* end for j ... */
	            } /* end else job ... */

#if ( PRNTlevel>=2 )
	            if ( job == 2 || job == 3 ) {
		        if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
	            } else if ( job == 4 ) {
		        if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
	            } else if ( job == 5 ) {
		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
	            }
#endif
	    
                } /* end if options->RowPerm ... */

	        t = SuperLU_timer_() - t;
	        stat->utime[ROWPERM] = t;
#if ( PRNTlevel>=1 )
	        if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t);
#endif
            } /* end if Fact ... */
        } else { /* options->RowPerm == NOROWPERM */
            for (i = 0; i < m; ++i) perm_r[i] = i;
        }

#if ( DEBUGlevel>=2 )
        if ( !iam ) PrintInt10("perm_r",  m, perm_r);
#endif
    } /* end if (!factored) */

    if ( !factored || options->IterRefine ) {
	/* Compute norm(A), which will be used to adjust small diagonal. */
	if ( notran ) *(unsigned char *)norm = '1';
	else *(unsigned char *)norm = 'I';
	anorm = pdlangs(norm, A, grid);
#if ( PRNTlevel>=1 )
	if ( !iam ) printf(".. anorm %e\n", anorm);
#endif
    }

    /* ------------------------------------------------------------
       Perform the LU factorization.
       ------------------------------------------------------------*/
    if ( !factored ) {
	t = SuperLU_timer_();
	/*
	 * Get column permutation vector perm_c[], according to permc_spec:
	 *   permc_spec = NATURAL:  natural ordering 
	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
	 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
	 */
	permc_spec = options->ColPerm;
	parSymbFact = options->ParSymbFact;

#if ( PRNTlevel>=1 )
	if ( parSymbFact && permc_spec != PARMETIS )
	    if ( !iam ) printf(".. Parallel symbolic factorization"
			       " only works wth ParMetis!\n");
#endif

	if ( parSymbFact == YES || permc_spec == PARMETIS ) {	
	    nprocs_num = grid->nprow * grid->npcol;
  	    noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num ))));

	    /* create a new communicator for the first noDomains processors in
	       grid->comm */
	    key = iam;
    	    if (iam < noDomains) col = 0;
	    else col = MPI_UNDEFINED;
	    MPI_Comm_split (grid->comm, col, key, &symb_comm );

	    permc_spec = PARMETIS; /* only works with PARMETIS */
        }

	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
	  if ( permc_spec == PARMETIS ) {
	      /* Get column permutation vector in perm_c.                    *
	       * This routine takes as input the distributed input matrix A  *
	       * and does not modify it.  It also allocates memory for       *
	       * sizes[] and fstVtxSep[] arrays, that contain information    *
	       * on the separator tree computed by ParMETIS.                 */
	      flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
                                  	   noDomains, &sizes, &fstVtxSep,
                                           grid, &symb_comm);
	      if (flinfo > 0)
	          ABORT("ERROR in get perm_c parmetis.");
	  } else {
	      get_perm_c_dist(iam, permc_spec, &GA, perm_c);
          }
        }

	stat->utime[COLPERM] = SuperLU_timer_() - t;

	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
	   (a.k.a. column etree), depending on the choice of ColPerm.
	   Adjust perm_c[] to be consistent with a postorder of etree.
	   Permute columns of A to form A*Pc'. */
	if ( Fact != SamePattern_SameRowPerm ) {
	    if ( parSymbFact == NO ) {
	        int_t *GACcolbeg, *GACcolend, *GACrowind;

	        sp_colorder(options, &GA, perm_c, etree, &GAC); 

	        /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
	        GACstore = (NCPformat *) GAC.Store;
	        GACcolbeg = GACstore->colbeg;
	        GACcolend = GACstore->colend;
	        GACrowind = GACstore->rowind;
	        for (j = 0; j < n; ++j) {
	            for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
		        irow = GACrowind[i];
		        GACrowind[i] = perm_c[irow];
	            }
	        }

	        /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
                   the nonzero data structures for L & U. */
#if ( PRNTlevel>=1 ) 
                if ( !iam )
		  printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
#endif
  	        t = SuperLU_timer_();
	        if ( !(Glu_freeable = (Glu_freeable_t *)
		      SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
		    ABORT("Malloc fails for Glu_freeable.");

	    	/* Every process does this. */
	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree, 
			     	 Glu_persist, Glu_freeable);

	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
	    	if ( iinfo < 0 ) { /* Successful return */
		    QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
#if ( PRNTlevel>=1 )
		    if ( !iam ) {
		    	printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
		    	printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
		    	printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
		    	printf("\tint %d, short %d, float %d, double %d\n", 
			       sizeof(int_t), sizeof(short), sizeof(float),
			       sizeof(double));
		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
			   	symb_mem_usage.for_lu*1e-6, 
			   	symb_mem_usage.total*1e-6,
			   	symb_mem_usage.expansions);
		    }
#endif
	    	} else {
		    if ( !iam ) {
		        fprintf(stderr,"symbfact() error returns %d\n",iinfo);
		    	exit(-1);
		    }
	        }
	    } /* end if serial symbolic factorization */
	    else {  /* parallel symbolic factorization */
	    	t = SuperLU_timer_();
	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
				       sizes, fstVtxSep, &Pslu_freeable, 
				       &(grid->comm), &symb_comm,
				       &symb_mem_usage); 
	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
	    	if (flinfo > 0) 
	      	    ABORT("Insufficient memory for parallel symbolic factorization.");
	    }
	} /* end if Fact ... */

#if ( PRNTlevel>=1 )
	if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]);
#endif
        if (sizes) SUPERLU_FREE (sizes);
        if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
	if (symb_comm != MPI_COMM_NULL)
	  MPI_Comm_free (&symb_comm); 

	if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
  	    /* Apply column permutation to the original distributed A */
	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];

	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
	       NOTE: the row permutation Pc*Pr is applied internally in the
  	       distribution routine. */
	    t = SuperLU_timer_();
	    dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct,
                                      Glu_freeable, LUstruct, grid);
	    stat->utime[DIST] = SuperLU_timer_() - t;

  	    /* Deallocate storage used in symbolic factorization. */
	    if ( Fact != SamePattern_SameRowPerm ) {
	        iinfo = symbfact_SubFree(Glu_freeable);
	        SUPERLU_FREE(Glu_freeable);
	    }
	} else {
	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. 
	       NOTE: the row permutation Pc*Pr is applied internally in the
	       distribution routine. */
	    /* Apply column permutation to the original distributed A */
	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];

    	    t = SuperLU_timer_();
	    dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct,
		  			   &Pslu_freeable, LUstruct, grid);
	    if (dist_mem_use > 0)
	        ABORT ("Not enough memory available for dist_psymbtonum\n");
	    stat->utime[DIST] = SuperLU_timer_() - t;
	}

#if ( PRNTlevel>=1 )
	if (!iam) printf ("\tDISTRIBUTE time    %8.2f\n", stat->utime[DIST]);
#endif

	/* Perform numerical factorization in parallel. */
	t = SuperLU_timer_();
	pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
	stat->utime[FACT] = SuperLU_timer_() - t;

#if ( PRNTlevel>=1 )
	{
	    int_t TinyPivots;
	    float for_lu, total, max, avg, temp;
	    dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    MPI_Reduce( &num_mem_usage.total, &total,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    temp = SUPERLU_MAX(symb_mem_usage.total,
			       symb_mem_usage.for_lu +
			       (float)dist_mem_use + num_mem_usage.for_lu);
	    if (parSymbFact == TRUE)
	      /* The memory used in the redistribution routine
		 includes the memory used for storing the symbolic
		 structure and the memory allocated for numerical
		 factorization */
	      temp = SUPERLU_MAX(symb_mem_usage.total,
				 (float)dist_mem_use);
	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
	    MPI_Reduce( &temp, &max,
		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
	    MPI_Reduce( &temp, &avg,
		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
			  MPI_SUM, grid->comm );
	    stat->TinyPivots = TinyPivots;
	    if ( !iam ) {
		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
		       for_lu*1e-6, total*1e-6);
		printf("\tAll space (MB):"
		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
	    }
	}
#endif
    
        /* Destroy GA */
        if ( Fact != SamePattern_SameRowPerm )
            Destroy_CompCol_Matrix_dist(&GA);
    } /* end if (!factored) */
	
    /* ------------------------------------------------------------
       Compute the solution matrix X.
       ------------------------------------------------------------*/
    if ( nrhs ) {

	if ( !(b_work = doubleMalloc_dist(n)) )
	    ABORT("Malloc fails for b_work[]");

	/* ------------------------------------------------------------
	   Scale the right-hand side if equilibration was performed. 
	   ------------------------------------------------------------*/
	if ( notran ) {
	    if ( rowequ ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    irow = fst_row;
		    for (i = 0; i < m_loc; ++i) {
		        b_col[i] *= R[irow];
		        ++irow;
		    }
		    b_col += ldb;
		}
	    }
	} else if ( colequ ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
	        irow = fst_row;
		for (i = 0; i < m_loc; ++i) {
		    b_col[i] *= C[irow];
		    ++irow;
		}
		b_col += ldb;
	    }
	}

	/* Save a copy of the right-hand side. */
	ldx = ldb;
	if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) )
	    ABORT("Malloc fails for X[]");
	x_col = X;  b_col = B;
	for (j = 0; j < nrhs; ++j) {
	    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
	    x_col += ldx;  b_col += ldb;
	}

	/* ------------------------------------------------------------
	   Solve the linear system.
	   ------------------------------------------------------------*/
	if ( options->SolveInitialized == NO ) {
	    dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
		       SOLVEstruct);
	}

	pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, 
		fst_row, ldb, nrhs, SOLVEstruct, stat, info);

	/* ------------------------------------------------------------
	   Use iterative refinement to improve the computed solution and
	   compute error bounds and backward error estimates for it.
	   ------------------------------------------------------------*/
	if ( options->IterRefine ) {
	    /* Improve the solution by iterative refinement. */
	    int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
	    SOLVEstruct_t *SOLVEstruct1;  /* Used by refinement. */

	    t = SuperLU_timer_();
	    if ( options->RefineInitialized == NO || Fact == DOFACT ) {
	        /* All these cases need to re-initialize gsmv structure */
	        if ( options->RefineInitialized )
		    pdgsmv_finalize(SOLVEstruct->gsmv_comm);
	        pdgsmv_init(A, SOLVEstruct->row_to_proc, grid,
			    SOLVEstruct->gsmv_comm);
	       
                /* Save a copy of the transformed local col indices
		   in colind_gsmv[]. */
	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
	        if ( !(it = intMalloc_dist(nnz_loc)) )
		    ABORT("Malloc fails for colind_gsmv[]");
	        colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
	        for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
	        options->RefineInitialized = YES;
	    } else if ( Fact == SamePattern ||
			Fact == SamePattern_SameRowPerm ) {
	        double at;
	        int_t k, jcol, p;
	        /* Swap to beginning the part of A corresponding to the
		   local part of X, as was done in pdgsmv_init() */
	        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
		    k = rowptr[i];
		    for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
		        jcol = colind[j];
		        p = SOLVEstruct->row_to_proc[jcol];
		        if ( p == iam ) { /* Local */
		            at = a[k]; a[k] = a[j]; a[j] = at;
		            ++k;
		        }
		    }
	        }
	      
	        /* Re-use the local col indices of A obtained from the
		   previous call to pdgsmv_init() */
	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
	    }

	    if ( nrhs == 1 ) { /* Use the existing solve structure */
	        SOLVEstruct1 = SOLVEstruct;
	    } else { /* For nrhs > 1, since refinement is performed for RHS
			one at a time, the communication structure for pdgstrs
			is different than the solve with nrhs RHS. 
			So we use SOLVEstruct1 for the refinement step.
		     */
	        if ( !(SOLVEstruct1 = (SOLVEstruct_t *) 
		                       SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) )
		    ABORT("Malloc fails for SOLVEstruct1");
	        /* Copy the same stuff */
	        SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
	        SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
	        SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
	        SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
		
		/* Initialize the *gstrs_comm for 1 RHS. */
		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
		    ABORT("Malloc fails for gstrs_comm[]");
		pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, 
			     Glu_persist, SOLVEstruct1);
	    }

	    pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
		    B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);

            /* Deallocate the storage associated with SOLVEstruct1 */
	    if ( nrhs > 1 ) {
	        pxgstrs_finalize(SOLVEstruct1->gstrs_comm);
	        SUPERLU_FREE(SOLVEstruct1);
	    }

	    stat->utime[REFINE] = SuperLU_timer_() - t;
	}

	/* Permute the solution matrix B <= Pc'*X. */
	pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
			       SOLVEstruct->inv_perm_c,
			       X, ldx, B, ldb, nrhs, grid);
#if ( DEBUGlevel>=2 )
	printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
	for (i = 0; i < m_loc; ++i)
	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
#endif
	
	/* Transform the solution matrix X to a solution of the original
	   system before the equilibration. */
	if ( notran ) {
	    if ( colequ ) {
		b_col = B;
		for (j = 0; j < nrhs; ++j) {
		    irow = fst_row;
		    for (i = 0; i < m_loc; ++i) {
		        b_col[i] *= C[irow];
		        ++irow;
		    }
		    b_col += ldb;
		}
	    }
	} else if ( rowequ ) {
	    b_col = B;
	    for (j = 0; j < nrhs; ++j) {
	        irow = fst_row;
		for (i = 0; i < m_loc; ++i) {
		    b_col[i] *= R[irow];
		    ++irow;
		}
		b_col += ldb;
	    }
	}

	SUPERLU_FREE(b_work);
	SUPERLU_FREE(X);

    } /* end if nrhs != 0 */

#if ( PRNTlevel>=1 )
    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
#endif

    /* Deallocate R and/or C if it was not used. */
    if ( Equil && Fact != SamePattern_SameRowPerm ) {
	switch ( ScalePermstruct->DiagScale ) {
	    case NOEQUIL:
	        SUPERLU_FREE(R);
		SUPERLU_FREE(C);
		break;
	    case ROW: 
		SUPERLU_FREE(C);
		break;
	    case COL: 
		SUPERLU_FREE(R);
		break;
	}
    }
    if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
 	Destroy_CompCol_Permuted_dist(&GAC);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pdgssvx()");
#endif

}
예제 #4
0
int main(int argc, char *argv[])
{
    superlu_dist_options_t options;
    SuperLUStat_t stat;
    SuperMatrix A;
    ScalePermstruct_t ScalePermstruct;
    LUstruct_t LUstruct;
    gridinfo_t grid;
    double   *berr;
    doublecomplex   *a, *b, *xtrue;
    int_t    *asub, *xa;
    int_t    m, n, nnz;
    int_t    nprow, npcol;
    int      iam, info, ldb, ldx, nrhs;
    char     trans[1];
    char     **cpp, c;
    FILE *fp, *fopen();
    extern int cpp_defs();

    /* prototypes */
    extern void LUstructInit(const int_t, LUstruct_t *);
    extern void LUstructFree(LUstruct_t *);
    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);

    nprow = 1;  /* Default process rows.      */
    npcol = 1;  /* Default process columns.   */
    nrhs = 1;   /* Number of right-hand side. */

    /* ------------------------------------------------------------
       INITIALIZE MPI ENVIRONMENT. 
       ------------------------------------------------------------*/
    MPI_Init( &argc, &argv );

    /* Parse command line argv[]. */
    for (cpp = argv+1; *cpp; ++cpp) {
	if ( **cpp == '-' ) {
	    c = *(*cpp+1);
	    ++cpp;
	    switch (c) {
	      case 'h':
		  printf("Options:\n");
		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
		  exit(0);
		  break;
	      case 'r': nprow = atoi(*cpp);
		        break;
	      case 'c': npcol = atoi(*cpp);
		        break;
	    }
	} else { /* Last arg is considered a filename */
	    if ( !(fp = fopen(*cpp, "r")) ) {
                ABORT("File does not exist");
            }
	    break;
	}
    }

    /* ------------------------------------------------------------
       INITIALIZE THE SUPERLU PROCESS GRID. 
       ------------------------------------------------------------*/
    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);

    /* Bail out if I do not belong in the grid. */
    iam = grid.iam;
    if ( iam >= nprow * npcol )
	goto out;


#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter main()");
#endif
    
    /* ------------------------------------------------------------
       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
       THE OTHER PROCESSES.
       ------------------------------------------------------------*/
    if ( !iam ) {
	/* Print the CPP definitions. */
	cpp_defs();
	
	/* Read the matrix stored on disk in Harwell-Boeing format. */
	zreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
	
	printf("Input matrix file: %s\n", *cpp);
	printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);

	/* Broadcast matrix A to the other PEs. */
	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
    } else {
	/* Receive matrix A from PE 0. */
	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );

	/* Allocate storage for compressed column representation. */
	zallocateA_dist(n, nnz, &a, &asub, &xa);

	MPI_Bcast( a,    nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm );
	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
    }
	
    /* Create compressed column matrix for A. */
    zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
				SLU_NC, SLU_Z, SLU_GE);

    /* Generate the exact solution and compute the right-hand side. */
    if (!(b=doublecomplexMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
    if (!(xtrue=doublecomplexMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
    *trans = 'N';
    ldx = n;
    ldb = m;
    zGenXtrue_dist(n, nrhs, xtrue, ldx);
    zFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);

    if ( !(berr = doubleMalloc_dist(nrhs)) )
	ABORT("Malloc fails for berr[].");

    /* ------------------------------------------------------------
       NOW WE SOLVE THE LINEAR SYSTEM.
       ------------------------------------------------------------*/

    /* Set the default input options:
        options.Fact = DOFACT;
        options.Equil = YES;
        options.ColPerm = METIS_AT_PLUS_A;
        options.RowPerm = LargeDiag_MC64;
        options.ReplaceTinyPivot = YES;
        options.Trans = NOTRANS;
        options.IterRefine = DOUBLE;
        options.SolveInitialized = NO;
        options.RefineInitialized = NO;
        options.PrintStat = YES;
     */
    set_default_options_dist(&options);

    if (!iam) {
	print_sp_ienv_dist(&options);
	print_options_dist(&options);
    }

    /* Initialize ScalePermstruct and LUstruct. */
    ScalePermstructInit(m, n, &ScalePermstruct);
    LUstructInit(n, &LUstruct);

    /* Initialize the statistics variables. */
    PStatInit(&stat);

    /* Call the linear equation solver. */
    pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
		     &LUstruct, berr, &stat, &info);

    /* Check the accuracy of the solution. */
    if ( !iam ) {
	zinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
    }
    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */

    /* ------------------------------------------------------------
       DEALLOCATE STORAGE.
       ------------------------------------------------------------*/
    PStatFree(&stat);
    Destroy_CompCol_Matrix_dist(&A);
    Destroy_LU(n, &grid, &LUstruct);
    ScalePermstructFree(&ScalePermstruct);
    LUstructFree(&LUstruct);
    SUPERLU_FREE(b);
    SUPERLU_FREE(xtrue);
    SUPERLU_FREE(berr);

    /* ------------------------------------------------------------
       RELEASE THE SUPERLU PROCESS GRID.
       ------------------------------------------------------------*/
out:
    superlu_gridexit(&grid);

    /* ------------------------------------------------------------
       TERMINATES THE MPI EXECUTION ENVIRONMENT.
       ------------------------------------------------------------*/
    MPI_Finalize();

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit main()");
#endif

}
예제 #5
0
int main(int argc, char *argv[])
{
    superlu_dist_options_t options;
    SuperLUStat_t stat;
    SuperMatrix A;
    ScalePermstruct_t ScalePermstruct;
    LUstruct_t LUstruct;
    gridinfo_t grid;
    double   *berr;
    double   *a, *a1, *b, *b1, *xtrue;
    int_t    *asub, *asub1, *xa, *xa1;
    int_t    i, j, m, n, nnz;
    int_t    nprow, npcol;
    int      iam, info, ldb, ldx, nrhs;
    char     trans[1];
    char     **cpp, c;
    FILE *fp, *fopen();
    extern int cpp_defs();

    /* prototypes */
    extern void LUstructInit(const int_t, LUstruct_t *);
    extern void LUstructFree(LUstruct_t *);
    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);

    nprow = 1;  /* Default process rows.      */
    npcol = 1;  /* Default process columns.   */
    nrhs = 1;   /* Number of right-hand side. */

    /* ------------------------------------------------------------
       INITIALIZE MPI ENVIRONMENT. 
       ------------------------------------------------------------*/
    MPI_Init( &argc, &argv );

    /* Parse command line argv[]. */
    for (cpp = argv+1; *cpp; ++cpp) {
	if ( **cpp == '-' ) {
	    c = *(*cpp+1);
	    ++cpp;
	    switch (c) {
	      case 'h':
		  printf("Options:\n");
		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
		  printf("\t-c <int>: process columns (default %d)\n", npcol);
		  exit(0);
		  break;
	      case 'r': nprow = atoi(*cpp);
		        break;
	      case 'c': npcol = atoi(*cpp);
		        break;
	    }
	} else { /* Last arg is considered a filename */
	    if ( !(fp = fopen(*cpp, "r")) ) {
                ABORT("File does not exist");
            }
	    break;
	}
    }

    /* ------------------------------------------------------------
       INITIALIZE THE SUPERLU PROCESS GRID. 
       ------------------------------------------------------------*/
    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);

    /* Bail out if I do not belong in the grid. */
    iam = grid.iam;
    if ( iam >= nprow * npcol )
	goto out;
    
#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter main()");
#endif

    /* ------------------------------------------------------------
       Process 0 reads the matrix A, and then broadcasts it to all
       the other processes.
       ------------------------------------------------------------*/
    if ( !iam ) {
	/* Print the CPP definitions. */
	cpp_defs();
	
	/* Read the matrix stored on disk in Harwell-Boeing format. */
	dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
	
	printf("Input matrix file: %s\n", *cpp);
	printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz);
	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);

	/* Broadcast matrix A to the other PEs. */
	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
    } else {
	/* Receive matrix A from PE 0. */
	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );

	/* Allocate storage for compressed column representation. */
	dallocateA_dist(n, nnz, &a, &asub, &xa);

	MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
    }
	
    /* Create compressed column matrix for A. */
    dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
				SLU_NC, SLU_D, SLU_GE);

    /* Generate the exact solution and compute the right-hand side. */
    if (!(b=doubleMalloc_dist(m * nrhs))) ABORT("Malloc fails for b[]");
    if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
    *trans = 'N';
    ldx = n;
    ldb = m;
    dGenXtrue_dist(n, nrhs, xtrue, ldx);
    dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);

    /* Save a copy of the right-hand side. */  
    if ( !(b1 = doubleMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
    for (j = 0; j < nrhs; ++j)
	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
    
    if ( !(berr = doubleMalloc_dist(nrhs)) )
	ABORT("Malloc fails for berr[].");

    /* Save a copy of the matrix A. */
    dallocateA_dist(n, nnz, &a1, &asub1, &xa1);
    for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; }
    for (i = 0; i < n+1; ++i) xa1[i] = xa[i];


    /* ------------------------------------------------------------
       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
       ------------------------------------------------------------*/

    /* Set the default input options:
        options.Fact = DOFACT;
        options.Equil = YES;
        options.ColPerm = METIS_AT_PLUS_A;
        options.RowPerm = LargeDiag;
        options.ReplaceTinyPivot = YES;
        options.Trans = NOTRANS;
        options.IterRefine = DOUBLE;
        options.SolveInitialized = NO;
        options.RefineInitialized = NO;
        options.PrintStat = YES;
     */
    set_default_options_dist(&options);

    if (!iam) {
	print_sp_ienv_dist(&options);
	print_options_dist(&options);
    }

    /* Initialize ScalePermstruct and LUstruct. */
    ScalePermstructInit(m, n, &ScalePermstruct);
    LUstructInit(n, &LUstruct);

    /* Initialize the statistics variables. */
    PStatInit(&stat);

    /* Call the linear equation solver: factorize and solve. */
    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
		     &LUstruct, berr, &stat, &info);

    /* Check the accuracy of the solution. */
    if ( !iam ) {
	dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
    }
    
    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
    PStatFree(&stat);
    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
					the L and U matrices.               */
    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */


    /* ------------------------------------------------------------
       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
       ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
       ------------------------------------------------------------*/
    options.Fact = SamePattern;
    PStatInit(&stat); /* Initialize the statistics variables. */

    /* Create compressed column matrix for A. */
    dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1,
				SLU_NC, SLU_D, SLU_GE);

    /* Solve the linear system. */
    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
		     &LUstruct, berr, &stat, &info);

    /* Check the accuracy of the solution. */
    if ( !iam ) {
	printf("Solve the system with the same sparsity pattern.\n");
	dinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
    }

    /* Print the statistics. */
    PStatPrint(&options, &stat, &grid);

    /* ------------------------------------------------------------
       DEALLOCATE STORAGE.
       ------------------------------------------------------------*/
    PStatFree(&stat);
    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
    Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
					the L and U matrices.               */
    ScalePermstructFree(&ScalePermstruct);
    LUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
    SUPERLU_FREE(berr);


    /* ------------------------------------------------------------
       RELEASE THE SUPERLU PROCESS GRID.
       ------------------------------------------------------------*/
out:
    superlu_gridexit(&grid);

    /* ------------------------------------------------------------
       TERMINATES THE MPI EXECUTION ENVIRONMENT.
       ------------------------------------------------------------*/
    MPI_Finalize();

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit main()");
#endif

}
예제 #6
0
PetscErrorCode MatLUFactorNumeric_SuperLU_DIST(Mat F,Mat A,const MatFactorInfo *info)
{
  Mat              *tseq,A_seq = NULL;
  Mat_SeqAIJ       *aa,*bb;
  Mat_SuperLU_DIST *lu = (Mat_SuperLU_DIST*)(F)->spptr;
  PetscErrorCode   ierr;
  PetscInt         M=A->rmap->N,N=A->cmap->N,i,*ai,*aj,*bi,*bj,nz,rstart,*garray,
                   m=A->rmap->n, colA_start,j,jcol,jB,countA,countB,*bjj,*ajj;
  int              sinfo;   /* SuperLU_Dist info flag is always an int even with long long indices */
  PetscMPIInt      size;
  SuperLUStat_t    stat;
  double           *berr=0;
  IS               isrow;
  Mat              F_diag=NULL;
#if defined(PETSC_USE_COMPLEX)
  doublecomplex    *av, *bv;
#else
  double           *av, *bv;
#endif

  PetscFunctionBegin;
  ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);

  if (lu->MatInputMode == GLOBAL) { /* global mat input */
    if (size > 1) { /* convert mpi A to seq mat A */
      ierr = ISCreateStride(PETSC_COMM_SELF,M,0,1,&isrow);CHKERRQ(ierr);
      ierr = MatGetSubMatrices(A,1,&isrow,&isrow,MAT_INITIAL_MATRIX,&tseq);CHKERRQ(ierr);
      ierr = ISDestroy(&isrow);CHKERRQ(ierr);

      A_seq = *tseq;
      ierr  = PetscFree(tseq);CHKERRQ(ierr);
      aa    = (Mat_SeqAIJ*)A_seq->data;
    } else {
      PetscBool flg;
      ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
      if (flg) {
        Mat_MPIAIJ *At = (Mat_MPIAIJ*)A->data;
        A = At->A;
      }
      aa =  (Mat_SeqAIJ*)A->data;
    }

    /* Convert Petsc NR matrix to SuperLU_DIST NC.
       Note: memories of lu->val, col and row are allocated by CompRow_to_CompCol_dist()! */
    if (lu->options.Fact != DOFACT) {/* successive numeric factorization, sparsity pattern is reused. */
      PetscStackCall("SuperLU_DIST:Destroy_CompCol_Matrix_dist",Destroy_CompCol_Matrix_dist(&lu->A_sup));
      if (lu->FactPattern == SamePattern_SameRowPerm) {
        lu->options.Fact = SamePattern_SameRowPerm; /* matrix has similar numerical values */
      } else { /* lu->FactPattern == SamePattern */
        PetscStackCall("SuperLU_DIST:Destroy_LU",Destroy_LU(N, &lu->grid, &lu->LUstruct));
        lu->options.Fact = SamePattern;
      }
    }
#if defined(PETSC_USE_COMPLEX)
    PetscStackCall("SuperLU_DIST:zCompRow_to_CompCol_dist",zCompRow_to_CompCol_dist(M,N,aa->nz,(doublecomplex*)aa->a,(int_t*)aa->j,(int_t*)aa->i,&lu->val,&lu->col, &lu->row));
#else
    PetscStackCall("SuperLU_DIST:dCompRow_to_CompCol_dist",dCompRow_to_CompCol_dist(M,N,aa->nz,aa->a,(int_t*)aa->j,(int_t*)aa->i,&lu->val, &lu->col, &lu->row));
#endif

    /* Create compressed column matrix A_sup. */
#if defined(PETSC_USE_COMPLEX)
    PetscStackCall("SuperLU_DIST:zCreate_CompCol_Matrix_dist",zCreate_CompCol_Matrix_dist(&lu->A_sup, M, N, aa->nz, lu->val, lu->col, lu->row, SLU_NC, SLU_Z, SLU_GE));
#else
    PetscStackCall("SuperLU_DIST:dCreate_CompCol_Matrix_dist",dCreate_CompCol_Matrix_dist(&lu->A_sup, M, N, aa->nz, lu->val, lu->col, lu->row, SLU_NC, SLU_D, SLU_GE));
#endif
  } else { /* distributed mat input */
    Mat_MPIAIJ *mat = (Mat_MPIAIJ*)A->data;
    aa=(Mat_SeqAIJ*)(mat->A)->data;
    bb=(Mat_SeqAIJ*)(mat->B)->data;
    ai=aa->i; aj=aa->j;
    bi=bb->i; bj=bb->j;
#if defined(PETSC_USE_COMPLEX)
    av=(doublecomplex*)aa->a;
    bv=(doublecomplex*)bb->a;
#else
    av=aa->a;
    bv=bb->a;
#endif
    rstart = A->rmap->rstart;
    nz     = aa->nz + bb->nz;
    garray = mat->garray;

    if (lu->options.Fact == DOFACT) { /* first numeric factorization */
#if defined(PETSC_USE_COMPLEX)
      PetscStackCall("SuperLU_DIST:zallocateA_dist",zallocateA_dist(m, nz, &lu->val, &lu->col, &lu->row));
#else
      PetscStackCall("SuperLU_DIST:dallocateA_dist",dallocateA_dist(m, nz, &lu->val, &lu->col, &lu->row));
#endif
    } else { /* successive numeric factorization, sparsity pattern and perm_c are reused. */
      /* Destroy_CompRowLoc_Matrix_dist(&lu->A_sup); */ /* this leads to crash! However, see SuperLU_DIST_2.5/EXAMPLE/pzdrive2.c */
      if (lu->FactPattern == SamePattern_SameRowPerm) {
        lu->options.Fact = SamePattern_SameRowPerm; /* matrix has similar numerical values */
      } else {
        PetscStackCall("SuperLU_DIST:Destroy_LU",Destroy_LU(N, &lu->grid, &lu->LUstruct)); /* Deallocate storage associated with the L and U matrices. */
        lu->options.Fact = SamePattern;
      }
    }
    nz = 0;
    for (i=0; i<m; i++) {
      lu->row[i] = nz;
      countA     = ai[i+1] - ai[i];
      countB     = bi[i+1] - bi[i];
      ajj        = aj + ai[i]; /* ptr to the beginning of this row */
      bjj        = bj + bi[i];

      /* B part, smaller col index */
      colA_start = rstart + ajj[0]; /* the smallest global col index of A */
      jB         = 0;
      for (j=0; j<countB; j++) {
        jcol = garray[bjj[j]];
        if (jcol > colA_start) {
          jB = j;
          break;
        }
        lu->col[nz]   = jcol;
        lu->val[nz++] = *bv++;
        if (j==countB-1) jB = countB;
      }

      /* A part */
      for (j=0; j<countA; j++) {
        lu->col[nz]   = rstart + ajj[j];
        lu->val[nz++] = *av++;
      }

      /* B part, larger col index */
      for (j=jB; j<countB; j++) {
        lu->col[nz]   = garray[bjj[j]];
        lu->val[nz++] = *bv++;
      }
    }
    lu->row[m] = nz;
#if defined(PETSC_USE_COMPLEX)
    PetscStackCall("SuperLU_DIST:zCreate_CompRowLoc_Matrix_dist",zCreate_CompRowLoc_Matrix_dist(&lu->A_sup, M, N, nz, m, rstart,lu->val, lu->col, lu->row, SLU_NR_loc, SLU_Z, SLU_GE));
#else
    PetscStackCall("SuperLU_DIST:dCreate_CompRowLoc_Matrix_dist",dCreate_CompRowLoc_Matrix_dist(&lu->A_sup, M, N, nz, m, rstart,lu->val, lu->col, lu->row, SLU_NR_loc, SLU_D, SLU_GE));
#endif
  }

  /* Factor the matrix. */
  PetscStackCall("SuperLU_DIST:PStatInit",PStatInit(&stat));   /* Initialize the statistics variables. */
  if (lu->MatInputMode == GLOBAL) { /* global mat input */
#if defined(PETSC_USE_COMPLEX)
    PetscStackCall("SuperLU_DIST:pzgssvx_ABglobal",pzgssvx_ABglobal(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, M, 0,&lu->grid, &lu->LUstruct, berr, &stat, &sinfo));
#else
    PetscStackCall("SuperLU_DIST:pdgssvx_ABglobal",pdgssvx_ABglobal(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, M, 0,&lu->grid, &lu->LUstruct, berr, &stat, &sinfo));
#endif
  } else { /* distributed mat input */
#if defined(PETSC_USE_COMPLEX)
    PetscStackCall("SuperLU_DIST:pzgssvx",pzgssvx(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, m, 0, &lu->grid,&lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &sinfo));
    if (sinfo) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pzgssvx fails, info: %d\n",sinfo);
#else
    PetscStackCall("SuperLU_DIST:pdgssvx",pdgssvx(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, m, 0, &lu->grid,&lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &sinfo));
    if (sinfo) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pdgssvx fails, info: %d\n",sinfo);
#endif
  }

  if (lu->MatInputMode == GLOBAL && size > 1) {
    ierr = MatDestroy(&A_seq);CHKERRQ(ierr);
  }

  if (lu->options.PrintStat) {
    PStatPrint(&lu->options, &stat, &lu->grid);  /* Print the statistics. */
  }
  PStatFree(&stat);
  if (size > 1) {
    F_diag            = ((Mat_MPIAIJ*)(F)->data)->A;
    F_diag->assembled = PETSC_TRUE;
  }
  (F)->assembled    = PETSC_TRUE;
  (F)->preallocated = PETSC_TRUE;
  lu->options.Fact  = FACTORED; /* The factored form of A is supplied. Local option used by this func. only */
  PetscFunctionReturn(0);
}
예제 #7
0
int zcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz,
			doublecomplex *nzval_g, int_t *rowind_g, int_t *colptr_g,
			gridinfo_t *grid)
{
    SuperMatrix GA;              /* global A */
    int_t    *rowind, *colptr;	 /* global */
    doublecomplex   *nzval;             /* global */
    doublecomplex   *nzval_loc;         /* local */
    int_t    *colind, *rowptr;	 /* local */
    int_t    m_loc, fst_row, nnz_loc;
    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
			   when mod(m, p) is not zero. */ 
    int_t    iam, row, col, i, j, relpos;
    char     trans[1];
    int_t    *marker;

    iam = grid->iam;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter zcreate_dist_matrix()");
#endif
 
    if ( !iam ) {

        /* Allocate storage for compressed column representation. */
        zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);

	/* Copy the global matrix. */
#if 0
	/* and ADJUST to 0-based indexing 
           which is required by the C routines.*/
#endif
        for(i=0; i<nnz; i++){
	  nzval[i]=nzval_g[i];
	  rowind[i]=rowind_g[i]; /* - 1;*/
        }
        for(i=0; i<n+1; i++)
	  colptr[i]=colptr_g[i]; /* - 1;*/


	/* Broadcast matrix A to the other PEs. */
	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( nzval,  nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
    } else {
	/* Receive matrix A from PE 0. */
	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );

	/* Allocate storage for compressed column representation. */
	zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);

	MPI_Bcast( nzval,   nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm );
	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
    }

#if 0
    nzval[0]=0.1;
#endif

    /* Compute the number of rows to be distributed to local process */
    m_loc = m / (grid->nprow * grid->npcol); 
    m_loc_fst = m_loc;
    /* When m / procs is not an integer */
    if ((m_loc * grid->nprow * grid->npcol) != m) {
      m_loc = m_loc+1;
      m_loc_fst = m_loc;
      if (iam == (grid->nprow * grid->npcol - 1)) 
	m_loc = m - m_loc_fst * (grid->nprow * grid->npcol - 1);
    }

    /* Create compressed column matrix for GA. */
    zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
				SLU_NC, SLU_Z, SLU_GE);


    /*************************************************
     * Change GA to a local A with NR_loc format     *
     *************************************************/

    rowptr = (int_t *) intMalloc_dist(m_loc+1);
    marker = (int_t *) intCalloc_dist(n);

    /* Get counts of each row of GA */
    for (i = 0; i < n; ++i)
      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
    /* Set up row pointers */
    rowptr[0] = 0;
    fst_row = iam * m_loc_fst;
    nnz_loc = 0;
    for (j = 0; j < m_loc; ++j) {
      row = fst_row + j;
      rowptr[j+1] = rowptr[j] + marker[row];
      marker[j] = rowptr[j];
    }
    nnz_loc = rowptr[m_loc];

    nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
    colind = (int_t *) intMalloc_dist(nnz_loc);

    /* Transfer the matrix into the compressed row storage */
    for (i = 0; i < n; ++i) {
      for (j = colptr[i]; j < colptr[i+1]; ++j) {
	row = rowind[j];
	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
	  row = row - fst_row;
	  relpos = marker[row];
	  colind[relpos] = i;
	  nzval_loc[relpos] = nzval[j];
	  ++marker[row];
	}
      }
    }

#if ( DEBUGlevel>=1 )
    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
#endif   


    /* Destroy GA */
    Destroy_CompCol_Matrix_dist(&GA);


    /******************************************************/
    /* Change GA to a local A with NR_loc format */
    /******************************************************/

    /* Set up the local A in NR_loc format */
    zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
				   nzval_loc, colind, rowptr,
				   SLU_NR_loc, SLU_Z, SLU_GE);
    
    SUPERLU_FREE(marker);

#if ( DEBUGlevel>=1 )
    printf("sizeof(NRforamt_loc) %d\n", sizeof(NRformat_loc));
    CHECK_MALLOC(iam, "Exit dcreate_dist_matrix()");
#endif
    return 0;
}