PetscErrorCode MatDestroy_SuperLU_DIST(Mat A) { PetscErrorCode ierr; Mat_SuperLU_DIST *lu = (Mat_SuperLU_DIST*)A->spptr; PetscBool flg; PetscFunctionBegin; if (lu && lu->CleanUpSuperLU_Dist) { /* Deallocate SuperLU_DIST storage */ if (lu->MatInputMode == GLOBAL) { Destroy_CompCol_Matrix_dist(&lu->A_sup); } else { Destroy_CompRowLoc_Matrix_dist(&lu->A_sup); if ( lu->options.SolveInitialized ) { #if defined(PETSC_USE_COMPLEX) zSolveFinalize(&lu->options, &lu->SOLVEstruct); #else dSolveFinalize(&lu->options, &lu->SOLVEstruct); #endif } } Destroy_LU(A->cmap->N, &lu->grid, &lu->LUstruct); ScalePermstructFree(&lu->ScalePermstruct); LUstructFree(&lu->LUstruct); /* Release the SuperLU_DIST process grid. */ superlu_gridexit(&lu->grid); ierr = MPI_Comm_free(&(lu->comm_superlu));CHKERRQ(ierr); } ierr = PetscFree(A->spptr);CHKERRQ(ierr); ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&flg);CHKERRQ(ierr); if (flg) { ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); } else { ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); } PetscFunctionReturn(0); }
int main(int argc, char *argv[]) { superlu_dist_options_t options; SuperLUStat_t stat; SuperMatrix A; ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; gridinfo_t grid1, grid2; double *berr; double *a, *b, *xtrue; int_t *asub, *xa; int_t i, j, m, n, nnz; int_t nprow, npcol, ldumap, p; int_t usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ char trans[1]; char **cpp, c; FILE *fp, *fopen(); /* prototypes */ extern void LUstructInit(const int_t, LUstruct_t *); extern void LUstructFree(LUstruct_t *); extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *); /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); exit(-1); } /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { c = *(*cpp+1); ++cpp; switch (c) { case 'h': printf("Options:\n"); printf("\t-r <int>: process rows (default %d)\n", nprow); printf("\t-c <int>: process columns (default %d)\n", npcol); exit(0); break; case 'r': nprow = atoi(*cpp); break; case 'c': npcol = atoi(*cpp); break; } } else { /* Last arg is considered a filename */ if ( !(fp = fopen(*cpp, "r")) ) { ABORT("File does not exist"); } break; } } /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID 1. ------------------------------------------------------------*/ nprow = 2; npcol = 3; ldumap = 2; p = 0; /* Grid 1 starts from process 0. */ for (i = 0; i < nprow; ++i) for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1); /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID 2. ------------------------------------------------------------*/ nprow = 2; npcol = 2; ldumap = 2; p = 6; /* Grid 2 starts from process 6. */ for (i = 0; i < nprow; ++i) for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2); /* Bail out if I do not belong in any of the 2 grids. */ MPI_Comm_rank( MPI_COMM_WORLD, &iam ); if ( iam >= 10 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); #endif if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */ iam = grid1.iam; /* Get the logical number in the new grid. */ /* ------------------------------------------------------------ PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL THE OTHER PROCESSES. ------------------------------------------------------------*/ if ( !iam ) { /* Read the matrix stored on disk in Harwell-Boeing format. */ dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz); printf("\tProcess grid\t%d X %d\n", (int) grid1.nprow, (int) grid1.npcol); /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid1.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid1.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid1.comm ); MPI_Bcast( a, nnz, MPI_DOUBLE, 0, grid1.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid1.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid1.comm ); } else { /* Receive matrix A from PE 0. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid1.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid1.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid1.comm ); /* Allocate storage for compressed column representation. */ dallocateA_dist(n, nnz, &a, &asub, &xa); MPI_Bcast( a, nnz, MPI_DOUBLE, 0, grid1.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid1.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid1.comm ); } /* Create compressed column matrix for A. */ dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); /* Generate the exact solution and compute the right-hand side. */ if (!(b=doubleMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); *trans = 'N'; ldx = n; ldb = m; dGenXtrue_dist(n, nrhs, xtrue, ldx); dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); /* ------------------------------------------------------------ NOW WE SOLVE THE LINEAR SYSTEM. ------------------------------------------------------------*/ /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag; options.ReplaceTinyPivot = YES; options.Trans = NOTRANS; options.IterRefine = DOUBLE; options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; */ set_default_options_dist(&options); if (!iam) { print_sp_ienv_dist(&options); print_options_dist(&options); } /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit(m, n, &ScalePermstruct); LUstructInit(n, &LUstruct); /* Initialize the statistics variables. */ PStatInit(&stat); /* Call the linear equation solver: factorize and solve. */ pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, &LUstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) { dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid1); } /* Print the statistics. */ PStatPrint(&options, &stat, &grid1); /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompCol_Matrix_dist(&A); Destroy_LU(n, &grid1, &LUstruct); ScalePermstructFree(&ScalePermstruct); LUstructFree(&LUstruct); SUPERLU_FREE(b); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); } else { /* I am in grid 2. */ iam = grid2.iam; /* Get the logical number in the new grid. */ /* ------------------------------------------------------------ PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL THE OTHER PROCESSES. ------------------------------------------------------------*/ if ( !iam ) { /* Read the matrix stored on disk in Harwell-Boeing format. */ dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz); printf("\tProcess grid\t%d X %d\n", (int) grid2.nprow, (int) grid2.npcol); /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid2.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid2.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid2.comm ); MPI_Bcast( a, nnz, MPI_DOUBLE, 0, grid2.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid2.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid2.comm ); } else { /* Receive matrix A from PE 0. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid2.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid2.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid2.comm ); /* Allocate storage for compressed column representation. */ dallocateA_dist(n, nnz, &a, &asub, &xa); MPI_Bcast( a, nnz, MPI_DOUBLE, 0, grid2.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid2.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid2.comm ); } /* Create compressed column matrix for A. */ dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); /* Generate the exact solution and compute the right-hand side. */ if (!(b=doubleMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); *trans = 'N'; ldx = n; ldb = m; dGenXtrue_dist(n, nrhs, xtrue, ldx); dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); /* ------------------------------------------------------------ NOW WE SOLVE THE LINEAR SYSTEM. ------------------------------------------------------------*/ /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = MMD_AT_PLUS_A; options.RowPerm = LargeDiag; options.ReplaceTinyPivot = YES; options.Trans = NOTRANS; options.IterRefine = DOUBLE; options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; */ set_default_options_dist(&options); /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit(m, n, &ScalePermstruct); LUstructInit(n, &LUstruct); /* Initialize the statistics variables. */ PStatInit(&stat); /* Call the linear equation solver: factorize and solve. */ pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, &LUstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) { dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid2); } /* Print the statistics. */ PStatPrint(&options, &stat, &grid2); /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompCol_Matrix_dist(&A); Destroy_LU(n, &grid2, &LUstruct); ScalePermstructFree(&ScalePermstruct); LUstructFree(&LUstruct); SUPERLU_FREE(b); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); } /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRIDS. ------------------------------------------------------------*/ superlu_gridexit(&grid1); superlu_gridexit(&grid2); out: /* ------------------------------------------------------------ TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ MPI_Finalize(); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit main()"); #endif }
void pdgssvx(superlu_options_t *options, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, double B[], int ldb, int nrhs, gridinfo_t *grid, LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr, SuperLUStat_t *stat, int *info) { /* * -- Distributed SuperLU routine (version 2.2) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * November 1, 2007 * Feburary 20, 2008 * * * Purpose * ======= * * PDGSSVX solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * * Static pivoting is a technique that combines the numerical stability * of partial pivoting with the scalability of Cholesky (no pivoting), * to run accurately and efficiently on large numbers of processors. * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed * description of the parallel algorithms. * * The input matrices A and B are distributed by block rows. * Here is a graphical illustration (0-based indexing): * * A B * 0 --------------- ------ * | | | | * | | P0 | | * | | | | * --------------- ------ * - fst_row->| | | | * | | | | | * m_loc | | P1 | | * | | | | | * - | | | | * --------------- ------ * | . | |. | * | . | |. | * | . | |. | * --------------- ------ * * where, fst_row is the row number of the first row, * m_loc is the number of rows local to this processor * These are defined in the 'SuperMatrix' structure, see supermatrix.h. * * * Here are the options for using this code: * * 1. Independent of all the other options specified below, the * user must supply * * - B, the matrix of right-hand sides, distributed by block rows, * and its dimensions ldb (local) and nrhs (global) * - grid, a structure describing the 2D processor mesh * - options->IterRefine, which determines whether or not to * improve the accuracy of the computed solution using * iterative refinement * * On output, B is overwritten with the solution X. * * 2. Depending on options->Fact, the user has four options * for solving A*X=B. The standard option is for factoring * A "from scratch". (The other options, described below, * are used when A is sufficiently similar to a previously * solved problem to save time by reusing part or all of * the previous factorization.) * * - options->Fact = DOFACT: A is factored "from scratch" * * In this case the user must also supply * * o A, the input matrix * * as well as the following options to determine what matrix to * factorize. * * o options->Equil, to specify how to scale the rows and columns * of A to "equilibrate" it (to try to reduce its * condition number and so improve the * accuracy of the computed solution) * * o options->RowPerm, to specify how to permute the rows of A * (typically to control numerical stability) * * o options->ColPerm, to specify how to permute the columns of A * (typically to control fill-in and enhance * parallelism during factorization) * * o options->ReplaceTinyPivot, to specify how to deal with tiny * pivots encountered during factorization * (to control numerical stability) * * The outputs returned include * * o ScalePermstruct, modified to describe how the input matrix A * was equilibrated and permuted: * . ScalePermstruct->DiagScale, indicates whether the rows and/or * columns of A were scaled * . ScalePermstruct->R, array of row scale factors * . ScalePermstruct->C, array of column scale factors * . ScalePermstruct->perm_r, row permutation vector * . ScalePermstruct->perm_c, column permutation vector * * (part of ScalePermstruct may also need to be supplied on input, * depending on options->RowPerm and options->ColPerm as described * later). * * o A, the input matrix A overwritten by the scaled and permuted * matrix diag(R)*A*diag(C)*Pc^T, where * Pc is the row permutation matrix determined by * ScalePermstruct->perm_c * diag(R) and diag(C) are diagonal scaling matrices determined * by ScalePermstruct->DiagScale, ScalePermstruct->R and * ScalePermstruct->C * * o LUstruct, which contains the L and U factorization of A1 where * * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored * in A on output.) * * 3. The second value of options->Fact assumes that a matrix with the same * sparsity pattern as A has already been factored: * * - options->Fact = SamePattern: A is factored, assuming that it has * the same nonzero pattern as a previously factored matrix. In * this case the algorithm saves time by reusing the previously * computed column permutation vector stored in * ScalePermstruct->perm_c and the "elimination tree" of A * stored in LUstruct->etree * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->RowPerm * o options->ReplaceTinyPivot * * but not options->ColPerm, whose value is ignored. This is because the * previous column permutation from ScalePermstruct->perm_c is used as * input. The user must also supply * * o A, the input matrix * o ScalePermstruct->perm_c, the column permutation * o LUstruct->etree, the elimination tree * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated and row permuted * o LUstruct, modified to contain the new L and U factors * * 4. The third value of options->Fact assumes that a matrix B with the same * sparsity pattern as A has already been factored, and where the * row permutation of B can be reused for A. This is useful when A and B * have similar numerical values, so that the same row permutation * will make both factorizations numerically stable. This lets us reuse * all of the previously computed structure of L and U. * * - options->Fact = SamePattern_SameRowPerm: A is factored, * assuming not only the same nonzero pattern as the previously * factored matrix B, but reusing B's row permutation. * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->ReplaceTinyPivot * * but not options->RowPerm or options->ColPerm, whose values are * ignored. This is because the permutations from ScalePermstruct->perm_r * and ScalePermstruct->perm_c are used as input. * * The user must also supply * * o A, the input matrix * o ScalePermstruct->DiagScale, how the previous matrix was row * and/or column scaled * o ScalePermstruct->R, the row scalings of the previous matrix, * if any * o ScalePermstruct->C, the columns scalings of the previous matrix, * if any * o ScalePermstruct->perm_r, the row permutation of the previous * matrix * o ScalePermstruct->perm_c, the column permutation of the previous * matrix * o all of LUstruct, the previously computed information about * L and U (the actual numerical values of L and U * stored in LUstruct->Llu are ignored) * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated (thus ScalePermstruct->DiagScale, * R and C may be modified) * o LUstruct, modified to contain the new L and U factors * * 5. The fourth and last value of options->Fact assumes that A is * identical to a matrix that has already been factored on a previous * call, and reuses its entire LU factorization * * - options->Fact = Factored: A is identical to a previously * factorized matrix, so the entire previous factorization * can be reused. * * In this case all the other options mentioned above are ignored * (options->Equil, options->RowPerm, options->ColPerm, * options->ReplaceTinyPivot) * * The user must also supply * * o A, the unfactored matrix, only in the case that iterative * refinment is to be done (specifically A must be the output * A from the previous call, so that it has been scaled and permuted) * o all of ScalePermstruct * o all of LUstruct, including the actual numerical values of * L and U * * all of which are unmodified on output. * * Arguments * ========= * * options (input) superlu_options_t* (global) * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following fields should be defined for this structure: * * o Fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, how the matrix A should * be factorized based on the previous history. * * = DOFACT: The matrix A will be factorized from scratch. * Inputs: A * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * all of ScalePermstruct * all of LUstruct * * = SamePattern: the matrix A will be factorized assuming * that a factorization of a matrix with the same sparsity * pattern was performed prior to this one. Therefore, this * factorization will reuse column permutation vector * ScalePermstruct->perm_c and the elimination tree * LUstruct->etree * Inputs: A * options->Equil, RowPerm, ReplaceTinyPivot * ScalePermstruct->perm_c * LUstruct->etree * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * rest of ScalePermstruct (DiagScale, R, C, perm_r) * rest of LUstruct (GLU_persist, Llu) * * = SamePattern_SameRowPerm: the matrix A will be factorized * assuming that a factorization of a matrix with the same * sparsity pattern and similar numerical values was performed * prior to this one. Therefore, this factorization will reuse * both row and column scaling factors R and C, and the * both row and column permutation vectors perm_r and perm_c, * distributed data structure set up from the previous symbolic * factorization. * Inputs: A * options->Equil, ReplaceTinyPivot * all of ScalePermstruct * all of LUstruct * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * modified LUstruct->Llu * = FACTORED: the matrix A is already factored. * Inputs: all of ScalePermstruct * all of LUstruct * * o Equil (yes_no_t) * Specifies whether to equilibrate the system. * = NO: no equilibration. * = YES: scaling factors are computed to equilibrate the system: * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. * Whether or not the system will be equilibrated depends * on the scaling of the matrix A, but if equilibration is * used, A is overwritten by diag(R)*A*diag(C) and B by * diag(R)*B. * * o RowPerm (rowperm_t) * Specifies how to permute rows of the matrix A. * = NATURAL: use the natural ordering. * = LargeDiag: use the Duff/Koster algorithm to permute rows of * the original matrix to make the diagonal large * relative to the off-diagonal. * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r * input by the user. * * o ColPerm (colperm_t) * Specifies what type of column permutation to use to reduce fill. * = NATURAL: natural ordering. * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. * = MMD_ATA: minimum degree ordering on structure of A'*A. * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. * * o ReplaceTinyPivot (yes_no_t) * = NO: do not modify pivots * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during * LU factorization. * * o IterRefine (IterRefine_t) * Specifies how to perform iterative refinement. * = NO: no iterative refinement. * = DOUBLE: accumulate residual in double precision. * = EXTRA: accumulate residual in extra precision. * * NOTE: all options must be indentical on all processes when * calling this routine. * * A (input/output) SuperMatrix* (local) * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. * That is, A is stored in distributed compressed row format. * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization * routine PDGSTRF can factorize rectangular matrices. * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, * depending on ScalePermstruct->DiagScale and options->ColPerm: * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by * diag(R)*A*diag(C). * if options->ColPerm != NATURAL, A is further overwritten by * diag(R)*A*diag(C)*Pc^T. * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. * * ScalePermstruct (input/output) ScalePermstruct_t* (global) * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * It contains the following fields: * * o DiagScale (DiagScale_t) * Specifies the form of equilibration that was done. * = NOEQUIL: no equilibration. * = ROW: row equilibration, i.e., A was premultiplied by * diag(R). * = COL: Column equilibration, i.e., A was postmultiplied * by diag(C). * = BOTH: both row and column equilibration, i.e., A was * replaced by diag(R)*A*diag(C). * If options->Fact = FACTORED or SamePattern_SameRowPerm, * DiagScale is an input argument; otherwise it is an output * argument. * * o perm_r (int*) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * If options->RowPerm = MY_PERMR, or * options->Fact = SamePattern_SameRowPerm, perm_r is an * input argument; otherwise it is an output argument. * * o perm_c (int*) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * If options->ColPerm = MY_PERMC or options->Fact = SamePattern * or options->Fact = SamePattern_SameRowPerm, perm_c is an * input argument; otherwise, it is an output argument. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * * o R (double*) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). * If DiagScale = NOEQUIL or COL, R is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * * o C (double*) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). * If DiagScale = NOEQUIL or ROW, C is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is * an input argument; otherwise, C is an output argument. * * B (input/output) double* (local) * On entry, the right-hand side matrix of dimension (m_loc, nrhs), * where, m_loc is the number of rows stored locally on my * process and is defined in the data structure of matrix A. * On exit, the solution matrix if info = 0; * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * The number of right-hand sides. * If nrhs = 0, only LU decomposition is performed, the forward * and back substitutions are skipped. * * grid (input) gridinfo_t* (global) * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * It contains the following fields: * * o etree (int*) dimension (A->ncol) (global) * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'. * It is computed in sp_colorder() during the first factorization, * and is reused in the subsequent factorizations of the matrices * with the same nonzero pattern. * On exit of sp_colorder(), the columns of A are permuted so that * the etree is in a certain postorder. This postorder is reflected * in ScalePermstruct->perm_c. * NOTE: * Etree is a vector of parent pointers for a forest whose vertices * are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * o Glu_persist (Glu_persist_t*) (global) * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (LocalLU_t*) (local) * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * SOLVEstruct (input/output) SOLVEstruct_t* * The data structure to hold the communication pattern used * in the phases of triangular solution and iterative refinement. * This pattern should be intialized only once for repeated solutions. * If options->SolveInitialized = YES, it is an input argument. * If options->SolveInitialized = NO and nrhs != 0, it is an output * argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'. * * berr (output) double*, dimension (nrhs) (global) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * so the solution could not be computed. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * * See superlu_ddefs.h for the definitions of varioous data types. * */ NRformat_loc *Astore; SuperMatrix GA; /* Global A in NC format */ NCformat *GAstore; double *a_GA; SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ NCPformat *GACstore; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t *Glu_freeable; /* The nonzero structures of L and U factors, which are replicated on all processrs. (lsub, xlsub) contains the compressed subscript of supernodes in L. (usub, xusub) contains the compressed subscript of nonzero segments in U. If options->Fact != SamePattern_SameRowPerm, they are computed by SYMBFACT routine, and then used by PDDISTRIBUTE routine. They will be freed after PDDISTRIBUTE routine. If options->Fact == SamePattern_SameRowPerm, these structures are not used. */ fact_t Fact; double *a; int_t *colptr, *rowind; int_t *perm_r; /* row permutations from partial pivoting */ int_t *perm_c; /* column permutation vector */ int_t *etree; /* elimination tree */ int_t *rowptr, *colind; /* Local A in NR*/ int_t *rowind_loc, *colptr_loc; int_t colequ, Equil, factored, job, notran, rowequ, need_value; int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; int_t nnz_loc, m_loc, fst_row, icol; int iam; int ldx; /* LDA for matrix X (local). */ char equed[1], norm[1]; double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; double *X, *b_col, *b_work, *x_col; double t; static mem_usage_t num_mem_usage, symb_mem_usage; #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif int_t procs; /* Structures needed for parallel symbolic factorization */ int_t *sizes, *fstVtxSep, parSymbFact; int noDomains, nprocs_num; MPI_Comm symb_comm; /* communicator for symbolic factorization */ int col, key; /* parameters for creating a new communicator */ Pslu_freeable_t Pslu_freeable; float flinfo; /* Initialization. */ m = A->nrow; n = A->ncol; Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; a = (double *) Astore->nzval; rowptr = Astore->rowptr; colind = Astore->colind; sizes = NULL; fstVtxSep = NULL; symb_comm = MPI_COMM_NULL; /* Test the input parameters. */ *info = 0; Fact = options->Fact; if ( Fact < 0 || Fact > FACTORED ) *info = -1; else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) *info = -1; else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) *info = -1; else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) *info = -1; else if ( options->IterRefine == EXTRA ) { *info = -1; fprintf(stderr, "Extra precise iterative refinement yet to support."); } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < m_loc ) *info = -5; else if ( nrhs < 0 ) *info = -6; if ( *info ) { i = -(*info); pxerbla("pdgssvx", grid, -*info); return; } factored = (Fact == FACTORED); Equil = (!factored && options->Equil == YES); notran = (options->Trans == NOTRANS); iam = grid->iam; job = 5; if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { rowequ = (ScalePermstruct->DiagScale == ROW) || (ScalePermstruct->DiagScale == BOTH); colequ = (ScalePermstruct->DiagScale == COL) || (ScalePermstruct->DiagScale == BOTH); } else rowequ = colequ = FALSE; /* The following arrays are replicated on all processes. */ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; etree = LUstruct->etree; R = ScalePermstruct->R; C = ScalePermstruct->C; /********/ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgssvx()"); #endif /* Not factored & ask for equilibration */ if ( Equil && Fact != SamePattern_SameRowPerm ) { /* Allocate storage if not done so before. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; } } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( Equil ) { #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter equil"); #endif t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm ) { /* Reuse R and C. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: break; case ROW: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { a[i] *= R[irow]; /* Scale rows. */ } ++irow; } break; case COL: for (j = 0; j < m_loc; ++j) for (i = rowptr[j]; i < rowptr[j+1]; ++i){ icol = colind[i]; a[i] *= C[icol]; /* Scale columns. */ } break; case BOTH: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ } ++irow; } break; } } else { /* Compute R & C from scratch */ /* Compute the row and column scalings. */ pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); /* Equilibrate matrix A if it is badly-scaled. */ pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); if ( lsame_(equed, "R") ) { ScalePermstruct->DiagScale = rowequ = ROW; } else if ( lsame_(equed, "C") ) { ScalePermstruct->DiagScale = colequ = COL; } else if ( lsame_(equed, "B") ) { ScalePermstruct->DiagScale = BOTH; rowequ = ROW; colequ = COL; } else ScalePermstruct->DiagScale = NOEQUIL; #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. equilibrated? *equed = %c\n", *equed); /*fflush(stdout);*/ } #endif } /* if Fact ... */ stat->utime[EQUIL] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit equil"); #endif } /* if Equil ... */ if ( !factored ) { /* Skip this if already factored. */ /* * Gather A from the distributed compressed row format to * global A in compressed column format. * Numerical values are gathered only when a row permutation * for large diagonal is sought after. */ if ( Fact != SamePattern_SameRowPerm ) { need_value = (options->RowPerm == LargeDiag); pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); GAstore = (NCformat *) GA.Store; colptr = GAstore->colptr; rowind = GAstore->rowind; nnz = GAstore->nnz; if ( need_value ) a_GA = (double *) GAstore->nzval; else assert(GAstore->nzval == NULL); } /* ------------------------------------------------------------ Find the row permutation for A. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { t = SuperLU_timer_(); if ( Fact != SamePattern_SameRowPerm ) { if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ /* Permute the global matrix GA for symbfact() */ for (i = 0; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } else { /* options->RowPerm == LargeDiag */ /* Get a new perm_r[] */ if ( job == 5 ) { /* Allocate storage for scaling factors. */ if ( !(R1 = doubleMalloc_dist(m)) ) ABORT("SUPERLU_MALLOC fails for R1[]"); if ( !(C1 = doubleMalloc_dist(n)) ) ABORT("SUPERLU_MALLOC fails for C1[]"); } if ( !iam ) { /* Process 0 finds a row permutation */ dldperm(job, m, nnz, colptr, rowind, a_GA, perm_r, R1, C1); MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } else { MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } #if ( PRNTlevel>=2 ) dmin = dlamch_("Overflow"); dsum = 0.0; dprod = 1.0; #endif if ( job == 5 ) { if ( Equil ) { for (i = 0; i < n; ++i) { R1[i] = exp(R1[i]); C1[i] = exp(C1[i]); } /* Scale the distributed matrix */ irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R1[irow] * C1[icol]; #if ( PRNTlevel>=2 ) if ( perm_r[irow] == icol ) { /* New diagonal */ if ( job == 2 || job == 3 ) dmin = SUPERLU_MIN(dmin, fabs(a[i])); else if ( job == 4 ) dsum += fabs(a[i]); else if ( job == 5 ) dprod *= fabs(a[i]); } #endif } ++irow; } /* Multiply together the scaling factors. */ if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; else for (i = 0; i < m; ++i) R[i] = R1[i]; if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; else for (i = 0; i < n; ++i) C[i] = C1[i]; ScalePermstruct->DiagScale = BOTH; rowequ = colequ = 1; } /* end Equil */ /* Now permute global A to prepare for symbfact() */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } SUPERLU_FREE (R1); SUPERLU_FREE (C1); } else { /* job = 2,3,4 */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } /* end for i ... */ } /* end for j ... */ } /* end else job ... */ #if ( PRNTlevel>=2 ) if ( job == 2 || job == 3 ) { if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); } else if ( job == 4 ) { if ( !iam ) printf("\tsum of diagonal %e\n", dsum); } else if ( job == 5 ) { if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif } /* end if options->RowPerm ... */ t = SuperLU_timer_() - t; stat->utime[ROWPERM] = t; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); #endif } /* end if Fact ... */ } else { /* options->RowPerm == NOROWPERM */ for (i = 0; i < m; ++i) perm_r[i] = i; } #if ( DEBUGlevel>=2 ) if ( !iam ) PrintInt10("perm_r", m, perm_r); #endif } /* end if (!factored) */ if ( !factored || options->IterRefine ) { /* Compute norm(A), which will be used to adjust small diagonal. */ if ( notran ) *(unsigned char *)norm = '1'; else *(unsigned char *)norm = 'I'; anorm = pdlangs(norm, A, grid); #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. anorm %e\n", anorm); #endif } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( !factored ) { t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A * permc_spec = PARMETIS: parallel METIS on structure of A'+A * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; parSymbFact = options->ParSymbFact; #if ( PRNTlevel>=1 ) if ( parSymbFact && permc_spec != PARMETIS ) if ( !iam ) printf(".. Parallel symbolic factorization" " only works wth ParMetis!\n"); #endif if ( parSymbFact == YES || permc_spec == PARMETIS ) { nprocs_num = grid->nprow * grid->npcol; noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num )))); /* create a new communicator for the first noDomains processors in grid->comm */ key = iam; if (iam < noDomains) col = 0; else col = MPI_UNDEFINED; MPI_Comm_split (grid->comm, col, key, &symb_comm ); permc_spec = PARMETIS; /* only works with PARMETIS */ } if ( permc_spec != MY_PERMC && Fact == DOFACT ) { if ( permc_spec == PARMETIS ) { /* Get column permutation vector in perm_c. * * This routine takes as input the distributed input matrix A * * and does not modify it. It also allocates memory for * * sizes[] and fstVtxSep[] arrays, that contain information * * on the separator tree computed by ParMETIS. */ flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, noDomains, &sizes, &fstVtxSep, grid, &symb_comm); if (flinfo > 0) ABORT("ERROR in get perm_c parmetis."); } else { get_perm_c_dist(iam, permc_spec, &GA, perm_c); } } stat->utime[COLPERM] = SuperLU_timer_() - t; /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' (a.k.a. column etree), depending on the choice of ColPerm. Adjust perm_c[] to be consistent with a postorder of etree. Permute columns of A to form A*Pc'. */ if ( Fact != SamePattern_SameRowPerm ) { if ( parSymbFact == NO ) { int_t *GACcolbeg, *GACcolend, *GACrowind; sp_colorder(options, &GA, perm_c, etree, &GAC); /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ GACstore = (NCPformat *) GAC.Store; GACcolbeg = GACstore->colbeg; GACcolend = GACstore->colend; GACrowind = GACstore->rowind; for (j = 0; j < n; ++j) { for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { irow = GACrowind[i]; GACrowind[i] = perm_c[irow]; } } /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); if ( !(Glu_freeable = (Glu_freeable_t *) SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) ABORT("Malloc fails for Glu_freeable."); /* Every process does this. */ iinfo = symbfact(options, iam, &GAC, perm_c, etree, Glu_persist, Glu_freeable); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if ( iinfo < 0 ) { /* Successful return */ QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); #if ( PRNTlevel>=1 ) if ( !iam ) { printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); printf("\tint %d, short %d, float %d, double %d\n", sizeof(int_t), sizeof(short), sizeof(float), sizeof(double)); printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); } #endif } else { if ( !iam ) { fprintf(stderr,"symbfact() error returns %d\n",iinfo); exit(-1); } } } /* end if serial symbolic factorization */ else { /* parallel symbolic factorization */ t = SuperLU_timer_(); flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, sizes, fstVtxSep, &Pslu_freeable, &(grid->comm), &symb_comm, &symb_mem_usage); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if (flinfo > 0) ABORT("Insufficient memory for parallel symbolic factorization."); } } /* end if Fact ... */ #if ( PRNTlevel>=1 ) if (!iam) printf("\tSYMBfact time: %.2f\n", stat->utime[SYMBFAC]); #endif if (sizes) SUPERLU_FREE (sizes); if (fstVtxSep) SUPERLU_FREE (fstVtxSep); if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { /* Apply column permutation to the original distributed A */ for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ t = SuperLU_timer_(); dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct, Glu_freeable, LUstruct, grid); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factorization. */ if ( Fact != SamePattern_SameRowPerm ) { iinfo = symbfact_SubFree(Glu_freeable); SUPERLU_FREE(Glu_freeable); } } else { /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ /* Apply column permutation to the original distributed A */ for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; t = SuperLU_timer_(); dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct, &Pslu_freeable, LUstruct, grid); if (dist_mem_use > 0) ABORT ("Not enough memory available for dist_psymbtonum\n"); stat->utime[DIST] = SuperLU_timer_() - t; } #if ( PRNTlevel>=1 ) if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); #endif /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); stat->utime[FACT] = SuperLU_timer_() - t; #if ( PRNTlevel>=1 ) { int_t TinyPivots; float for_lu, total, max, avg, temp; dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); MPI_Reduce( &num_mem_usage.for_lu, &for_lu, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &num_mem_usage.total, &total, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); temp = SUPERLU_MAX(symb_mem_usage.total, symb_mem_usage.for_lu + (float)dist_mem_use + num_mem_usage.for_lu); if (parSymbFact == TRUE) /* The memory used in the redistribution routine includes the memory used for storing the symbolic structure and the memory allocated for numerical factorization */ temp = SUPERLU_MAX(symb_mem_usage.total, (float)dist_mem_use); temp = SUPERLU_MAX(temp, num_mem_usage.total); MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, MPI_SUM, grid->comm ); stat->TinyPivots = TinyPivots; if ( !iam ) { printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", for_lu*1e-6, total*1e-6); printf("\tAll space (MB):" "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); } } #endif /* Destroy GA */ if ( Fact != SamePattern_SameRowPerm ) Destroy_CompCol_Matrix_dist(&GA); } /* end if (!factored) */ /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ if ( nrhs ) { if ( !(b_work = doubleMalloc_dist(n)) ) ABORT("Malloc fails for b_work[]"); /* ------------------------------------------------------------ Scale the right-hand side if equilibration was performed. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } } else if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } /* Save a copy of the right-hand side. */ ldx = ldb; if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) ABORT("Malloc fails for X[]"); x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; x_col += ldx; b_col += ldb; } /* ------------------------------------------------------------ Solve the linear system. ------------------------------------------------------------*/ if ( options->SolveInitialized == NO ) { dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, SOLVEstruct); } pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, fst_row, ldb, nrhs, SOLVEstruct, stat, info); /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ if ( options->IterRefine ) { /* Improve the solution by iterative refinement. */ int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ t = SuperLU_timer_(); if ( options->RefineInitialized == NO || Fact == DOFACT ) { /* All these cases need to re-initialize gsmv structure */ if ( options->RefineInitialized ) pdgsmv_finalize(SOLVEstruct->gsmv_comm); pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, SOLVEstruct->gsmv_comm); /* Save a copy of the transformed local col indices in colind_gsmv[]. */ if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); if ( !(it = intMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for colind_gsmv[]"); colind_gsmv = SOLVEstruct->A_colind_gsmv = it; for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; options->RefineInitialized = YES; } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { double at; int_t k, jcol, p; /* Swap to beginning the part of A corresponding to the local part of X, as was done in pdgsmv_init() */ for (i = 0; i < m_loc; ++i) { /* Loop through each row */ k = rowptr[i]; for (j = rowptr[i]; j < rowptr[i+1]; ++j) { jcol = colind[j]; p = SOLVEstruct->row_to_proc[jcol]; if ( p == iam ) { /* Local */ at = a[k]; a[k] = a[j]; a[j] = at; ++k; } } } /* Re-use the local col indices of A obtained from the previous call to pdgsmv_init() */ for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; } if ( nrhs == 1 ) { /* Use the existing solve structure */ SOLVEstruct1 = SOLVEstruct; } else { /* For nrhs > 1, since refinement is performed for RHS one at a time, the communication structure for pdgstrs is different than the solve with nrhs RHS. So we use SOLVEstruct1 for the refinement step. */ if ( !(SOLVEstruct1 = (SOLVEstruct_t *) SUPERLU_MALLOC(sizeof(SOLVEstruct_t))) ) ABORT("Malloc fails for SOLVEstruct1"); /* Copy the same stuff */ SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; SOLVEstruct1->diag_len = SOLVEstruct->diag_len; SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; /* Initialize the *gstrs_comm for 1 RHS. */ if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, Glu_persist, SOLVEstruct1); } pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); /* Deallocate the storage associated with SOLVEstruct1 */ if ( nrhs > 1 ) { pxgstrs_finalize(SOLVEstruct1->gstrs_comm); SUPERLU_FREE(SOLVEstruct1); } stat->utime[REFINE] = SuperLU_timer_() - t; } /* Permute the solution matrix B <= Pc'*X. */ pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, SOLVEstruct->inv_perm_c, X, ldx, B, ldb, nrhs, grid); #if ( DEBUGlevel>=2 ) printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); for (i = 0; i < m_loc; ++i) printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); #endif /* Transform the solution matrix X to a solution of the original system before the equilibration. */ if ( notran ) { if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } } else if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } SUPERLU_FREE(b_work); SUPERLU_FREE(X); } /* end if nrhs != 0 */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif /* Deallocate R and/or C if it was not used. */ if ( Equil && Fact != SamePattern_SameRowPerm ) { switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: SUPERLU_FREE(R); SUPERLU_FREE(C); break; case ROW: SUPERLU_FREE(C); break; case COL: SUPERLU_FREE(R); break; } } if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) Destroy_CompCol_Permuted_dist(&GAC); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgssvx()"); #endif }
int main(int argc, char *argv[]) { superlu_dist_options_t options; SuperLUStat_t stat; SuperMatrix A; ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; gridinfo_t grid; double *berr; doublecomplex *a, *b, *xtrue; int_t *asub, *xa; int_t m, n, nnz; int_t nprow, npcol; int iam, info, ldb, ldx, nrhs; char trans[1]; char **cpp, c; FILE *fp, *fopen(); extern int cpp_defs(); /* prototypes */ extern void LUstructInit(const int_t, LUstruct_t *); extern void LUstructFree(LUstruct_t *); extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *); nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ nrhs = 1; /* Number of right-hand side. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { c = *(*cpp+1); ++cpp; switch (c) { case 'h': printf("Options:\n"); printf("\t-r <int>: process rows (default " IFMT ")\n", nprow); printf("\t-c <int>: process columns (default " IFMT ")\n", npcol); exit(0); break; case 'r': nprow = atoi(*cpp); break; case 'c': npcol = atoi(*cpp); break; } } else { /* Last arg is considered a filename */ if ( !(fp = fopen(*cpp, "r")) ) { ABORT("File does not exist"); } break; } } /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); #endif /* ------------------------------------------------------------ PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL THE OTHER PROCESSES. ------------------------------------------------------------*/ if ( !iam ) { /* Print the CPP definitions. */ cpp_defs(); /* Read the matrix stored on disk in Harwell-Boeing format. */ zreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); printf("Input matrix file: %s\n", *cpp); printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( a, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); } else { /* Receive matrix A from PE 0. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); /* Allocate storage for compressed column representation. */ zallocateA_dist(n, nnz, &a, &asub, &xa); MPI_Bcast( a, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); } /* Create compressed column matrix for A. */ zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_Z, SLU_GE); /* Generate the exact solution and compute the right-hand side. */ if (!(b=doublecomplexMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); if (!(xtrue=doublecomplexMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); *trans = 'N'; ldx = n; ldb = m; zGenXtrue_dist(n, nrhs, xtrue, ldx); zFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); /* ------------------------------------------------------------ NOW WE SOLVE THE LINEAR SYSTEM. ------------------------------------------------------------*/ /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; options.ReplaceTinyPivot = YES; options.Trans = NOTRANS; options.IterRefine = DOUBLE; options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; */ set_default_options_dist(&options); if (!iam) { print_sp_ienv_dist(&options); print_options_dist(&options); } /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit(m, n, &ScalePermstruct); LUstructInit(n, &LUstruct); /* Initialize the statistics variables. */ PStatInit(&stat); /* Call the linear equation solver. */ pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) { zinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompCol_Matrix_dist(&A); Destroy_LU(n, &grid, &LUstruct); ScalePermstructFree(&ScalePermstruct); LUstructFree(&LUstruct); SUPERLU_FREE(b); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: superlu_gridexit(&grid); /* ------------------------------------------------------------ TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ MPI_Finalize(); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit main()"); #endif }
int main(int argc, char *argv[]) { superlu_dist_options_t options; SuperLUStat_t stat; SuperMatrix A; ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; gridinfo_t grid; double *berr; double *a, *a1, *b, *b1, *xtrue; int_t *asub, *asub1, *xa, *xa1; int_t i, j, m, n, nnz; int_t nprow, npcol; int iam, info, ldb, ldx, nrhs; char trans[1]; char **cpp, c; FILE *fp, *fopen(); extern int cpp_defs(); /* prototypes */ extern void LUstructInit(const int_t, LUstruct_t *); extern void LUstructFree(LUstruct_t *); extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *); nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ nrhs = 1; /* Number of right-hand side. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { c = *(*cpp+1); ++cpp; switch (c) { case 'h': printf("Options:\n"); printf("\t-r <int>: process rows (default %d)\n", nprow); printf("\t-c <int>: process columns (default %d)\n", npcol); exit(0); break; case 'r': nprow = atoi(*cpp); break; case 'c': npcol = atoi(*cpp); break; } } else { /* Last arg is considered a filename */ if ( !(fp = fopen(*cpp, "r")) ) { ABORT("File does not exist"); } break; } } /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); #endif /* ------------------------------------------------------------ Process 0 reads the matrix A, and then broadcasts it to all the other processes. ------------------------------------------------------------*/ if ( !iam ) { /* Print the CPP definitions. */ cpp_defs(); /* Read the matrix stored on disk in Harwell-Boeing format. */ dreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); printf("Input matrix file: %s\n", *cpp); printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz); printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( a, nnz, MPI_DOUBLE, 0, grid.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); } else { /* Receive matrix A from PE 0. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); /* Allocate storage for compressed column representation. */ dallocateA_dist(n, nnz, &a, &asub, &xa); MPI_Bcast( a, nnz, MPI_DOUBLE, 0, grid.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); } /* Create compressed column matrix for A. */ dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); /* Generate the exact solution and compute the right-hand side. */ if (!(b=doubleMalloc_dist(m * nrhs))) ABORT("Malloc fails for b[]"); if (!(xtrue=doubleMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); *trans = 'N'; ldx = n; ldb = m; dGenXtrue_dist(n, nrhs, xtrue, ldx); dFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); /* Save a copy of the right-hand side. */ if ( !(b1 = doubleMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]"); for (j = 0; j < nrhs; ++j) for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb]; if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); /* Save a copy of the matrix A. */ dallocateA_dist(n, nnz, &a1, &asub1, &xa1); for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; } for (i = 0; i < n+1; ++i) xa1[i] = xa[i]; /* ------------------------------------------------------------ WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. ------------------------------------------------------------*/ /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag; options.ReplaceTinyPivot = YES; options.Trans = NOTRANS; options.IterRefine = DOUBLE; options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; */ set_default_options_dist(&options); if (!iam) { print_sp_ienv_dist(&options); print_options_dist(&options); } /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit(m, n, &ScalePermstruct); LUstructInit(n, &LUstruct); /* Initialize the statistics variables. */ PStatInit(&stat); /* Call the linear equation solver: factorize and solve. */ pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) { dinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with the L and U matrices. */ SUPERLU_FREE(b); /* Free storage of right-hand side. */ /* ------------------------------------------------------------ NOW WE SOLVE ANOTHER LINEAR SYSTEM. ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. ------------------------------------------------------------*/ options.Fact = SamePattern; PStatInit(&stat); /* Initialize the statistics variables. */ /* Create compressed column matrix for A. */ dCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1, SLU_NC, SLU_D, SLU_GE); /* Solve the linear system. */ pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) { printf("Solve the system with the same sparsity pattern.\n"); dinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid); } /* Print the statistics. */ PStatPrint(&options, &stat, &grid); /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with the L and U matrices. */ ScalePermstructFree(&ScalePermstruct); LUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ SUPERLU_FREE(b1); /* Free storage of right-hand side. */ SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ SUPERLU_FREE(berr); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: superlu_gridexit(&grid); /* ------------------------------------------------------------ TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ MPI_Finalize(); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit main()"); #endif }
PetscErrorCode MatLUFactorNumeric_SuperLU_DIST(Mat F,Mat A,const MatFactorInfo *info) { Mat *tseq,A_seq = NULL; Mat_SeqAIJ *aa,*bb; Mat_SuperLU_DIST *lu = (Mat_SuperLU_DIST*)(F)->spptr; PetscErrorCode ierr; PetscInt M=A->rmap->N,N=A->cmap->N,i,*ai,*aj,*bi,*bj,nz,rstart,*garray, m=A->rmap->n, colA_start,j,jcol,jB,countA,countB,*bjj,*ajj; int sinfo; /* SuperLU_Dist info flag is always an int even with long long indices */ PetscMPIInt size; SuperLUStat_t stat; double *berr=0; IS isrow; Mat F_diag=NULL; #if defined(PETSC_USE_COMPLEX) doublecomplex *av, *bv; #else double *av, *bv; #endif PetscFunctionBegin; ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); if (lu->MatInputMode == GLOBAL) { /* global mat input */ if (size > 1) { /* convert mpi A to seq mat A */ ierr = ISCreateStride(PETSC_COMM_SELF,M,0,1,&isrow);CHKERRQ(ierr); ierr = MatGetSubMatrices(A,1,&isrow,&isrow,MAT_INITIAL_MATRIX,&tseq);CHKERRQ(ierr); ierr = ISDestroy(&isrow);CHKERRQ(ierr); A_seq = *tseq; ierr = PetscFree(tseq);CHKERRQ(ierr); aa = (Mat_SeqAIJ*)A_seq->data; } else { PetscBool flg; ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr); if (flg) { Mat_MPIAIJ *At = (Mat_MPIAIJ*)A->data; A = At->A; } aa = (Mat_SeqAIJ*)A->data; } /* Convert Petsc NR matrix to SuperLU_DIST NC. Note: memories of lu->val, col and row are allocated by CompRow_to_CompCol_dist()! */ if (lu->options.Fact != DOFACT) {/* successive numeric factorization, sparsity pattern is reused. */ PetscStackCall("SuperLU_DIST:Destroy_CompCol_Matrix_dist",Destroy_CompCol_Matrix_dist(&lu->A_sup)); if (lu->FactPattern == SamePattern_SameRowPerm) { lu->options.Fact = SamePattern_SameRowPerm; /* matrix has similar numerical values */ } else { /* lu->FactPattern == SamePattern */ PetscStackCall("SuperLU_DIST:Destroy_LU",Destroy_LU(N, &lu->grid, &lu->LUstruct)); lu->options.Fact = SamePattern; } } #if defined(PETSC_USE_COMPLEX) PetscStackCall("SuperLU_DIST:zCompRow_to_CompCol_dist",zCompRow_to_CompCol_dist(M,N,aa->nz,(doublecomplex*)aa->a,(int_t*)aa->j,(int_t*)aa->i,&lu->val,&lu->col, &lu->row)); #else PetscStackCall("SuperLU_DIST:dCompRow_to_CompCol_dist",dCompRow_to_CompCol_dist(M,N,aa->nz,aa->a,(int_t*)aa->j,(int_t*)aa->i,&lu->val, &lu->col, &lu->row)); #endif /* Create compressed column matrix A_sup. */ #if defined(PETSC_USE_COMPLEX) PetscStackCall("SuperLU_DIST:zCreate_CompCol_Matrix_dist",zCreate_CompCol_Matrix_dist(&lu->A_sup, M, N, aa->nz, lu->val, lu->col, lu->row, SLU_NC, SLU_Z, SLU_GE)); #else PetscStackCall("SuperLU_DIST:dCreate_CompCol_Matrix_dist",dCreate_CompCol_Matrix_dist(&lu->A_sup, M, N, aa->nz, lu->val, lu->col, lu->row, SLU_NC, SLU_D, SLU_GE)); #endif } else { /* distributed mat input */ Mat_MPIAIJ *mat = (Mat_MPIAIJ*)A->data; aa=(Mat_SeqAIJ*)(mat->A)->data; bb=(Mat_SeqAIJ*)(mat->B)->data; ai=aa->i; aj=aa->j; bi=bb->i; bj=bb->j; #if defined(PETSC_USE_COMPLEX) av=(doublecomplex*)aa->a; bv=(doublecomplex*)bb->a; #else av=aa->a; bv=bb->a; #endif rstart = A->rmap->rstart; nz = aa->nz + bb->nz; garray = mat->garray; if (lu->options.Fact == DOFACT) { /* first numeric factorization */ #if defined(PETSC_USE_COMPLEX) PetscStackCall("SuperLU_DIST:zallocateA_dist",zallocateA_dist(m, nz, &lu->val, &lu->col, &lu->row)); #else PetscStackCall("SuperLU_DIST:dallocateA_dist",dallocateA_dist(m, nz, &lu->val, &lu->col, &lu->row)); #endif } else { /* successive numeric factorization, sparsity pattern and perm_c are reused. */ /* Destroy_CompRowLoc_Matrix_dist(&lu->A_sup); */ /* this leads to crash! However, see SuperLU_DIST_2.5/EXAMPLE/pzdrive2.c */ if (lu->FactPattern == SamePattern_SameRowPerm) { lu->options.Fact = SamePattern_SameRowPerm; /* matrix has similar numerical values */ } else { PetscStackCall("SuperLU_DIST:Destroy_LU",Destroy_LU(N, &lu->grid, &lu->LUstruct)); /* Deallocate storage associated with the L and U matrices. */ lu->options.Fact = SamePattern; } } nz = 0; for (i=0; i<m; i++) { lu->row[i] = nz; countA = ai[i+1] - ai[i]; countB = bi[i+1] - bi[i]; ajj = aj + ai[i]; /* ptr to the beginning of this row */ bjj = bj + bi[i]; /* B part, smaller col index */ colA_start = rstart + ajj[0]; /* the smallest global col index of A */ jB = 0; for (j=0; j<countB; j++) { jcol = garray[bjj[j]]; if (jcol > colA_start) { jB = j; break; } lu->col[nz] = jcol; lu->val[nz++] = *bv++; if (j==countB-1) jB = countB; } /* A part */ for (j=0; j<countA; j++) { lu->col[nz] = rstart + ajj[j]; lu->val[nz++] = *av++; } /* B part, larger col index */ for (j=jB; j<countB; j++) { lu->col[nz] = garray[bjj[j]]; lu->val[nz++] = *bv++; } } lu->row[m] = nz; #if defined(PETSC_USE_COMPLEX) PetscStackCall("SuperLU_DIST:zCreate_CompRowLoc_Matrix_dist",zCreate_CompRowLoc_Matrix_dist(&lu->A_sup, M, N, nz, m, rstart,lu->val, lu->col, lu->row, SLU_NR_loc, SLU_Z, SLU_GE)); #else PetscStackCall("SuperLU_DIST:dCreate_CompRowLoc_Matrix_dist",dCreate_CompRowLoc_Matrix_dist(&lu->A_sup, M, N, nz, m, rstart,lu->val, lu->col, lu->row, SLU_NR_loc, SLU_D, SLU_GE)); #endif } /* Factor the matrix. */ PetscStackCall("SuperLU_DIST:PStatInit",PStatInit(&stat)); /* Initialize the statistics variables. */ if (lu->MatInputMode == GLOBAL) { /* global mat input */ #if defined(PETSC_USE_COMPLEX) PetscStackCall("SuperLU_DIST:pzgssvx_ABglobal",pzgssvx_ABglobal(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, M, 0,&lu->grid, &lu->LUstruct, berr, &stat, &sinfo)); #else PetscStackCall("SuperLU_DIST:pdgssvx_ABglobal",pdgssvx_ABglobal(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, M, 0,&lu->grid, &lu->LUstruct, berr, &stat, &sinfo)); #endif } else { /* distributed mat input */ #if defined(PETSC_USE_COMPLEX) PetscStackCall("SuperLU_DIST:pzgssvx",pzgssvx(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, m, 0, &lu->grid,&lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &sinfo)); if (sinfo) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pzgssvx fails, info: %d\n",sinfo); #else PetscStackCall("SuperLU_DIST:pdgssvx",pdgssvx(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, m, 0, &lu->grid,&lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &sinfo)); if (sinfo) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pdgssvx fails, info: %d\n",sinfo); #endif } if (lu->MatInputMode == GLOBAL && size > 1) { ierr = MatDestroy(&A_seq);CHKERRQ(ierr); } if (lu->options.PrintStat) { PStatPrint(&lu->options, &stat, &lu->grid); /* Print the statistics. */ } PStatFree(&stat); if (size > 1) { F_diag = ((Mat_MPIAIJ*)(F)->data)->A; F_diag->assembled = PETSC_TRUE; } (F)->assembled = PETSC_TRUE; (F)->preallocated = PETSC_TRUE; lu->options.Fact = FACTORED; /* The factored form of A is supplied. Local option used by this func. only */ PetscFunctionReturn(0); }
int zcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz, doublecomplex *nzval_g, int_t *rowind_g, int_t *colptr_g, gridinfo_t *grid) { SuperMatrix GA; /* global A */ int_t *rowind, *colptr; /* global */ doublecomplex *nzval; /* global */ doublecomplex *nzval_loc; /* local */ int_t *colind, *rowptr; /* local */ int_t m_loc, fst_row, nnz_loc; int_t m_loc_fst; /* Record m_loc of the first p-1 processors, when mod(m, p) is not zero. */ int_t iam, row, col, i, j, relpos; char trans[1]; int_t *marker; iam = grid->iam; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter zcreate_dist_matrix()"); #endif if ( !iam ) { /* Allocate storage for compressed column representation. */ zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); /* Copy the global matrix. */ #if 0 /* and ADJUST to 0-based indexing which is required by the C routines.*/ #endif for(i=0; i<nnz; i++){ nzval[i]=nzval_g[i]; rowind[i]=rowind_g[i]; /* - 1;*/ } for(i=0; i<n+1; i++) colptr[i]=colptr_g[i]; /* - 1;*/ /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); } else { /* Receive matrix A from PE 0. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); /* Allocate storage for compressed column representation. */ zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); } #if 0 nzval[0]=0.1; #endif /* Compute the number of rows to be distributed to local process */ m_loc = m / (grid->nprow * grid->npcol); m_loc_fst = m_loc; /* When m / procs is not an integer */ if ((m_loc * grid->nprow * grid->npcol) != m) { m_loc = m_loc+1; m_loc_fst = m_loc; if (iam == (grid->nprow * grid->npcol - 1)) m_loc = m - m_loc_fst * (grid->nprow * grid->npcol - 1); } /* Create compressed column matrix for GA. */ zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, SLU_NC, SLU_Z, SLU_GE); /************************************************* * Change GA to a local A with NR_loc format * *************************************************/ rowptr = (int_t *) intMalloc_dist(m_loc+1); marker = (int_t *) intCalloc_dist(n); /* Get counts of each row of GA */ for (i = 0; i < n; ++i) for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; /* Set up row pointers */ rowptr[0] = 0; fst_row = iam * m_loc_fst; nnz_loc = 0; for (j = 0; j < m_loc; ++j) { row = fst_row + j; rowptr[j+1] = rowptr[j] + marker[row]; marker[j] = rowptr[j]; } nnz_loc = rowptr[m_loc]; nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); colind = (int_t *) intMalloc_dist(nnz_loc); /* Transfer the matrix into the compressed row storage */ for (i = 0; i < n; ++i) { for (j = colptr[i]; j < colptr[i+1]; ++j) { row = rowind[j]; if ( (row>=fst_row) && (row<fst_row+m_loc) ) { row = row - fst_row; relpos = marker[row]; colind[relpos] = i; nzval_loc[relpos] = nzval[j]; ++marker[row]; } } } #if ( DEBUGlevel>=1 ) if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); #endif /* Destroy GA */ Destroy_CompCol_Matrix_dist(&GA); /******************************************************/ /* Change GA to a local A with NR_loc format */ /******************************************************/ /* Set up the local A in NR_loc format */ zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, nzval_loc, colind, rowptr, SLU_NR_loc, SLU_Z, SLU_GE); SUPERLU_FREE(marker); #if ( DEBUGlevel>=1 ) printf("sizeof(NRforamt_loc) %d\n", sizeof(NRformat_loc)); CHECK_MALLOC(iam, "Exit dcreate_dist_matrix()"); #endif return 0; }