Exemplo n.º 1
0
int main(int argc, char *argv[])
{
/*
 * Purpose
 * =======
 *
 * The driver program ZLINSOLX2.
 *
 * This example illustrates how to use ZGSSVX to solve systems repeatedly
 * with the same sparsity pattern of matrix A.
 * In this case, the column permutation vector perm_c is computed once.
 * The following data structures will be reused in the subsequent call to
 * ZGSSVX: perm_c, etree
 * 
 */
    char           equed[1];
    yes_no_t       equil;
    trans_t        trans;
    SuperMatrix    A, A1, L, U;
    SuperMatrix    B, B1, X;
    NCformat       *Astore;
    NCformat       *Ustore;
    SCformat       *Lstore;
    doublecomplex         *a, *a1;
    int            *asub, *xa, *asub1, *xa1;
    int            *perm_r; /* row permutations from partial pivoting */
    int            *perm_c; /* column permutation vector */
    int            *etree;
    void           *work;
    int            info, lwork, nrhs, ldx;
    int            i, j, m, n, nnz;
    doublecomplex         *rhsb, *rhsb1, *rhsx, *xact;
    double         *R, *C;
    double         *ferr, *berr;
    double         u, rpg, rcond;
    mem_usage_t    mem_usage;
    superlu_options_t options;
    SuperLUStat_t stat;
    extern void    parse_command_line();

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC("Enter main()");
#endif

    /* Defaults */
    lwork = 0;
    nrhs  = 1;
    equil = YES;	
    u     = 1.0;
    trans = NOTRANS;

    /* Set the default input options:
	options.Fact = DOFACT;
        options.Equil = YES;
    	options.ColPerm = COLAMD;
	options.DiagPivotThresh = 1.0;
    	options.Trans = NOTRANS;
    	options.IterRefine = NOREFINE;
    	options.SymmetricMode = NO;
    	options.PivotGrowth = NO;
    	options.ConditionNumber = NO;
    	options.PrintStat = YES;
     */
    set_default_options(&options);

    /* Can use command line input to modify the defaults. */
    parse_command_line(argc, argv, &lwork, &u, &equil, &trans);
    options.Equil = equil;
    options.DiagPivotThresh = u;
    options.Trans = trans;

    if ( lwork > 0 ) {
	work = SUPERLU_MALLOC(lwork);
	if ( !work ) {
	    ABORT("DLINSOLX: cannot allocate work[]");
	}
    }

    /* Read matrix A from a file in Harwell-Boeing format.*/
    zreadhb(&m, &n, &nnz, &a, &asub, &xa);
    if ( !(a1 = doublecomplexMalloc(nnz)) ) ABORT("Malloc fails for a1[].");
    if ( !(asub1 = intMalloc(nnz)) ) ABORT("Malloc fails for asub1[].");
    if ( !(xa1 = intMalloc(n+1)) ) ABORT("Malloc fails for xa1[].");
    for (i = 0; i < nnz; ++i) {
        a1[i] = a[i];
	asub1[i] = asub[i];
    }
    for (i = 0; i < n+1; ++i) xa1[i] = xa[i];
    
    zCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_Z, SLU_GE);
    Astore = A.Store;
    printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz);
    
    if ( !(rhsb = doublecomplexMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsb[].");
    if ( !(rhsb1 = doublecomplexMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsb1[].");
    if ( !(rhsx = doublecomplexMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsx[].");
    zCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_Z, SLU_GE);
    zCreate_Dense_Matrix(&X, m, nrhs, rhsx, m, SLU_DN, SLU_Z, SLU_GE);
    xact = doublecomplexMalloc(n * nrhs);
    ldx = n;
    zGenXtrue(n, nrhs, xact, ldx);
    zFillRHS(trans, nrhs, xact, ldx, &A, &B);
    for (j = 0; j < nrhs; ++j)
        for (i = 0; i < m; ++i) rhsb1[i+j*m] = rhsb[i+j*m];
    
    if ( !(perm_c = intMalloc(n)) ) ABORT("Malloc fails for perm_c[].");
    if ( !(perm_r = intMalloc(m)) ) ABORT("Malloc fails for perm_r[].");
    if ( !(etree = intMalloc(n)) ) ABORT("Malloc fails for etree[].");
    if ( !(R = (double *) SUPERLU_MALLOC(A.nrow * sizeof(double))) ) 
        ABORT("SUPERLU_MALLOC fails for R[].");
    if ( !(C = (double *) SUPERLU_MALLOC(A.ncol * sizeof(double))) )
        ABORT("SUPERLU_MALLOC fails for C[].");
    if ( !(ferr = (double *) SUPERLU_MALLOC(nrhs * sizeof(double))) )
        ABORT("SUPERLU_MALLOC fails for ferr[].");
    if ( !(berr = (double *) SUPERLU_MALLOC(nrhs * sizeof(double))) ) 
        ABORT("SUPERLU_MALLOC fails for berr[].");

    /* Initialize the statistics variables. */
    StatInit(&stat);
    
    /* ------------------------------------------------------------
       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME: AX = B
       ------------------------------------------------------------*/
    zgssvx(&options, &A, perm_c, perm_r, etree, equed, R, C,
           &L, &U, work, lwork, &B, &X, &rpg, &rcond, ferr, berr,
           &mem_usage, &stat, &info);

    printf("First system: zgssvx() returns info %d\n", info);

    if ( info == 0 || info == n+1 ) {

        /* This is how you could access the solution matrix. */
        doublecomplex *sol = (doublecomplex*) ((DNformat*) X.Store)->nzval; 

	if ( options.PivotGrowth ) printf("Recip. pivot growth = %e\n", rpg);
	if ( options.ConditionNumber )
	    printf("Recip. condition number = %e\n", rcond);
        Lstore = (SCformat *) L.Store;
        Ustore = (NCformat *) U.Store;
	printf("No of nonzeros in factor L = %d\n", Lstore->nnz);
    	printf("No of nonzeros in factor U = %d\n", Ustore->nnz);
    	printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n);
    	printf("FILL ratio = %.1f\n", (float)(Lstore->nnz + Ustore->nnz - n)/nnz);

	printf("L\\U MB %.3f\ttotal MB needed %.3f\n",
	       mem_usage.for_lu/1e6, mem_usage.total_needed/1e6);
	if ( options.IterRefine ) {
            printf("Iterative Refinement:\n");
	    printf("%8s%8s%16s%16s\n", "rhs", "Steps", "FERR", "BERR");
	    for (i = 0; i < nrhs; ++i)
	      printf("%8d%8d%16e%16e\n", i+1, stat.RefineSteps, ferr[i], berr[i]);
	}
	fflush(stdout);

    } else if ( info > 0 && lwork == -1 ) {
        printf("** Estimated memory: %d bytes\n", info - n);
    }

    if ( options.PrintStat ) StatPrint(&stat);
    StatFree(&stat);
    Destroy_CompCol_Matrix(&A);
    Destroy_Dense_Matrix(&B);
    if ( lwork >= 0 ) { /* Deallocate storage associated with L and U. */
        Destroy_SuperNode_Matrix(&L);
        Destroy_CompCol_Matrix(&U);
    }

    /* ------------------------------------------------------------
       NOW WE SOLVE ANOTHER LINEAR SYSTEM: A1*X = B1
       ONLY THE SPARSITY PATTERN OF A1 IS THE SAME AS THAT OF A.
       ------------------------------------------------------------*/
    options.Fact = SamePattern;
    StatInit(&stat); /* Initialize the statistics variables. */

    zCreate_CompCol_Matrix(&A1, m, n, nnz, a1, asub1, xa1,
                           SLU_NC, SLU_Z, SLU_GE);
    zCreate_Dense_Matrix(&B1, m, nrhs, rhsb1, m, SLU_DN, SLU_Z, SLU_GE);

    zgssvx(&options, &A1, perm_c, perm_r, etree, equed, R, C,
           &L, &U, work, lwork, &B1, &X, &rpg, &rcond, ferr, berr,
           &mem_usage, &stat, &info);

    printf("\nSecond system: zgssvx() returns info %d\n", info);

    if ( info == 0 || info == n+1 ) {

        /* This is how you could access the solution matrix. */
        doublecomplex *sol = (doublecomplex*) ((DNformat*) X.Store)->nzval; 

	if ( options.PivotGrowth ) printf("Recip. pivot growth = %e\n", rpg);
	if ( options.ConditionNumber )
	    printf("Recip. condition number = %e\n", rcond);
        Lstore = (SCformat *) L.Store;
        Ustore = (NCformat *) U.Store;
	printf("No of nonzeros in factor L = %d\n", Lstore->nnz);
    	printf("No of nonzeros in factor U = %d\n", Ustore->nnz);
    	printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n);
	printf("L\\U MB %.3f\ttotal MB needed %.3f\n",
	       mem_usage.for_lu/1e6, mem_usage.total_needed/1e6);
	if ( options.IterRefine ) {
            printf("Iterative Refinement:\n");
	    printf("%8s%8s%16s%16s\n", "rhs", "Steps", "FERR", "BERR");
	    for (i = 0; i < nrhs; ++i)
	      printf("%8d%8d%16e%16e\n", i+1, stat.RefineSteps, ferr[i], berr[i]);
	}
	fflush(stdout);
    } else if ( info > 0 && lwork == -1 ) {
        printf("** Estimated memory: %d bytes\n", info - n);
    }

    if ( options.PrintStat ) StatPrint(&stat);
    StatFree(&stat);

    SUPERLU_FREE (xact);
    SUPERLU_FREE (etree);
    SUPERLU_FREE (perm_r);
    SUPERLU_FREE (perm_c);
    SUPERLU_FREE (R);
    SUPERLU_FREE (C);
    SUPERLU_FREE (ferr);
    SUPERLU_FREE (berr);
    Destroy_CompCol_Matrix(&A1);
    Destroy_Dense_Matrix(&B1);
    Destroy_Dense_Matrix(&X);
    if ( lwork == 0 ) {
        Destroy_SuperNode_Matrix(&L);
        Destroy_CompCol_Matrix(&U);
    } else if ( lwork > 0 ) {
        SUPERLU_FREE(work);
    }

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC("Exit main()");
#endif
}
void f_create_SuperMatrix_handle(fptr *handle)
{
    *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
}
Exemplo n.º 3
0
float
sPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, 
             SuperMatrix *L, SuperMatrix *U)
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 *
 * Purpose
 * =======
 *
 * Compute the reciprocal pivot growth factor of the leading ncols columns
 * of the matrix, using the formula:
 *     min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) )
 *
 * Arguments
 * =========
 *
 * ncols    (input) int
 *          The number of columns of matrices A, L and U.
 *
 * A        (input) SuperMatrix*
 *          Original matrix A, permuted by columns, of dimension
 *          (A->nrow, A->ncol). The type of A can be:
 *          Stype = NC; Dtype = _D; Mtype = GE.
 *
 * L        (output) SuperMatrix*
 *          The factor L from the factorization Pr*A=L*U; use compressed row
 *          subscripts storage for supernodes, i.e., L has type:
 *          Stype = SC; Dtype = _D; Mtype = TRLU.
 *
 * U        (output) SuperMatrix*
 *          The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
 *          storage scheme, i.e., U has types: Stype = NC;
 *          Dtype = _D; Mtype = TRU.
 *
 */
    NCformat *Astore;
    SCPformat *Lstore;
    NCPformat *Ustore;
    float  *Aval, *Lval, *Uval;
    int      fsupc, nsupr, luptr, nz_in_U;
    int      i, j, k, oldcol;
    int      *inv_perm_c;
    float   rpg, maxaj, maxuj;
    extern   double slamch_(char *);
    float   smlnum;
    float   *luval;
   
    /* Get machine constants. */
    smlnum = slamch_("S");
    rpg = 1. / smlnum;

    Astore = A->Store;
    Lstore = L->Store;
    Ustore = U->Store;
    Aval = Astore->nzval;
    Lval = Lstore->nzval;
    Uval = Ustore->nzval;
    
    inv_perm_c = (int *) SUPERLU_MALLOC( (size_t) A->ncol*sizeof(int) );
    for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j;

    for (k = 0; k <= Lstore->nsuper; ++k) {
	fsupc = L_FST_SUPC(k);
	nsupr = L_SUB_END(fsupc) - L_SUB_START(fsupc);
	luptr = L_NZ_START(fsupc);
	luval = &Lval[luptr];
	nz_in_U = 1;
	
	for (j = fsupc; j < L_LAST_SUPC(k) && j < ncols; ++j) {
	    maxaj = 0.;
            oldcol = inv_perm_c[j];
	    for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; i++)
		maxaj = SUPERLU_MAX( maxaj, fabs(Aval[i]) );
	
	    maxuj = 0.;
	    for (i = Ustore->colbeg[j]; i < Ustore->colend[j]; i++)
		maxuj = SUPERLU_MAX( maxuj, fabs(Uval[i]) );
	    
	    /* Supernode */
	    for (i = 0; i < nz_in_U; ++i)
		maxuj = SUPERLU_MAX( maxuj, fabs(luval[i]) );

	    ++nz_in_U;
	    luval += nsupr;

	    if ( maxuj == 0. )
		rpg = SUPERLU_MIN( rpg, 1.);
	    else
		rpg = SUPERLU_MIN( rpg, maxaj / maxuj );
	}
	
	if ( j >= ncols ) break;
    }

    SUPERLU_FREE(inv_perm_c);
    return (rpg);
}
void f_create_gridinfo_handle(fptr *handle)
{
    *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
}
void f_create_ScalePerm_handle(fptr *handle)
{
    *handle = (fptr) SUPERLU_MALLOC(sizeof(ScalePermstruct_t));
}
Exemplo n.º 6
0
void tlin::allocD(SuperMatrix *&A, int rows, int cols) {
  A = (SuperMatrix *)SUPERLU_MALLOC(sizeof(SuperMatrix));

  double *values = doubleMalloc(rows * cols * sizeof(double));
  dCreate_Dense_Matrix(A, rows, cols, values, rows, SLU_DN, SLU_D, SLU_GE);
}
Exemplo n.º 7
0
void tlin::allocD(SuperMatrix *&A, int rows, int cols, int lda,
                  double *values) {
  A = (SuperMatrix *)SUPERLU_MALLOC(sizeof(SuperMatrix));
  dCreate_Dense_Matrix(A, rows, cols, values, lda, SLU_DN, SLU_D, SLU_GE);
}
Exemplo n.º 8
0
/*! \brief
 *
 * <pre>
 * Purpose
 * =======
 *   Set up the communication pattern for the triangular solution.
 * 
 * Arguments
 * =========
 *
 * n      (input) int (global)
 *        The dimension of the linear system.
 *
 * m_loc  (input) int (local)
 *        The local row dimension of the distributed input matrix.
 *
 * nrhs   (input) int (global)
 *        Number of right-hand sides.
 *
 * fst_row (input) int (global)
 *        The row number of matrix B's first row in the global matrix.
 *
 * perm_r (input) int* (global)
 *        The row permutation vector.
 *
 * perm_c (input) int* (global)
 *        The column permutation vector.
 *
 * grid   (input) gridinfo_t*
 *        The 2D process mesh.
 * </pre>
 */
int_t
pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row,
	     int_t perm_r[], int_t perm_c[], gridinfo_t *grid,
	     Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct)
{

    int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
    int *itemp, *ptr_to_ibuf, *ptr_to_dbuf;
    int_t *row_to_proc;
    int_t i, gbi, k, l, num_diag_procs, *diag_procs;
    int_t irow, lk, q, knsupc, nsupers, *xsup, *supno;
    int   iam, p, pkk, procs;
    pxgstrs_comm_t *gstrs_comm;

    procs = grid->nprow * grid->npcol;
    iam = grid->iam;
    gstrs_comm = SOLVEstruct->gstrs_comm;
    xsup = Glu_persist->xsup;
    supno = Glu_persist->supno;
    nsupers = Glu_persist->supno[n-1] + 1;
    row_to_proc = SOLVEstruct->row_to_proc;

    /* ------------------------------------------------------------
       SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X.
       ------------------------------------------------------------*/
    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
        ABORT("Malloc fails for B_to_X_itemp[].");
    SendCnt      = itemp;
    SendCnt_nrhs = itemp +   procs;
    RecvCnt      = itemp + 2*procs;
    RecvCnt_nrhs = itemp + 3*procs;
    sdispls      = itemp + 4*procs;
    sdispls_nrhs = itemp + 5*procs;
    rdispls      = itemp + 6*procs;
    rdispls_nrhs = itemp + 7*procs;

    /* Count the number of elements to be sent to each diagonal process.*/
    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
	gbi = BlockNum( irow );
	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
	++SendCnt[p];
    }
  
    /* Set up the displacements for alltoall. */
    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
    sdispls[0] = rdispls[0] = 0;
    for (p = 1; p < procs; ++p) {
        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
    }
    for (p = 0; p < procs; ++p) {
        SendCnt_nrhs[p] = SendCnt[p] * nrhs;
	sdispls_nrhs[p] = sdispls[p] * nrhs;
        RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
	rdispls_nrhs[p] = rdispls[p] * nrhs;
    }

    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
    gstrs_comm->B_to_X_SendCnt = SendCnt;

    /* ------------------------------------------------------------
       SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B.
       ------------------------------------------------------------*/
    /* This is freed in pxgstrs_finalize(). */
    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
        ABORT("Malloc fails for X_to_B_itemp[].");
    SendCnt      = itemp;
    SendCnt_nrhs = itemp +   procs;
    RecvCnt      = itemp + 2*procs;
    RecvCnt_nrhs = itemp + 3*procs;
    sdispls      = itemp + 4*procs;
    sdispls_nrhs = itemp + 5*procs;
    rdispls      = itemp + 6*procs;
    rdispls_nrhs = itemp + 7*procs;

    /* Count the number of X entries to be sent to each process.*/
    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
    num_diag_procs = SOLVEstruct->num_diag_procs;
    diag_procs = SOLVEstruct->diag_procs;

    for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */
	pkk = diag_procs[p];
	if ( iam == pkk ) {
	    for (k = p; k < nsupers; k += num_diag_procs) {
		knsupc = SuperSize( k );
		lk = LBi( k, grid ); /* Local block number */
		irow = FstBlockC( k );
		for (i = 0; i < knsupc; ++i) {
#if 0
		    q = row_to_proc[inv_perm_c[irow]];
#else
		    q = row_to_proc[irow];
#endif
		    ++SendCnt[q];
		    ++irow;
		}
	    }
	}
    }

    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
    sdispls[0] = rdispls[0] = 0;
    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
    SendCnt_nrhs[0] = SendCnt[0] * nrhs;
    RecvCnt_nrhs[0] = RecvCnt[0] * nrhs;
    for (p = 1; p < procs; ++p) {
        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
        sdispls_nrhs[p] = sdispls[p] * nrhs;
        rdispls_nrhs[p] = rdispls[p] * nrhs;
	SendCnt_nrhs[p] = SendCnt[p] * nrhs;
	RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
    }

    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
    gstrs_comm->X_to_B_SendCnt = SendCnt;

    if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) )
        ABORT("Malloc fails for ptr_to_ibuf[].");
    gstrs_comm->ptr_to_ibuf = ptr_to_ibuf;
    gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs;

} /* PXGSTRS_INIT */
Exemplo n.º 9
0
int_t
pddistribute(fact_t fact, int_t n, SuperMatrix *A,
	     ScalePermstruct_t *ScalePermstruct,
	     Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct,
	     gridinfo_t *grid)
/*
 * -- Distributed SuperLU routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley.
 * March 15, 2003
 *
 *
 * Purpose
 * =======
 *   Distribute the matrix onto the 2D process mesh.
 * 
 * Arguments
 * =========
 * 
 * fact (input) fact_t
 *        Specifies whether or not the L and U structures will be re-used.
 *        = SamePattern_SameRowPerm: L and U structures are input, and
 *                                   unchanged on exit.
 *        = DOFACT or SamePattern: L and U structures are computed and output.
 *
 * n      (input) int
 *        Dimension of the matrix.
 *
 * A      (input) SuperMatrix*
 *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
 *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
 *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
 *
 * ScalePermstruct (input) ScalePermstruct_t*
 *        The data structure to store the scaling and permutation vectors
 *        describing the transformations performed to the original matrix A.
 *
 * Glu_freeable (input) *Glu_freeable_t
 *        The global structure describing the graph of L and U.
 * 
 * LUstruct (input) LUstruct_t*
 *        Data structures for L and U factors.
 *
 * grid   (input) gridinfo_t*
 *        The 2D process mesh.
 *
 * Return value
 * ============
 *   > 0, working storage required (in bytes).
 *
 */
{
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;
    int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, len, len1, nsupc;
    int_t ljb;  /* local block column number */
    int_t nrbl; /* number of L blocks in current block column */
    int_t nrbu; /* number of U blocks in current block column */
    int_t gb;   /* global block number; 0 < gb <= nsuper */
    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
    int iam, jbrow, kcol, mycol, myrow, pc, pr;
    int_t mybufmax[NBUFFERS];
#if 0
    NCPformat *Astore;
#else /* XSL ==> */
    NRformat_loc *Astore;
#endif
    double *a;
    int_t *asub, *xa;
#if 0
    int_t *xa_begin, *xa_end;
#endif
    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
    int_t *supno = Glu_persist->supno;   
    int_t *lsub, *xlsub, *usub, *xusub;
    int_t nsupers;
    int_t next_lind;      /* next available position in index[*] */
    int_t next_lval;      /* next available position in nzval[*] */
    int_t *index;         /* indices consist of headers and row subscripts */
    double *lusup, *uval; /* nonzero values in L and U */
    double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
    double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */

    /*-- Counts to be used in factorization. --*/
    int_t  *ToRecv, *ToSendD, **ToSendR;

    /*-- Counts to be used in lower triangular solve. --*/
    int_t  *fmod;          /* Modification count for L-solve.        */
    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
    int_t  kseen;

    /*-- Counts to be used in upper triangular solve. --*/
    int_t  *bmod;          /* Modification count for U-solve.        */
    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
    int_t  *ilsum;         /* starting position of each supernode in 
			      the full array (local)                 */

    /*-- Auxiliary arrays; freed on return --*/
    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
    int_t *Ucbs;       /* number of column blocks in a block row            */
    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
    double *dense, *dense_col; /* SPA */
    double zero = 0.0;
    int_t ldaspa;     /* LDA of SPA */
    int_t mem_use = 0, iword, dword;

#if ( PRNTlevel>=1 )
    int_t nLblocks = 0, nUblocks = 0;
#endif
#if ( PROFlevel>=1 ) 
    double t, t_u, t_l;
    int_t u_blks;
#endif

    /* Initialization. */
    iam = grid->iam;
    myrow = MYROW( iam, grid );
    mycol = MYCOL( iam, grid );
    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
    nsupers  = supno[n-1] + 1;
    Astore   = (NRformat_loc *) A->Store;

#if ( PRNTlevel>=1 )
    iword = sizeof(int_t);
    dword = sizeof(double);
#endif

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pddistribute()");
#endif

    dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno,
		      grid, &xa, &asub, &a);

    if ( fact == SamePattern_SameRowPerm ) {
#if ( PROFlevel>=1 )
	t_l = t_u = 0; u_blks = 0;
#endif
	/* We can propagate the new values of A into the existing
	   L and U data structures.            */
	ilsum = Llu->ilsum;
	ldaspa = Llu->ldalsum;
	if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) )
	    ABORT("Calloc fails for SPA dense[].");
	nrbu = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
	if ( !(Urb_length = intCalloc_dist(nrbu)) )
	    ABORT("Calloc fails for Urb_length[].");
	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
	    ABORT("Malloc fails for Urb_indptr[].");
	for (lb = 0; lb < nrbu; ++lb) 
	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
	Unzval_br_ptr = Llu->Unzval_br_ptr;
#if ( PRNTlevel>=1 )
	mem_use += 2*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
#endif
	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
	    pc = PCOL( jb, grid );
	    if ( mycol == pc ) { /* Block column jb in my process column */
		fsupc = FstBlockC( jb );
		nsupc = SuperSize( jb );

		/* Scatter A into SPA. */
		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
		    for (i = xa[j]; i < xa[j+1]; ++i) {
			irow = asub[i];
			gb = BlockNum( irow );
			if ( myrow == PROW( gb, grid ) ) {
			    lb = LBi( gb, grid );
			    irow = ilsum[lb] + irow - FstBlockC( gb );
			    dense_col[irow] = a[i];
			}
		    }
		    dense_col += ldaspa;
		}

#if ( PROFlevel>=1 )
		t = SuperLU_timer_();
#endif
		/* Gather the values of A from SPA into Unzval[]. */
		for (lb = 0; lb < nrbu; ++lb) {
		    index = Ufstnz_br_ptr[lb];
		    if ( index && index[Urb_indptr[lb]] == jb ) {
			uval = Unzval_br_ptr[lb];
			len = Urb_indptr[lb] + UB_DESCRIPTOR;
			gb = lb * grid->nprow + myrow;/* Global block number */
			k = FstBlockC( gb+1 );
			irow = ilsum[lb] - FstBlockC( gb );
			for (jj = 0, dense_col = dense; jj < nsupc; ++jj) {
			    j = index[len+jj];
			    for (i = j; i < k; ++i) {
				uval[Urb_length[lb]++] = dense_col[irow+i];
				dense_col[irow+i] = zero;
			    }
			    dense_col += ldaspa;
			}
			Urb_indptr[lb] += UB_DESCRIPTOR + nsupc;
		    } /* if index != NULL */
		} /* for lb ... */
#if ( PROFlevel>=1 )
		t_u += SuperLU_timer_() - t;
		t = SuperLU_timer_();
#endif
		/* Gather the values of A from SPA into Lnzval[]. */
		ljb = LBj( jb, grid ); /* Local block number */
		index = Lrowind_bc_ptr[ljb];
		if ( index ) {
		    nrbl = index[0];   /* Number of row blocks. */
		    len = index[1];    /* LDA of lusup[]. */
		    lusup = Lnzval_bc_ptr[ljb];
		    next_lind = BC_HEADER;
		    next_lval = 0;
		    for (jj = 0; jj < nrbl; ++jj) {
			gb = index[next_lind++];
			len1 = index[next_lind++]; /* Rows in the block. */
			lb = LBi( gb, grid );
			for (bnnz = 0; bnnz < len1; ++bnnz) {
			    irow = index[next_lind++]; /* Global index. */
			    irow = ilsum[lb] + irow - FstBlockC( gb );
			    k = next_lval++;
			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
				lusup[k] = dense_col[irow];
				dense_col[irow] = zero;
				k += len;
				dense_col += ldaspa;
			    }
			} /* for bnnz ... */
		    } /* for jj ... */
		} /* if index ... */
#if ( PROFlevel>=1 )
		t_l += SuperLU_timer_() - t;
#endif
	    } /* if mycol == pc */
	} /* for jb ... */

	SUPERLU_FREE(dense);
	SUPERLU_FREE(Urb_length);
	SUPERLU_FREE(Urb_indptr);
#if ( PROFlevel>=1 )
	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
			   t_l, t_u, u_blks, nrbu);
#endif

    } else {
        /* ------------------------------------------------------------
	   FIRST TIME CREATING THE L AND U DATA STRUCTURES.
	   ------------------------------------------------------------*/

#if ( PROFlevel>=1 )
	t_l = t_u = 0; u_blks = 0;
#endif
	/* We first need to set up the L and U data structures and then
	 * propagate the values of A into them.
	 */
	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
	xlsub = Glu_freeable->xlsub;
	usub = Glu_freeable->usub;    /* compressed U subscripts */
	xusub = Glu_freeable->xusub;
    
	if ( !(ToRecv = intCalloc_dist(nsupers)) )
	    ABORT("Calloc fails for ToRecv[].");

	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
	if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
	    ABORT("Malloc fails for ToSendR[].");
	j = k * grid->npcol;
	if ( !(index = intMalloc_dist(j)) )
	    ABORT("Malloc fails for index[].");
#if ( PRNTlevel>=1 )
	mem_use = k*sizeof(int_t*) + (j + nsupers)*iword;
#endif
	for (i = 0; i < j; ++i) index[i] = EMPTY;
	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */

	/* Pointers to the beginning of each block row of U. */
	if ( !(Unzval_br_ptr = 
              (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
	    ABORT("Malloc fails for Unzval_br_ptr[].");
	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
	
	if ( !(ToSendD = intCalloc_dist(k)) )
	    ABORT("Malloc fails for ToSendD[].");
	if ( !(ilsum = intMalloc_dist(k+1)) )
	    ABORT("Malloc fails for ilsum[].");

	/* Auxiliary arrays used to set up U block data structures.
	   They are freed on return. */
	if ( !(rb_marker = intCalloc_dist(k)) )
	    ABORT("Calloc fails for rb_marker[].");
	if ( !(Urb_length = intCalloc_dist(k)) )
	    ABORT("Calloc fails for Urb_length[].");
	if ( !(Urb_indptr = intMalloc_dist(k)) )
	    ABORT("Malloc fails for Urb_indptr[].");
	if ( !(Urb_fstnz = intCalloc_dist(k)) )
	    ABORT("Calloc fails for Urb_fstnz[].");
	if ( !(Ucbs = intCalloc_dist(k)) )
	    ABORT("Calloc fails for Ucbs[].");
#if ( PRNTlevel>=1 )	
	mem_use = 2*k*sizeof(int_t*) + (7*k+1)*iword;
#endif
	/* Compute ldaspa and ilsum[]. */
	ldaspa = 0;
	ilsum[0] = 0;
	for (gb = 0; gb < nsupers; ++gb) {
	    if ( myrow == PROW( gb, grid ) ) {
		i = SuperSize( gb );
		ldaspa += i;
		lb = LBi( gb, grid );
		ilsum[lb + 1] = ilsum[lb] + i;
	    }
	}
	
            
	/* ------------------------------------------------------------
	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
	   ------------------------------------------------------------*/
	
	/* Loop through each supernode column. */
	for (jb = 0; jb < nsupers; ++jb) {
	    pc = PCOL( jb, grid );
	    fsupc = FstBlockC( jb );
	    nsupc = SuperSize( jb );
	    /* Loop through each column in the block. */
	    for (j = fsupc; j < fsupc + nsupc; ++j) {
		/* usub[*] contains only "first nonzero" in each segment. */
		for (i = xusub[j]; i < xusub[j+1]; ++i) {
		    irow = usub[i]; /* First nonzero of the segment. */
		    gb = BlockNum( irow );
		    kcol = PCOL( gb, grid );
		    ljb = LBj( gb, grid );
		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
		    pr = PROW( gb, grid );
		    lb = LBi( gb, grid );
		    if ( mycol == pc ) {
			if  ( myrow == pr ) {
			    ToSendD[lb] = YES;
			    /* Count nonzeros in entire block row. */
			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
			    if (rb_marker[lb] <= jb) {/* First see the block */
				rb_marker[lb] = jb + 1;
				Urb_fstnz[lb] += nsupc;
				++Ucbs[lb]; /* Number of column blocks
					       in block row lb. */
#if ( PRNTlevel>=1 )
				++nUblocks;
#endif
			    }
			    ToRecv[gb] = 1;
			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
		    }
		} /* for i ... */
	    } /* for j ... */
	} /* for jb ... */
	
	/* Set up the initial pointers for each block row in U. */
	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
	for (lb = 0; lb < nrbu; ++lb) {
	    len = Urb_length[lb];
	    rb_marker[lb] = 0; /* Reset block marker. */
	    if ( len ) {
		/* Add room for descriptors */
		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
		if ( !(index = intMalloc_dist(len1+1)) )
		    ABORT("Malloc fails for Uindex[].");
		Ufstnz_br_ptr[lb] = index;
		if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) )
		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
		index[0] = Ucbs[lb]; /* Number of column blocks */
		index[1] = len;      /* Total length of nzval[] */
		index[2] = len1;     /* Total length of index[] */
		index[len1] = -1;    /* End marker */
	    } else {
		Ufstnz_br_ptr[lb] = NULL;
		Unzval_br_ptr[lb] = NULL;
	    }
	    Urb_length[lb] = 0; /* Reset block length. */
	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
	} /* for lb ... */

	SUPERLU_FREE(Urb_fstnz);
	SUPERLU_FREE(Ucbs);
#if ( PRNTlevel>=1 )
        mem_use -= 2*k * iword;
#endif
	/* Auxiliary arrays used to set up L block data structures.
	   They are freed on return.
	   k is the number of local row blocks.   */
	if ( !(Lrb_length = intCalloc_dist(k)) )
	    ABORT("Calloc fails for Lrb_length[].");
	if ( !(Lrb_number = intMalloc_dist(k)) )
	    ABORT("Malloc fails for Lrb_number[].");
	if ( !(Lrb_indptr = intMalloc_dist(k)) )
	    ABORT("Malloc fails for Lrb_indptr[].");
	if ( !(Lrb_valptr = intMalloc_dist(k)) )
	    ABORT("Malloc fails for Lrb_valptr[].");
	if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) )
	    ABORT("Calloc fails for SPA dense[].");

	/* These counts will be used for triangular solves. */
	if ( !(fmod = intCalloc_dist(k)) )
	    ABORT("Calloc fails for fmod[].");
	if ( !(bmod = intCalloc_dist(k)) )
	    ABORT("Calloc fails for bmod[].");
	/* ------------------------------------------------ */
#if ( PRNTlevel>=1 )	
	mem_use += 6*k*iword + ldaspa*sp_ienv_dist(3)*dword;
#endif
	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */

	/* Pointers to the beginning of each block column of L. */
	if ( !(Lnzval_bc_ptr = 
              (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
	Lrowind_bc_ptr[k-1] = NULL;

	/* These lists of processes will be used for triangular solves. */
	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
	    ABORT("Malloc fails for fsendx_plist[].");
	len = k * grid->nprow;
	if ( !(index = intMalloc_dist(len)) )
	    ABORT("Malloc fails for fsendx_plist[0]");
	for (i = 0; i < len; ++i) index[i] = EMPTY;
	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
	    fsendx_plist[i] = &index[j];
	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
	    ABORT("Malloc fails for bsendx_plist[].");
	if ( !(index = intMalloc_dist(len)) )
	    ABORT("Malloc fails for bsendx_plist[0]");
	for (i = 0; i < len; ++i) index[i] = EMPTY;
	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
	    bsendx_plist[i] = &index[j];
	/* -------------------------------------------------------------- */
#if ( PRNTlevel>=1 )
	mem_use += 4*k*sizeof(int_t*) + 2*len*iword;
#endif

	/*------------------------------------------------------------
	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
	  ------------------------------------------------------------*/

	for (jb = 0; jb < nsupers; ++jb) {
	    pc = PCOL( jb, grid );
	    if ( mycol == pc ) { /* Block column jb in my process column */
		fsupc = FstBlockC( jb );
		nsupc = SuperSize( jb );
		ljb = LBj( jb, grid ); /* Local block number */
		
		/* Scatter A into SPA. */
		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
		    for (i = xa[j]; i < xa[j+1]; ++i) {
			irow = asub[i];
			gb = BlockNum( irow );
			if ( myrow == PROW( gb, grid ) ) {
			    lb = LBi( gb, grid );
			    irow = ilsum[lb] + irow - FstBlockC( gb );
			    dense_col[irow] = a[i];
			}
		    }
		    dense_col += ldaspa;
		}

		jbrow = PROW( jb, grid );

#if ( PROFlevel>=1 )
		t = SuperLU_timer_();
#endif
		/*------------------------------------------------
		 * SET UP U BLOCKS.
		 *------------------------------------------------*/
		kseen = 0;
		/* Loop through each column in the block column. */
		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
		    istart = xusub[j];
		    for (i = istart; i < xusub[j+1]; ++i) {
			irow = usub[i]; /* First nonzero in the segment. */
			gb = BlockNum( irow );
			pr = PROW( gb, grid );
			if ( pr != jbrow ) 
			    bsendx_plist[ljb][pr] = YES;
			if ( myrow == pr ) {
			    lb = LBi( gb, grid ); /* Local block number */
			    index = Ufstnz_br_ptr[lb];
			    if (rb_marker[lb] <= jb) {/* First see the block */
				rb_marker[lb] = jb + 1;
				index[Urb_indptr[lb]] = jb; /* Descriptor */
				Urb_indptr[lb] += UB_DESCRIPTOR;
				len = Urb_indptr[lb];
				for (k = 0; k < nsupc; ++k)
				    index[len+k] = FstBlockC( gb+1 );
				if ( gb != jb )/* Exclude diagonal block. */
				    ++bmod[lb];/* Mod. count for back solve */
				if ( kseen == 0 && myrow != jbrow ) {
				    ++nbrecvx;
				    kseen = 1;
				}
			    } else {
				len = Urb_indptr[lb];/* Start fstnz in index */
			    }
			    jj = j - fsupc;
			    index[len+jj] = irow;
			} /* if myrow == pr ... */
		    } /* for i ... */
		} /* for j ... */

		/* Figure out how many nonzeros in each block, and gather
		   the initial values of A from SPA into Uval. */
		for (lb = 0; lb < nrbu; ++lb) {
		    if ( rb_marker[lb] == jb + 1 ) { /* Not an empty block. */
			index = Ufstnz_br_ptr[lb];
			uval = Unzval_br_ptr[lb];
			len = Urb_indptr[lb];
			gb = lb * grid->nprow + myrow;/* Global block number */
			k = FstBlockC( gb+1 );
			irow = ilsum[lb] - FstBlockC( gb );
			for (jj=0, bnnz=0, dense_col=dense; jj < nsupc; ++jj) {
			    j = index[len+jj];  /* First nonzero in segment. */
			    bnnz += k - j;
			    for (i = j; i < k; ++i) {
				uval[Urb_length[lb]++] = dense_col[irow + i];
				dense_col[irow + i] = zero;
			    }
			    dense_col += ldaspa;
			}
			index[len-1] = bnnz; /* Set block length in Descriptor */
			Urb_indptr[lb] += nsupc;
		    }
		} /* for lb ... */
#if ( PROFlevel>=1 )
		t_u += SuperLU_timer_() - t;
		t = SuperLU_timer_();
#endif		
		/*------------------------------------------------
		 * SET UP L BLOCKS.
		 *------------------------------------------------*/

		/* Count number of blocks and length of each block. */
		nrbl = 0;
		len = 0; /* Number of row subscripts I own. */
		kseen = 0;
		istart = xlsub[fsupc];
		for (i = istart; i < xlsub[fsupc+1]; ++i) {
		    irow = lsub[i];
		    gb = BlockNum( irow ); /* Global block number */
		    pr = PROW( gb, grid ); /* Process row owning this block */
		    if ( pr != jbrow )
			fsendx_plist[ljb][pr] = YES;
		    if ( myrow == pr ) {
			lb = LBi( gb, grid );  /* Local block number */
			if (rb_marker[lb] <= jb) { /* First see this block */
			    rb_marker[lb] = jb + 1;
			    Lrb_length[lb] = 1;
			    Lrb_number[nrbl++] = gb;
			    if ( gb != jb ) /* Exclude diagonal block. */
				++fmod[lb]; /* Mod. count for forward solve */
			    if ( kseen == 0 && myrow != jbrow ) {
				++nfrecvx;
				kseen = 1;
			    }
#if ( PRNTlevel>=1 )
			    ++nLblocks;
#endif
			} else {
			    ++Lrb_length[lb];
			}
			++len;
		    }
		} /* for i ... */

		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
		    /* Set up the initial pointers for each block in 
		       index[] and nzval[]. */
		    /* Add room for descriptors */
		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
		    if ( !(index = intMalloc_dist(len1)) ) 
			ABORT("Malloc fails for index[]");
		    Lrowind_bc_ptr[ljb] = index;
		    if (!(Lnzval_bc_ptr[ljb] = 
                         doubleMalloc_dist(len*nsupc))) {
			fprintf(stderr, "col block %d ", jb);
			ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
		    }
		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
		    index[0] = nrbl;  /* Number of row blocks */
		    index[1] = len;   /* LDA of the nzval[] */
		    next_lind = BC_HEADER;
		    next_lval = 0;
		    for (k = 0; k < nrbl; ++k) {
			gb = Lrb_number[k];
			lb = LBi( gb, grid );
			len = Lrb_length[lb];
			Lrb_length[lb] = 0;  /* Reset vector of block length */
			index[next_lind++] = gb; /* Descriptor */
			index[next_lind++] = len; 
			Lrb_indptr[lb] = next_lind;
			Lrb_valptr[lb] = next_lval;
			next_lind += len;
			next_lval += len;
		    }
		    /* Propagate the compressed row subscripts to Lindex[],
                       and the initial values of A from SPA into Lnzval[]. */
		    lusup = Lnzval_bc_ptr[ljb];
		    len = index[1];  /* LDA of lusup[] */
		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
			irow = lsub[i];
			gb = BlockNum( irow );
			if ( myrow == PROW( gb, grid ) ) {
			    lb = LBi( gb, grid );
			    k = Lrb_indptr[lb]++; /* Random access a block */
			    index[k] = irow;
			    k = Lrb_valptr[lb]++;
			    irow = ilsum[lb] + irow - FstBlockC( gb );
			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
				lusup[k] = dense_col[irow];
				dense_col[irow] = zero;
				k += len;
				dense_col += ldaspa;
			    }
			}
		    } /* for i ... */
		} else {
		    Lrowind_bc_ptr[ljb] = NULL;
		    Lnzval_bc_ptr[ljb] = NULL;
		} /* if nrbl ... */
#if ( PROFlevel>=1 )
		t_l += SuperLU_timer_() - t;
#endif
	    } /* if mycol == pc */

	} /* for jb ... */

	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
	Llu->Unzval_br_ptr = Unzval_br_ptr;
	Llu->ToRecv = ToRecv;
	Llu->ToSendD = ToSendD;
	Llu->ToSendR = ToSendR;
	Llu->fmod = fmod;
	Llu->fsendx_plist = fsendx_plist;
	Llu->nfrecvx = nfrecvx;
	Llu->bmod = bmod;
	Llu->bsendx_plist = bsendx_plist;
	Llu->nbrecvx = nbrecvx;
	Llu->ilsum = ilsum;
	Llu->ldalsum = ldaspa;
	
#if ( PRNTlevel>=1 )
	if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
			   nLblocks, nUblocks);
#endif

	SUPERLU_FREE(rb_marker);
	SUPERLU_FREE(Urb_length);
	SUPERLU_FREE(Urb_indptr);
	SUPERLU_FREE(Lrb_length);
	SUPERLU_FREE(Lrb_number);
	SUPERLU_FREE(Lrb_indptr);
	SUPERLU_FREE(Lrb_valptr);
	SUPERLU_FREE(dense);

	/* Find the maximum buffer size. */
	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, 
		      MPI_MAX, grid->comm);
#if ( PROFlevel>=1 )
	if ( !iam ) printf(".. 1st distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
			   t_l, t_u, u_blks, nrbu);
#endif

    } /* else fact != SamePattern_SameRowPerm */

    SUPERLU_FREE(xa);
    SUPERLU_FREE(asub);
    SUPERLU_FREE(a);

#if ( DEBUGlevel>=1 )
    /* Memory allocated but not freed:
       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
    CHECK_MALLOC(iam, "Exit pddistribute()");
#endif
    
    return (mem_use);
} /* PDDISTRIBUTE */
/*! \brief
 *
 * <pre>
 * Purpose
 * =======
 *
 * GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a
 * graph partitioning algorithm to the symmetrized graph A+A'.  The
 * multilevel graph partitioning algorithm used is the
 * ParMETIS_V3_NodeND routine available in the parallel graph
 * partitioning package parMETIS.  
 *
 * The number of independent sub-domains noDomains computed by this
 * algorithm has to be a power of 2.  Hence noDomains is the larger
 * number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow
 * * npcol is the number of processors used in SuperLU_DIST.
 *
 * Arguments
 * =========
 *
 * A       (input) SuperMatrix*
 *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
 *         of the linear equations is A->nrow.  Matrix A is distributed
 *         in NRformat_loc format.
 *
 * perm_r  (input) int_t*
 *         Row permutation vector of size A->nrow, which defines the 
 *         permutation matrix Pr; perm_r[i] = j means row i of A is in 
 *         position j in Pr*A.
 *
 * perm_c  (output) int_t*
 *	   Column permutation vector of size A->ncol, which defines the 
 *         permutation matrix Pc; perm_c[i] = j means column i of A is 
 *         in position j in A*Pc.
 *
 * nprocs_i (input) int*
 *         Number of processors the input matrix is distributed on in a block
 *         row format.  It corresponds to number of processors used in
 *         SuperLU_DIST.
 *
 * noDomains (input) int*, must be power of 2
 *         Number of independent domains to be computed by the graph
 *         partitioning algorithm.  ( noDomains <= nprocs_i )
 *
 * sizes   (output) int_t**, of size 2 * noDomains
 *         Returns pointer to an array containing the number of nodes
 *         for each sub-domain and each separator.  Separators are stored 
 *         from left to right.
 *         Memory for the array is allocated in this routine.
 *
 * fstVtxSep (output) int_t**, of size 2 * noDomains
 *         Returns pointer to an array containing first node for each
 *         sub-domain and each separator.
 *         Memory for the array is allocated in this routine.
 *
 * Return value
 * ============
 *   < 0, number of bytes allocated on return from the symbolic factorization.
 *   > 0, number of bytes allocated when out of memory.
 * </pre>
 */
float
get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
		     int nprocs_i, int noDomains, 
		     int_t **sizes, int_t **fstVtxSep,
		     gridinfo_t *grid, MPI_Comm *metis_comm)

{
  NRformat_loc *Astore;
  int   iam, p;
#if 0
  int   *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int;
  int   *options, numflag;
#else /* 64-bit integers */
  int_t options[4]={0,0,0,1}, numflag;
#endif
  int_t m_loc, fst_row;
  int_t m, n, bnz, i, j;
  int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes;
  int_t *b_rowptr, *b_colind;
  int_t *dist_order;
  int  *recvcnts, *displs;
  /* first row index on each processor when the matrix is distributed
     on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */
  int_t  *vtxdist_i, *vtxdist_o; 
  int_t szSep, k, noNodes;
  float apat_mem_l; /* memory used during the computation of the graph of A+A' */
  float mem;  /* Memory used during this routine */
  MPI_Status status;

  /* Initialization. */
  MPI_Comm_rank (grid->comm, &iam);
  n = A->ncol;
  m = A->nrow;
  if ( m != n ) ABORT("Matrix is not square");
  mem = 0.;

#if ( DEBUGlevel>=1 )
  CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()");
#endif

  Astore = (NRformat_loc *) A->Store;
  m_loc = Astore->m_loc;     /* number of rows local to this processor */
  fst_row = Astore->fst_row; /* global index of the first row */
  rowptr = Astore->rowptr;   /* pointer to rows and column indices */
  colind = Astore->colind;
  
#if ( PRNTlevel>=1 )
  if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n",
		     noDomains);
#endif

  numflag = 0;
  /* determine first row on each processor */
  vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
  if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i.");
  vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
  if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o.");

  MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t,
		 grid->comm);
  vtxdist_i[nprocs_i] = m;

  if (noDomains == nprocs_i) {
    /* keep the same distribution of A */
    for (p = 0; p <= nprocs_i; p++)
      vtxdist_o[p] = vtxdist_i[p];
  }
  else {
    i = n / noDomains;
    j = n % noDomains;
    for (k = 0, p = 0; p < noDomains; p++) {
      vtxdist_o[p] = k;
      k += i;
      if (p < j)  k++;
    }
    /* The remaining non-participating processors get the same 
       first-row-number as the last processor.   */
    for (p = noDomains; p <= nprocs_i; p++)
      vtxdist_o[p] = k;
  }

#if ( DEBUGlevel>=2 )
  if (!iam)
    PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o);
#endif  

  /* Compute distributed A + A' */
  if ((apat_mem_l = 
       a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i,
			     n, rowptr, colind, noDomains, vtxdist_o,
			     &bnz, &b_rowptr, &b_colind, grid)) > 0)
    return (apat_mem_l);
  mem += -apat_mem_l;
  
  /* Initialize and allocate storage for parMetis. */    
  (*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
  if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes.");
  l_sizes = *sizes;
  (*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
  if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
  l_fstVtxSep = *fstVtxSep;
  m_loc = vtxdist_o[iam+1] - vtxdist_o[iam];
  
  if ( iam < noDomains) 
    /* dist_order is the perm returned by parMetis, distributed */
    if (! (dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t))))
      ABORT("SUPERLU_MALLOC fails for dist_order.");

#if 0

  /* ParMETIS represents the column pointers and row indices of *
   * the input matrix using integers. When SuperLU_DIST uses    *
   * long int for the int_t type, then several supplementary    *
   * copies need to be performed in order to call ParMETIS.     */
#if defined (_LONGINT)
  l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int));
  if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int.");
  
  /* Allocate storage */
  if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int))))
    ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]");
  for (i = 0; i <= m_loc; i++)
    b_rowptr_int[i] = b_rowptr[i];
  SUPERLU_FREE (b_rowptr);
  
  if ( bnz ) {
    if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int))))
      ABORT("SUPERLU_MALLOC fails for b_colind_int[]");
    for (i = 0; i < bnz; i++)
      b_colind_int[i] = b_colind[i];
    SUPERLU_FREE (b_colind);
  }
  
  if ( !(vtxdist_o_int = 
	 (int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int))))
    ABORT("SUPERLU_MALLOC fails for vtxdist_o_int.");
  for (i = 0; i <= nprocs_i; i++)
    vtxdist_o_int[i] = vtxdist_o[i];
  SUPERLU_FREE (vtxdist_o);

#else  /* Default */

  vtxdist_o_int = vtxdist_o;
  b_rowptr_int = b_rowptr; b_colind_int = b_colind;
  l_sizes_int = l_sizes;

#endif
#endif
    
  if ( iam < noDomains) {

    ParMETIS_V3_NodeND(vtxdist_o, b_rowptr, b_colind, 
		       &numflag, options,
		       dist_order, l_sizes, metis_comm);
  }

  if (bnz) SUPERLU_FREE (b_colind);
  SUPERLU_FREE (b_rowptr);

#if 0  
  if ( iam < noDomains) {
    SUPERLU_FREE (options);
  }

#if defined (_LONGINT)
  /* Copy data from dist_order_int to dist_order */
  if ( iam < noDomains) {
    /* dist_order is the perm returned by parMetis, distributed */
    if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t))))
      ABORT("SUPERLU_MALLOC fails for dist_order.");
    for (i = 0; i < m_loc; i++)
      dist_order[i] = dist_order_int[i];
    SUPERLU_FREE(dist_order_int);
    
    for (i = 0; i < 2*noDomains; i++)
      l_sizes[i] = l_sizes_int[i];
    SUPERLU_FREE(l_sizes_int);
  }
#else 
  dist_order = dist_order_int;
#endif

#endif
  
  /* Allgatherv dist_order to get perm_c */
  if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
    ABORT ("SUPERLU_MALLOC fails for displs.");
  if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
    ABORT ("SUPERLU_MALLOC fails for recvcnts.");
  for (i = 0; i < nprocs_i; i++)
    recvcnts[i] = vtxdist_o[i+1] - vtxdist_o[i];
  displs[0]=0;
  for(i=1; i < nprocs_i; i++) 
    displs[i] = displs[i-1] + recvcnts[i-1];
  
  MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs, 
		  mpi_int_t, grid->comm);

  if ( iam < noDomains) {
    SUPERLU_FREE (dist_order);
  }
  SUPERLU_FREE (vtxdist_i);
  SUPERLU_FREE (vtxdist_o);
  SUPERLU_FREE (recvcnts);
  SUPERLU_FREE (displs);
  
  /* send l_sizes to every processor p >= noDomains */
  if (!iam)
    for (p = noDomains; p < nprocs_i; p++)
      MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm);
  if (noDomains <= iam && iam < nprocs_i)
    MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm,
	      &status);
  
  /* Determine the first node in each separator, store it in l_fstVtxSep */  
  for (j = 0; j < 2 * noDomains; j++)
    l_fstVtxSep[j] = 0;
  l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2];
  szSep = noDomains;
  i = 0;
  while (szSep != 1) {
    for (j = i; j < i + szSep; j++) {
      l_fstVtxSep[j] += l_sizes[j]; 	      
    }
    for (j = i; j < i + szSep; j++) {
      k = i + szSep + (j-i) / 2;
      l_fstVtxSep[k] += l_fstVtxSep[j]; 
    }
    i += szSep;
    szSep = szSep / 2;
  }
  
  l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2];
  i = 2 * noDomains - 2;
  szSep = 1;
  while (i > 0) {
    for (j = i; j < i + szSep; j++) {
      k = (i - 2 * szSep) + (j-i) * 2 + 1;
      noNodes = l_fstVtxSep[k];
      l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k];
      l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] - 
	noNodes - l_sizes[k-1];
    }
    szSep *= 2;
    i -= szSep;
  }

#if ( PRNTlevel>=2 )
  if (!iam ) {
    PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes);
    PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep);
  }
#endif

#if ( DEBUGlevel>=1 )
  CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()");
#endif
  
  return (-mem);

} /* get_perm_c_parmetis */
Exemplo n.º 11
0
double
zPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, 
             SuperMatrix *L, SuperMatrix *U)
{

    NCformat *Astore;
    SCformat *Lstore;
    NCformat *Ustore;
    doublecomplex  *Aval, *Lval, *Uval;
    int      fsupc, nsupr, luptr, nz_in_U;
    int      i, j, k, oldcol;
    int      *inv_perm_c;
    double   rpg, maxaj, maxuj;
    double   smlnum;
    doublecomplex   *luval;
    doublecomplex   temp_comp;
   
    /* Get machine constants. */
    smlnum = dmach("S");
    rpg = 1. / smlnum;

    Astore = A->Store;
    Lstore = L->Store;
    Ustore = U->Store;
    Aval = Astore->nzval;
    Lval = Lstore->nzval;
    Uval = Ustore->nzval;
    
    inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int));
    for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j;

    for (k = 0; k <= Lstore->nsuper; ++k) {
	fsupc = L_FST_SUPC(k);
	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
	luptr = L_NZ_START(fsupc);
	luval = &Lval[luptr];
	nz_in_U = 1;
	
	for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) {
	    maxaj = 0.;
            oldcol = inv_perm_c[j];
	    for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i)
		maxaj = SUPERLU_MAX( maxaj, z_abs1( &Aval[i]) );
	
	    maxuj = 0.;
	    for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++)
		maxuj = SUPERLU_MAX( maxuj, z_abs1( &Uval[i]) );
	    
	    /* Supernode */
	    for (i = 0; i < nz_in_U; ++i)
		maxuj = SUPERLU_MAX( maxuj, z_abs1( &luval[i]) );

	    ++nz_in_U;
	    luval += nsupr;

	    if ( maxuj == 0. )
		rpg = SUPERLU_MIN( rpg, 1.);
	    else
		rpg = SUPERLU_MIN( rpg, maxaj / maxuj );
	}
	
	if ( j >= ncols ) break;
    }

    SUPERLU_FREE(inv_perm_c);
    return (rpg);
}
static float
a_plus_at_CompRow_loc
(
 int   iam,         /* Input - my processor number */
 int_t *perm_r,     /* Input - row permutation vector Pr */
 int   nprocs_i,    /* Input - number of processors the input matrix
		       is distributed on */
 int_t *vtxdist_i,  /* Input - index of first row on each processor of the input matrix */
 int_t n,           /* Input - number of columns in matrix A. */
 int_t *rowptr,     /* Input - row pointers of size m_loc+1 for matrix A. */
 int_t *colind,     /* Input - column indices of size nnz_loc for matrix A. */
 int   nprocs_o,    /* Input - number of processors the output matrix
		       is distributed on */
 int_t *vtxdist_o,  /* Input - index of first row on each processor of the output matrix */
 int_t *p_bnz,      /* Output - on exit, returns the actual number of
		       local nonzeros in matrix A'+A. */
 int_t **p_b_rowptr, /* Output - output matrix, row pointers of size m_loc+1 */
 int_t **p_b_colind, /* Output - output matrix, column indices of size *p_bnz */
 gridinfo_t *grid    /* Input - grid of processors information */
 )
{

  int_t i, j, k, col, num_nz, nprocs;
  int_t *tcolind_recv; /* temporary receive buffer */
  int_t *tcolind_send; /* temporary send buffer */
  int_t sz_tcolind_send, sz_tcolind_recv;
  int_t ind, ind_rcv;
  int redist_pra; /* TRUE if Pr != I or nprocs_i != nprocs_o */
  int_t *marker, *iperm_r;
  int_t *sendCnts, *recvCnts;
  int_t *sdispls, *rdispls;
  int_t *b_rowptr, *b_colind, bnz_t, *b_rowptr_t, *b_colind_t;
  int_t p, t_ind, nelts, ipcol;
  int_t m_loc, m_loc_o;      /* number of local rows */ 
  int_t fst_row, fst_row_o;  /* index of first local row */
  int_t nnz_loc;    /* number of local nonzeros in matrix A */
  float apat_mem, apat_mem_max;
  int   *intBuf1, *intBuf2, *intBuf3, *intBuf4;  

#if ( DEBUGlevel>=1 )
  CHECK_MALLOC(iam, "Enter a_plus_at_CompRow_loc()");
#endif
  
  fst_row    = vtxdist_i[iam];
  m_loc      = vtxdist_i[iam+1] - vtxdist_i[iam];
  nnz_loc    = rowptr[m_loc];
  redist_pra = FALSE;  
  nprocs     = SUPERLU_MAX(nprocs_i, nprocs_o);
  apat_mem_max = 0.;
  
  if (!(marker = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t))))
    ABORT("SUPERLU_MALLOC fails for marker[]");
  if (!(iperm_r = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t))))
    ABORT("SUPERLU_MALLOC fails for iperm_r[]");
  if (!(sendCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))
    ABORT("SUPERLU_MALLOC fails for sendCnts[]");
  if (!(recvCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))
    ABORT("SUPERLU_MALLOC fails for recvCnts[]");
  if (!(sdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))
    ABORT("SUPERLU_MALLOC fails for sdispls[]");
  if (!(rdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))
    ABORT("SUPERLU_MALLOC fails for rdispls[]");
  apat_mem = 2 * n + 4 * nprocs + 3;

#if defined (_LONGINT)
  intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs * sizeof(int));
  intBuf2 = intBuf1 + nprocs;
  intBuf3 = intBuf1 + 2 * nprocs;
  intBuf4 = intBuf1 + 3 * nprocs;
  apat_mem += 4*nprocs*sizeof(int) / sizeof(int_t);
#endif  

  /* compute the inverse row permutation vector */
  for (i = 0; i < n; i++) {
    marker[i] = 1;
    if (perm_r[i] != i)
      redist_pra = TRUE;
    iperm_r[perm_r[i]] = i;
  }

  /* TRANSPOSE LOCAL ROWS ON MY PROCESSOR iam.         */
  /* THE RESULT IS STORED IN TCOLIND_SEND.             */
  /* THIS COUNTS FOR TWO PASSES OF THE LOCAL MATRIX.   */

  /* First pass to get counts of each row of T, and set up column pointers */
  for (j = 0; j < m_loc; j++) {
    for (i = rowptr[j]; i < rowptr[j+1]; i++){
      marker[iperm_r[colind[i]]]++;
    }
  }
  /* determine number of elements to be sent to each processor */
  for (p = 0; p < nprocs_i; p++) {
    sendCnts[p] = 0;
    for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) 
      sendCnts[p] += marker[i];
  }
  /* exchange send/receive counts information in between all processors */
  MPI_Alltoall (sendCnts, 1, mpi_int_t,
		recvCnts, 1, mpi_int_t, grid->comm);
  sendCnts[iam] = 0;
  
  for (i = 0, j = 0, p = 0; p < nprocs_i; p++) {
    rdispls[p] = j;
    j += recvCnts[p];
    sdispls[p] = i;  
    i += sendCnts[p];
  }
  recvCnts[iam] = 0;
  sz_tcolind_recv = j;
  sz_tcolind_send = i;
  
  /* allocate memory to receive necessary blocks of transpose matrix T */
  if (sz_tcolind_recv) {
    if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv 
						   * sizeof(int_t) )))
      ABORT("SUPERLU_MALLOC fails tcolind_recv[]");
    apat_mem += sz_tcolind_recv;
  }
  /* allocate memory to send blocks of local transpose matrix T to other processors */
  if (sz_tcolind_send) {
    if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) 
						  * sizeof(int_t))))
      ABORT("SUPERLU_MALLOC fails for tcolind_send[]");
    apat_mem += sz_tcolind_send;
  }

  /* Set up marker[] to point at the beginning of each row in the
     send/receive buffer.  For each row, we store first its number of
     elements, and then the elements. */  
  ind_rcv = rdispls[iam];
  for (p = 0; p < nprocs_i; p++) {
    for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) {
      nelts = marker[i] - 1;
      if (p == iam) {
	tcolind_recv[ind_rcv] = nelts;
	marker[i] = ind_rcv + 1;
	ind_rcv += nelts + 1;
      }
      else {
	tcolind_send[sdispls[p]] = nelts;
	marker[i] = sdispls[p] + 1;
	sdispls[p] += nelts + 1;
      }
    }
  }
  /* reset sdispls vector */
  for (i = 0, p = 0; p < nprocs_i; p++) {
    sdispls[p] = i;  
    i += sendCnts[p];
  }
  /* Second pass of the local matrix A to copy data to be sent */
  for (j = 0; j < m_loc; j++)
    for (i = rowptr[j]; i < rowptr[j+1]; i++) {
      col = colind[i];
      ipcol = iperm_r[col];      
      if (ipcol >= fst_row && ipcol < fst_row + m_loc)  /* local data */
	tcolind_recv[marker[ipcol]] = perm_r[j + fst_row];      
      else /* remote */ 
	tcolind_send[marker[ipcol]] = perm_r[j + fst_row];
      marker[ipcol] ++;
    }
  sendCnts[iam] = 0;
  recvCnts[iam] = 0;

#if defined (_LONGINT)
  for (p=0; p<nprocs; p++) {
    if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX ||
	recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX)
      ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
    intBuf1[p] = (int) sendCnts[p];
    intBuf2[p] = (int) sdispls[p];
    intBuf3[p] = (int) recvCnts[p];
    intBuf4[p] = (int) rdispls[p];
  }
#else  /* Default */
  intBuf1 = sendCnts;  intBuf2 = sdispls;
  intBuf3 = recvCnts;  intBuf4 = rdispls;
#endif
  
  /* send/receive transpose matrix T */
  MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t,
		 tcolind_recv, intBuf3, intBuf4, mpi_int_t,
		 grid->comm);
  /* ------------------------------------------------------------
     DEALLOCATE SEND COMMUNICATION STORAGE
     ------------------------------------------------------------*/
  if (sz_tcolind_send) {
    SUPERLU_FREE( tcolind_send );
    apat_mem_max = apat_mem;
    apat_mem -= sz_tcolind_send;
  }

  /* ----------------------------------------------------------------
     FOR LOCAL ROWS:
       compute B = A + T, where row j of B is:
       Struct (B(j,:)) = Struct (A(j,:)) UNION Struct (T(j,:))
       do not include the diagonal entry
     THIS COUNTS FOR TWO PASSES OF THE LOCAL ROWS OF A AND T.   
     ------------------------------------------------------------------ */
  
  /* Reset marker to EMPTY */
  for (i = 0; i < n; ++i) marker[i] = EMPTY;
  /* save rdispls information */
  for (p = 0; p < nprocs_i; p++)
    sdispls[p] = rdispls[p];

  /* First pass determines number of nonzeros in B */
  num_nz = 0;
  for (j = 0; j < m_loc; j++) {
    /* Flag the diagonal so it's not included in the B matrix */
    marker[perm_r[j + fst_row]] = j;
    
    /* Add pattern of row A(j,:) to B(j,:) */
    for (i = rowptr[j]; i < rowptr[j+1]; i++) {
      k = colind[i];
      if ( marker[k] != j ) {
	marker[k] = j;
	++num_nz;
      }
    }
    
    /* Add pattern of row T(j,:) to B(j,:) */
    for (p = 0; p < nprocs_i; p++) {
      t_ind = rdispls[p];
      nelts = tcolind_recv[t_ind]; t_ind ++;
      for (i = t_ind; i < t_ind + nelts; i++) {
	k = tcolind_recv[i];
	if ( marker[k] != j ) {
	  marker[k] = j;
	  ++num_nz;
	}
      }
      t_ind += nelts;
      rdispls[p] = t_ind;
    }
  }
  bnz_t = num_nz;

  /* Allocate storage for B=Pr*A+A'*Pr' */
  if ( !(b_rowptr_t = (int_t*) SUPERLU_MALLOC((m_loc+1) * sizeof(int_t))))
    ABORT("SUPERLU_MALLOC fails for b_rowptr_t[]");
  if ( bnz_t ) {
    if ( !(b_colind_t = (int_t*) SUPERLU_MALLOC( bnz_t * sizeof(int_t))))
      ABORT("SUPERLU_MALLOC fails for b_colind_t[]");
  }
  apat_mem += m_loc + 1 + bnz_t;
  if (apat_mem > apat_mem_max)
    apat_mem_max = apat_mem;
  
  /* Reset marker to EMPTY */
  for (i = 0; i < n; i++) marker[i] = EMPTY;
  /* restore rdispls information */
  for (p = 0; p < nprocs_i; p++)
    rdispls[p] = sdispls[p];
  
  /* Second pass, compute each row of B, one at a time */
  num_nz = 0;
  t_ind = 0;
  for (j = 0; j < m_loc; j++) {
    b_rowptr_t[j] = num_nz;
    
    /* Flag the diagonal so it's not included in the B matrix */
    marker[perm_r[j + fst_row]] = j;

    /* Add pattern of row A(j,:) to B(j,:) */
    for (i = rowptr[j]; i < rowptr[j+1]; i++) {
      k = colind[i];
      if ( marker[k] != j ) {
	marker[k] = j;
	b_colind_t[num_nz] = k; num_nz ++;
      }
    }

    /* Add pattern of row T(j,:) to B(j,:) */
    for (p = 0; p < nprocs_i; p++) {
      t_ind = rdispls[p];
      nelts = tcolind_recv[t_ind]; t_ind++;
      for (i = t_ind; i < t_ind + nelts; i++) {
	k = tcolind_recv[i];
	if ( marker[k] != j ) {
	  marker[k] = j;
	  b_colind_t[num_nz] = k; num_nz++;
	}
      }
      t_ind += nelts;
      rdispls[p] = t_ind;
    }
  }
  b_rowptr_t[m_loc] = num_nz;

  for (p = 0; p <= SUPERLU_MIN(nprocs_i, nprocs_o); p++) 
    if (vtxdist_i[p] != vtxdist_o[p])
      redist_pra = TRUE;
  
  if (sz_tcolind_recv) {
    SUPERLU_FREE (tcolind_recv);
    apat_mem -= sz_tcolind_recv;
  }
  SUPERLU_FREE (marker);
  SUPERLU_FREE (iperm_r);
  apat_mem -= 2 * n + 1;
  
  /* redistribute permuted matrix (by rows) from nproc_i processors
     to nproc_o processors */
  if (redist_pra) {
    m_loc_o = vtxdist_o[iam+1] - vtxdist_o[iam];
    fst_row_o = vtxdist_o[iam];
    nnz_loc = 0;
    
    if ( !(b_rowptr = intMalloc_dist(m_loc_o + 1)) )
      ABORT("Malloc fails for *b_rowptr[].");
    apat_mem += m_loc_o + 1;
    if (apat_mem > apat_mem_max)
      apat_mem_max = apat_mem;

    for (p = 0; p < nprocs_i; p++) {
      sendCnts[p] = 0;
      recvCnts[p] = 0;
    }

    for (i = 0; i < m_loc; i++) {
      k = perm_r[i+fst_row];
      /* find the processor to which row k belongs */
      j = FALSE; p = 0;
      while (!j) {
	if (vtxdist_o[p] <= k && k < vtxdist_o[p+1])
	  j = TRUE;
	else 
	  p ++;
      }
      if (p == iam) {
	b_rowptr[k-fst_row_o] = b_rowptr_t[i + 1] - b_rowptr_t[i];
	nnz_loc += b_rowptr[k-fst_row_o];
      }
      else
	sendCnts[p] += b_rowptr_t[i + 1] - b_rowptr_t[i] + 2;
    }
    /* exchange send/receive counts information in between all processors */
    MPI_Alltoall (sendCnts, 1, mpi_int_t,
		  recvCnts, 1, mpi_int_t, grid->comm);
    
    for (i = 0, j = 0, p = 0; p < nprocs_i; p++) {
      rdispls[p] = j;
      j += recvCnts[p];
      sdispls[p] = i;  
      i += sendCnts[p];
    }
    rdispls[p] = j;
    sdispls[p] = i;
    sz_tcolind_recv = j;
    sz_tcolind_send = i;

    /* allocate memory for local data */
    tcolind_recv = NULL;
    tcolind_send = NULL;
    if (sz_tcolind_recv) {
      if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv 
						     * sizeof(int_t) )))
	ABORT("SUPERLU_MALLOC fails tcolind_recv[]");
      apat_mem += sz_tcolind_recv;
    }
    /* allocate memory to receive necessary data */
    if (sz_tcolind_send) {
      if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) 
						    * sizeof(int_t))))
	ABORT("SUPERLU_MALLOC fails for tcolind_send[]");
      apat_mem += sz_tcolind_send;
    }
    if (apat_mem > apat_mem_max)
      apat_mem_max = apat_mem;

    /* Copy data to be sent */
    ind_rcv = rdispls[iam];
    for (i = 0; i < m_loc; i++) {
      k = perm_r[i+fst_row];
      /* find the processor to which row k belongs */
      j = FALSE; p = 0;
      while (!j) {
	if (vtxdist_o[p] <= k && k < vtxdist_o[p+1])
	  j = TRUE;
	else 
	  p ++;
      }
      if (p != iam) { /* remote */ 
	tcolind_send[sdispls[p]] = k;
	tcolind_send[sdispls[p]+1] = b_rowptr_t[i+1] - b_rowptr_t[i];
	sdispls[p] += 2;
	for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++) {
	  tcolind_send[sdispls[p]] = b_colind_t[j]; sdispls[p] ++;
	}
      }
    }
  
    /* reset sdispls vector */
    for (i = 0, p = 0; p < nprocs_i; p++) {
      sdispls[p] = i;  
      i += sendCnts[p];
    }
    sendCnts[iam] = 0;
    recvCnts[iam] = 0;
    
#if defined (_LONGINT)
    for (p=0; p<nprocs; p++) {
      if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX ||
	  recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX)
	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
      intBuf1[p] = (int) sendCnts[p];
      intBuf2[p] = (int) sdispls[p];
      intBuf3[p] = (int) recvCnts[p];
      intBuf4[p] = (int) rdispls[p];
    }
#else  /* Default */
    intBuf1 = sendCnts;  intBuf2 = sdispls;
    intBuf3 = recvCnts;  intBuf4 = rdispls;
#endif

    /* send/receive permuted matrix T by rows */
    MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t,
		   tcolind_recv, intBuf3, intBuf4, mpi_int_t,
		   grid->comm);
    /* ------------------------------------------------------------
       DEALLOCATE COMMUNICATION STORAGE
       ------------------------------------------------------------*/
    if (sz_tcolind_send) {
      SUPERLU_FREE( tcolind_send );
      apat_mem -= sz_tcolind_send;
    }
    
    /* ------------------------------------------------------------
       STORE ROWS IN ASCENDING ORDER OF THEIR NUMBER
       ------------------------------------------------------------*/
    for (p = 0; p < nprocs; p++) {
      if (p != iam) {
	i = rdispls[p];
	while (i < rdispls[p+1]) {
	  j = tcolind_recv[i];
	  nelts = tcolind_recv[i+1];
	  i += 2 + nelts;
	  b_rowptr[j-fst_row_o] = nelts;
	  nnz_loc += nelts;
	}
      }
    }

    if (nnz_loc) {
      if ( !(b_colind = intMalloc_dist(nnz_loc)) ) {
	ABORT("Malloc fails for bcolind[].");
	apat_mem += nnz_loc;
	if (apat_mem > apat_mem_max)
	  apat_mem_max = apat_mem;
      }
    }

    /* Initialize the array of row pointers */
    k = 0;
    for (j = 0; j < m_loc_o; j++) {
      i = b_rowptr[j];
      b_rowptr[j] = k;
      k += i;
    }
    if (m_loc_o) b_rowptr[j] = k;
    
    /* Copy the data into the row oriented storage */
    for (p = 0; p < nprocs; p++) {
      if (p != iam) {
	i = rdispls[p];
	while (i < rdispls[p+1]) {
	  j = tcolind_recv[i];
	  nelts = tcolind_recv[i+1];
	  for (i += 2, k = b_rowptr[j-fst_row_o]; 
	       k < b_rowptr[j-fst_row_o+1]; i++, k++) 
	    b_colind[k] = tcolind_recv[i];
	}
      }
    }
    for (i = 0; i < m_loc; i++) {
      k = perm_r[i+fst_row];
      if (k >= vtxdist_o[iam] && k < vtxdist_o[iam+1]) {
	ind = b_rowptr[k-fst_row_o];
	for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++, ind++)
	  b_colind[ind] = b_colind_t[j];
      }
    }
    
    SUPERLU_FREE(b_rowptr_t);
    if ( bnz_t )
      SUPERLU_FREE(b_colind_t);
    if (sz_tcolind_recv)
      SUPERLU_FREE(tcolind_recv);
    apat_mem -= bnz_t + m_loc + sz_tcolind_recv;
    
    *p_bnz = nnz_loc;
    *p_b_rowptr = b_rowptr;
    *p_b_colind = b_colind;
  }
  else { /* no need for redistribution */
    *p_bnz = bnz_t;
    *p_b_rowptr = b_rowptr_t;
    *p_b_colind = b_colind_t;
  }
  
  SUPERLU_FREE (rdispls);
  SUPERLU_FREE (sdispls);
  SUPERLU_FREE (sendCnts);
  SUPERLU_FREE (recvCnts);
  apat_mem -= 4 * nprocs + 2;
#if defined (_LONGINT)
  SUPERLU_FREE (intBuf1);
  apat_mem -= 4*nprocs*sizeof(int) / sizeof(int_t);
#endif
  
#if ( DEBUGlevel>=1 )
  CHECK_MALLOC(iam, "Exit a_plus_at_CompRow_loc()");
#endif
  
  return (- apat_mem_max * sizeof(int_t));
} /* a_plus_at_CompRow_loc */
Exemplo n.º 13
0
int zgst07(trans_t trans, int n, int nrhs, SuperMatrix *A, doublecomplex *b, 
	      int ldb, doublecomplex *x, int ldx, doublecomplex *xact, 
              int ldxact, double *ferr, double *berr, double *reslts)
{
/*
    Purpose   
    =======   

    ZGST07 tests the error bounds from iterative refinement for the   
    computed solution to a system of equations op(A)*X = B, where A is a 
    general n by n matrix and op(A) = A or A**T, depending on TRANS.
    
    RESLTS(1) = test of the error bound   
              = norm(X - XACT) / ( norm(X) * FERR )   
    A large value is returned if this ratio is not less than one.   

    RESLTS(2) = residual from the iterative refinement routine   
              = the maximum of BERR / ( (n+1)*EPS + (*) ), where   
                (*) = (n+1)*UNFL / (min_i (abs(op(A))*abs(X) +abs(b))_i ) 

    Arguments   
    =========   

    TRANS   (input) trans_t
            Specifies the form of the system of equations.   
            = NOTRANS:  A *x = b   
            = TRANS  :  A'*x = b, where A' is the transpose of A   
            = CONJ   :  A'*x = b, where A' is the transpose of A   

    N       (input) INT
            The number of rows of the matrices X and XACT.  N >= 0.   

    NRHS    (input) INT   
            The number of columns of the matrices X and XACT.  NRHS >= 0. 
  

    A       (input) SuperMatrix *, dimension (A->nrow, A->ncol)
            The original n by n matrix A.   

    B       (input) DOUBLE COMPLEX PRECISION array, dimension (LDB,NRHS)   
            The right hand side vectors for the system of linear   
            equations.   

    LDB     (input) INT   
            The leading dimension of the array B.  LDB >= max(1,N).   

    X       (input) DOUBLE COMPLEX PRECISION array, dimension (LDX,NRHS)   
            The computed solution vectors.  Each vector is stored as a   
            column of the matrix X.   

    LDX     (input) INT   
            The leading dimension of the array X.  LDX >= max(1,N).   

    XACT    (input) DOUBLE COMPLEX PRECISION array, dimension (LDX,NRHS)   
            The exact solution vectors.  Each vector is stored as a   
            column of the matrix XACT.   

    LDXACT  (input) INT   
            The leading dimension of the array XACT.  LDXACT >= max(1,N). 
  

    FERR    (input) DOUBLE COMPLEX PRECISION array, dimension (NRHS)   
            The estimated forward error bounds for each solution vector   
            X.  If XTRUE is the true solution, FERR bounds the magnitude 
            of the largest entry in (X - XTRUE) divided by the magnitude 
            of the largest entry in X.   

    BERR    (input) DOUBLE COMPLEX PRECISION array, dimension (NRHS)   
            The componentwise relative backward error of each solution   
            vector (i.e., the smallest relative change in any entry of A 
  
            or B that makes X an exact solution).   

    RESLTS  (output) DOUBLE PRECISION array, dimension (2)   
            The maximum over the NRHS solution vectors of the ratios:   
            RESLTS(1) = norm(X - XACT) / ( norm(X) * FERR )   
            RESLTS(2) = BERR / ( (n+1)*EPS + (*) )   

    ===================================================================== 
*/
    
    /* Table of constant values */
    int c__1 = 1;

    /* System generated locals */
    double d__1, d__2;
    double d__3, d__4;

    /* Local variables */
    double diff, axbi;
    int    imax, irow, n__1;
    int    i, j, k;
    double unfl, ovfl;
    double xnorm;
    double errbnd;
    int    notran;
    double eps, tmp;
    double *rwork;
    doublecomplex *Aval;
    NCformat *Astore;

    /* Function prototypes */
    extern int    lsame_(char *, char *);
    extern int    izamax_(int *, doublecomplex *, int *);


    /* Quick exit if N = 0 or NRHS = 0. */
    if ( n <= 0 || nrhs <= 0 ) {
	reslts[0] = 0.;
	reslts[1] = 0.;
	return 0;
    }

    eps = dmach("Epsilon");
    unfl = dmach("Safe minimum");
    ovfl   = 1. / unfl;
    notran = (trans == NOTRANS);

    rwork  = (double *) SUPERLU_MALLOC(n*sizeof(double));
    if ( !rwork ) ABORT("SUPERLU_MALLOC fails for rwork");
    Astore = A->Store;
    Aval   = (doublecomplex *) Astore->nzval;
    
    /* Test 1:  Compute the maximum of   
       norm(X - XACT) / ( norm(X) * FERR )   
       over all the vectors X and XACT using the infinity-norm. */

    errbnd = 0.;
    for (j = 0; j < nrhs; ++j) {
	n__1 = n;
	imax = izamax_(&n__1, &x[j*ldx], &c__1);
	d__1 = (d__2 = x[imax-1 + j*ldx].r, fabs(d__2)) + 
               (d__3 = x[imax-1 + j*ldx].i, fabs(d__3));
	xnorm = SUPERLU_MAX(d__1,unfl);
	diff = 0.;
	for (i = 0; i < n; ++i) {
	    d__1 = (d__2 = x[i+j*ldx].r - xact[i+j*ldxact].r, fabs(d__2)) +
                   (d__3 = x[i+j*ldx].i - xact[i+j*ldxact].i, fabs(d__3));
	    diff = SUPERLU_MAX(diff, d__1);
	}

	if (xnorm > 1.) {
	    goto L20;
	} else if (diff <= ovfl * xnorm) {
	    goto L20;
	} else {
	    errbnd = 1. / eps;
	    goto L30;
	}

L20:
#if 0	
	if (diff / xnorm <= ferr[j]) {
	    d__1 = diff / xnorm / ferr[j];
	    errbnd = SUPERLU_MAX(errbnd,d__1);
	} else {
	    errbnd = 1. / eps;
	}
#endif
	d__1 = diff / xnorm / ferr[j];
	errbnd = SUPERLU_MAX(errbnd,d__1);
	/*printf("Ferr: %f\n", errbnd);*/
L30:
	;
    }
    reslts[0] = errbnd;

    /* Test 2: Compute the maximum of BERR / ( (n+1)*EPS + (*) ), where 
       (*) = (n+1)*UNFL / (min_i (abs(op(A))*abs(X) + abs(b))_i ) */

    for (k = 0; k < nrhs; ++k) {
	for (i = 0; i < n; ++i) 
            rwork[i] = (d__1 = b[i + k*ldb].r, fabs(d__1)) +
                       (d__2 = b[i + k*ldb].i, fabs(d__2));
	if ( notran ) {
	    for (j = 0; j < n; ++j) {
		tmp = (d__1 = x[j + k*ldx].r, fabs(d__1)) +
                      (d__2 = x[j + k*ldx].i, fabs(d__2));
		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
		    d__1 = (d__2 = Aval[i].r, fabs(d__2)) +
                           (d__3 = Aval[i].i, fabs(d__3));
		    rwork[Astore->rowind[i]] += d__1 * tmp;
                }
	    }
	} else {
	    for (j = 0; j < n; ++j) {
		tmp = 0.;
		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
		    irow = Astore->rowind[i];
		    d__1 = (d__2 = x[irow + k*ldx].r, fabs(d__2)) +
                           (d__3 = x[irow + k*ldx].i, fabs(d__3));
                    d__2 = (d__3 = Aval[i].r, fabs(d__3)) +
                           (d__4 = Aval[i].i, fabs(d__4));
		    tmp += d__2 * d__1;
		}
		rwork[j] += tmp;
	    }
	}

	axbi = rwork[0];
	for (i = 1; i < n; ++i) axbi = SUPERLU_MIN(axbi, rwork[i]);
	
	/* Computing MAX */
	d__1 = axbi, d__2 = (n + 1) * unfl;
	tmp = berr[k] / ((n + 1) * eps + (n + 1) * unfl / SUPERLU_MAX(d__1,d__2));
	
	if (k == 0) {
	    reslts[1] = tmp;
	} else {
	    reslts[1] = SUPERLU_MAX(reslts[1],tmp);
	}
    }

    SUPERLU_FREE(rwork);
    return 0;

} /* zgst07 */
Exemplo n.º 14
0
psgstrf_threadarg_t *
psgstrf_thread_init(SuperMatrix *A, SuperMatrix *L, SuperMatrix *U,
		    superlumt_options_t *options, 
		    pxgstrf_shared_t *pxgstrf_shared,
		    Gstat_t *Gstat, int *info)
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 *
 * Purpose
 * =======
 *
 * psgstrf_thread_init() initializes the parallel data structures
 * for the multithreaded routine psgstrf_thread().
 *
 * Arguments
 * =========
 *
 * A        (input) SuperMatrix*
 *	    Original matrix A, permutated by columns, of dimension
 *          (A->nrow, A->ncol). The type of A can be:
 *          Stype = NCP; Dtype = _D; Mtype = GE.
 *
 * L        (input) SuperMatrix*
 *          If options->refact = YES, then use the existing
 *          storage in L to perform LU factorization;
 *          Otherwise, L is not accessed. L has types: 
 *          Stype = SCP, Dtype = _D, Mtype = TRLU.
 *
 * U        (input) SuperMatrix*
 *          If options->refact = YES, then use the existing
 *          storage in U to perform LU factorization;
 *          Otherwise, U is not accessed. U has types:
 *          Stype = NCP, Dtype = _D, Mtype = TRU.
 *
 * options (input) superlumt_options_t*
 *          The structure contains the parameters to control how the
 *          factorization is performed;
 *          See superlumt_options_t structure defined in slu_mt_util.h.
 *
 * pxgstrf_shared (output) pxgstrf_shared_t*
 *          The structure contains the shared task queue and the 
 *          synchronization variables for parallel factorization.
 *          See pxgstrf_shared_t structure defined in pdsp_defs.h.
 *
 * Gstat    (output) Gstat_t*
 *          Record all the statistics about the factorization; 
 *          See Gstat_t structure defined in slu_mt_util.h.
 *
 * info     (output) int*
 *          = 0: successful exit
 *          > 0: if options->lwork = -1, info returns the estimated
 *               amount of memory (in bytes) required;
 *               Otherwise, it returns the number of bytes allocated when
 *               memory allocation failure occurred, plus A->ncol.
 *
 */
    static GlobalLU_t Glu; /* persistent to support repeated factors. */
    psgstrf_threadarg_t *psgstrf_threadarg;
    register int n, i, nprocs;
    NCPformat *Astore;
    int  *perm_c;
    int  *perm_r;
    int  *inv_perm_c; /* inverse of perm_c */
    int  *inv_perm_r; /* inverse of perm_r */
    int	 *xprune;  /* points to locations in subscript vector lsub[*].
			For column i, xprune[i] denotes the point where 
			structural pruning begins.
			I.e. only xlsub[i],..,xprune[i]-1 need to be
			traversed for symbolic factorization.     */
    int  *ispruned;/* flag to indicate whether column j is pruned */
    int   nzlumax;
    pxgstrf_relax_t *pxgstrf_relax;
    
    nprocs     = options->nprocs;
    perm_c     = options->perm_c;
    perm_r     = options->perm_r;
    n          = A->ncol;
    Astore     = A->Store;
    inv_perm_r = (int *) intMalloc(n);
    inv_perm_c = (int *) intMalloc(n);
    xprune     = (int *) intMalloc(n);
    ispruned   = (int *) intCalloc(n);
    
    /* Pack shared data objects to each process. */
    pxgstrf_shared->inv_perm_r   = inv_perm_r;
    pxgstrf_shared->inv_perm_c   = inv_perm_c;
    pxgstrf_shared->xprune       = xprune;
    pxgstrf_shared->ispruned     = ispruned;
    pxgstrf_shared->A            = A;
    pxgstrf_shared->Glu          = &Glu;
    pxgstrf_shared->Gstat        = Gstat;
    pxgstrf_shared->info         = info;

    if ( options->usepr ) {
	/* Compute the inverse of perm_r */
	for (i = 0; i < n; ++i) inv_perm_r[perm_r[i]] = i;
    }
    for (i = 0; i < n; ++i) inv_perm_c[perm_c[i]] = i;

    /* Initialization. */
    Glu.nsuper = -1;
    Glu.nextl  = 0;
    Glu.nextu  = 0;
    Glu.nextlu = 0;
    ifill(perm_r, n, EMPTY);

    /* Identify relaxed supernodes at the bottom of the etree. */
    pxgstrf_relax = (pxgstrf_relax_t *)
        SUPERLU_MALLOC( (size_t) (n+2) * sizeof(pxgstrf_relax_t) );
    if ( options->SymmetricMode == YES ) {
        heap_relax_snode(n, options, pxgstrf_relax);
    } else {
        pxgstrf_relax_snode(n, options, pxgstrf_relax);
    }        
    
    /* Initialize mutex variables, task queue, determine panels. */
    ParallelInit(n, pxgstrf_relax, options, pxgstrf_shared);
    
    /* Set up memory image in lusup[*]. */
    nzlumax = sPresetMap(n, A, pxgstrf_relax, options, &Glu);
    if ( options->refact == NO ) Glu.nzlumax = nzlumax;
    
    SUPERLU_FREE (pxgstrf_relax);

    /* Allocate global storage common to all the factor routines */
    *info = psgstrf_MemInit(n, Astore->nnz, options, L, U, &Glu);
    if ( *info ) return NULL;

    /* Prepare arguments to all threads. */
    psgstrf_threadarg = (psgstrf_threadarg_t *) 
        SUPERLU_MALLOC(nprocs * sizeof(psgstrf_threadarg_t));
    for (i = 0; i < nprocs; ++i) {
        psgstrf_threadarg[i].pnum = i;
        psgstrf_threadarg[i].info = 0;
	psgstrf_threadarg[i].superlumt_options = options;
	psgstrf_threadarg[i].pxgstrf_shared = pxgstrf_shared;
    }

#if ( DEBUGlevel==1 )
    printf("** psgstrf_thread_init() called\n");
#endif

    return (psgstrf_threadarg);
}
Exemplo n.º 15
0
void
cgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
       int *etree, char *equed, float *R, float *C,
       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
       SuperMatrix *B, SuperMatrix *X, float *recip_pivot_growth, 
       float *rcond, float *ferr, float *berr, 
       mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info )
{


    DNformat  *Bstore, *Xstore;
    complex    *Bmat, *Xmat;
    int       ldb, ldx, nrhs;
    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int       colequ, equil, nofact, notran, rowequ, permc_spec;
    trans_t   trant;
    char      norm[1];
    int       i, j, info1;
    float    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
    int       relax, panel_size;
    float    diag_pivot_thresh;
    double    t0;      /* temporary time */
    double    *utime;

    /* External functions */
    extern float clangs(char *, SuperMatrix *);

    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;

    *info = 0;
    nofact = (options->Fact != FACTORED);
    equil = (options->Equil == YES);
    notran = (options->Trans == NOTRANS);
    if ( nofact ) {
	*(unsigned char *)equed = 'N';
	rowequ = FALSE;
	colequ = FALSE;
    } else {
	rowequ = lsame_(equed, "R") || lsame_(equed, "B");
	colequ = lsame_(equed, "C") || lsame_(equed, "B");
	smlnum = slamch_("Safe minimum");
	bignum = 1. / smlnum;
    }

#if 0
printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n",
       options->Fact, options->Trans, *equed);
#endif

    /* Test the input parameters */
    if (options->Fact != DOFACT && options->Fact != SamePattern &&
	options->Fact != SamePattern_SameRowPerm &&
	options->Fact != FACTORED &&
	options->Trans != NOTRANS && options->Trans != TRANS && 
	options->Trans != CONJ &&
	options->Equil != NO && options->Equil != YES)
	*info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
	      A->Dtype != SLU_C || A->Mtype != SLU_GE )
	*info = -2;
    else if (options->Fact == FACTORED &&
	     !(rowequ || colequ || lsame_(equed, "N")))
	*info = -6;
    else {
	if (rowequ) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, R[j]);
		rcmax = SUPERLU_MAX(rcmax, R[j]);
	    }
	    if (rcmin <= 0.) *info = -7;
	    else if ( A->nrow > 0)
		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else rowcnd = 1.;
	}
	if (colequ && *info == 0) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, C[j]);
		rcmax = SUPERLU_MAX(rcmax, C[j]);
	    }
	    if (rcmin <= 0.) *info = -8;
	    else if (A->nrow > 0)
		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else colcnd = 1.;
	}
	if (*info == 0) {
	    if ( lwork < -1 ) *info = -12;
	    else if ( B->ncol < 0 ) *info = -13;
	    else if ( B->ncol > 0 ) { /* no checking if B->ncol=0 */
	         if ( Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      B->Stype != SLU_DN || B->Dtype != SLU_C || 
		      B->Mtype != SLU_GE )
		*info = -13;
            }
	    if ( X->ncol < 0 ) *info = -14;
            else if ( X->ncol > 0 ) { /* no checking if X->ncol=0 */
                 if ( Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      (B->ncol != 0 && B->ncol != X->ncol) ||
                      X->Stype != SLU_DN ||
		      X->Dtype != SLU_C || X->Mtype != SLU_GE )
		*info = -14;
            }
	}
    }
    if (*info != 0) {
	i = -(*info);
	xerbla_("cgssvx", &i);
	return;
    }
    
    /* Initialization for factor parameters */
    panel_size = sp_ienv(1);
    relax      = sp_ienv(2);
    diag_pivot_thresh = options->DiagPivotThresh;

    utime = stat->utime;
    
    /* Convert A to SLU_NC format when necessary. */
    if ( A->Stype == SLU_NR ) {
	NRformat *Astore = A->Store;
	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
	cCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
			       Astore->nzval, Astore->colind, Astore->rowptr,
			       SLU_NC, A->Dtype, A->Mtype);
	if ( notran ) { /* Reverse the transpose argument. */
	    trant = TRANS;
	    notran = 0;
	} else {
	    trant = NOTRANS;
	    notran = 1;
	}
    } else { /* A->Stype == SLU_NC */
	trant = options->Trans;
	AA = A;
    }

    if ( nofact && equil ) {
	t0 = SuperLU_timer_();
	/* Compute row and column scalings to equilibrate the matrix A. */
	cgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
	
	if ( info1 == 0 ) {
	    /* Equilibrate matrix A. */
	    claqgs(AA, R, C, rowcnd, colcnd, amax, equed);
	    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
	    colequ = lsame_(equed, "C") || lsame_(equed, "B");
	}
	utime[EQUIL] = SuperLU_timer_() - t0;
    }


    if ( nofact ) {
	
        t0 = SuperLU_timer_();
	/*
	 * Gnet column permutation vector perm_c[], according to permc_spec:
	 *   permc_spec = NATURAL:  natural ordering 
	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
	 *   permc_spec = COLAMD:   approximate minimum degree column ordering
	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
	 */
	permc_spec = options->ColPerm;
	if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
            get_perm_c(permc_spec, AA, perm_c);
	utime[COLPERM] = SuperLU_timer_() - t0;

	t0 = SuperLU_timer_();
	sp_preorder(options, AA, perm_c, etree, &AC);
	utime[ETREE] = SuperLU_timer_() - t0;
    
/*	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
	       relax, panel_size, sp_ienv(3), sp_ienv(4));
	fflush(stdout); */
	
	/* Compute the LU factorization of A*Pc. */
	t0 = SuperLU_timer_();
	cgstrf(options, &AC, relax, panel_size, etree,
                work, lwork, perm_c, perm_r, L, U, stat, info);
	utime[FACT] = SuperLU_timer_() - t0;
	
	if ( lwork == -1 ) {
	    mem_usage->total_needed = *info - A->ncol;
	    return;
	}
    }

    if ( options->PivotGrowth ) {
        if ( *info > 0 ) {
	    if ( *info <= A->ncol ) {
	        /* Compute the reciprocal pivot growth factor of the leading
	           rank-deficient *info columns of A. */
	        *recip_pivot_growth = cPivotGrowth(*info, AA, perm_c, L, U);
	    }
	    return;
        }

        /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
        *recip_pivot_growth = cPivotGrowth(A->ncol, AA, perm_c, L, U);
    }

    if ( options->ConditionNumber ) {
        /* Estimate the reciprocal of the condition number of A. */
        t0 = SuperLU_timer_();
        if ( notran ) {
	    *(unsigned char *)norm = '1';
        } else {
	    *(unsigned char *)norm = 'I';
        }
        anorm = clangs(norm, AA);
        cgscon(norm, L, U, anorm, rcond, stat, info);
        utime[RCOND] = SuperLU_timer_() - t0;
    }
    
    if ( nrhs > 0 ) {
        /* Scale the right hand side if equilibration was performed. */
        if ( notran ) {
	    if ( rowequ ) {
	        for (j = 0; j < nrhs; ++j)
		    for (i = 0; i < A->nrow; ++i)
                        cs_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]);
	    }
        } else if ( colequ ) {
	    for (j = 0; j < nrhs; ++j)
	        for (i = 0; i < A->nrow; ++i)
                    cs_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]);
        }

        /* Compute the solution matrix X. */
        for (j = 0; j < nrhs; j++)  /* Save a copy of the right hand sides */
            for (i = 0; i < B->nrow; i++)
	        Xmat[i + j*ldx] = Bmat[i + j*ldb];
    
        t0 = SuperLU_timer_();
        cgstrs (trant, L, U, perm_c, perm_r, X, stat, info);
        utime[SOLVE] = SuperLU_timer_() - t0;
    
        /* Use iterative refinement to improve the computed solution and compute
           error bounds and backward error estimates for it. */
        t0 = SuperLU_timer_();
        if ( options->IterRefine != NOREFINE ) {
            cgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B,
                   X, ferr, berr, stat, info);
        } else {
            for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0;
        }
        utime[REFINE] = SuperLU_timer_() - t0;

        /* Transform the solution matrix X to a solution of the original system. */
        if ( notran ) {
	    if ( colequ ) {
	        for (j = 0; j < nrhs; ++j)
		    for (i = 0; i < A->nrow; ++i)
                        cs_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]);
	    }
        } else if ( rowequ ) {
	    for (j = 0; j < nrhs; ++j)
	        for (i = 0; i < A->nrow; ++i)
                    cs_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]);
        }
    } /* end if nrhs > 0 */

    if ( options->ConditionNumber ) {
        /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
        if ( *rcond < slamch_("E") ) *info = A->ncol + 1;
    }

    if ( nofact ) {
        cQuerySpace(L, U, mem_usage);
        Destroy_CompCol_Permuted(&AC);
    }
    if ( A->Stype == SLU_NR ) {
	Destroy_SuperMatrix_Store(AA);
	SUPERLU_FREE(AA);
    }

}
Exemplo n.º 16
0
int_t
dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
                Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno,
                gridinfo_t *grid, int_t *colptr[], int_t *rowind[],
                double *a[])
{
/*
 * -- Distributed SuperLU routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley.
 * March 15, 2003
 *
 * Purpose
 * =======
 *   Re-distribute A on the 2D process mesh.
 * 
 * Arguments
 * =========
 * 
 * A      (input) SuperMatrix*
 *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
 *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
 *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
 *
 * ScalePermstruct (input) ScalePermstruct_t*
 *        The data structure to store the scaling and permutation vectors
 *        describing the transformations performed to the original matrix A.
 *
 * Glu_freeable (input) *Glu_freeable_t
 *        The global structure describing the graph of L and U.
 * 
 * grid   (input) gridinfo_t*
 *        The 2D process mesh.
 *
 * colptr (output) int*
 *
 * rowind (output) int*
 *
 * a      (output) double*
 *
 * Return value
 * ============
 *
 */
    NRformat_loc *Astore;
    int_t  *perm_r; /* row permutation vector */
    int_t  *perm_c; /* column permutation vector */
    int_t  i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize;
    int_t  nnz_loc;    /* number of local nonzeros */
    int_t  nnz_remote; /* number of remote nonzeros to be sent */
    int_t  SendCnt; /* number of remote nonzeros to be sent */
    int_t  RecvCnt; /* number of remote nonzeros to be sent */
    int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
    int_t  *ia, *ja, **ia_send, *index, *itemp;
    int_t  *ptr_to_send;
    double *aij, **aij_send, *nzval, *dtemp;
    double *nzval_a;
    int    iam, it, p, procs;
    MPI_Request *send_req;
    MPI_Status  status;
    

    /* ------------------------------------------------------------
       INITIALIZATION.
       ------------------------------------------------------------*/
    iam = grid->iam;
#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter dReDistribute_A()");
#endif
    perm_r = ScalePermstruct->perm_r;
    perm_c = ScalePermstruct->perm_c;
    procs = grid->nprow * grid->npcol;
    Astore = (NRformat_loc *) A->Store;
    n = A->ncol;
    m_loc = Astore->m_loc;
    fst_row = Astore->fst_row;
    nnzToRecv = intCalloc_dist(2*procs);
    nnzToSend = nnzToRecv + procs;


    /* ------------------------------------------------------------
       COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
       THEN ALLOCATE SPACE.
       THIS ACCOUNTS FOR THE FIRST PASS OF A.
       ------------------------------------------------------------*/
    for (i = 0; i < m_loc; ++i) {
        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
	    jcol = Astore->colind[j];
	    gbi = BlockNum( irow );
	    gbj = BlockNum( jcol );
	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
	    ++nnzToSend[p]; 
	}
    }

    /* All-to-all communication */
    MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
		  grid->comm);

    maxnnzToRecv = 0;
    nnz_loc = SendCnt = RecvCnt = 0;

    for (p = 0; p < procs; ++p) {
	if ( p != iam ) {
	    SendCnt += nnzToSend[p];
	    RecvCnt += nnzToRecv[p];
	    maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
	} else {
	    nnz_loc += nnzToRecv[p];
	    /*assert(nnzToSend[p] == nnzToRecv[p]);*/
	}
    }
    k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */

    /* Allocate space for storing the triplets after redistribution. */
    if ( !(ia = intMalloc_dist(2*k)) )
        ABORT("Malloc fails for ia[].");
    ja = ia + k;
    if ( !(aij = doubleMalloc_dist(k)) )
        ABORT("Malloc fails for aij[].");

    /* Allocate temporary storage for sending/receiving the A triplets. */
    if ( procs > 1 ) {
      if ( !(send_req = (MPI_Request *)
	     SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) )
	ABORT("Malloc fails for send_req[].");
      if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) )
        ABORT("Malloc fails for ia_send[].");
      if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) )
        ABORT("Malloc fails for aij_send[].");
      if ( !(index = intMalloc_dist(2*SendCnt)) )
        ABORT("Malloc fails for index[].");
      if ( !(nzval = doubleMalloc_dist(SendCnt)) )
        ABORT("Malloc fails for nzval[].");
      if ( !(ptr_to_send = intCalloc_dist(procs)) )
        ABORT("Malloc fails for ptr_to_send[].");
      if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) )
        ABORT("Malloc fails for itemp[].");
      if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) )
        ABORT("Malloc fails for dtemp[].");

      for (i = 0, j = 0, p = 0; p < procs; ++p) {
          if ( p != iam ) {
	      ia_send[p] = &index[i];
	      i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
	      aij_send[p] = &nzval[j];
	      j += nnzToSend[p];
	  }
      }
    } /* if procs > 1 */
      
    if ( !(*colptr = intCalloc_dist(n+1)) )
        ABORT("Malloc fails for *colptr[].");

    /* ------------------------------------------------------------
       LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
       THIS ACCOUNTS FOR THE SECOND PASS OF A.
       ------------------------------------------------------------*/
    nnz_loc = 0; /* Reset the local nonzero count. */
    nzval_a = Astore->nzval;
    for (i = 0; i < m_loc; ++i) {
        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
	    jcol = Astore->colind[j];
	    gbi = BlockNum( irow );
	    gbj = BlockNum( jcol );
	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );

	    if ( p != iam ) { /* remote */
	        k = ptr_to_send[p];
	        ia_send[p][k] = irow;
	        ia_send[p][k + nnzToSend[p]] = jcol;
		aij_send[p][k] = nzval_a[j];
		++ptr_to_send[p]; 
	    } else {          /* local */
	        ia[nnz_loc] = irow;
	        ja[nnz_loc] = jcol;
		aij[nnz_loc] = nzval_a[j];
		++nnz_loc;
		++(*colptr)[jcol]; /* Count nonzeros in each column */
	    }
	}
    }

    /* ------------------------------------------------------------
       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
       NOTE: Can possibly use MPI_Alltoallv.
       ------------------------------------------------------------*/
    for (p = 0; p < procs; ++p) {
        if ( p != iam ) {
	    it = 2*nnzToSend[p];
	    MPI_Isend( ia_send[p], it, mpi_int_t,
		       p, iam, grid->comm, &send_req[p] );
	    it = nnzToSend[p];
	    MPI_Isend( aij_send[p], it, MPI_DOUBLE,
	               p, iam+procs, grid->comm, &send_req[procs+p] ); 
	}
    }

    for (p = 0; p < procs; ++p) {
        if ( p != iam ) {
	    it = 2*nnzToRecv[p];
	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); 
	    it = nnzToRecv[p];
            MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
		      grid->comm, &status );
	    for (i = 0; i < nnzToRecv[p]; ++i) {
	        ia[nnz_loc] = itemp[i];
		jcol = itemp[i + nnzToRecv[p]];
		/*assert(jcol<n);*/
	        ja[nnz_loc] = jcol;
		aij[nnz_loc] = dtemp[i];
		++nnz_loc;
		++(*colptr)[jcol]; /* Count nonzeros in each column */ 
	    }
	}
    }

    for (p = 0; p < procs; ++p) {
        if ( p != iam ) {
	    MPI_Wait( &send_req[p], &status);
	    MPI_Wait( &send_req[procs+p], &status);
	}
    }

    /* ------------------------------------------------------------
       DEALLOCATE TEMPORARY STORAGE
       ------------------------------------------------------------*/

    SUPERLU_FREE(nnzToRecv);

    if ( procs > 1 ) {
	SUPERLU_FREE(send_req);
	SUPERLU_FREE(ia_send);
	SUPERLU_FREE(aij_send);
	SUPERLU_FREE(index);
	SUPERLU_FREE(nzval);
	SUPERLU_FREE(ptr_to_send);
	SUPERLU_FREE(itemp);
	SUPERLU_FREE(dtemp);
    }

    /* ------------------------------------------------------------
       CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT.
       ------------------------------------------------------------*/
    if ( !(*rowind = intMalloc_dist(nnz_loc)) )
        ABORT("Malloc fails for *rowind[].");
    if ( !(*a = doubleMalloc_dist(nnz_loc)) )
        ABORT("Malloc fails for *a[].");

    /* Initialize the array of column pointers */
    k = 0;
    jsize = (*colptr)[0];
    (*colptr)[0] = 0;
    for (j = 1; j < n; ++j) {
	k += jsize;
	jsize = (*colptr)[j];
	(*colptr)[j] = k;
    }
    
    /* Copy the triplets into the column oriented storage */
    for (i = 0; i < nnz_loc; ++i) {
	j = ja[i];
	k = (*colptr)[j];
	(*rowind)[k] = ia[i];
	(*a)[k] = aij[i];
	++(*colptr)[j];
    }

    /* Reset the column pointers to the beginning of each column */
    for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1];
    (*colptr)[0] = 0;

    SUPERLU_FREE(ia);
    SUPERLU_FREE(aij);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit dReDistribute_A()");
#endif
 
} /* dReDistribute_A */
Exemplo n.º 17
0
void
zgssvx(char *fact, char *trans, char *refact,
       SuperMatrix *A, factor_param_t *factor_params, int *perm_c,
       int *perm_r, int *etree, char *equed, double *R, double *C,
       SuperMatrix *L, SuperMatrix *U, void *work, int lwork,
       SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, 
       double *rcond, double *ferr, double *berr, 
       mem_usage_t *mem_usage, int *info )
{
/*
 * Purpose
 * =======
 *
 * ZGSSVX solves the system of linear equations A*X=B or A'*X=B, using
 * the LU factorization from zgstrf(). Error bounds on the solution and
 * a condition estimate are also provided. It performs the following steps:
 *
 *   1. If A is stored column-wise (A->Stype = SLU_NC):
 *  
 *      1.1. If fact = 'E', scaling factors are computed to equilibrate the
 *           system:
 *             trans = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
 *             trans = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
 *             trans = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
 *           Whether or not the system will be equilibrated depends on the
 *           scaling of the matrix A, but if equilibration is used, A is
 *           overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if trans='N')
 *           or diag(C)*B (if trans = 'T' or 'C').
 *
 *      1.2. Permute columns of A, forming A*Pc, where Pc is a permutation
 *           matrix that usually preserves sparsity.
 *           For more details of this step, see sp_preorder.c.
 *
 *      1.3. If fact = 'N' or 'E', the LU decomposition is used to factor the
 *           matrix A (after equilibration if fact = 'E') as Pr*A*Pc = L*U,
 *           with Pr determined by partial pivoting.
 *
 *      1.4. Compute the reciprocal pivot growth factor.
 *
 *      1.5. If some U(i,i) = 0, so that U is exactly singular, then the
 *           routine returns with info = i. Otherwise, the factored form of 
 *           A is used to estimate the condition number of the matrix A. If
 *           the reciprocal of the condition number is less than machine
 *           precision, info = A->ncol+1 is returned as a warning, but the
 *           routine still goes on to solve for X and computes error bounds
 *           as described below.
 *
 *      1.6. The system of equations is solved for X using the factored form
 *           of A.
 *
 *      1.7. Iterative refinement is applied to improve the computed solution
 *           matrix and calculate error bounds and backward error estimates
 *           for it.
 *
 *      1.8. If equilibration was used, the matrix X is premultiplied by
 *           diag(C) (if trans = 'N') or diag(R) (if trans = 'T' or 'C') so
 *           that it solves the original system before equilibration.
 *
 *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm
 *      to the transpose of A:
 *
 *      2.1. If fact = 'E', scaling factors are computed to equilibrate the
 *           system:
 *             trans = 'N':  diag(R)*A'*diag(C)     *inv(diag(C))*X = diag(R)*B
 *             trans = 'T': (diag(R)*A'*diag(C))**T *inv(diag(R))*X = diag(C)*B
 *             trans = 'C': (diag(R)*A'*diag(C))**H *inv(diag(R))*X = diag(C)*B
 *           Whether or not the system will be equilibrated depends on the
 *           scaling of the matrix A, but if equilibration is used, A' is
 *           overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
 *           (if trans='N') or diag(C)*B (if trans = 'T' or 'C').
 *
 *      2.2. Permute columns of transpose(A) (rows of A), 
 *           forming transpose(A)*Pc, where Pc is a permutation matrix that 
 *           usually preserves sparsity.
 *           For more details of this step, see sp_preorder.c.
 *
 *      2.3. If fact = 'N' or 'E', the LU decomposition is used to factor the
 *           transpose(A) (after equilibration if fact = 'E') as 
 *           Pr*transpose(A)*Pc = L*U with the permutation Pr determined by
 *           partial pivoting.
 *
 *      2.4. Compute the reciprocal pivot growth factor.
 *
 *      2.5. If some U(i,i) = 0, so that U is exactly singular, then the
 *           routine returns with info = i. Otherwise, the factored form 
 *           of transpose(A) is used to estimate the condition number of the
 *           matrix A. If the reciprocal of the condition number
 *           is less than machine precision, info = A->nrow+1 is returned as
 *           a warning, but the routine still goes on to solve for X and
 *           computes error bounds as described below.
 *
 *      2.6. The system of equations is solved for X using the factored form
 *           of transpose(A).
 *
 *      2.7. Iterative refinement is applied to improve the computed solution
 *           matrix and calculate error bounds and backward error estimates
 *           for it.
 *
 *      2.8. If equilibration was used, the matrix X is premultiplied by
 *           diag(C) (if trans = 'N') or diag(R) (if trans = 'T' or 'C') so
 *           that it solves the original system before equilibration.
 *
 *   See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 * Arguments
 * =========
 *
 * fact    (input) char*
 *         Specifies whether or not the factored form of the matrix
 *         A is supplied on entry, and if not, whether the matrix A should
 *         be equilibrated before it is factored.
 *         = 'F': On entry, L, U, perm_r and perm_c contain the factored
 *                form of A. If equed is not 'N', the matrix A has been
 *                equilibrated with scaling factors R and C.
 *                A, L, U, perm_r are not modified.
 *         = 'N': The matrix A will be factored, and the factors will be
 *                stored in L and U.
 *         = 'E': The matrix A will be equilibrated if necessary, then
 *                factored into L and U.
 *
 * trans   (input) char*
 *         Specifies the form of the system of equations:
 *         = 'N': A * X = B        (No transpose)
 *         = 'T': A**T * X = B     (Transpose)
 *         = 'C': A**H * X = B     (Transpose)
 *
 * refact  (input) char*
 *         Specifies whether we want to re-factor the matrix.
 *         = 'N': Factor the matrix A.
 *         = 'Y': Matrix A was factored before, now we want to re-factor
 *                matrix A with perm_r and etree as inputs. Use
 *                the same storage for the L\U factors previously allocated,
 *                expand it if necessary. User should insure to use the same
 *                memory model.  In this case, perm_r may be modified due to
 *                different pivoting determined by diagonal threshold.
 *         If fact = 'F', then refact is not accessed.
 *
 * A       (input/output) SuperMatrix*
 *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
 *         of the linear equations is A->nrow. Currently, the type of A can be:
 *         Stype = SLU_NC or SLU_NR, Dtype = SLU_Z, Mtype = SLU_GE.
 *         In the future, more general A may be handled.
 *
 *         On entry, If fact = 'F' and equed is not 'N', then A must have
 *         been equilibrated by the scaling factors in R and/or C.  
 *         A is not modified if fact = 'F' or 'N', or if fact = 'E' and 
 *         equed = 'N' on exit.
 *
 *         On exit, if fact = 'E' and equed is not 'N', A is scaled as follows:
 *         If A->Stype = SLU_NC:
 *           equed = 'R':  A := diag(R) * A
 *           equed = 'C':  A := A * diag(C)
 *           equed = 'B':  A := diag(R) * A * diag(C).
 *         If A->Stype = SLU_NR:
 *           equed = 'R':  transpose(A) := diag(R) * transpose(A)
 *           equed = 'C':  transpose(A) := transpose(A) * diag(C)
 *           equed = 'B':  transpose(A) := diag(R) * transpose(A) * diag(C).
 *
 * factor_params (input) factor_param_t*
 *         The structure defines the input scalar parameters, consisting of
 *         the following fields. If factor_params = NULL, the default
 *         values are used for all the fields; otherwise, the values
 *         are given by the user.
 *         - panel_size (int): Panel size. A panel consists of at most
 *             panel_size consecutive columns. If panel_size = -1, use 
 *             default value 8.
 *         - relax (int): To control degree of relaxing supernodes. If the
 *             number of nodes (columns) in a subtree of the elimination
 *             tree is less than relax, this subtree is considered as one
 *             supernode, regardless of the row structures of those columns.
 *             If relax = -1, use default value 8.
 *         - diag_pivot_thresh (double): Diagonal pivoting threshold.
 *             At step j of the Gaussian elimination, if
 *                 abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)),
 *             then use A_jj as pivot. 0 <= diag_pivot_thresh <= 1.
 *             If diag_pivot_thresh = -1, use default value 1.0,
 *             which corresponds to standard partial pivoting.
 *         - drop_tol (double): Drop tolerance threshold. (NOT IMPLEMENTED)
 *             At step j of the Gaussian elimination, if
 *                 abs(A_ij)/(max_i abs(A_ij)) < drop_tol,
 *             then drop entry A_ij. 0 <= drop_tol <= 1.
 *             If drop_tol = -1, use default value 0.0, which corresponds to
 *             standard Gaussian elimination.
 *
 * perm_c  (input/output) int*
 *	   If A->Stype = SLU_NC, Column permutation vector of size A->ncol,
 *         which defines the permutation matrix Pc; perm_c[i] = j means
 *         column i of A is in position j in A*Pc.
 *         On exit, perm_c may be overwritten by the product of the input
 *         perm_c and a permutation that postorders the elimination tree
 *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
 *         is already in postorder.
 *
 *         If A->Stype = SLU_NR, column permutation vector of size A->nrow,
 *         which describes permutation of columns of transpose(A) 
 *         (rows of A) as described above.
 * 
 * perm_r  (input/output) int*
 *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
 *         which defines the permutation matrix Pr, and is determined
 *         by partial pivoting.  perm_r[i] = j means row i of A is in 
 *         position j in Pr*A.
 *
 *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
 *         determines permutation of rows of transpose(A)
 *         (columns of A) as described above.
 *
 *         If refact is not 'Y', perm_r is output argument;
 *         If refact = 'Y', the pivoting routine will try to use the input
 *         perm_r, unless a certain threshold criterion is violated.
 *         In that case, perm_r is overwritten by a new permutation
 *         determined by partial pivoting or diagonal threshold pivoting.
 * 
 * etree   (input/output) int*,  dimension (A->ncol)
 *         Elimination tree of Pc'*A'*A*Pc.
 *         If fact is not 'F' and refact = 'Y', etree is an input argument,
 *         otherwise it is an output argument.
 *         Note: etree is a vector of parent pointers for a forest whose
 *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *
 * equed   (input/output) char*
 *         Specifies the form of equilibration that was done.
 *         = 'N': No equilibration.
 *         = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
 *         = 'C': Column equilibration, i.e., A was postmultiplied by diag(C).
 *         = 'B': Both row and column equilibration, i.e., A was replaced 
 *                by diag(R)*A*diag(C).
 *         If fact = 'F', equed is an input argument, otherwise it is
 *         an output argument.
 *
 * R       (input/output) double*, dimension (A->nrow)
 *         The row scale factors for A or transpose(A).
 *         If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
 *            (if A->Stype = SLU_NR) is multiplied on the left by diag(R).
 *         If equed = 'N' or 'C', R is not accessed.
 *         If fact = 'F', R is an input argument; otherwise, R is output.
 *         If fact = 'F' and equed = 'R' or 'B', each element of R must
 *            be positive.
 * 
 * C       (input/output) double*, dimension (A->ncol)
 *         The column scale factors for A or transpose(A).
 *         If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A)
 *            (if A->Stype = SLU_NR) is multiplied on the right by diag(C).
 *         If equed = 'N' or 'R', C is not accessed.
 *         If fact = 'F', C is an input argument; otherwise, C is output.
 *         If fact = 'F' and equed = 'C' or 'B', each element of C must
 *            be positive.
 *         
 * L       (output) SuperMatrix*
 *	   The factor L from the factorization
 *             Pr*A*Pc=L*U              (if A->Stype SLU_= NC) or
 *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
 *         Uses compressed row subscripts storage for supernodes, i.e.,
 *         L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU.
 *
 * U       (output) SuperMatrix*
 *	   The factor U from the factorization
 *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
 *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
 *         Uses column-wise storage scheme, i.e., U has types:
 *         Stype = SLU_NC, Dtype = SLU_Z, Mtype = TRU.
 *
 * work    (workspace/output) void*, size (lwork) (in bytes)
 *         User supplied workspace, should be large enough
 *         to hold data structures for factors L and U.
 *         On exit, if fact is not 'F', L and U point to this array.
 *
 * lwork   (input) int
 *         Specifies the size of work array in bytes.
 *         = 0:  allocate space internally by system malloc;
 *         > 0:  use user-supplied work array of length lwork in bytes,
 *               returns error if space runs out.
 *         = -1: the routine guesses the amount of space needed without
 *               performing the factorization, and returns it in
 *               mem_usage->total_needed; no other side effects.
 *
 *         See argument 'mem_usage' for memory usage statistics.
 *
 * B       (input/output) SuperMatrix*
 *         B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE.
 *         On entry, the right hand side matrix.
 *         On exit,
 *            if equed = 'N', B is not modified; otherwise
 *            if A->Stype = SLU_NC:
 *               if trans = 'N' and equed = 'R' or 'B', B is overwritten by
 *                  diag(R)*B;
 *               if trans = 'T' or 'C' and equed = 'C' of 'B', B is
 *                  overwritten by diag(C)*B;
 *            if A->Stype = SLU_NR:
 *               if trans = 'N' and equed = 'C' or 'B', B is overwritten by
 *                  diag(C)*B;
 *               if trans = 'T' or 'C' and equed = 'R' of 'B', B is
 *                  overwritten by diag(R)*B.
 *
 * X       (output) SuperMatrix*
 *         X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. 
 *         If info = 0 or info = A->ncol+1, X contains the solution matrix
 *         to the original system of equations. Note that A and B are modified
 *         on exit if equed is not 'N', and the solution to the equilibrated
 *         system is inv(diag(C))*X if trans = 'N' and equed = 'C' or 'B',
 *         or inv(diag(R))*X if trans = 'T' or 'C' and equed = 'R' or 'B'.
 *
 * recip_pivot_growth (output) double*
 *         The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ).
 *         The infinity norm is used. If recip_pivot_growth is much less
 *         than 1, the stability of the LU factorization could be poor.
 *
 * rcond   (output) double*
 *         The estimate of the reciprocal condition number of the matrix A
 *         after equilibration (if done). If rcond is less than the machine
 *         precision (in particular, if rcond = 0), the matrix is singular
 *         to working precision. This condition is indicated by a return
 *         code of info > 0.
 *
 * FERR    (output) double*, dimension (B->ncol)   
 *         The estimated forward error bound for each solution vector   
 *         X(j) (the j-th column of the solution matrix X).   
 *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
 *         is an estimated upper bound for the magnitude of the largest 
 *         element in (X(j) - XTRUE) divided by the magnitude of the   
 *         largest element in X(j).  The estimate is as reliable as   
 *         the estimate for RCOND, and is almost always a slight   
 *         overestimate of the true error.
 *
 * BERR    (output) double*, dimension (B->ncol)
 *         The componentwise relative backward error of each solution   
 *         vector X(j) (i.e., the smallest relative change in   
 *         any element of A or B that makes X(j) an exact solution).
 *
 * mem_usage (output) mem_usage_t*
 *         Record the memory usage statistics, consisting of following fields:
 *         - for_lu (float)
 *           The amount of space used in bytes for L\U data structures.
 *         - total_needed (float)
 *           The amount of space needed in bytes to perform factorization.
 *         - expansions (int)
 *           The number of memory expansions during the LU factorization.
 *
 * info    (output) int*
 *         = 0: successful exit   
 *         < 0: if info = -i, the i-th argument had an illegal value   
 *         > 0: if info = i, and i is   
 *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
 *                    been completed, but the factor U is exactly   
 *                    singular, so the solution and error bounds   
 *                    could not be computed.   
 *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
 *                    precision, meaning that the matrix is singular to
 *                    working precision. Nevertheless, the solution and
 *                    error bounds are computed because there are a number
 *                    of situations where the computed solution can be more
 *                    accurate than the value of RCOND would suggest.   
 *              > A->ncol+1: number of bytes allocated when memory allocation
 *                    failure occurred, plus A->ncol.
 *
 */

    DNformat  *Bstore, *Xstore;
    doublecomplex    *Bmat, *Xmat;
    int       ldb, ldx, nrhs;
    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int       colequ, equil, nofact, notran, rowequ;
    char      trant[1], norm[1];
    int       i, j, info1;
    double    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
    int       relax, panel_size;
    double    diag_pivot_thresh, drop_tol;
    double    t0;      /* temporary time */
    double    *utime;
    extern SuperLUStat_t SuperLUStat;

    /* External functions */
    extern double zlangs(char *, SuperMatrix *);
    extern double dlamch_(char *);

    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;

#if 0
printf("zgssvx: fact=%c, trans=%c, refact=%c, equed=%c\n",
       *fact, *trans, *refact, *equed);
#endif
    
    *info = 0;
    nofact = lsame_(fact, "N");
    equil = lsame_(fact, "E");
    notran = lsame_(trans, "N");
    if (nofact || equil) {
	*(unsigned char *)equed = 'N';
	rowequ = FALSE;
	colequ = FALSE;
    } else {
	rowequ = lsame_(equed, "R") || lsame_(equed, "B");
	colequ = lsame_(equed, "C") || lsame_(equed, "B");
	smlnum = dlamch_("Safe minimum");
	bignum = 1. / smlnum;
    }

    /* Test the input parameters */
    if (!nofact && !equil && !lsame_(fact, "F")) *info = -1;
    else if (!notran && !lsame_(trans, "T") && !lsame_(trans, "C")) *info = -2;
    else if ( !(lsame_(refact,"Y") || lsame_(refact, "N")) ) *info = -3;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
	      A->Dtype != SLU_Z || A->Mtype != SLU_GE )
	*info = -4;
    else if (lsame_(fact, "F") && !(rowequ || colequ || lsame_(equed, "N")))
	*info = -9;
    else {
	if (rowequ) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, R[j]);
		rcmax = SUPERLU_MAX(rcmax, R[j]);
	    }
	    if (rcmin <= 0.) *info = -10;
	    else if ( A->nrow > 0)
		rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else rowcnd = 1.;
	}
	if (colequ && *info == 0) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = SUPERLU_MIN(rcmin, C[j]);
		rcmax = SUPERLU_MAX(rcmax, C[j]);
	    }
	    if (rcmin <= 0.) *info = -11;
	    else if (A->nrow > 0)
		colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum);
	    else colcnd = 1.;
	}
	if (*info == 0) {
	    if ( lwork < -1 ) *info = -15;
	    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      B->Stype != SLU_DN || B->Dtype != SLU_Z || 
		      B->Mtype != SLU_GE )
		*info = -16;
	    else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) ||
		      B->ncol != X->ncol || X->Stype != SLU_DN ||
		      X->Dtype != SLU_Z || X->Mtype != SLU_GE )
		*info = -17;
	}
    }
    if (*info != 0) {
	i = -(*info);
	xerbla_("zgssvx", &i);
	return;
    }
    
    /* Default values for factor_params */
    panel_size = sp_ienv(1);
    relax      = sp_ienv(2);
    diag_pivot_thresh = 1.0;
    drop_tol   = 0.0;
    if ( factor_params != NULL ) {
	if ( factor_params->panel_size != -1 )
	    panel_size = factor_params->panel_size;
	if ( factor_params->relax != -1 ) relax = factor_params->relax;
	if ( factor_params->diag_pivot_thresh != -1 )
	    diag_pivot_thresh = factor_params->diag_pivot_thresh;
	if ( factor_params->drop_tol != -1 )
	    drop_tol = factor_params->drop_tol;
    }

    StatInit(panel_size, relax);
    utime = SuperLUStat.utime;
    
    /* Convert A to SLU_NC format when necessary. */
    if ( A->Stype == SLU_NR ) {
	NRformat *Astore = A->Store;
	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
	zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
			       Astore->nzval, Astore->colind, Astore->rowptr,
			       SLU_NC, A->Dtype, A->Mtype);
	if ( notran ) { /* Reverse the transpose argument. */
	    *trant = 'T';
	    notran = 0;
	} else {
	    *trant = 'N';
	    notran = 1;
	}
    } else { /* A->Stype == SLU_NC */
	*trant = *trans;
	AA = A;
    }

    if ( equil ) {
	t0 = SuperLU_timer_();
	/* Compute row and column scalings to equilibrate the matrix A. */
	zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
	
	if ( info1 == 0 ) {
	    /* Equilibrate matrix A. */
	    zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
	    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
	    colequ = lsame_(equed, "C") || lsame_(equed, "B");
	}
	utime[EQUIL] = SuperLU_timer_() - t0;
    }

    /* Scale the right hand side if equilibration was performed. */
    if ( notran ) {
	if ( rowequ ) {
	    for (j = 0; j < nrhs; ++j)
		for (i = 0; i < A->nrow; ++i) {
                  zd_mult(&Bmat[i + j*ldb], &Bmat[i + j*ldb], R[i]);
	        }
	}
    } else if ( colequ ) {
	for (j = 0; j < nrhs; ++j)
	    for (i = 0; i < A->nrow; ++i) {
              zd_mult(&Bmat[i + j*ldb], &Bmat[i + j*ldb], C[i]);
	    }
    }

    if ( nofact || equil ) {
	
	t0 = SuperLU_timer_();
	sp_preorder(refact, AA, perm_c, etree, &AC);
	utime[ETREE] = SuperLU_timer_() - t0;
    
/*	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
	       relax, panel_size, sp_ienv(3), sp_ienv(4));
	fflush(stdout); */
	
	/* Compute the LU factorization of A*Pc. */
	t0 = SuperLU_timer_();
	zgstrf(refact, &AC, diag_pivot_thresh, drop_tol, relax, panel_size,
	       etree, work, lwork, perm_r, perm_c, L, U, info);
	utime[FACT] = SuperLU_timer_() - t0;
	
	if ( lwork == -1 ) {
	    mem_usage->total_needed = *info - A->ncol;
	    return;
	}
    }

    if ( *info > 0 ) {
	if ( *info <= A->ncol ) {
	    /* Compute the reciprocal pivot growth factor of the leading
	       rank-deficient *info columns of A. */
	    *recip_pivot_growth = zPivotGrowth(*info, AA, perm_c, L, U);
	}
	return;
    }

    /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */
    *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U);

    /* Estimate the reciprocal of the condition number of A. */
    t0 = SuperLU_timer_();
    if ( notran ) {
	*(unsigned char *)norm = '1';
    } else {
	*(unsigned char *)norm = 'I';
    }
    anorm = zlangs(norm, AA);
    zgscon(norm, L, U, anorm, rcond, info);
    utime[RCOND] = SuperLU_timer_() - t0;
    
    /* Compute the solution matrix X. */
    for (j = 0; j < nrhs; j++)    /* Save a copy of the right hand sides */
	for (i = 0; i < B->nrow; i++)
	    Xmat[i + j*ldx] = Bmat[i + j*ldb];
    
    t0 = SuperLU_timer_();
    zgstrs (trant, L, U, perm_r, perm_c, X, info);
    utime[SOLVE] = SuperLU_timer_() - t0;
    
    /* Use iterative refinement to improve the computed solution and compute
       error bounds and backward error estimates for it. */
    t0 = SuperLU_timer_();
    zgsrfs(trant, AA, L, U, perm_r, perm_c, equed, R, C, B,
	      X, ferr, berr, info);
    utime[REFINE] = SuperLU_timer_() - t0;

    /* Transform the solution matrix X to a solution of the original system. */
    if ( notran ) {
	if ( colequ ) {
	    for (j = 0; j < nrhs; ++j)
		for (i = 0; i < A->nrow; ++i) {
                  zd_mult(&Xmat[i + j*ldx], &Xmat[i + j*ldx], C[i]);
	        }
	}
    } else if ( rowequ ) {
	for (j = 0; j < nrhs; ++j)
	    for (i = 0; i < A->nrow; ++i) {
              zd_mult(&Xmat[i+ j*ldx], &Xmat[i+ j*ldx], R[i]);
            }
    }

    /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */
    if ( *rcond < dlamch_("E") ) *info = A->ncol + 1;

    zQuerySpace(L, U, panel_size, mem_usage);

    if ( nofact || equil ) Destroy_CompCol_Permuted(&AC);
    if ( A->Stype == SLU_NR ) {
	Destroy_SuperMatrix_Store(AA);
	SUPERLU_FREE(AA);
    }

    PrintStat( &SuperLUStat );
    StatFree();
}
void
pzgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
		  gridinfo_t *grid, doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx,
		  int nrhs, double *berr, SuperLUStat_t *stat, int *info)
{


#define ITMAX 20
    
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;
    /* 
     * Data structures used by matrix-vector multiply routine.
     */
    int_t  N_update; /* Number of variables updated on this process */
    int_t  *update;  /* vector elements (global index) updated 
			on this processor.                     */
    int_t  *bindx;
    doublecomplex *val;
    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
			       matrix-vector multiply.  */
    /*-- end data structures for matrix-vector multiply --*/
    doublecomplex *b, *ax, *R, *B_col, *temp, *work, *X_col,
           *x_trs, *dx_trs;
    double *rwork;
    int_t notran;
    int_t count, ii, j, jj, k, knsupc, lk, lwork,
          nprow, nsupers, nz, p;
    int   i, iam, pkk;
    int_t *ilsum, *xsup;
    double eps, lstres;
    double s, safmin, safe1, safe2;

    /* NEW STUFF */
    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
    int_t *diag_len; /* Length of the X vector on diagonal processes. */

    /*-- Function prototypes --*/
    extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *,
			 doublecomplex *, int, SuperLUStat_t *, int *);
    /*extern double dlamch_(char *);*/
    
    /* Test the input parameters. */
    *info = 0;
    if ( n < 0 ) *info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      A->Stype != SLU_NCP || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
	*info = -2;
    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
    else if ( nrhs < 0 ) *info = -13;
    if (*info != 0) {
	i = -(*info);
	xerbla_("pzgsrfs_ABXglobal", &i);
	return;
    }

    /* Quick return if possible. */
    if ( n == 0 || nrhs == 0 ) {
	return;
    }

    /* Initialization. */
    iam = grid->iam;
    nprow = grid->nprow;
    nsupers = Glu_persist->supno[n-1] + 1;
    xsup = Glu_persist->xsup;
    ilsum = Llu->ilsum;
    notran = 1;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pzgsrfs_ABXglobal()");
#endif

    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
		   &diag_procs, &diag_len);
#if ( PRNTlevel>=1 )
    if ( !iam ) {
	printf(".. number of diag processes = %d\n", num_diag_procs);
	PrintInt10("diag_procs", num_diag_procs, diag_procs);
	PrintInt10("diag_len", num_diag_procs, diag_len);
    }
#endif

    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
	ABORT("Calloc fails for mv_sup_to_proc[]");

    pzgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
		          &val, &bindx, mv_sup_to_proc);

    i = CEILING( nsupers, nprow ); /* Number of local block rows */
    ii = Llu->ldalsum + i * XK_H;
    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
    jj = diag_len[0];
    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
    jj = SUPERLU_MAX( jj, N_update );
    lwork = N_update         /* For ax and R */
	  + ii               /* For dx_trs */
	  + ii               /* For x_trs */
          + k                /* For b */
	  + jj;              /* for temp */
    if ( !(work = doublecomplexMalloc_dist(lwork)) )
	ABORT("Malloc fails for work[]");
    ax = R = work;
    dx_trs = work + N_update;
    x_trs  = dx_trs + ii;
    b      = x_trs + ii;
    temp   = b + k;
    if ( !(rwork = SUPERLU_MALLOC(N_update * sizeof(double))) )
	ABORT("Malloc fails for rwork[]");

#if ( DEBUGlevel>=2 )
    {
	doublecomplex *dwork = doublecomplexMalloc_dist(n);
	for (i = 0; i < n; ++i) {
	    if ( i & 1 ) dwork[i].r = 1.;
	    else dwork[i].r = 2.;
	    dwork[i].i = 0.;
        }
	/* Check correctness of matrix-vector multiply. */
	pzgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
	PrintDouble5("Mult A*x", N_update, ax);
	SUPERLU_FREE(dwork);
    }
#endif


    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
    nz     = A->ncol + 1;
    eps    = dlamch_("Epsilon");
    safmin = dlamch_("Safe minimum");

    /* Set SAFE1 essentially to be the underflow threshold times the
       number of additions in each row. */
    safe1  = nz * safmin;
    safe2  = safe1 / eps;

#if ( DEBUGlevel>=1 )
    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
		       eps, anorm, safe1, safe2);
#endif

    /* Do for each right-hand side ... */
    for (j = 0; j < nrhs; ++j) {
	count = 0;
	lstres = 3.;

	/* Copy X into x on the diagonal processes. */
	B_col = &B[j*ldb];
	X_col = &X[j*ldx];
	for (p = 0; p < num_diag_procs; ++p) {
	    pkk = diag_procs[p];
	    if ( iam == pkk ) {
		for (k = p; k < nsupers; k += num_diag_procs) {
		    knsupc = SuperSize( k );
		    lk = LBi( k, grid );
		    ii = ilsum[lk] + (lk+1)*XK_H;
		    jj = FstBlockC( k );
		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
		    dx_trs[ii-XK_H].r = k;/* Block number prepended in header. */
		}
	    }
	}
	/* Copy B into b distributed the same way as matrix-vector product. */
        if ( N_update ) ii = update[0];
	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];

	while (1) { /* Loop until stopping criterion is satisfied. */

	    /* Compute residual R = B - op(A) * X,   
	       where op(A) = A, A**T, or A**H, depending on TRANS. */

	    /* Matrix-vector multiply. */
	    pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
	    
	    /* Compute residual. */
	    for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]);

	    /* Compute abs(op(A))*abs(X) + abs(B). */
	    pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork);
	    for (i = 0; i < N_update; ++i) rwork[i] += slud_z_abs1(&b[i]);
	    
	    s = 0.0;
	    for (i = 0; i < N_update; ++i) {
		if ( rwork[i] > safe2 ) {
		    s = SUPERLU_MAX(s, slud_z_abs1(&R[i]) / rwork[i]);
		} else if ( rwork[i] != 0.0 ) {
		    s = SUPERLU_MAX(s, (safe1 + slud_z_abs1(&R[i])) / rwork[i]);
                }
                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
                   we know the true residual also must be exactly 0.0. */
	    }
	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
		
#if ( PRNTlevel>= 1 )
	    if ( !iam )
		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
#endif
	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
		/* Compute new dx. */
		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
				   mv_sup_to_proc, dx_trs);
		pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);

		/* Update solution. */
		for (p = 0; p < num_diag_procs; ++p) 
		    if ( iam == diag_procs[p] )
			for (k = p; k < nsupers; k += num_diag_procs) {
			    lk = LBi( k, grid );
			    ii = ilsum[lk] + (lk+1)*XK_H;
			    knsupc = SuperSize( k );
			    for (i = 0; i < knsupc; ++i)
				z_add(&x_trs[i + ii], &x_trs[i + ii], 
				      &dx_trs[i + ii]);
			}
		lstres = berr[j];
		++count;
		/* Transfer x_trs (on diagonal processes) into X
		   (on all processes). */
		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
					num_diag_procs, diag_procs, diag_len,
					X_col, temp);
	    } else {
		break;
	    }
	} /* end while */

	stat->RefineSteps = count;

    } /* for j ... */


    /* Deallocate storage used by matrix-vector multiplication. */
    SUPERLU_FREE(diag_procs);
    SUPERLU_FREE(diag_len);
    if ( N_update ) {
	SUPERLU_FREE(update);
	SUPERLU_FREE(bindx);
	SUPERLU_FREE(val);
    }
    SUPERLU_FREE(mv_sup_to_proc);
    SUPERLU_FREE(work);
    SUPERLU_FREE(rwork);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pzgsrfs_ABXglobal()");
#endif

} /* PZGSRFS_ABXGLOBAL */
Exemplo n.º 19
0
void tlin::allocS(SuperMatrix *&A, int rows, int cols, int nnz, int *colptr,
                  int *rowind, double *values) {
  A = (SuperMatrix *)SUPERLU_MALLOC(sizeof(SuperMatrix));
  dCreate_CompCol_Matrix(A, rows, cols, nnz, values, rowind, colptr, SLU_NC,
                         SLU_D, SLU_GE);
}
Exemplo n.º 20
0
void
dgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
      SuperMatrix *L, SuperMatrix *U, SuperMatrix *B,
      SuperLUStat_t *stat, int *info )
{

    DNformat *Bstore;
    SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int      lwork = 0, *etree, i;
    GlobalLU_t Glu; /* Not needed on return. */
    
    /* Set default values for some parameters */
    int      panel_size;     /* panel size */
    int      relax;          /* no of columns in a relaxed snodes */
    int      permc_spec;
    trans_t  trans = NOTRANS;
    double   *utime;
    double   t;	/* Temporary time */

    /* Test the input parameters ... */
    *info = 0;
    Bstore = B->Store;
    if ( options->Fact != DOFACT ) *info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	 (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
	 A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -2;
    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
	B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE )
	*info = -7;
    if ( *info != 0 ) {
	i = -(*info);
	input_error("dgssv", &i);
	return;
    }

    utime = stat->utime;

    /* Convert A to SLU_NC format when necessary. */
    if ( A->Stype == SLU_NR ) {
	NRformat *Astore = A->Store;
	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
	dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
			       Astore->nzval, Astore->colind, Astore->rowptr,
			       SLU_NC, A->Dtype, A->Mtype);
	trans = TRANS;
    } else {
        if ( A->Stype == SLU_NC ) AA = A;
    }

    t = SuperLU_timer_();
    /*
     * Get column permutation vector perm_c[], according to permc_spec:
     *   permc_spec = NATURAL:  natural ordering 
     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
     *   permc_spec = COLAMD:   approximate minimum degree column ordering
     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
     */
    permc_spec = options->ColPerm;
    if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
      get_perm_c(permc_spec, AA, perm_c);
    utime[COLPERM] = SuperLU_timer_() - t;

    etree = intMalloc(A->ncol);

    t = SuperLU_timer_();
    sp_preorder(options, AA, perm_c, etree, &AC);
    utime[ETREE] = SuperLU_timer_() - t;

    panel_size = sp_ienv(1);
    relax = sp_ienv(2);

    /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
	  relax, panel_size, sp_ienv(3), sp_ienv(4));*/
    t = SuperLU_timer_(); 
    /* Compute the LU factorization of A. */
    dgstrf(options, &AC, relax, panel_size, etree,
            NULL, lwork, perm_c, perm_r, L, U, &Glu, stat, info);
    utime[FACT] = SuperLU_timer_() - t;

    t = SuperLU_timer_();
    if ( *info == 0 ) {
        /* Solve the system A*X=B, overwriting B with X. */
        dgstrs (trans, L, U, perm_c, perm_r, B, stat, info);
    }
    utime[SOLVE] = SuperLU_timer_() - t;

    SUPERLU_FREE (etree);
    Destroy_CompCol_Permuted(&AC);
    if ( A->Stype == SLU_NR ) {
	Destroy_SuperMatrix_Store(AA);
	SUPERLU_FREE(AA);
    }

}
Exemplo n.º 21
0
main(int argc, char *argv[])
{
/*
 * Purpose
 * =======
 *
 * ZDRIVE is the main test program for the DOUBLE COMPLEX linear
 * equation driver routines ZGSSV and ZGSSVX.
 *
 * The program is invoked by a shell script file -- ztest.csh.
 * The output from the tests are written into a file -- ztest.out.
 *
 * =====================================================================
 */
    doublecomplex         *a, *a_save;
    int            *asub, *asub_save;
    int            *xa, *xa_save;
    SuperMatrix  A, B, X, L, U;
    SuperMatrix  ASAV, AC;
    mem_usage_t    mem_usage;
    int            *perm_r; /* row permutation from partial pivoting */
    int            *perm_c, *pc_save; /* column permutation */
    int            *etree;
    doublecomplex  zero = {0.0, 0.0};
    double         *R, *C;
    double         *ferr, *berr;
    double         *rwork;
    doublecomplex          *wwork;
    void           *work;
    int            info, lwork, nrhs, panel_size, relax;
    int            m, n, nnz;
    doublecomplex         *xact;
    doublecomplex         *rhsb, *solx, *bsav;
    int            ldb, ldx;
    double         rpg, rcond;
    int            i, j, k1;
    double         rowcnd, colcnd, amax;
    int            maxsuper, rowblk, colblk;
    int            prefact, nofact, equil, iequed;
    int            nt, nrun, nfail, nerrs, imat, fimat, nimat;
    int            nfact, ifact, itran;
    int            kl, ku, mode, lda;
    int            zerot, izero, ioff;
    double         u;
    double         anorm, cndnum;
    doublecomplex         *Afull;
    double         result[NTESTS];
    superlu_options_t options;
    fact_t         fact;
    trans_t        trans;
    SuperLUStat_t  stat;
    static char    matrix_type[8];
    static char    equed[1], path[4], sym[1], dist[1];

    /* Fixed set of parameters */
    int            iseed[]  = {1988, 1989, 1990, 1991};
    static char    equeds[]  = {'N', 'R', 'C', 'B'};
    static fact_t  facts[] = {FACTORED, DOFACT, SamePattern,
                              SamePattern_SameRowPerm};
    static trans_t transs[]  = {NOTRANS, TRANS, CONJ};

    /* Some function prototypes */
    extern int zgst01(int, int, SuperMatrix *, SuperMatrix *,
                      SuperMatrix *, int *, int *, double *);
    extern int zgst02(trans_t, int, int, int, SuperMatrix *, doublecomplex *,
                      int, doublecomplex *, int, double *resid);
    extern int zgst04(int, int, doublecomplex *, int,
                      doublecomplex *, int, double rcond, double *resid);
    extern int zgst07(trans_t, int, int, SuperMatrix *, doublecomplex *, int,
                         doublecomplex *, int, doublecomplex *, int,
                         double *, double *, double *);
    extern int zlatb4_(char *, int *, int *, int *, char *, int *, int *,
                       double *, int *, double *, char *);
    extern int zlatms_(int *, int *, char *, int *, char *, double *d,
                       int *, double *, double *, int *, int *,
                       char *, doublecomplex *, int *, doublecomplex *, int *);
    extern int sp_zconvert(int, int, doublecomplex *, int, int, int,
                           doublecomplex *a, int *, int *, int *);


    /* Executable statements */

    strcpy(path, "ZGE");
    nrun  = 0;
    nfail = 0;
    nerrs = 0;

    /* Defaults */
    lwork      = 0;
    n          = 1;
    nrhs       = 1;
    panel_size = sp_ienv(1);
    relax      = sp_ienv(2);
    u          = 1.0;
    strcpy(matrix_type, "LA");
    parse_command_line(argc, argv, matrix_type, &n,
                       &panel_size, &relax, &nrhs, &maxsuper,
                       &rowblk, &colblk, &lwork, &u);
    if ( lwork > 0 ) {
        work = SUPERLU_MALLOC(lwork);
        if ( !work ) {
            fprintf(stderr, "expert: cannot allocate %d bytes\n", lwork);
            exit (-1);
        }
    }

    /* Set the default input options. */
    set_default_options(&options);
    options.DiagPivotThresh = u;
    options.PrintStat = NO;
    options.PivotGrowth = YES;
    options.ConditionNumber = YES;
    options.IterRefine = DOUBLE;

    if ( strcmp(matrix_type, "LA") == 0 ) {
        /* Test LAPACK matrix suite. */
        m = n;
        lda = SUPERLU_MAX(n, 1);
        nnz = n * n;        /* upper bound */
        fimat = 1;
        nimat = NTYPES;
        Afull = doublecomplexCalloc(lda * n);
        zallocateA(n, nnz, &a, &asub, &xa);
    } else {
        /* Read a sparse matrix */
        fimat = nimat = 0;
        zreadhb(&m, &n, &nnz, &a, &asub, &xa);
    }

    zallocateA(n, nnz, &a_save, &asub_save, &xa_save);
    rhsb = doublecomplexMalloc(m * nrhs);
    bsav = doublecomplexMalloc(m * nrhs);
    solx = doublecomplexMalloc(n * nrhs);
    ldb  = m;
    ldx  = n;
    zCreate_Dense_Matrix(&B, m, nrhs, rhsb, ldb, SLU_DN, SLU_Z, SLU_GE);
    zCreate_Dense_Matrix(&X, n, nrhs, solx, ldx, SLU_DN, SLU_Z, SLU_GE);
    xact = doublecomplexMalloc(n * nrhs);
    etree   = intMalloc(n);
    perm_r  = intMalloc(n);
    perm_c  = intMalloc(n);
    pc_save = intMalloc(n);
    R       = (double *) SUPERLU_MALLOC(m*sizeof(double));
    C       = (double *) SUPERLU_MALLOC(n*sizeof(double));
    ferr    = (double *) SUPERLU_MALLOC(nrhs*sizeof(double));
    berr    = (double *) SUPERLU_MALLOC(nrhs*sizeof(double));
    j = SUPERLU_MAX(m,n) * SUPERLU_MAX(4,nrhs);
    rwork   = (double *) SUPERLU_MALLOC(j*sizeof(double));
    for (i = 0; i < j; ++i) rwork[i] = 0.;
    if ( !R ) ABORT("SUPERLU_MALLOC fails for R");
    if ( !C ) ABORT("SUPERLU_MALLOC fails for C");
    if ( !ferr ) ABORT("SUPERLU_MALLOC fails for ferr");
    if ( !berr ) ABORT("SUPERLU_MALLOC fails for berr");
    if ( !rwork ) ABORT("SUPERLU_MALLOC fails for rwork");
    wwork   = doublecomplexCalloc( SUPERLU_MAX(m,n) * SUPERLU_MAX(4,nrhs) );

    for (i = 0; i < n; ++i) perm_c[i] = pc_save[i] = i;
    options.ColPerm = MY_PERMC;

    for (imat = fimat; imat <= nimat; ++imat) { /* All matrix types */

        if ( imat ) {

            /* Skip types 5, 6, or 7 if the matrix size is too small. */
            zerot = (imat >= 5 && imat <= 7);
            if ( zerot && n < imat-4 )
                continue;

            /* Set up parameters with ZLATB4 and generate a test matrix
               with ZLATMS.  */
            zlatb4_(path, &imat, &n, &n, sym, &kl, &ku, &anorm, &mode,
                    &cndnum, dist);

            zlatms_(&n, &n, dist, iseed, sym, &rwork[0], &mode, &cndnum,
                    &anorm, &kl, &ku, "No packing", Afull, &lda,
                    &wwork[0], &info);

            if ( info ) {
                printf(FMT3, "ZLATMS", info, izero, n, nrhs, imat, nfail);
                continue;
            }

            /* For types 5-7, zero one or more columns of the matrix
               to test that INFO is returned correctly.   */
            if ( zerot ) {
                if ( imat == 5 ) izero = 1;
                else if ( imat == 6 ) izero = n;
                else izero = n / 2 + 1;
                ioff = (izero - 1) * lda;
                if ( imat < 7 ) {
                    for (i = 0; i < n; ++i) Afull[ioff + i] = zero;
                } else {
                    for (j = 0; j < n - izero + 1; ++j)
                        for (i = 0; i < n; ++i)
                            Afull[ioff + i + j*lda] = zero;
                }
            } else {
                izero = 0;
            }

            /* Convert to sparse representation. */
            sp_zconvert(n, n, Afull, lda, kl, ku, a, asub, xa, &nnz);

        } else {
            izero = 0;
            zerot = 0;
        }

        zCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_Z, SLU_GE);

        /* Save a copy of matrix A in ASAV */
        zCreate_CompCol_Matrix(&ASAV, m, n, nnz, a_save, asub_save, xa_save,
                              SLU_NC, SLU_Z, SLU_GE);
        zCopy_CompCol_Matrix(&A, &ASAV);

        /* Form exact solution. */
        zGenXtrue(n, nrhs, xact, ldx);

        StatInit(&stat);

        for (iequed = 0; iequed < 4; ++iequed) {
            *equed = equeds[iequed];
            if (iequed == 0) nfact = 4;
            else nfact = 1; /* Only test factored, pre-equilibrated matrix */

            for (ifact = 0; ifact < nfact; ++ifact) {
                fact = facts[ifact];
                options.Fact = fact;

                for (equil = 0; equil < 2; ++equil) {
                    options.Equil = equil;
                    prefact   = ( options.Fact == FACTORED ||
                                  options.Fact == SamePattern_SameRowPerm );
                                /* Need a first factor */
                    nofact    = (options.Fact != FACTORED);  /* Not factored */

                    /* Restore the matrix A. */
                    zCopy_CompCol_Matrix(&ASAV, &A);

                    if ( zerot ) {
                        if ( prefact ) continue;
                    } else if ( options.Fact == FACTORED ) {
                        if ( equil || iequed ) {
                            /* Compute row and column scale factors to
                               equilibrate matrix A.    */
                            zgsequ(&A, R, C, &rowcnd, &colcnd, &amax, &info);

                            /* Force equilibration. */
                            if ( !info && n > 0 ) {
                                if ( lsame_(equed, "R") ) {
                                    rowcnd = 0.;
                                    colcnd = 1.;
                                } else if ( lsame_(equed, "C") ) {
                                    rowcnd = 1.;
                                    colcnd = 0.;
                                } else if ( lsame_(equed, "B") ) {
                                    rowcnd = 0.;
                                    colcnd = 0.;
                                }
                            }

                            /* Equilibrate the matrix. */
                            zlaqgs(&A, R, C, rowcnd, colcnd, amax, equed);
                        }
                    }

                    if ( prefact ) { /* Need a factor for the first time */

                        /* Save Fact option. */
                        fact = options.Fact;
                        options.Fact = DOFACT;

                        /* Preorder the matrix, obtain the column etree. */
                        sp_preorder(&options, &A, perm_c, etree, &AC);

                        /* Factor the matrix AC. */
                        zgstrf(&options, &AC, relax, panel_size,
                               etree, work, lwork, perm_c, perm_r, &L, &U,
                               &stat, &info);

                        if ( info ) {
                            printf("** First factor: info %d, equed %c\n",
                                   info, *equed);
                            if ( lwork == -1 ) {
                                printf("** Estimated memory: %d bytes\n",
                                        info - n);
                                exit(0);
                            }
                        }

                        Destroy_CompCol_Permuted(&AC);

                        /* Restore Fact option. */
                        options.Fact = fact;
                    } /* if .. first time factor */

                    for (itran = 0; itran < NTRAN; ++itran) {
                        trans = transs[itran];
                        options.Trans = trans;

                        /* Restore the matrix A. */
                        zCopy_CompCol_Matrix(&ASAV, &A);

                        /* Set the right hand side. */
                        zFillRHS(trans, nrhs, xact, ldx, &A, &B);
                        zCopy_Dense_Matrix(m, nrhs, rhsb, ldb, bsav, ldb);

                        /*----------------
                         * Test zgssv
                         *----------------*/
                        if ( options.Fact == DOFACT && itran == 0) {
                            /* Not yet factored, and untransposed */

                            zCopy_Dense_Matrix(m, nrhs, rhsb, ldb, solx, ldx);
                            zgssv(&options, &A, perm_c, perm_r, &L, &U, &X,
                                  &stat, &info);

                            if ( info && info != izero ) {
                                printf(FMT3, "zgssv",
                                       info, izero, n, nrhs, imat, nfail);
                            } else {
                                /* Reconstruct matrix from factors and
                                   compute residual. */
                                zgst01(m, n, &A, &L, &U, perm_c, perm_r,
                                         &result[0]);
                                nt = 1;
                                if ( izero == 0 ) {
                                    /* Compute residual of the computed
                                       solution. */
                                    zCopy_Dense_Matrix(m, nrhs, rhsb, ldb,
                                                       wwork, ldb);
                                    zgst02(trans, m, n, nrhs, &A, solx,
                                              ldx, wwork,ldb, &result[1]);
                                    nt = 2;
                                }

                                /* Print information about the tests that
                                   did not pass the threshold.      */
                                for (i = 0; i < nt; ++i) {
                                    if ( result[i] >= THRESH ) {
                                        printf(FMT1, "zgssv", n, i,
                                               result[i]);
                                        ++nfail;
                                    }
                                }
                                nrun += nt;
                            } /* else .. info == 0 */

                            /* Restore perm_c. */
                            for (i = 0; i < n; ++i) perm_c[i] = pc_save[i];

                            if (lwork == 0) {
                                Destroy_SuperNode_Matrix(&L);
                                Destroy_CompCol_Matrix(&U);
                            }
                        } /* if .. end of testing zgssv */

                        /*----------------
                         * Test zgssvx
                         *----------------*/

                        /* Equilibrate the matrix if fact = FACTORED and
                           equed = 'R', 'C', or 'B'.   */
                        if ( options.Fact == FACTORED &&
                             (equil || iequed) && n > 0 ) {
                            zlaqgs(&A, R, C, rowcnd, colcnd, amax, equed);
                        }

                        /* Solve the system and compute the condition number
                           and error bounds using zgssvx.      */
                        zgssvx(&options, &A, perm_c, perm_r, etree,
                               equed, R, C, &L, &U, work, lwork, &B, &X, &rpg,
                               &rcond, ferr, berr, &mem_usage, &stat, &info);

                        if ( info && info != izero ) {
                            printf(FMT3, "zgssvx",
                                   info, izero, n, nrhs, imat, nfail);
                            if ( lwork == -1 ) {
                                printf("** Estimated memory: %.0f bytes\n",
                                        mem_usage.total_needed);
                                exit(0);
                            }
                        } else {
                            if ( !prefact ) {
                                /* Reconstruct matrix from factors and
                                   compute residual. */
                                zgst01(m, n, &A, &L, &U, perm_c, perm_r,
                                         &result[0]);
                                k1 = 0;
                            } else {
                                k1 = 1;
                            }

                            if ( !info ) {
                                /* Compute residual of the computed solution.*/
                                zCopy_Dense_Matrix(m, nrhs, bsav, ldb,
                                                  wwork, ldb);
                                zgst02(trans, m, n, nrhs, &ASAV, solx, ldx,
                                          wwork, ldb, &result[1]);

                                /* Check solution from generated exact
                                   solution. */
                                zgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
                                          &result[2]);

                                /* Check the error bounds from iterative
                                   refinement. */
                                zgst07(trans, n, nrhs, &ASAV, bsav, ldb,
                                          solx, ldx, xact, ldx, ferr, berr,
                                          &result[3]);

                                /* Print information about the tests that did
                                   not pass the threshold.    */
                                for (i = k1; i < NTESTS; ++i) {
                                    if ( result[i] >= THRESH ) {
                                        printf(FMT2, "zgssvx",
                                               options.Fact, trans, *equed,
                                               n, imat, i, result[i]);
                                        ++nfail;
                                    }
                                }
                                nrun += NTESTS;
                            } /* if .. info == 0 */
                        } /* else .. end of testing zgssvx */

                    } /* for itran ... */

                    if ( lwork == 0 ) {
                        Destroy_SuperNode_Matrix(&L);
                        Destroy_CompCol_Matrix(&U);
                    }

                } /* for equil ... */
            } /* for ifact ... */
        } /* for iequed ... */
#if 0
    if ( !info ) {
        PrintPerf(&L, &U, &mem_usage, rpg, rcond, ferr, berr, equed);
    }
#endif

    } /* for imat ... */

    /* Print a summary of the results. */
    PrintSumm("ZGE", nfail, nrun, nerrs);

    SUPERLU_FREE (rhsb);
    SUPERLU_FREE (bsav);
    SUPERLU_FREE (solx);
    SUPERLU_FREE (xact);
    SUPERLU_FREE (etree);
    SUPERLU_FREE (perm_r);
    SUPERLU_FREE (perm_c);
    SUPERLU_FREE (pc_save);
    SUPERLU_FREE (R);
    SUPERLU_FREE (C);
    SUPERLU_FREE (ferr);
    SUPERLU_FREE (berr);
    SUPERLU_FREE (rwork);
    SUPERLU_FREE (wwork);
    Destroy_SuperMatrix_Store(&B);
    Destroy_SuperMatrix_Store(&X);
    Destroy_CompCol_Matrix(&A);
    Destroy_CompCol_Matrix(&ASAV);
    if ( lwork > 0 ) {
        SUPERLU_FREE (work);
        Destroy_SuperMatrix_Store(&L);
        Destroy_SuperMatrix_Store(&U);
    }
    StatFree(&stat);

    return 0;
}
Exemplo n.º 22
0
void
cgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U,
       int *perm_c, int *perm_r, char *equed, float *R, float *C,
       SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr,
       SuperLUStat_t *stat, int *info)
{
/*
 *   Purpose   
 *   =======   
 *
 *   CGSRFS improves the computed solution to a system of linear   
 *   equations and provides error bounds and backward error estimates for 
 *   the solution.   
 *
 *   If equilibration was performed, the system becomes:
 *           (diag(R)*A_original*diag(C)) * X = diag(R)*B_original.
 *
 *   See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 *   Arguments   
 *   =========   
 *
 * trans   (input) trans_t
 *          Specifies the form of the system of equations:
 *          = NOTRANS: A * X = B  (No transpose)
 *          = TRANS:   A'* X = B  (Transpose)
 *          = CONJ:    A**H * X = B  (Conjugate transpose)
 *   
 *   A       (input) SuperMatrix*
 *           The original matrix A in the system, or the scaled A if
 *           equilibration was done. The type of A can be:
 *           Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_GE.
 *    
 *   L       (input) SuperMatrix*
 *	     The factor L from the factorization Pr*A*Pc=L*U. Use
 *           compressed row subscripts storage for supernodes, 
 *           i.e., L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU.
 * 
 *   U       (input) SuperMatrix*
 *           The factor U from the factorization Pr*A*Pc=L*U as computed by
 *           cgstrf(). Use column-wise storage scheme, 
 *           i.e., U has types: Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU.
 *
 *   perm_c  (input) int*, dimension (A->ncol)
 *	     Column permutation vector, which defines the 
 *           permutation matrix Pc; perm_c[i] = j means column i of A is 
 *           in position j in A*Pc.
 *
 *   perm_r  (input) int*, dimension (A->nrow)
 *           Row permutation vector, which defines the permutation matrix Pr;
 *           perm_r[i] = j means row i of A is in position j in Pr*A.
 *
 *   equed   (input) Specifies the form of equilibration that was done.
 *           = 'N': No equilibration.
 *           = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
 *           = 'C': Column equilibration, i.e., A was postmultiplied by
 *                  diag(C).
 *           = 'B': Both row and column equilibration, i.e., A was replaced 
 *                  by diag(R)*A*diag(C).
 *
 *   R       (input) float*, dimension (A->nrow)
 *           The row scale factors for A.
 *           If equed = 'R' or 'B', A is premultiplied by diag(R).
 *           If equed = 'N' or 'C', R is not accessed.
 * 
 *   C       (input) float*, dimension (A->ncol)
 *           The column scale factors for A.
 *           If equed = 'C' or 'B', A is postmultiplied by diag(C).
 *           If equed = 'N' or 'R', C is not accessed.
 *
 *   B       (input) SuperMatrix*
 *           B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE.
 *           The right hand side matrix B.
 *           if equed = 'R' or 'B', B is premultiplied by diag(R).
 *
 *   X       (input/output) SuperMatrix*
 *           X has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE.
 *           On entry, the solution matrix X, as computed by cgstrs().
 *           On exit, the improved solution matrix X.
 *           if *equed = 'C' or 'B', X should be premultiplied by diag(C)
 *               in order to obtain the solution to the original system.
 *
 *   FERR    (output) float*, dimension (B->ncol)   
 *           The estimated forward error bound for each solution vector   
 *           X(j) (the j-th column of the solution matrix X).   
 *           If XTRUE is the true solution corresponding to X(j), FERR(j) 
 *           is an estimated upper bound for the magnitude of the largest 
 *           element in (X(j) - XTRUE) divided by the magnitude of the   
 *           largest element in X(j).  The estimate is as reliable as   
 *           the estimate for RCOND, and is almost always a slight   
 *           overestimate of the true error.
 *
 *   BERR    (output) float*, dimension (B->ncol)   
 *           The componentwise relative backward error of each solution   
 *           vector X(j) (i.e., the smallest relative change in   
 *           any element of A or B that makes X(j) an exact solution).
 *
 *   stat     (output) SuperLUStat_t*
 *            Record the statistics on runtime and floating-point operation count.
 *            See util.h for the definition of 'SuperLUStat_t'.
 *
 *   info    (output) int*   
 *           = 0:  successful exit   
 *            < 0:  if INFO = -i, the i-th argument had an illegal value   
 *
 *    Internal Parameters   
 *    ===================   
 *
 *    ITMAX is the maximum number of steps of iterative refinement.   
 *
 */  

#define ITMAX 5
    
    /* Table of constant values */
    int    ione = 1;
    complex ndone = {-1., 0.};
    complex done = {1., 0.};
    
    /* Local variables */
    NCformat *Astore;
    complex   *Aval;
    SuperMatrix Bjcol;
    DNformat *Bstore, *Xstore, *Bjcol_store;
    complex   *Bmat, *Xmat, *Bptr, *Xptr;
    int      kase;
    float   safe1, safe2;
    int      i, j, k, irow, nz, count, notran, rowequ, colequ;
    int      ldb, ldx, nrhs;
    float   s, xk, lstres, eps, safmin;
    char     transc[1];
    trans_t  transt;
    complex   *work;
    float   *rwork;
    int      *iwork;
    extern double slamch_(char *);
    extern int clacon_(int *, complex *, complex *, float *, int *);
#ifdef _CRAY
    extern int CCOPY(int *, complex *, int *, complex *, int *);
    extern int CSAXPY(int *, complex *, complex *, int *, complex *, int *);
#else
    extern int ccopy_(int *, complex *, int *, complex *, int *);
    extern int caxpy_(int *, complex *, complex *, int *, complex *, int *);
#endif

    Astore = A->Store;
    Aval   = Astore->nzval;
    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;
    
    /* Test the input parameters */
    *info = 0;
    notran = (trans == NOTRANS);
    if ( !notran && trans != TRANS && trans != CONJ ) *info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      A->Stype != SLU_NC || A->Dtype != SLU_C || A->Mtype != SLU_GE )
	*info = -2;
    else if ( L->nrow != L->ncol || L->nrow < 0 ||
 	      L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU )
	*info = -3;
    else if ( U->nrow != U->ncol || U->nrow < 0 ||
 	      U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU )
	*info = -4;
    else if ( ldb < SUPERLU_MAX(0, A->nrow) ||
 	      B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE )
        *info = -10;
    else if ( ldx < SUPERLU_MAX(0, A->nrow) ||
 	      X->Stype != SLU_DN || X->Dtype != SLU_C || X->Mtype != SLU_GE )
	*info = -11;
    if (*info != 0) {
	i = -(*info);
	xerbla_("cgsrfs", &i);
	return;
    }

    /* Quick return if possible */
    if ( A->nrow == 0 || nrhs == 0) {
	for (j = 0; j < nrhs; ++j) {
	    ferr[j] = 0.;
	    berr[j] = 0.;
	}
	return;
    }

    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
    colequ = lsame_(equed, "C") || lsame_(equed, "B");
    
    /* Allocate working space */
    work = complexMalloc(2*A->nrow);
    rwork = (float *) SUPERLU_MALLOC( A->nrow * sizeof(float) );
    iwork = intMalloc(A->nrow);
    if ( !work || !rwork || !iwork ) 
        ABORT("Malloc fails for work/rwork/iwork.");
    
    if ( notran ) {
	*(unsigned char *)transc = 'N';
        transt = TRANS;
    } else {
	*(unsigned char *)transc = 'T';
	transt = NOTRANS;
    }

    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
    nz     = A->ncol + 1;
    eps    = slamch_("Epsilon");
    safmin = slamch_("Safe minimum");
    safe1  = nz * safmin;
    safe2  = safe1 / eps;

    /* Compute the number of nonzeros in each row (or column) of A */
    for (i = 0; i < A->nrow; ++i) iwork[i] = 0;
    if ( notran ) {
	for (k = 0; k < A->ncol; ++k)
	    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) 
		++iwork[Astore->rowind[i]];
    } else {
	for (k = 0; k < A->ncol; ++k)
	    iwork[k] = Astore->colptr[k+1] - Astore->colptr[k];
    }	

    /* Copy one column of RHS B into Bjcol. */
    Bjcol.Stype = B->Stype;
    Bjcol.Dtype = B->Dtype;
    Bjcol.Mtype = B->Mtype;
    Bjcol.nrow  = B->nrow;
    Bjcol.ncol  = 1;
    Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
    if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store");
    Bjcol_store = Bjcol.Store;
    Bjcol_store->lda = ldb;
    Bjcol_store->nzval = work; /* address aliasing */
	
    /* Do for each right hand side ... */
    for (j = 0; j < nrhs; ++j) {
	count = 0;
	lstres = 3.;
	Bptr = &Bmat[j*ldb];
	Xptr = &Xmat[j*ldx];

	while (1) { /* Loop until stopping criterion is satisfied. */

	    /* Compute residual R = B - op(A) * X,   
	       where op(A) = A, A**T, or A**H, depending on TRANS. */
	    
#ifdef _CRAY
	    CCOPY(&A->nrow, Bptr, &ione, work, &ione);
#else
	    ccopy_(&A->nrow, Bptr, &ione, work, &ione);
#endif
	    sp_cgemv(transc, ndone, A, Xptr, ione, done, work, ione);

	    /* Compute componentwise relative backward error from formula 
	       max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) )   
	       where abs(Z) is the componentwise absolute value of the matrix
	       or vector Z.  If the i-th component of the denominator is less
	       than SAFE2, then SAFE1 is added to the i-th component of the   
	       numerator and denominator before dividing. */

	    for (i = 0; i < A->nrow; ++i) rwork[i] = slu_c_abs1( &Bptr[i] );
	    
	    /* Compute abs(op(A))*abs(X) + abs(B). */
	    if (notran) {
		for (k = 0; k < A->ncol; ++k) {
		    xk = slu_c_abs1( &Xptr[k] );
		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
			rwork[Astore->rowind[i]] += slu_c_abs1(&Aval[i]) * xk;
		}
	    } else {
		for (k = 0; k < A->ncol; ++k) {
		    s = 0.;
		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
			irow = Astore->rowind[i];
			s += slu_c_abs1(&Aval[i]) * slu_c_abs1(&Xptr[irow]);
		    }
		    rwork[k] += s;
		}
	    }
	    s = 0.;
	    for (i = 0; i < A->nrow; ++i) {
		if (rwork[i] > safe2)
		    s = SUPERLU_MAX( s, slu_c_abs1(&work[i]) / rwork[i] );
		else
		    s = SUPERLU_MAX( s, (slu_c_abs1(&work[i]) + safe1) / 
				(rwork[i] + safe1) );
	    }
	    berr[j] = s;

	    /* Test stopping criterion. Continue iterating if   
	       1) The residual BERR(J) is larger than machine epsilon, and   
	       2) BERR(J) decreased by at least a factor of 2 during the   
	          last iteration, and   
	       3) At most ITMAX iterations tried. */

	    if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) {
		/* Update solution and try again. */
		cgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
		
#ifdef _CRAY
		CAXPY(&A->nrow, &done, work, &ione,
		       &Xmat[j*ldx], &ione);
#else
		caxpy_(&A->nrow, &done, work, &ione,
		       &Xmat[j*ldx], &ione);
#endif
		lstres = berr[j];
		++count;
	    } else {
		break;
	    }
        
	} /* end while */

	stat->RefineSteps = count;

	/* Bound error from formula:
	   norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))*   
	   ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X)   
          where   
            norm(Z) is the magnitude of the largest component of Z   
            inv(op(A)) is the inverse of op(A)   
            abs(Z) is the componentwise absolute value of the matrix or
	       vector Z   
            NZ is the maximum number of nonzeros in any row of A, plus 1   
            EPS is machine epsilon   

          The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B))   
          is incremented by SAFE1 if the i-th component of   
          abs(op(A))*abs(X) + abs(B) is less than SAFE2.   

          Use CLACON to estimate the infinity-norm of the matrix   
             inv(op(A)) * diag(W),   
          where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */
	
	for (i = 0; i < A->nrow; ++i) rwork[i] = slu_c_abs1( &Bptr[i] );
	
	/* Compute abs(op(A))*abs(X) + abs(B). */
	if ( notran ) {
	    for (k = 0; k < A->ncol; ++k) {
		xk = slu_c_abs1( &Xptr[k] );
		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
		    rwork[Astore->rowind[i]] += slu_c_abs1(&Aval[i]) * xk;
	    }
	} else {
	    for (k = 0; k < A->ncol; ++k) {
		s = 0.;
		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
		    irow = Astore->rowind[i];
		    xk = slu_c_abs1( &Xptr[irow] );
		    s += slu_c_abs1(&Aval[i]) * xk;
		}
		rwork[k] += s;
	    }
	}
	
	for (i = 0; i < A->nrow; ++i)
	    if (rwork[i] > safe2)
		rwork[i] = slu_c_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i];
	    else
		rwork[i] = slu_c_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1;
	kase = 0;

	do {
	    clacon_(&A->nrow, &work[A->nrow], work,
		    &ferr[j], &kase);
	    if (kase == 0) break;

	    if (kase == 1) {
		/* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */
		if ( notran && colequ )
		    for (i = 0; i < A->ncol; ++i) {
		        cs_mult(&work[i], &work[i], C[i]);
	            }
		else if ( !notran && rowequ )
		    for (i = 0; i < A->nrow; ++i) {
		        cs_mult(&work[i], &work[i], R[i]);
                    }

		cgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info);
		
		for (i = 0; i < A->nrow; ++i) {
		    cs_mult(&work[i], &work[i], rwork[i]);
	 	}
	    } else {
		/* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */
		for (i = 0; i < A->nrow; ++i) {
		    cs_mult(&work[i], &work[i], rwork[i]);
		}
		
		cgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
		
		if ( notran && colequ )
		    for (i = 0; i < A->ncol; ++i) {
		        cs_mult(&work[i], &work[i], C[i]);
		    }
		else if ( !notran && rowequ )
		    for (i = 0; i < A->ncol; ++i) {
		        cs_mult(&work[i], &work[i], R[i]);  
		    }
	    }
	    
	} while ( kase != 0 );

	/* Normalize error. */
	lstres = 0.;
 	if ( notran && colequ ) {
	    for (i = 0; i < A->nrow; ++i)
	    	lstres = SUPERLU_MAX( lstres, C[i] * slu_c_abs1( &Xptr[i]) );
  	} else if ( !notran && rowequ ) {
	    for (i = 0; i < A->nrow; ++i)
	    	lstres = SUPERLU_MAX( lstres, R[i] * slu_c_abs1( &Xptr[i]) );
	} else {
	    for (i = 0; i < A->nrow; ++i)
	    	lstres = SUPERLU_MAX( lstres, slu_c_abs1( &Xptr[i]) );
	}
	if ( lstres != 0. )
	    ferr[j] /= lstres;

    } /* for each RHS j ... */
    
    SUPERLU_FREE(work);
    SUPERLU_FREE(rwork);
    SUPERLU_FREE(iwork);
    SUPERLU_FREE(Bjcol.Store);

    return;

} /* cgsrfs */
Exemplo n.º 23
0
void f_create_options_handle(fptr *handle)
{
    *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_options_t));
}
Exemplo n.º 24
0
/*! \brief
 *
 * <pre>
 *   Purpose   
 *   =======   
 *
 *   SGSRFS improves the computed solution to a system of linear   
 *   equations and provides error bounds and backward error estimates for 
 *   the solution.   
 *
 *   If equilibration was performed, the system becomes:
 *           (diag(R)*A_original*diag(C)) * X = diag(R)*B_original.
 *
 *   See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 *   Arguments   
 *   =========   
 *
 * trans   (input) trans_t
 *          Specifies the form of the system of equations:
 *          = NOTRANS: A * X = B  (No transpose)
 *          = TRANS:   A'* X = B  (Transpose)
 *          = CONJ:    A**H * X = B  (Conjugate transpose)
 *   
 *   A       (input) SuperMatrix*
 *           The original matrix A in the system, or the scaled A if
 *           equilibration was done. The type of A can be:
 *           Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_GE.
 *    
 *   L       (input) SuperMatrix*
 *	     The factor L from the factorization Pr*A*Pc=L*U. Use
 *           compressed row subscripts storage for supernodes, 
 *           i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
 * 
 *   U       (input) SuperMatrix*
 *           The factor U from the factorization Pr*A*Pc=L*U as computed by
 *           sgstrf(). Use column-wise storage scheme, 
 *           i.e., U has types: Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
 *
 *   perm_c  (input) int*, dimension (A->ncol)
 *	     Column permutation vector, which defines the 
 *           permutation matrix Pc; perm_c[i] = j means column i of A is 
 *           in position j in A*Pc.
 *
 *   perm_r  (input) int*, dimension (A->nrow)
 *           Row permutation vector, which defines the permutation matrix Pr;
 *           perm_r[i] = j means row i of A is in position j in Pr*A.
 *
 *   equed   (input) Specifies the form of equilibration that was done.
 *           = 'N': No equilibration.
 *           = 'R': Row equilibration, i.e., A was premultiplied by diag(R).
 *           = 'C': Column equilibration, i.e., A was postmultiplied by
 *                  diag(C).
 *           = 'B': Both row and column equilibration, i.e., A was replaced 
 *                  by diag(R)*A*diag(C).
 *
 *   R       (input) float*, dimension (A->nrow)
 *           The row scale factors for A.
 *           If equed = 'R' or 'B', A is premultiplied by diag(R).
 *           If equed = 'N' or 'C', R is not accessed.
 * 
 *   C       (input) float*, dimension (A->ncol)
 *           The column scale factors for A.
 *           If equed = 'C' or 'B', A is postmultiplied by diag(C).
 *           If equed = 'N' or 'R', C is not accessed.
 *
 *   B       (input) SuperMatrix*
 *           B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE.
 *           The right hand side matrix B.
 *           if equed = 'R' or 'B', B is premultiplied by diag(R).
 *
 *   X       (input/output) SuperMatrix*
 *           X has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE.
 *           On entry, the solution matrix X, as computed by sgstrs().
 *           On exit, the improved solution matrix X.
 *           if *equed = 'C' or 'B', X should be premultiplied by diag(C)
 *               in order to obtain the solution to the original system.
 *
 *   FERR    (output) float*, dimension (B->ncol)   
 *           The estimated forward error bound for each solution vector   
 *           X(j) (the j-th column of the solution matrix X).   
 *           If XTRUE is the true solution corresponding to X(j), FERR(j) 
 *           is an estimated upper bound for the magnitude of the largest 
 *           element in (X(j) - XTRUE) divided by the magnitude of the   
 *           largest element in X(j).  The estimate is as reliable as   
 *           the estimate for RCOND, and is almost always a slight   
 *           overestimate of the true error.
 *
 *   BERR    (output) float*, dimension (B->ncol)   
 *           The componentwise relative backward error of each solution   
 *           vector X(j) (i.e., the smallest relative change in   
 *           any element of A or B that makes X(j) an exact solution).
 *
 *   stat     (output) SuperLUStat_t*
 *            Record the statistics on runtime and floating-point operation count.
 *            See util.h for the definition of 'SuperLUStat_t'.
 *
 *   info    (output) int*   
 *           = 0:  successful exit   
 *            < 0:  if INFO = -i, the i-th argument had an illegal value   
 *
 *    Internal Parameters   
 *    ===================   
 *
 *    ITMAX is the maximum number of steps of iterative refinement.   
 *
 * </pre>
 */
void
sgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U,
       int *perm_c, int *perm_r, char *equed, float *R, float *C,
       SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr,
       SuperLUStat_t *stat, int *info)
{


#define ITMAX 5
    
    /* Table of constant values */
    int    ione = 1;
    float ndone = -1.;
    float done = 1.;
    
    /* Local variables */
    NCformat *Astore;
    float   *Aval;
    SuperMatrix Bjcol;
    DNformat *Bstore, *Xstore, *Bjcol_store;
    float   *Bmat, *Xmat, *Bptr, *Xptr;
    int      kase;
    float   safe1, safe2;
    int      i, j, k, irow, nz, count, notran, rowequ, colequ;
    int      ldb, ldx, nrhs;
    float   s, xk, lstres, eps, safmin;
    char     transc[1];
    trans_t  transt;
    float   *work;
    float   *rwork;
    int      *iwork;

    extern int slacon_(int *, float *, float *, int *, float *, int *);
#ifdef _CRAY
    extern int SCOPY(int *, float *, int *, float *, int *);
    extern int SSAXPY(int *, float *, float *, int *, float *, int *);
#else
    extern int scopy_(int *, float *, int *, float *, int *);
    extern int saxpy_(int *, float *, float *, int *, float *, int *);
#endif

    Astore = A->Store;
    Aval   = Astore->nzval;
    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;
    
    /* Test the input parameters */
    *info = 0;
    notran = (trans == NOTRANS);
    if ( !notran && trans != TRANS && trans != CONJ ) *info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE )
	*info = -2;
    else if ( L->nrow != L->ncol || L->nrow < 0 ||
 	      L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU )
	*info = -3;
    else if ( U->nrow != U->ncol || U->nrow < 0 ||
 	      U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU )
	*info = -4;
    else if ( ldb < SUPERLU_MAX(0, A->nrow) ||
 	      B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE )
        *info = -10;
    else if ( ldx < SUPERLU_MAX(0, A->nrow) ||
 	      X->Stype != SLU_DN || X->Dtype != SLU_S || X->Mtype != SLU_GE )
	*info = -11;
    if (*info != 0) {
	i = -(*info);
	xerbla_("sgsrfs", &i);
	return;
    }

    /* Quick return if possible */
    if ( A->nrow == 0 || nrhs == 0) {
	for (j = 0; j < nrhs; ++j) {
	    ferr[j] = 0.;
	    berr[j] = 0.;
	}
	return;
    }

    rowequ = lsame_(equed, "R") || lsame_(equed, "B");
    colequ = lsame_(equed, "C") || lsame_(equed, "B");
    
    /* Allocate working space */
    work = floatMalloc(2*A->nrow);
    rwork = (float *) SUPERLU_MALLOC( A->nrow * sizeof(float) );
    iwork = intMalloc(2*A->nrow);
    if ( !work || !rwork || !iwork ) 
        ABORT("Malloc fails for work/rwork/iwork.");
    
    if ( notran ) {
	*(unsigned char *)transc = 'N';
        transt = TRANS;
    } else {
	*(unsigned char *)transc = 'T';
	transt = NOTRANS;
    }

    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
    nz     = A->ncol + 1;
    eps    = slamch_("Epsilon");
    safmin = slamch_("Safe minimum");
    /* Set SAFE1 essentially to be the underflow threshold times the
       number of additions in each row. */
    safe1  = nz * safmin;
    safe2  = safe1 / eps;

    /* Compute the number of nonzeros in each row (or column) of A */
    for (i = 0; i < A->nrow; ++i) iwork[i] = 0;
    if ( notran ) {
	for (k = 0; k < A->ncol; ++k)
	    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) 
		++iwork[Astore->rowind[i]];
    } else {
	for (k = 0; k < A->ncol; ++k)
	    iwork[k] = Astore->colptr[k+1] - Astore->colptr[k];
    }	

    /* Copy one column of RHS B into Bjcol. */
    Bjcol.Stype = B->Stype;
    Bjcol.Dtype = B->Dtype;
    Bjcol.Mtype = B->Mtype;
    Bjcol.nrow  = B->nrow;
    Bjcol.ncol  = 1;
    Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
    if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store");
    Bjcol_store = Bjcol.Store;
    Bjcol_store->lda = ldb;
    Bjcol_store->nzval = work; /* address aliasing */
	
    /* Do for each right hand side ... */
    for (j = 0; j < nrhs; ++j) {
	count = 0;
	lstres = 3.;
	Bptr = &Bmat[j*ldb];
	Xptr = &Xmat[j*ldx];

	while (1) { /* Loop until stopping criterion is satisfied. */

	    /* Compute residual R = B - op(A) * X,   
	       where op(A) = A, A**T, or A**H, depending on TRANS. */
	    
#ifdef _CRAY
	    SCOPY(&A->nrow, Bptr, &ione, work, &ione);
#else
	    scopy_(&A->nrow, Bptr, &ione, work, &ione);
#endif
	    sp_sgemv(transc, ndone, A, Xptr, ione, done, work, ione);

	    /* Compute componentwise relative backward error from formula 
	       max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) )   
	       where abs(Z) is the componentwise absolute value of the matrix
	       or vector Z.  If the i-th component of the denominator is less
	       than SAFE2, then SAFE1 is added to the i-th component of the   
	       numerator before dividing. */

	    for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] );
	    
	    /* Compute abs(op(A))*abs(X) + abs(B). */
	    if (notran) {
		for (k = 0; k < A->ncol; ++k) {
		    xk = fabs( Xptr[k] );
		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
			rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk;
		}
	    } else {
		for (k = 0; k < A->ncol; ++k) {
		    s = 0.;
		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
			irow = Astore->rowind[i];
			s += fabs(Aval[i]) * fabs(Xptr[irow]);
		    }
		    rwork[k] += s;
		}
	    }
	    s = 0.;
	    for (i = 0; i < A->nrow; ++i) {
		if (rwork[i] > safe2) {
		    s = SUPERLU_MAX( s, fabs(work[i]) / rwork[i] );
		} else if ( rwork[i] != 0.0 ) {
                    /* Adding SAFE1 to the numerator guards against
                       spuriously zero residuals (underflow). */
		    s = SUPERLU_MAX( s, (safe1 + fabs(work[i])) / rwork[i] );
                }
                /* If rwork[i] is exactly 0.0, then we know the true 
                   residual also must be exactly 0.0. */
	    }
	    berr[j] = s;

	    /* Test stopping criterion. Continue iterating if   
	       1) The residual BERR(J) is larger than machine epsilon, and   
	       2) BERR(J) decreased by at least a factor of 2 during the   
	          last iteration, and   
	       3) At most ITMAX iterations tried. */

	    if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) {
		/* Update solution and try again. */
		sgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
		
#ifdef _CRAY
		SAXPY(&A->nrow, &done, work, &ione,
		       &Xmat[j*ldx], &ione);
#else
		saxpy_(&A->nrow, &done, work, &ione,
		       &Xmat[j*ldx], &ione);
#endif
		lstres = berr[j];
		++count;
	    } else {
		break;
	    }
        
	} /* end while */

	stat->RefineSteps = count;

	/* Bound error from formula:
	   norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))*   
	   ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X)   
          where   
            norm(Z) is the magnitude of the largest component of Z   
            inv(op(A)) is the inverse of op(A)   
            abs(Z) is the componentwise absolute value of the matrix or
	       vector Z   
            NZ is the maximum number of nonzeros in any row of A, plus 1   
            EPS is machine epsilon   

          The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B))   
          is incremented by SAFE1 if the i-th component of   
          abs(op(A))*abs(X) + abs(B) is less than SAFE2.   

          Use SLACON to estimate the infinity-norm of the matrix   
             inv(op(A)) * diag(W),   
          where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */
	
	for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] );
	
	/* Compute abs(op(A))*abs(X) + abs(B). */
	if ( notran ) {
	    for (k = 0; k < A->ncol; ++k) {
		xk = fabs( Xptr[k] );
		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
		    rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk;
	    }
	} else {
	    for (k = 0; k < A->ncol; ++k) {
		s = 0.;
		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
		    irow = Astore->rowind[i];
		    xk = fabs( Xptr[irow] );
		    s += fabs(Aval[i]) * xk;
		}
		rwork[k] += s;
	    }
	}
	
	for (i = 0; i < A->nrow; ++i)
	    if (rwork[i] > safe2)
		rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i];
	    else
		rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1;

	kase = 0;

	do {
	    slacon_(&A->nrow, &work[A->nrow], work,
		    &iwork[A->nrow], &ferr[j], &kase);
	    if (kase == 0) break;

	    if (kase == 1) {
		/* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */
		if ( notran && colequ )
		    for (i = 0; i < A->ncol; ++i) work[i] *= C[i];
		else if ( !notran && rowequ )
		    for (i = 0; i < A->nrow; ++i) work[i] *= R[i];
		
		sgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info);
		
		for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i];
	    } else {
		/* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */
		for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i];
		
		sgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info);
		
		if ( notran && colequ )
		    for (i = 0; i < A->ncol; ++i) work[i] *= C[i];
		else if ( !notran && rowequ )
		    for (i = 0; i < A->ncol; ++i) work[i] *= R[i];  
	    }
	    
	} while ( kase != 0 );


	/* Normalize error. */
	lstres = 0.;
 	if ( notran && colequ ) {
	    for (i = 0; i < A->nrow; ++i)
	    	lstres = SUPERLU_MAX( lstres, C[i] * fabs( Xptr[i]) );
  	} else if ( !notran && rowequ ) {
	    for (i = 0; i < A->nrow; ++i)
	    	lstres = SUPERLU_MAX( lstres, R[i] * fabs( Xptr[i]) );
	} else {
	    for (i = 0; i < A->nrow; ++i)
	    	lstres = SUPERLU_MAX( lstres, fabs( Xptr[i]) );
	}
	if ( lstres != 0. )
	    ferr[j] /= lstres;

    } /* for each RHS j ... */
    
    SUPERLU_FREE(work);
    SUPERLU_FREE(rwork);
    SUPERLU_FREE(iwork);
    SUPERLU_FREE(Bjcol.Store);

    return;

} /* sgsrfs */
Exemplo n.º 25
0
void f_create_SOLVEstruct_handle(fptr *handle)
{
    *handle = (fptr) SUPERLU_MALLOC(sizeof(SOLVEstruct_t));
}
Exemplo n.º 26
0
void
c_fortran_zgssv_(int *iopt, int *n, int *nnz, int *nrhs,
                 doublecomplex *values, int *rowind, int *colptr,
                 doublecomplex *b, int *ldb,
                 fptr *f_factors, /* a handle containing the address
                                     pointing to the factored matrices */
                 int *info)

{
/*
 * This routine can be called from Fortran.
 *
 * iopt (input) int
 *      Specifies the operation:
 *      = 1, performs LU decomposition for the first time
 *      = 2, performs triangular solve
 *      = 3, free all the storage in the end
 *
 * f_factors (input/output) fptr*
 *      If iopt == 1, it is an output and contains the pointer pointing to
 *                    the structure of the factored matrices.
 *      Otherwise, it it an input.
 *
 */

    SuperMatrix A, AC, B;
    SuperMatrix *L, *U;
    int *perm_r; /* row permutations from partial pivoting */
    int *perm_c; /* column permutation vector */
    int *etree;  /* column elimination tree */
    SCformat *Lstore;
    NCformat *Ustore;
    int      i, panel_size, permc_spec, relax;
    trans_t  trans;
    mem_usage_t   mem_usage;
    superlu_options_t options;
    SuperLUStat_t stat;
    factors_t *LUfactors;

    trans = NOTRANS;

    if ( *iopt == 1 ) { /* LU decomposition */

        /* Set the default input options. */
        set_default_options(&options);

        /* Initialize the statistics variables. */
        StatInit(&stat);

        /* Adjust to 0-based indexing */
        for (i = 0; i < *nnz; ++i) --rowind[i];
        for (i = 0; i <= *n; ++i) --colptr[i];

        zCreate_CompCol_Matrix(&A, *n, *n, *nnz, values, rowind, colptr,
                               SLU_NC, SLU_Z, SLU_GE);
        L = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
        U = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
        if ( !(perm_r = intMalloc(*n)) ) ABORT("Malloc fails for perm_r[].");
        if ( !(perm_c = intMalloc(*n)) ) ABORT("Malloc fails for perm_c[].");
        if ( !(etree = intMalloc(*n)) ) ABORT("Malloc fails for etree[].");

        /*
         * Get column permutation vector perm_c[], according to permc_spec:
         *   permc_spec = 0: natural ordering
         *   permc_spec = 1: minimum degree on structure of A'*A
         *   permc_spec = 2: minimum degree on structure of A'+A
         *   permc_spec = 3: approximate minimum degree for unsymmetric matrices
         */
        permc_spec = options.ColPerm;
        get_perm_c(permc_spec, &A, perm_c);

        sp_preorder(&options, &A, perm_c, etree, &AC);

        panel_size = sp_ienv(1);
        relax = sp_ienv(2);

        zgstrf(&options, &AC, relax, panel_size, etree,
                NULL, 0, perm_c, perm_r, L, U, &stat, info);

        if ( *info == 0 ) {
            Lstore = (SCformat *) L->Store;
            Ustore = (NCformat *) U->Store;
            printf("No of nonzeros in factor L = %d\n", Lstore->nnz);
            printf("No of nonzeros in factor U = %d\n", Ustore->nnz);
            printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz);
            zQuerySpace(L, U, &mem_usage);
            printf("L\\U MB %.3f\ttotal MB needed %.3f\n",
                   mem_usage.for_lu/1e6, mem_usage.total_needed/1e6);
        } else {
            printf("zgstrf() error returns INFO= %d\n", *info);
            if ( *info <= *n ) { /* factorization completes */
                zQuerySpace(L, U, &mem_usage);
                printf("L\\U MB %.3f\ttotal MB needed %.3f\n",
                       mem_usage.for_lu/1e6, mem_usage.total_needed/1e6);
            }
        }

        /* Restore to 1-based indexing */
        for (i = 0; i < *nnz; ++i) ++rowind[i];
        for (i = 0; i <= *n; ++i) ++colptr[i];

        /* Save the LU factors in the factors handle */
        LUfactors = (factors_t*) SUPERLU_MALLOC(sizeof(factors_t));
        LUfactors->L = L;
        LUfactors->U = U;
        LUfactors->perm_c = perm_c;
        LUfactors->perm_r = perm_r;
        *f_factors = (fptr) LUfactors;

        /* Free un-wanted storage */
        SUPERLU_FREE(etree);
        Destroy_SuperMatrix_Store(&A);
        Destroy_CompCol_Permuted(&AC);
        StatFree(&stat);

    } else if ( *iopt == 2 ) { /* Triangular solve */
        /* Initialize the statistics variables. */
        StatInit(&stat);

        /* Extract the LU factors in the factors handle */
        LUfactors = (factors_t*) *f_factors;
        L = LUfactors->L;
        U = LUfactors->U;
        perm_c = LUfactors->perm_c;
        perm_r = LUfactors->perm_r;

        zCreate_Dense_Matrix(&B, *n, *nrhs, b, *ldb, SLU_DN, SLU_Z, SLU_GE);

        /* Solve the system A*X=B, overwriting B with X. */
        zgstrs (trans, L, U, perm_c, perm_r, &B, &stat, info);

        Destroy_SuperMatrix_Store(&B);
        StatFree(&stat);

    } else if ( *iopt == 3 ) { /* Free storage */
        /* Free the LU factors in the factors handle */
        LUfactors = (factors_t*) *f_factors;
        SUPERLU_FREE (LUfactors->perm_r);
        SUPERLU_FREE (LUfactors->perm_c);
        Destroy_SuperNode_Matrix(LUfactors->L);
        Destroy_CompCol_Matrix(LUfactors->U);
        SUPERLU_FREE (LUfactors->L);
        SUPERLU_FREE (LUfactors->U);
        SUPERLU_FREE (LUfactors);
    } else {
        fprintf(stderr,"Invalid iopt=%d passed to c_fortran_zgssv()\n",*iopt);
        exit(-1);
    }
}
Exemplo n.º 27
0
void f_create_SuperLUStat_handle(fptr *handle)
{
    *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
}
Exemplo n.º 28
0
void
pdgssvx(int nprocs, pdgstrf_options_t *pdgstrf_options, SuperMatrix *A, 
	int *perm_c, int *perm_r, equed_t *equed, double *R, double *C,
	SuperMatrix *L, SuperMatrix *U,
	SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, 
	double *rcond, double *ferr, double *berr, 
	superlu_memusage_t *superlu_memusage, int *info)
{
/*
 * -- SuperLU MT routine (version 1.0) --
 * Univ. of California Berkeley, Xerox Palo Alto Research Center,
 * and Lawrence Berkeley National Lab.
 * August 15, 1997
 *
 * Purpose
 * =======
 *
 * pdgssvx() solves the system of linear equations A*X=B or A'*X=B, using
 * the LU factorization from dgstrf(). Error bounds on the solution and
 * a condition estimate are also provided. It performs the following steps:
 *
 * 1. If A is stored column-wise (A->Stype = NC):
 *  
 *    1.1. If fact = EQUILIBRATE, scaling factors are computed to equilibrate
 *         the system:
 *           trans = NOTRANS: diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B
 *           trans = TRANS:  (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
 *           trans = CONJ:   (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
 *         Whether or not the system will be equilibrated depends on the
 *         scaling of the matrix A, but if equilibration is used, A is
 *         overwritten by diag(R)*A*diag(C) and B by diag(R)*B 
 *         (if trans = NOTRANS) or diag(C)*B (if trans = TRANS or CONJ).
 *
 *    1.2. Permute columns of A, forming A*Pc, where Pc is a permutation matrix
 *         that usually preserves sparsity.
 *         For more details of this step, see sp_colorder.c.
 *
 *    1.3. If fact = DOFACT or EQUILIBRATE, the LU decomposition is used to 
 *         factor the matrix A (after equilibration if fact = EQUILIBRATE) as
 *         Pr*A*Pc = L*U, with Pr determined by partial pivoting.
 *
 *    1.4. Compute the reciprocal pivot growth factor.
 *
 *    1.5. If some U(i,i) = 0, so that U is exactly singular, then the routine
 *         returns with info = i. Otherwise, the factored form of A is used to
 *         estimate the condition number of the matrix A. If the reciprocal of
 *         the condition number is less than machine precision, 
 *         info = A->ncol+1 is returned as a warning, but the routine still
 *         goes on to solve for X and computes error bounds as described below.
 *
 *    1.6. The system of equations is solved for X using the factored form
 *         of A.
 *
 *    1.7. Iterative refinement is applied to improve the computed solution
 *         matrix and calculate error bounds and backward error estimates
 *         for it.
 *
 *    1.8. If equilibration was used, the matrix X is premultiplied by
 *         diag(C) (if trans = NOTRANS) or diag(R) (if trans = TRANS or CONJ)
 *         so that it solves the original system before equilibration.
 *
 * 2. If A is stored row-wise (A->Stype = NR), apply the above algorithm
 *    to the tranpose of A:
 *
 *    2.1. If fact = EQUILIBRATE, scaling factors are computed to equilibrate
 *         the system:
 *           trans = NOTRANS:diag(R)*A'*diag(C)*inv(diag(C))*X = diag(R)*B
 *           trans = TRANS: (diag(R)*A'*diag(C))**T *inv(diag(R))*X = diag(C)*B
 *           trans = CONJ:  (diag(R)*A'*diag(C))**H *inv(diag(R))*X = diag(C)*B
 *         Whether or not the system will be equilibrated depends on the
 *         scaling of the matrix A, but if equilibration is used, A' is
 *         overwritten by diag(R)*A'*diag(C) and B by diag(R)*B 
 *         (if trans = NOTRANS) or diag(C)*B (if trans = TRANS or CONJ).
 *
 *    2.2. Permute columns of transpose(A) (rows of A), 
 *         forming transpose(A)*Pc, where Pc is a permutation matrix that
 *         usually preserves sparsity.
 *         For more details of this step, see sp_colorder.c.
 *
 *    2.3. If fact = DOFACT or EQUILIBRATE, the LU decomposition is used to 
 *         factor the matrix A (after equilibration if fact = EQUILIBRATE) as
 *         Pr*transpose(A)*Pc = L*U, with the permutation Pr determined by
 *         partial pivoting.
 *
 *    2.4. Compute the reciprocal pivot growth factor.
 *
 *    2.5. If some U(i,i) = 0, so that U is exactly singular, then the routine
 *         returns with info = i. Otherwise, the factored form of transpose(A)
 *         is used to estimate the condition number of the matrix A.
 *         If the reciprocal of the condition number is less than machine
 *         precision, info = A->nrow+1 is returned as a warning, but the
 *         routine still goes on to solve for X and computes error bounds
 *         as described below.
 *
 *    2.6. The system of equations is solved for X using the factored form
 *         of transpose(A).
 *
 *    2.7. Iterative refinement is applied to improve the computed solution
 *         matrix and calculate error bounds and backward error estimates
 *         for it.
 *
 *    2.8. If equilibration was used, the matrix X is premultiplied by
 *         diag(C) (if trans = NOTRANS) or diag(R) (if trans = TRANS or CONJ)
 *         so that it solves the original system before equilibration.
 *
 * See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 * Arguments
 * =========
 *
 * nprocs (input) int
 *         Number of processes (or threads) to be spawned and used to perform
 *         the LU factorization by pdgstrf(). There is a single thread of
 *         control to call pdgstrf(), and all threads spawned by pdgstrf() 
 *         are terminated before returning from pdgstrf().
 *
 * pdgstrf_options (input) pdgstrf_options_t*
 *         The structure defines the input parameters and data structure
 *         to control how the LU factorization will be performed.
 *         The following fields should be defined for this structure:
 *
 *         o fact (fact_t)
 *           Specifies whether or not the factored form of the matrix
 *           A is supplied on entry, and if not, whether the matrix A should
 *           be equilibrated before it is factored.
 *           = FACTORED: On entry, L, U, perm_r and perm_c contain the 
 *             factored form of A. If equed is not NOEQUIL, the matrix A has
 *             been equilibrated with scaling factors R and C.
 *             A, L, U, perm_r are not modified.
 *           = DOFACT: The matrix A will be factored, and the factors will be
 *             stored in L and U.
 *           = EQUILIBRATE: The matrix A will be equilibrated if necessary,
 *             then factored into L and U.
 *
 *         o trans (trans_t)
 *           Specifies the form of the system of equations:
 *           = NOTRANS: A * X = B        (No transpose)
 *           = TRANS:   A**T * X = B     (Transpose)
 *           = CONJ:    A**H * X = B     (Transpose)
 *
 *         o refact (yes_no_t)
 *           Specifies whether this is first time or subsequent factorization.
 *           = NO:  this factorization is treated as the first one;
 *           = YES: it means that a factorization was performed prior to this
 *               one. Therefore, this factorization will re-use some
 *               existing data structures, such as L and U storage, column
 *               elimination tree, and the symbolic information of the
 *               Householder matrix.
 *
 *         o panel_size (int)
 *           A panel consists of at most panel_size consecutive columns.
 *
 *         o relax (int)
 *           To control degree of relaxing supernodes. If the number
 *           of nodes (columns) in a subtree of the elimination tree is less
 *           than relax, this subtree is considered as one supernode,
 *           regardless of the row structures of those columns.
 *
 *         o diag_pivot_thresh (double)
 *           Diagonal pivoting threshold. At step j of the Gaussian 
 *           elimination, if 
 *               abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)),
 *           use A_jj as pivot, else use A_ij with maximum magnitude. 
 *           0 <= diag_pivot_thresh <= 1. The default value is 1, 
 *           corresponding to partial pivoting.
 *
 *         o usepr (yes_no_t)
 *           Whether the pivoting will use perm_r specified by the user.
 *           = YES: use perm_r; perm_r is input, unchanged on exit.
 *           = NO:  perm_r is determined by partial pivoting, and is output.
 *
 *         o drop_tol (double) (NOT IMPLEMENTED)
 *	     Drop tolerance parameter. At step j of the Gaussian elimination,
 *           if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
 *           0 <= drop_tol <= 1. The default value of drop_tol is 0,
 *           corresponding to not dropping any entry.
 *
 *         o work (void*) of size lwork
 *           User-supplied work space and space for the output data structures.
 *           Not referenced if lwork = 0;
 *
 *         o lwork (int)
 *           Specifies the length of work array.
 *           = 0:  allocate space internally by system malloc;
 *           > 0:  use user-supplied work array of length lwork in bytes,
 *                 returns error if space runs out.
 *           = -1: the routine guesses the amount of space needed without
 *                 performing the factorization, and returns it in
 *                 superlu_memusage->total_needed; no other side effects.
 *
 * A       (input/output) SuperMatrix*
 *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol), where
 *         A->nrow = A->ncol. Currently, the type of A can be:
 *         Stype = NC or NR, Dtype = _D, Mtype = GE. In the future,
 *         more general A will be handled.
 *
 *         On entry, If pdgstrf_options->fact = FACTORED and equed is not 
 *         NOEQUIL, then A must have been equilibrated by the scaling factors
 *         in R and/or C.  On exit, A is not modified 
 *         if pdgstrf_options->fact = FACTORED or DOFACT, or 
 *         if pdgstrf_options->fact = EQUILIBRATE and equed = NOEQUIL.
 *
 *         On exit, if pdgstrf_options->fact = EQUILIBRATE and equed is not
 *         NOEQUIL, A is scaled as follows:
 *         If A->Stype = NC:
 *           equed = ROW:  A := diag(R) * A
 *           equed = COL:  A := A * diag(C)
 *           equed = BOTH: A := diag(R) * A * diag(C).
 *         If A->Stype = NR:
 *           equed = ROW:  transpose(A) := diag(R) * transpose(A)
 *           equed = COL:  transpose(A) := transpose(A) * diag(C)
 *           equed = BOTH: transpose(A) := diag(R) * transpose(A) * diag(C).
 *
 * perm_c  (input/output) int*
 *	   If A->Stype = NC, Column permutation vector of size A->ncol,
 *         which defines the permutation matrix Pc; perm_c[i] = j means
 *         column i of A is in position j in A*Pc.
 *         On exit, perm_c may be overwritten by the product of the input
 *         perm_c and a permutation that postorders the elimination tree
 *         of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
 *         is already in postorder.
 *
 *         If A->Stype = NR, column permutation vector of size A->nrow,
 *         which describes permutation of columns of tranpose(A) 
 *         (rows of A) as described above.
 * 
 * perm_r  (input/output) int*
 *         If A->Stype = NC, row permutation vector of size A->nrow, 
 *         which defines the permutation matrix Pr, and is determined
 *         by partial pivoting.  perm_r[i] = j means row i of A is in 
 *         position j in Pr*A.
 *
 *         If A->Stype = NR, permutation vector of size A->ncol, which
 *         determines permutation of rows of transpose(A)
 *         (columns of A) as described above.
 *
 *         If pdgstrf_options->usepr = NO, perm_r is output argument;
 *         If pdgstrf_options->usepr = YES, the pivoting routine will try 
 *            to use the input perm_r, unless a certain threshold criterion
 *            is violated. In that case, perm_r is overwritten by a new
 *            permutation determined by partial pivoting or diagonal 
 *            threshold pivoting.
 * 
 * equed   (input/output) equed_t*
 *         Specifies the form of equilibration that was done.
 *         = NOEQUIL: No equilibration.
 *         = ROW:  Row equilibration, i.e., A was premultiplied by diag(R).
 *         = COL:  Column equilibration, i.e., A was postmultiplied by diag(C).
 *         = BOTH: Both row and column equilibration, i.e., A was replaced 
 *                 by diag(R)*A*diag(C).
 *         If pdgstrf_options->fact = FACTORED, equed is an input argument, 
 *         otherwise it is an output argument.
 *
 * R       (input/output) double*, dimension (A->nrow)
 *         The row scale factors for A or transpose(A).
 *         If equed = ROW or BOTH, A (if A->Stype = NC) or transpose(A)
 *            (if A->Stype = NR) is multiplied on the left by diag(R).
 *         If equed = NOEQUIL or COL, R is not accessed.
 *         If fact = FACTORED, R is an input argument; otherwise, R is output.
 *         If fact = FACTORED and equed = ROW or BOTH, each element of R must
 *            be positive.
 * 
 * C       (input/output) double*, dimension (A->ncol)
 *         The column scale factors for A or transpose(A).
 *         If equed = COL or BOTH, A (if A->Stype = NC) or trnspose(A)
 *            (if A->Stype = NR) is multiplied on the right by diag(C).
 *         If equed = NOEQUIL or ROW, C is not accessed.
 *         If fact = FACTORED, C is an input argument; otherwise, C is output.
 *         If fact = FACTORED and equed = COL or BOTH, each element of C must
 *            be positive.
 *         
 * L       (output) SuperMatrix*
 *	   The factor L from the factorization
 *             Pr*A*Pc=L*U              (if A->Stype = NC) or
 *             Pr*transpose(A)*Pc=L*U   (if A->Stype = NR).
 *         Uses compressed row subscripts storage for supernodes, i.e.,
 *         L has types: Stype = SCP, Dtype = _D, Mtype = TRLU.
 *
 * U       (output) SuperMatrix*
 *	   The factor U from the factorization
 *             Pr*A*Pc=L*U              (if A->Stype = NC) or
 *             Pr*transpose(A)*Pc=L*U   (if A->Stype = NR).
 *         Uses column-wise storage scheme, i.e., U has types:
 *         Stype = NCP, Dtype = _D, Mtype = TRU.
 *
 * B       (input/output) SuperMatrix*
 *         B has types: Stype = DN, Dtype = _D, Mtype = GE.
 *         On entry, the right hand side matrix.
 *         On exit,
 *            if equed = NOEQUIL, B is not modified; otherwise
 *            if A->Stype = NC:
 *               if trans = NOTRANS and equed = ROW or BOTH, B is overwritten
 *                  by diag(R)*B;
 *               if trans = TRANS or CONJ and equed = COL of BOTH, B is
 *                  overwritten by diag(C)*B;
 *            if A->Stype = NR:
 *               if trans = NOTRANS and equed = COL or BOTH, B is overwritten
 *                  by diag(C)*B;
 *               if trans = TRANS or CONJ and equed = ROW of BOTH, B is
 *                  overwritten by diag(R)*B.
 *
 * X       (output) SuperMatrix*
 *         X has types: Stype = DN, Dtype = _D, Mtype = GE. 
 *         If info = 0 or info = A->ncol+1, X contains the solution matrix
 *         to the original system of equations. Note that A and B are modified
 *         on exit if equed is not NOEQUIL, and the solution to the 
 *         equilibrated system is inv(diag(C))*X if trans = NOTRANS and
 *         equed = COL or BOTH, or inv(diag(R))*X if trans = TRANS or CONJ
 *         and equed = ROW or BOTH.
 *
 * recip_pivot_growth (output) double*
 *         The reciprocal pivot growth factor computed as
 *             max_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ).
 *         If recip_pivot_growth is much less than 1, the stability of the
 *         LU factorization could be poor.
 *
 * rcond   (output) double*
 *         The estimate of the reciprocal condition number of the matrix A
 *         after equilibration (if done). If rcond is less than the machine
 *         precision (in particular, if rcond = 0), the matrix is singular
 *         to working precision. This condition is indicated by a return
 *         code of info > 0.
 *
 * ferr    (output) double*, dimension (B->ncol)   
 *         The estimated forward error bound for each solution vector   
 *         X(j) (the j-th column of the solution matrix X).   
 *         If XTRUE is the true solution corresponding to X(j), FERR(j) 
 *         is an estimated upper bound for the magnitude of the largest 
 *         element in (X(j) - XTRUE) divided by the magnitude of the   
 *         largest element in X(j).  The estimate is as reliable as   
 *         the estimate for RCOND, and is almost always a slight   
 *         overestimate of the true error.
 *
 * berr    (output) double*, dimension (B->ncol)
 *         The componentwise relative backward error of each solution   
 *         vector X(j) (i.e., the smallest relative change in   
 *         any element of A or B that makes X(j) an exact solution).
 *
 * superlu_memusage (output) superlu_memusage_t*
 *         Record the memory usage statistics, consisting of following fields:
 *         - for_lu (float)
 *           The amount of space used in bytes for L\U data structures.
 *         - total_needed (float)
 *           The amount of space needed in bytes to perform factorization.
 *         - expansions (int)
 *           The number of memory expansions during the LU factorization.
 *
 * info    (output) int*
 *         = 0: successful exit   
 *         < 0: if info = -i, the i-th argument had an illegal value   
 *         > 0: if info = i, and i is   
 *              <= A->ncol: U(i,i) is exactly zero. The factorization has   
 *                    been completed, but the factor U is exactly   
 *                    singular, so the solution and error bounds   
 *                    could not be computed.   
 *              = A->ncol+1: U is nonsingular, but RCOND is less than machine
 *                    precision, meaning that the matrix is singular to
 *                    working precision. Nevertheless, the solution and
 *                    error bounds are computed because there are a number
 *                    of situations where the computed solution can be more
 *                    accurate than the value of RCOND would suggest.   
 *              > A->ncol+1: number of bytes allocated when memory allocation
 *                    failure occurred, plus A->ncol.
 *
 */

    NCformat  *Astore;
    DNformat  *Bstore, *Xstore;
    double    *Bmat, *Xmat;
    int       ldb, ldx, nrhs;
    SuperMatrix *AA; /* A in NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int       colequ, equil, dofact, notran, rowequ;
    char      norm[1];
    trans_t   trant;
    int       i, j, info1;
    double    amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin;
    int       n, relax, panel_size;
    Gstat_t   Gstat;
    double    t0;      /* temporary time */
    double    *utime;
    flops_t   *ops, flopcnt;
   
    /* External functions */
    extern double dlangs(char *, SuperMatrix *);
    extern double dlamch_(char *);

    Astore = A->Store;
    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    n      = A->ncol;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;
    pdgstrf_options->perm_c = perm_c;
    pdgstrf_options->perm_r = perm_r;

    *info = 0;
    dofact = (pdgstrf_options->fact == DOFACT);
    equil = (pdgstrf_options->fact == EQUILIBRATE);
    notran = (pdgstrf_options->trans == NOTRANS);
    if (dofact || equil) {
	*equed = NOEQUIL;
	rowequ = FALSE;
	colequ = FALSE;
    } else {
	rowequ = (*equed == ROW) || (*equed == BOTH);
	colequ = (*equed == COL) || (*equed == BOTH);
	smlnum = dlamch_("Safe minimum");
	bignum = 1. / smlnum;
    }

    /* ------------------------------------------------------------
       Test the input parameters.
       ------------------------------------------------------------*/
    if ( nprocs <= 0 ) *info = -1;
    else if ( (!dofact && !equil && (pdgstrf_options->fact != FACTORED))
	      || (!notran && (pdgstrf_options->trans != TRANS) && 
		 (pdgstrf_options->trans != CONJ))
	      || (pdgstrf_options->refact != YES && 
		  pdgstrf_options->refact != NO)
	      || (pdgstrf_options->usepr != YES &&
		  pdgstrf_options->usepr != NO)
	      || pdgstrf_options->lwork < -1 )
        *info = -2;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
	      A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -3;
    else if ((pdgstrf_options->fact == FACTORED) && 
	     !(rowequ || colequ || (*equed == NOEQUIL))) *info = -6;
    else {
	if (rowequ) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = MIN(rcmin, R[j]);
		rcmax = MAX(rcmax, R[j]);
	    }
	    if (rcmin <= 0.) *info = -7;
	    else if ( A->nrow > 0)
		rowcnd = MAX(rcmin,smlnum) / MIN(rcmax,bignum);
	    else rowcnd = 1.;
	}
	if (colequ && *info == 0) {
	    rcmin = bignum;
	    rcmax = 0.;
	    for (j = 0; j < A->nrow; ++j) {
		rcmin = MIN(rcmin, C[j]);
		rcmax = MAX(rcmax, C[j]);
	    }
	    if (rcmin <= 0.) *info = -8;
	    else if (A->nrow > 0)
		colcnd = MAX(rcmin,smlnum) / MIN(rcmax,bignum);
	    else colcnd = 1.;
	}
	if (*info == 0) {
	    if ( B->ncol < 0 || Bstore->lda < MAX(0, A->nrow) ||
		      B->Stype != SLU_DN || B->Dtype != SLU_D || 
		      B->Mtype != SLU_GE )
		*info = -11;
	    else if ( X->ncol < 0 || Xstore->lda < MAX(0, A->nrow) ||
		      B->ncol != X->ncol || X->Stype != SLU_DN ||
		      X->Dtype != SLU_D || X->Mtype != SLU_GE )
		*info = -12;
	}
    }
    if (*info != 0) {
	i = -(*info);
	xerbla_("dgssvx", &i);
	return;
    }
    
    
    /* ------------------------------------------------------------
       Allocate storage and initialize statistics variables. 
       ------------------------------------------------------------*/
    panel_size = pdgstrf_options->panel_size;
    relax = pdgstrf_options->relax;
    StatAlloc(n, nprocs, panel_size, relax, &Gstat);
    StatInit(n, nprocs, &Gstat);
    utime = Gstat.utime;
    ops = Gstat.ops;
    
    /* ------------------------------------------------------------
       Convert A to NC format when necessary.
       ------------------------------------------------------------*/
    if ( A->Stype == SLU_NR ) {
	NRformat *Astore = A->Store;
	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
	dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
			       Astore->nzval, Astore->colind, Astore->rowptr,
			       SLU_NC, A->Dtype, A->Mtype);
	if ( notran ) { /* Reverse the transpose argument. */
	    trant = TRANS;
	    notran = 0;
	} else {
	    trant = NOTRANS;
	    notran = 1;
	}
    } else { /* A->Stype == NC */
	trant = pdgstrf_options->trans;
	AA = A;
    }

    /* ------------------------------------------------------------
       Diagonal scaling to equilibrate the matrix.
       ------------------------------------------------------------*/
    if ( equil ) {
	t0 = SuperLU_timer_();
	/* Compute row and column scalings to equilibrate the matrix A. */
	dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1);
	
	if ( info1 == 0 ) {
	    /* Equilibrate matrix A. */
	    dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed);
	    rowequ = (*equed == ROW) || (*equed == BOTH);
	    colequ = (*equed == COL) || (*equed == BOTH);
	}
	utime[EQUIL] = SuperLU_timer_() - t0;
    }

    /* ------------------------------------------------------------
       Scale the right hand side.
       ------------------------------------------------------------*/
    if ( notran ) {
	if ( rowequ ) {
	    for (j = 0; j < nrhs; ++j)
		for (i = 0; i < A->nrow; ++i) {
		  Bmat[i + j*ldb] *= R[i];
	        }
	}
    } else if ( colequ ) {
	for (j = 0; j < nrhs; ++j)
	    for (i = 0; i < A->nrow; ++i) {
	      Bmat[i + j*ldb] *= C[i];
	    }
    }

    
    /* ------------------------------------------------------------
       Perform the LU factorization.
       ------------------------------------------------------------*/
    if ( dofact || equil ) {
	
        /* Obtain column etree, the column count (colcnt_h) and supernode
	   partition (part_super_h) for the Householder matrix. */
	t0 = SuperLU_timer_();
	sp_colorder(AA, perm_c, pdgstrf_options, &AC);
	utime[ETREE] = SuperLU_timer_() - t0;

#if ( PRNTlevel >= 2 )    
	printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
	       relax, panel_size, sp_ienv(3), sp_ienv(4));
	fflush(stdout);
#endif
	
	/* Compute the LU factorization of A*Pc. */
	t0 = SuperLU_timer_();
	pdgstrf(pdgstrf_options, &AC, perm_r, L, U, &Gstat, info);
	utime[FACT] = SuperLU_timer_() - t0;
	
	flopcnt = 0;
	for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops;
	ops[FACT] = flopcnt;

	if ( pdgstrf_options->lwork == -1 ) {
	    superlu_memusage->total_needed = *info - A->ncol;
	    return;
	}
    }

    if ( *info > 0 ) {
	if ( *info <= A->ncol ) {
	    /* Compute the reciprocal pivot growth factor of the leading
	       rank-deficient *info columns of A. */
	    *recip_pivot_growth = dPivotGrowth(*info, AA, perm_c, L, U);
	}
    } else {

	/* ------------------------------------------------------------
	   Compute the reciprocal pivot growth factor *recip_pivot_growth.
	   ------------------------------------------------------------*/
	*recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U);

	/* ------------------------------------------------------------
	   Estimate the reciprocal of the condition number of A.
	   ------------------------------------------------------------*/
	t0 = SuperLU_timer_();
	if ( notran ) {
	    *(unsigned char *)norm = '1';
	} else {
	    *(unsigned char *)norm = 'I';
	}
	anorm = dlangs(norm, AA);
	dgscon(norm, L, U, anorm, rcond, info);
	utime[RCOND] = SuperLU_timer_() - t0;
    
	/* ------------------------------------------------------------
	   Compute the solution matrix X.
	   ------------------------------------------------------------*/
	for (j = 0; j < nrhs; j++)    /* Save a copy of the right hand sides */
	    for (i = 0; i < B->nrow; i++)
		Xmat[i + j*ldx] = Bmat[i + j*ldb];
    
	t0 = SuperLU_timer_();
	dgstrs(trant, L, U, perm_r, perm_c, X, &Gstat, info);
	utime[SOLVE] = SuperLU_timer_() - t0;
	ops[SOLVE] = ops[TRISOLVE];
    
	/* ------------------------------------------------------------
	   Use iterative refinement to improve the computed solution and
	   compute error bounds and backward error estimates for it.
	   ------------------------------------------------------------*/
	t0 = SuperLU_timer_();
	dgsrfs(trant, AA, L, U, perm_r, perm_c, *equed,
	       R, C, B, X, ferr, berr, &Gstat, info);
	utime[REFINE] = SuperLU_timer_() - t0;

	/* ------------------------------------------------------------
	   Transform the solution matrix X to a solution of the original
	   system.
	   ------------------------------------------------------------*/
	if ( notran ) {
	    if ( colequ ) {
		for (j = 0; j < nrhs; ++j)
		    for (i = 0; i < A->nrow; ++i) {
			Xmat[i + j*ldx] *= C[i];
		    }
	    }
	} else if ( rowequ ) {
	    for (j = 0; j < nrhs; ++j)
		for (i = 0; i < A->nrow; ++i) {
		    Xmat[i + j*ldx] *= R[i];
		}
	}
	
	/* Set INFO = A->ncol+1 if the matrix is singular to 
	   working precision.*/
	if ( *rcond < dlamch_("E") ) *info = A->ncol + 1;
	
    }

    superlu_QuerySpace(nprocs, L, U, panel_size, superlu_memusage);

    /* ------------------------------------------------------------
       Deallocate storage after factorization.
       ------------------------------------------------------------*/
    if ( pdgstrf_options->refact == NO ) {
        SUPERLU_FREE(pdgstrf_options->etree);
        SUPERLU_FREE(pdgstrf_options->colcnt_h);
	SUPERLU_FREE(pdgstrf_options->part_super_h);
    }
    if ( dofact || equil ) {
        Destroy_CompCol_Permuted(&AC);
    }
    if ( A->Stype == SLU_NR ) {
	Destroy_SuperMatrix_Store(AA);
	SUPERLU_FREE(AA);
    }

    /* ------------------------------------------------------------
       Print timings, then deallocate statistic variables.
       ------------------------------------------------------------*/
    PrintStat(&Gstat);
    StatFree(&Gstat);
}
Exemplo n.º 29
0
void
pdgssvx(superlu_options_t_Distributed *options, SuperMatrix *A,
        ScalePermstruct_t *ScalePermstruct,
        double B[], int ldb, int nrhs, gridinfo_t *grid,
        LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr,
        SuperLUStat_t *stat, int *info)
{
    /*
     * -- Distributed SuperLU routine (version 2.0) --
     * Lawrence Berkeley National Lab, Univ. of California Berkeley.
     * March 15, 2003
     *
     *
     * Purpose
     * =======
     *
     * PDGSSVX solves a system of linear equations A*X=B,
     * by using Gaussian elimination with "static pivoting" to
     * compute the LU factorization of A.
     *
     * Static pivoting is a technique that combines the numerical stability
     * of partial pivoting with the scalability of Cholesky (no pivoting),
     * to run accurately and efficiently on large numbers of processors.
     *
     * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
     * description of the parallel algorithms.
     *
     * Here are the options for using this code:
     *
     *   1. Independent of all the other options specified below, the
     *      user must supply
     *
     *      -  B, the matrix of right-hand sides, distributed by block rows,
     *            and its dimensions ldb (local) and nrhs (global)
     *      -  grid, a structure describing the 2D processor mesh
     *      -  options->IterRefine, which determines whether or not to
     *            improve the accuracy of the computed solution using
     *            iterative refinement
     *
     *      On output, B is overwritten with the solution X.
     *
     *   2. Depending on options->Fact, the user has four options
     *      for solving A*X=B. The standard option is for factoring
     *      A "from scratch". (The other options, described below,
     *      are used when A is sufficiently similar to a previously
     *      solved problem to save time by reusing part or all of
     *      the previous factorization.)
     *
     *      -  options->Fact = DOFACT: A is factored "from scratch"
     *
     *      In this case the user must also supply
     *
     *        o  A, the input matrix
     *
     *        as well as the following options to determine what matrix to
     *        factorize.
     *
     *        o  options->Equil,   to specify how to scale the rows and columns
     *                             of A to "equilibrate" it (to try to reduce its
     *                             condition number and so improve the
     *                             accuracy of the computed solution)
     *
     *        o  options->RowPerm, to specify how to permute the rows of A
     *                             (typically to control numerical stability)
     *
     *        o  options->ColPerm, to specify how to permute the columns of A
     *                             (typically to control fill-in and enhance
     *                             parallelism during factorization)
     *
     *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
     *                             pivots encountered during factorization
     *                             (to control numerical stability)
     *
     *      The outputs returned include
     *
     *        o  ScalePermstruct,  modified to describe how the input matrix A
     *                             was equilibrated and permuted:
     *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
     *                                         columns of A were scaled
     *          .  ScalePermstruct->R, array of row scale factors
     *          .  ScalePermstruct->C, array of column scale factors
     *          .  ScalePermstruct->perm_r, row permutation vector
     *          .  ScalePermstruct->perm_c, column permutation vector
     *
     *          (part of ScalePermstruct may also need to be supplied on input,
     *           depending on options->RowPerm and options->ColPerm as described
     *           later).
     *
     *        o  A, the input matrix A overwritten by the scaled and permuted
     *              matrix Pc*Pr*diag(R)*A*diag(C), where
     *              Pr and Pc are row and columns permutation matrices determined
     *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c,
     *                  respectively, and
     *              diag(R) and diag(C) are diagonal scaling matrices determined
     *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
     *                  ScalePermstruct->C
     *
     *        o  LUstruct, which contains the L and U factorization of A1 where
     *
     *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
     *
     *               (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
     *                in A on output.)
     *
     *   3. The second value of options->Fact assumes that a matrix with the same
     *      sparsity pattern as A has already been factored:
     *
     *      -  options->Fact = SamePattern: A is factored, assuming that it has
     *            the same nonzero pattern as a previously factored matrix. In
     *            this case the algorithm saves time by reusing the previously
     *            computed column permutation vector stored in
     *            ScalePermstruct->perm_c and the "elimination tree" of A
     *            stored in LUstruct->etree
     *
     *      In this case the user must still specify the following options
     *      as before:
     *
     *        o  options->Equil
     *        o  options->RowPerm
     *        o  options->ReplaceTinyPivot
     *
     *      but not options->ColPerm, whose value is ignored. This is because the
     *      previous column permutation from ScalePermstruct->perm_c is used as
     *      input. The user must also supply
     *
     *        o  A, the input matrix
     *        o  ScalePermstruct->perm_c, the column permutation
     *        o  LUstruct->etree, the elimination tree
     *
     *      The outputs returned include
     *
     *        o  A, the input matrix A overwritten by the scaled and permuted
     *              matrix as described above
     *        o  ScalePermstruct, modified to describe how the input matrix A was
     *                            equilibrated and row permuted
     *        o  LUstruct, modified to contain the new L and U factors
     *
     *   4. The third value of options->Fact assumes that a matrix B with the same
     *      sparsity pattern as A has already been factored, and where the
     *      row permutation of B can be reused for A. This is useful when A and B
     *      have similar numerical values, so that the same row permutation
     *      will make both factorizations numerically stable. This lets us reuse
     *      all of the previously computed structure of L and U.
     *
     *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
     *            assuming not only the same nonzero pattern as the previously
     *            factored matrix B, but reusing B's row permutation.
     *
     *      In this case the user must still specify the following options
     *      as before:
     *
     *        o  options->Equil
     *        o  options->ReplaceTinyPivot
     *
     *      but not options->RowPerm or options->ColPerm, whose values are
     *      ignored. This is because the permutations from ScalePermstruct->perm_r
     *      and ScalePermstruct->perm_c are used as input.
     *
     *      The user must also supply
     *
     *        o  A, the input matrix
     *        o  ScalePermstruct->DiagScale, how the previous matrix was row
     *                                       and/or column scaled
     *        o  ScalePermstruct->R, the row scalings of the previous matrix,
     *                               if any
     *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
     *                               if any
     *        o  ScalePermstruct->perm_r, the row permutation of the previous
     *                                    matrix
     *        o  ScalePermstruct->perm_c, the column permutation of the previous
     *                                    matrix
     *        o  all of LUstruct, the previously computed information about
     *                            L and U (the actual numerical values of L and U
     *                            stored in LUstruct->Llu are ignored)
     *
     *      The outputs returned include
     *
     *        o  A, the input matrix A overwritten by the scaled and permuted
     *              matrix as described above
     *        o  ScalePermstruct,  modified to describe how the input matrix A was
     *                             equilibrated (thus ScalePermstruct->DiagScale,
     *                             R and C may be modified)
     *        o  LUstruct, modified to contain the new L and U factors
     *
     *   5. The fourth and last value of options->Fact assumes that A is
     *      identical to a matrix that has already been factored on a previous
     *      call, and reuses its entire LU factorization
     *
     *      -  options->Fact = Factored: A is identical to a previously
     *            factorized matrix, so the entire previous factorization
     *            can be reused.
     *
     *      In this case all the other options mentioned above are ignored
     *      (options->Equil, options->RowPerm, options->ColPerm,
     *       options->ReplaceTinyPivot)
     *
     *      The user must also supply
     *
     *        o  A, the unfactored matrix, only in the case that iterative
     *              refinment is to be done (specifically A must be the output
     *              A from the previous call, so that it has been scaled and
     *              permuted)
     *        o  all of ScalePermstruct
     *        o  all of LUstruct, including the actual numerical values of
     *           L and U
     *
     *      all of which are unmodified on output.
     *
     * Arguments
     * =========
     *
     * options (input) superlu_options_t_Distributed* (global)
     *         The structure defines the input parameters to control
     *         how the LU decomposition will be performed.
     *         The following fields should be defined for this structure:
     *
     *         o Fact (fact_t)
     *           Specifies whether or not the factored form of the matrix
     *           A is supplied on entry, and if not, how the matrix A should
     *           be factorized based on the previous history.
     *
     *           = DOFACT: The matrix A will be factorized from scratch.
     *                 Inputs:  A
     *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
     *                 Outputs: modified A
     *                             (possibly row and/or column scaled and/or
     *                              permuted)
     *                          all of ScalePermstruct
     *                          all of LUstruct
     *
     *           = SamePattern: the matrix A will be factorized assuming
     *             that a factorization of a matrix with the same sparsity
     *             pattern was performed prior to this one. Therefore, this
     *             factorization will reuse column permutation vector
     *             ScalePermstruct->perm_c and the elimination tree
     *             LUstruct->etree
     *                 Inputs:  A
     *                          options->Equil, RowPerm, ReplaceTinyPivot
     *                          ScalePermstruct->perm_c
     *                          LUstruct->etree
     *                 Outputs: modified A
     *                             (possibly row and/or column scaled and/or
     *                              permuted)
     *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
     *                          rest of LUstruct (GLU_persist, Llu)
     *
     *           = SamePattern_SameRowPerm: the matrix A will be factorized
     *             assuming that a factorization of a matrix with the same
     *             sparsity	pattern and similar numerical values was performed
     *             prior to this one. Therefore, this factorization will reuse
     *             both row and column scaling factors R and C, and the
     *             both row and column permutation vectors perm_r and perm_c,
     *             distributed data structure set up from the previous symbolic
     *             factorization.
     *                 Inputs:  A
     *                          options->Equil, ReplaceTinyPivot
     *                          all of ScalePermstruct
     *                          all of LUstruct
     *                 Outputs: modified A
     *                             (possibly row and/or column scaled and/or
     *                              permuted)
     *                          modified LUstruct->Llu
     *           = FACTORED: the matrix A is already factored.
     *                 Inputs:  all of ScalePermstruct
     *                          all of LUstruct
     *
     *         o Equil (yes_no_t)
     *           Specifies whether to equilibrate the system.
     *           = NO:  no equilibration.
     *           = YES: scaling factors are computed to equilibrate the system:
     *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
     *                  Whether or not the system will be equilibrated depends
     *                  on the scaling of the matrix A, but if equilibration is
     *                  used, A is overwritten by diag(R)*A*diag(C) and B by
     *                  diag(R)*B.
     *
     *         o RowPerm (rowperm_t)
     *           Specifies how to permute rows of the matrix A.
     *           = NATURAL:   use the natural ordering.
     *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
     *                        the original matrix to make the diagonal large
     *                        relative to the off-diagonal.
     *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
     *                        input by the user.
     *
     *         o ColPerm (colperm_t)
     *           Specifies what type of column permutation to use to reduce fill.
     *           = NATURAL:       natural ordering.
     *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
     *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
     *           = COLAMD:        approximate minimum degree column ordering.
     *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
     *
     *         o ReplaceTinyPivot (yes_no_t)
     *           = NO:  do not modify pivots
     *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
     *                  LU factorization.
     *
     *         o IterRefine (IterRefine_t)
     *           Specifies how to perform iterative refinement.
     *           = NO:     no iterative refinement.
     *           = DOUBLE: accumulate residual in double precision.
     *           = EXTRA:  accumulate residual in extra precision.
     *
     *         NOTE: all options must be indentical on all processes when
     *               calling this routine.
     *
     * A (input/output) SuperMatrix* (local)
     *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
     *           The number of linear equations is A->nrow. The type of A must be:
     *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
     *           That is, A is stored in distributed compressed row format.
     *           See supermatrix.h for the definition of 'SuperMatrix'.
     *           This routine only handles square A, however, the LU factorization
     *           routine PDGSTRF can factorize rectangular matrices.
     *         On exit, A may be overwtirren by Pc*Pr*diag(R)*A*diag(C),
     *           depending on ScalePermstruct->DiagScale, options->RowPerm and
     *           options->ColPerm:
     *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
     *                diag(R)*A*diag(C).
     *             if options->RowPerm != NATURAL, A is further overwritten by
     *                Pr*diag(R)*A*diag(C).
     *             if options->ColPerm != NATURAL, A is further overwritten by
     *                Pc*Pr*diag(R)*A*diag(C).
     *           If all the above condition are true, the LU decomposition is
     *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
     *
     * ScalePermstruct (input/output) ScalePermstruct_t* (global)
     *         The data structure to store the scaling and permutation vectors
     *         describing the transformations performed to the matrix A.
     *         It contains the following fields:
     *
     *         o DiagScale (DiagScale_t)
     *           Specifies the form of equilibration that was done.
     *           = NOEQUIL: no equilibration.
     *           = ROW:     row equilibration, i.e., A was premultiplied by
     *                      diag(R).
     *           = COL:     Column equilibration, i.e., A was postmultiplied
     *                      by diag(C).
     *           = BOTH:    both row and column equilibration, i.e., A was
     *                      replaced by diag(R)*A*diag(C).
     *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
     *           DiagScale is an input argument; otherwise it is an output
     *           argument.
     *
     *         o perm_r (int*)
     *           Row permutation vector, which defines the permutation matrix Pr;
     *           perm_r[i] = j means row i of A is in position j in Pr*A.
     *           If options->RowPerm = MY_PERMR, or
     *           options->Fact = SamePattern_SameRowPerm, perm_r is an
     *           input argument; otherwise it is an output argument.
     *
     *         o perm_c (int*)
     *           Column permutation vector, which defines the
     *           permutation matrix Pc; perm_c[i] = j means column i of A is
     *           in position j in A*Pc.
     *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
     *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
     *           input argument; otherwise, it is an output argument.
     *           On exit, perm_c may be overwritten by the product of the input
     *           perm_c and a permutation that postorders the elimination tree
     *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
     *           is already in postorder.
     *
     *         o R (double*) dimension (A->nrow)
     *           The row scale factors for A.
     *           If DiagScale = ROW or BOTH, A is multiplied on the left by
     *                          diag(R).
     *           If DiagScale = NOEQUIL or COL, R is not defined.
     *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
     *           an input argument; otherwise, R is an output argument.
     *
     *         o C (double*) dimension (A->ncol)
     *           The column scale factors for A.
     *           If DiagScale = COL or BOTH, A is multiplied on the right by
     *                          diag(C).
     *           If DiagScale = NOEQUIL or ROW, C is not defined.
     *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
     *           an input argument; otherwise, C is an output argument.
     *
     * B       (input/output) double* (local)
     *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
     *           where, m_loc is the number of rows stored locally on my
     *           process and is defined in the data structure of matrix A.
     *         On exit, the solution matrix if info = 0;
     *
     * ldb     (input) int (local)
     *         The leading dimension of matrix B.
     *
     * nrhs    (input) int (global)
     *         The number of right-hand sides.
     *         If nrhs = 0, only LU decomposition is performed, the forward
     *         and back substitutions are skipped.
     *
     * grid    (input) gridinfo_t* (global)
     *         The 2D process mesh. It contains the MPI communicator, the number
     *         of process rows (NPROW), the number of process columns (NPCOL),
     *         and my process rank. It is an input argument to all the
     *         parallel routines.
     *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
     *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
     *
     * LUstruct (input/output) LUstruct_t*
     *         The data structures to store the distributed L and U factors.
     *         It contains the following fields:
     *
     *         o etree (int*) dimension (A->ncol) (global)
     *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
     *           It is computed in sp_colorder() during the first factorization,
     *           and is reused in the subsequent factorizations of the matrices
     *           with the same nonzero pattern.
     *           On exit of sp_colorder(), the columns of A are permuted so that
     *           the etree is in a certain postorder. This postorder is reflected
     *           in ScalePermstruct->perm_c.
     *           NOTE:
     *           Etree is a vector of parent pointers for a forest whose vertices
     *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
     *
     *         o Glu_persist (Glu_persist_t*) (global)
     *           Global data structure (xsup, supno) replicated on all processes,
     *           describing the supernode partition in the factored matrices
     *           L and U:
     *	       xsup[s] is the leading column of the s-th supernode,
     *             supno[i] is the supernode number to which column i belongs.
     *
     *         o Llu (LocalLU_t*) (local)
     *           The distributed data structures to store L and U factors.
     *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
     *
     * SOLVEstruct (input/output) SOLVEstruct_t*
     *         The data structure to hold the communication pattern used
     *         in the phases of triangular solution and iterative refinement.
     *         This pattern should be intialized only once for repeated solutions.
     *         If options->SolveInitialized = YES, it is an input argument.
     *         If options->SolveInitialized = NO and nrhs != 0, it is an output
     *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
     *
     * berr    (output) double*, dimension (nrhs) (global)
     *         The componentwise relative backward error of each solution
     *         vector X(j) (i.e., the smallest relative change in
     *         any element of A or B that makes X(j) an exact solution).
     *
     * stat   (output) SuperLUStat_t*
     *        Record the statistics on runtime and floating-point operation count.
     *        See util.h for the definition of 'SuperLUStat_t'.
     *
     * info    (output) int*
     *         = 0: successful exit
     *         > 0: if info = i, and i is
     *             <= A->ncol: U(i,i) is exactly zero. The factorization has
     *                been completed, but the factor U is exactly singular,
     *                so the solution could not be computed.
     *             > A->ncol: number of bytes allocated when memory allocation
     *                failure occurred, plus A->ncol.
     *
     * See superlu_ddefs.h for the definitions of varioous data types.
     *
     */
    NRformat_loc *Astore;
    SuperMatrix GA;      /* Global A in NC format */
    NCformat *GAstore;
    double   *a_GA;
    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
    NCPformat *GACstore;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    Glu_freeable_t *Glu_freeable;
    /* The nonzero structures of L and U factors, which are
    replicated on all processrs.
       (lsub, xlsub) contains the compressed subscript of
                 supernodes in L.
       (usub, xusub) contains the compressed subscript of
                 nonzero segments in U.
    If options->Fact != SamePattern_SameRowPerm, they are
    computed by SYMBFACT routine, and then used by PDDISTRIBUTE
    routine. They will be freed after PDDISTRIBUTE routine.
    If options->Fact == SamePattern_SameRowPerm, these
    structures are not used.                                  */
    fact_t   Fact;
    double   *a;
    int_t    *colptr, *rowind;
    int_t    *perm_r; /* row permutations from partial pivoting */
    int_t    *perm_c; /* column permutation vector */
    int_t    *etree;  /* elimination tree */
    int_t    *rowptr, *colind;  /* Local A in NR*/
    int_t    *rowind_loc, *colptr_loc;
    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
    int_t    nnz_loc, m_loc, fst_row, icol;
    int      iam;
    int      ldx;  /* LDA for matrix X (local). */
    char     equed[1], norm[1];
    double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
    double   *X, *b_col, *b_work, *x_col;
    double   t;
    static mem_usage_t_Distributed num_mem_usage, symb_mem_usage;
#if ( PRNTlevel>= 2 )
    double   dmin, dsum, dprod;
#endif
    int_t procs;

    /* Initialization. */
    m = A->nrow;
    n = A->ncol;
    Astore = (NRformat_loc *) A->Store;
    nnz_loc = Astore->nnz_loc;
    m_loc = Astore->m_loc;
    fst_row = Astore->fst_row;
    a = Astore->nzval;
    rowptr = Astore->rowptr;
    colind = Astore->colind;

    /* Test the input parameters. */
    *info = 0;
    Fact = options->Fact;
    if ( Fact < 0 || Fact > FACTORED )
        *info = -1;
    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
        *info = -1;
    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
        *info = -1;
    else if ( options->IterRefine < 0 || options->IterRefine > EXTRA )
        *info = -1;
    else if ( options->IterRefine == EXTRA ) {
        *info = -1;
        fprintf(stderr, "Extra precise iterative refinement yet to support.");
    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
                || A->Dtype != SLU_D || A->Mtype != SLU_GE )
        *info = -2;
    else if ( ldb < m_loc )
        *info = -5;
    else if ( nrhs < 0 )
        *info = -6;
    if ( *info ) {
        i = -(*info);
        pxerbla("pdgssvx", grid, -*info);
        return;
    }

    factored = (Fact == FACTORED);
    Equil = (!factored && options->Equil == YES);
    notran = (options->Trans == NOTRANS);
    iam = grid->iam;
    job = 5;
    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
        rowequ = (ScalePermstruct->DiagScale == ROW) ||
                 (ScalePermstruct->DiagScale == BOTH);
        colequ = (ScalePermstruct->DiagScale == COL) ||
                 (ScalePermstruct->DiagScale == BOTH);
    } else rowequ = colequ = FALSE;

    /* The following arrays are replicated on all processes. */
    perm_r = ScalePermstruct->perm_r;
    perm_c = ScalePermstruct->perm_c;
    etree = LUstruct->etree;
    R = ScalePermstruct->R;
    C = ScalePermstruct->C;
    /********/

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pdgssvx()");
#endif

    if ( Equil ) {
        /* Allocate storage if not done so before. */
        switch ( ScalePermstruct->DiagScale ) {
        case NOEQUIL:
            if ( !(R = (double *) doubleMalloc_dist(m)) )
                ABORT("Malloc fails for R[].");
            if ( !(C = (double *) doubleMalloc_dist(n)) )
                ABORT("Malloc fails for C[].");
            ScalePermstruct->R = R;
            ScalePermstruct->C = C;
            break;
        case ROW:
            if ( !(C = (double *) doubleMalloc_dist(n)) )
                ABORT("Malloc fails for C[].");
            ScalePermstruct->C = C;
            break;
        case COL:
            if ( !(R = (double *) doubleMalloc_dist(m)) )
                ABORT("Malloc fails for R[].");
            ScalePermstruct->R = R;
            break;
        }
    }

    /* ------------------------------------------------------------
       Diagonal scaling to equilibrate the matrix.
       ------------------------------------------------------------*/
    if ( Equil ) {
#if ( DEBUGlevel>=1 )
        CHECK_MALLOC(iam, "Enter equil");
#endif
        t = SuperLU_timer_();

        if ( Fact == SamePattern_SameRowPerm ) {
            /* Reuse R and C. */
            switch ( ScalePermstruct->DiagScale ) {
            case NOEQUIL:
                break;
            case ROW:
                irow = fst_row;
                for (j = 0; j < m_loc; ++j) {
                    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
                        a[i] *= R[irow];       /* Scale rows. */
                    }
                    ++irow;
                }
                break;
            case COL:
                for (j = 0; j < m_loc; ++j)
                    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
                        icol = colind[i];
                        a[i] *= C[icol];          /* Scale columns. */
                    }
                break;
            case BOTH:
                irow = fst_row;
                for (j = 0; j < m_loc; ++j) {
                    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
                        icol = colind[i];
                        a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
                    }
                    ++irow;
                }
                break;
            }
        } else {

            /* Compute the row and column scalings. */
            pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);

            /* Equilibrate matrix A if it is badly-scaled. */
            pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed);

            if ( lsame_(equed, "R") ) {
                ScalePermstruct->DiagScale = rowequ = ROW;
            } else if ( lsame_(equed, "C") ) {
                ScalePermstruct->DiagScale = colequ = COL;
            } else if ( lsame_(equed, "B") ) {
                ScalePermstruct->DiagScale = BOTH;
                rowequ = ROW;
                colequ = COL;
            } else ScalePermstruct->DiagScale = NOEQUIL;

#if ( PRNTlevel>=1 )
            if ( !iam ) {
                printf(".. equilibrated? *equed = %c\n", *equed);
                /*fflush(stdout);*/
            }
#endif
        } /* if Fact ... */

        stat->utime[EQUIL] = SuperLU_timer_() - t;
#if ( DEBUGlevel>=1 )
        CHECK_MALLOC(iam, "Exit equil");
#endif
    } /* if Equil ... */


    /*
     * Gather A from the distributed compressed row format to
     * global A in compressed column format.
     * Numerical values are gathered only when a row permutation
     * for large diagonal is sought after.
     */
    need_value = (options->RowPerm == LargeDiag &&
                  Fact != SamePattern_SameRowPerm && !factored);
    pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
    GAstore = (NCformat *) GA.Store;
    colptr = GAstore->colptr;
    rowind = GAstore->rowind;
    nnz = GAstore->nnz;
    if ( need_value ) a_GA = GAstore->nzval;
    else assert(GAstore->nzval == NULL);


    /* ------------------------------------------------------------
       Find the row permutation for A.
       ------------------------------------------------------------*/
    if ( options->RowPerm != NO ) {
        t = SuperLU_timer_();
        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
            /* Permute the global matrix GA for symbfact() */
            for (i = 0; i < colptr[n]; ++i) {
                irow = rowind[i];
                rowind[i] = perm_r[irow];
            }
        } else if ( !factored && Fact != SamePattern_SameRowPerm ) {
            /* Get a new perm_r[] */
            if ( job == 5 ) {
                /* Allocate storage for scaling factors. */
                if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) )
                    ABORT("SUPERLU_MALLOC fails for R1[]");
                if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) )
                    ABORT("SUPERLU_MALLOC fails for C1[]");
            }

            if ( !iam ) {
                /* Process 0 finds a row permutation for large diagonal. */
                dldperm(job, m, nnz, colptr, rowind, a_GA, perm_r, R1, C1);

                MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
                if ( job == 5 && Equil ) {
                    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
                    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
                }
            } else {
                MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
                if ( job == 5 && Equil ) {
                    MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm );
                    MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm );
                }
            }

#if ( PRNTlevel>=2 )
            dmin = dlamch_("Overflow");
            dsum = 0.0;
            dprod = 1.0;
#endif
            if ( job == 5 ) {
                if ( Equil ) {
                    for (i = 0; i < n; ++i) {
                        R1[i] = exp(R1[i]);
                        C1[i] = exp(C1[i]);
                    }

                    /* Permute the global matrix GA for symbfact(). */
                    for (j = 0; j < n; ++j) {
                        for (i = colptr[j]; i < colptr[j+1]; ++i) {
                            irow = rowind[i];
                            rowind[i] = perm_r[irow];
#if ( PRNTlevel>=2 )
                            if ( rowind[i] == j ) /* New diagonal */
                                dprod *= fabs(a[i]);
#endif
                        }
                    }

                    /* Scale the distributed matrix */
                    irow = fst_row;
                    for (j = 0; j < m_loc; ++j) {
                        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
                            icol = colind[i];
                            a[i] *= R1[irow] * C1[icol];
                        }
                        ++irow;
                    }

                    /* Multiply together the scaling factors. */
                    if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
                    else for (i = 0; i < m; ++i) R[i] = R1[i];
                    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
                    else for (i = 0; i < n; ++i) C[i] = C1[i];

                    ScalePermstruct->DiagScale = BOTH;
                    rowequ = colequ = 1;

                } else { /* No equilibration. Only permute the global A. */
                    for (i = colptr[0]; i < colptr[n]; ++i) {
                        irow = rowind[i];
                        rowind[i] = perm_r[irow];
                    }
                }
                SUPERLU_FREE (R1);
                SUPERLU_FREE (C1);
            } else { /* job = 2,3,4 */
                for (j = 0; j < n; ++j) {
                    for (i = colptr[j]; i < colptr[j+1]; ++i) {
                        irow = rowind[i];
                        rowind[i] = perm_r[irow];
#if ( PRNTlevel>=2 )
                        if ( rowind[i] == j ) { /* New diagonal */
                            if ( job == 2 || job == 3 )
                                dmin = SUPERLU_MIN(dmin, fabs(a[i]));
                            else if ( job == 4 )
                                dsum += fabs(a[i]);
                            else if ( job == 5 )
                                dprod *= fabs(a[i]);
                        }
#endif
                    }
                }
            }

#if ( PRNTlevel>=2 )
            if ( job == 2 || job == 3 ) {
                if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
            } else if ( job == 4 ) {
                if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
            } else if ( job == 5 ) {
                if ( !iam ) printf("\t product of diagonal %e\n", dprod);
            }
#endif

        } /* else !factored */

        t = SuperLU_timer_() - t;
        stat->utime[ROWPERM] = t;
#if ( PRNTlevel>=1 )
        if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t);
#endif

    } else { /* options->RowPerm == NOROWPERM */
        for (i = 0; i <m; ++i) perm_r[i] = i;
    }

#if ( DEBUGlevel>=1 )
    if ( !iam ) PrintInt10("perm_r",  m, perm_r);
#endif

    if ( !factored || options->IterRefine ) {
        /* Compute norm(A), which will be used to adjust small diagonal. */
        if ( notran ) *(unsigned char *)norm = '1';
        else *(unsigned char *)norm = 'I';
        anorm = pdlangs(norm, A, grid);
#if ( PRNTlevel>=1 )
        if ( !iam ) printf(".. anorm %e\n", anorm);
#endif
    }

    /* ------------------------------------------------------------
       Perform the LU factorization.
       ------------------------------------------------------------*/
    if ( !factored ) {
        t = SuperLU_timer_();
        /*
         * Get column permutation vector perm_c[], according to permc_spec:
         *   permc_spec = NATURAL:  natural ordering
         *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
         *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
         *   permc_spec = COLAMD:   approximate minimum degree column ordering
         *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
         */
        permc_spec = options->ColPerm;
        if ( permc_spec != MY_PERMC && Fact == DOFACT )
            get_perm_c_dist(iam, permc_spec, &GA, perm_c);

        /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
           (a.k.a. column etree), depending on the choice of ColPerm.
           Adjust perm_c[] to be consistent with a postorder of etree.
           Permute columns of A to form A*Pc'. */
        sp_colorder(options, &GA, perm_c, etree, &GAC);

        /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
        {
            int_t *GACcolbeg, *GACcolend, *GACrowind;
            GACstore = GAC.Store;
            GACcolbeg = GACstore->colbeg;
            GACcolend = GACstore->colend;
            GACrowind = GACstore->rowind;
            for (j = 0; j < n; ++j) {
                for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
                    irow = GACrowind[i];
                    GACrowind[i] = perm_c[irow];
                }
            }
        }

        stat->utime[COLPERM] = SuperLU_timer_() - t;

        /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the
           nonzero data structures which are suitable for supernodal GENP. */
        if ( Fact != SamePattern_SameRowPerm ) {
#if ( PRNTlevel>=1 )
            if ( !iam )
                printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
                       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
#endif
            t = SuperLU_timer_();
            if ( !(Glu_freeable = (Glu_freeable_t *)
                                  SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
                ABORT("Malloc fails for Glu_freeable.");

            /* Every process does this. */
            iinfo = symbfact(iam, &GAC, perm_c, etree,
                             Glu_persist, Glu_freeable);

            stat->utime[SYMBFAC] = SuperLU_timer_() - t;
            if ( iinfo < 0 ) { /* Successful return */
                QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
#if ( PRNTlevel>=1 )
                if ( !iam ) {
                    printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1);
                    printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]);
                    printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]);
                    printf("\tint %d, short %d, float %d, double %d\n",
                           sizeof(int_t), sizeof(short), sizeof(float),
                           sizeof(double));
                    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
                           symb_mem_usage.for_lu*1e-6,
                           symb_mem_usage.total*1e-6,
                           symb_mem_usage.expansions);
                }
#endif
            } else {
                if ( !iam ) {
                    fprintf(stderr, "symbfact() error returns %d\n", iinfo);
                    exit(-1);
                }
            }
        }

        /* Apply column permutation to the original distributed A */
        for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];

        /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
           NOTE: the row permutation Pc*Pr is applied internally in the
           distribution routine. */
        t = SuperLU_timer_();
        dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct,
                                    Glu_freeable, LUstruct, grid);
        stat->utime[DIST] = SuperLU_timer_() - t;

        /* Deallocate storage used in symbolic factorization. */
        if ( Fact != SamePattern_SameRowPerm ) {
            iinfo = symbfact_SubFree(Glu_freeable);
            SUPERLU_FREE(Glu_freeable);
        }

        /* Perform numerical factorization in parallel. */
        t = SuperLU_timer_();
        pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
        stat->utime[FACT] = SuperLU_timer_() - t;

#if ( PRNTlevel>=1 )
        {
            int_t TinyPivots;
            float for_lu, total, max, avg, temp;
            dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage);
            MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
                        1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
            MPI_Reduce( &num_mem_usage.total, &total,
                        1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
            temp = SUPERLU_MAX(symb_mem_usage.total,
                               symb_mem_usage.for_lu +
                               (float)dist_mem_use + num_mem_usage.for_lu);
            temp = SUPERLU_MAX(temp, num_mem_usage.total);
            MPI_Reduce( &temp, &max,
                        1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
            MPI_Reduce( &temp, &avg,
                        1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
            MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
                           MPI_SUM, grid->comm );
            stat->TinyPivots = TinyPivots;
            if ( !iam ) {
                printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
                       for_lu*1e-6, total*1e-6);
                printf("\tAll space (MB):"
                       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
                       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
                printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
            }
        }
#endif

    } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
        /* Permute columns of A to form A*Pc' using the existing perm_c.
         * NOTE: rows of A were previously permuted to Pc*A.
         *
         * XSL: NO; this is different now.
         */
        sp_colorder(options, &GA, perm_c, NULL, &GAC); /* ????? */
    } /* if !factored ... */

    /* Destroy GA */
    Destroy_CompCol_Matrix_dist(&GA);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Before solve");
#endif
    /* ------------------------------------------------------------
       Compute the solution matrix X.
       ------------------------------------------------------------*/
    if ( nrhs ) {

        if ( !(b_work = doubleMalloc_dist(n)) )
            ABORT("Malloc fails for b_work[]");

        /* ------------------------------------------------------------
           Scale the right-hand side if equilibration was performed.
           ------------------------------------------------------------*/
        if ( notran ) {
            if ( rowequ ) {
                b_col = B;
                for (j = 0; j < nrhs; ++j) {
                    irow = fst_row;
                    for (i = 0; i < m_loc; ++i) {
                        b_col[i] *= R[irow];
                        ++irow;
                    }
                    b_col += ldb;
                }
            }
        } else if ( colequ ) {
            b_col = B;
            for (j = 0; j < nrhs; ++j) {
                irow = fst_row;
                for (i = 0; i < m_loc; ++i) {
                    b_col[i] *= C[irow];
                    ++irow;
                }
                b_col += ldb;
            }
        }

        /* Save a copy of the right-hand side. */
        ldx = ldb;
        if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) )
            ABORT("Malloc fails for X[]");
        x_col = X;
        b_col = B;
        for (j = 0; j < nrhs; ++j) {
            for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
            x_col += ldx;
            b_col += ldb;
        }

        /* ------------------------------------------------------------
           Solve the linear system.
           ------------------------------------------------------------*/
        if ( options->SolveInitialized == NO ) {
            dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
                       SOLVEstruct);
        }

        pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
                fst_row, ldb, nrhs, SOLVEstruct, stat, info);

#if ( DEBUGlevel>=2 )
        printf("\n(%d) .. After pdgstrs(): x =\n", iam);
        for (i = 0; i < m_loc; ++i)
            printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, X[i]);
#endif
        /* ------------------------------------------------------------
           Use iterative refinement to improve the computed solution and
           compute error bounds and backward error estimates for it.
           ------------------------------------------------------------*/
        if ( options->IterRefine ) {
            /* Improve the solution by iterative refinement. */
            t = SuperLU_timer_();
            pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
                    B, ldb, X, ldx, nrhs, SOLVEstruct, berr, stat, info);
            stat->utime[REFINE] = SuperLU_timer_() - t;
        }

        /* Permute the solution matrix B <= Pc'*X. */
        pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
                               SOLVEstruct->inv_perm_c,
                               X, ldx, B, ldb, nrhs, grid);
#if ( DEBUGlevel>=2 )
        printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
        for (i = 0; i < m_loc; ++i)
            printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
#endif

        /* Transform the solution matrix X to a solution of the original
           system before the equilibration. */
        if ( notran ) {
            if ( colequ ) {
                b_col = B;
                for (j = 0; j < nrhs; ++j) {
                    irow = fst_row;
                    for (i = 0; i < m_loc; ++i) {
                        b_col[i] *= C[irow];
                        ++irow;
                    }
                    b_col += ldb;
                }
            }
        } else if ( rowequ ) {
            b_col = B;
            for (j = 0; j < nrhs; ++j) {
                irow = fst_row;
                for (i = 0; i < m_loc; ++i) {
                    b_col[i] *= R[irow];
                    ++irow;
                }
                b_col += ldb;
            }
        }

        SUPERLU_FREE(b_work);
        SUPERLU_FREE(X);

    } /* end if nrhs != 0 */

#if ( PRNTlevel>=1 )
    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
#endif

    /* Deallocate storage. */
    if ( Equil && Fact != SamePattern_SameRowPerm ) {
        switch ( ScalePermstruct->DiagScale ) {
        case NOEQUIL:
            SUPERLU_FREE(R);
            SUPERLU_FREE(C);
            break;
        case ROW:
            SUPERLU_FREE(C);
            break;
        case COL:
            SUPERLU_FREE(R);
            break;
        }
    }
    if ( !factored || (factored && options->IterRefine) )
        Destroy_CompCol_Permuted_dist(&GAC);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pdgssvx()");
#endif

}
Exemplo n.º 30
0
void
pdgstrsL(int_t n, LUstruct_t *LUstruct, 
	 ScalePermstruct_t *ScalePermstruct,
	 gridinfo_t *grid, double *B,
	 int_t m_loc, int_t fst_row, int_t ldb, int nrhs,
	 SOLVEstruct_t *SOLVEstruct,
	 SuperLUStat_t *stat, int *info)
{
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;
    double alpha = 1.0;
    double zero = 0.0;
    double *lsum;  /* Local running sum of the updates to B-components */
    double *x;     /* X component at step k. */
		    /* NOTE: x and lsum are of same size. */
    double *lusup, *dest;
    double *recvbuf, *tempv;
    double *rtemp; /* Result of full matrix-vector multiply. */
    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
    int_t  iam, kcol, krow, mycol, myrow;
    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
    int_t  nb, nlb, nub, nsupers;
    int_t  *xsup, *supno, *lsub, *usub;
    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
    int_t  Pc, Pr;
    int    knsupc, nsupr;
    int    ldalsum;   /* Number of lsum entries locally owned. */
    int    maxrecvsz, p, pi;
    int_t  **Lrowind_bc_ptr;
    double **Lnzval_bc_ptr;
    MPI_Status status;
#ifdef ISEND_IRECV
    MPI_Request *send_req, recv_req;
#endif
    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;

    /*-- Counts used for L-solve --*/
    int_t  *fmod;         /* Modification count for L-solve --
                             Count the number of local block products to
                             be summed into lsum[lk]. */
    int_t  **fsendx_plist = Llu->fsendx_plist;
    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
                             from processes in this row. 
                             It is only valid on the diagonal processes. */
    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
    int_t  nleaf = 0, nroot = 0;

    double t;
#if ( DEBUGlevel>=2 )
    int_t Ublocks = 0;
#endif

    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
 
    t = SuperLU_timer_();

    /* Test input parameters. */
    *info = 0;
    if ( n < 0 ) *info = -1;
    else if ( nrhs < 0 ) *info = -9;
    if ( *info ) {
	pxerbla("PDGSTRS", grid, -*info);
	return;
    }
	
    /*
     * Initialization.
     */
    iam = grid->iam;
    Pc = grid->npcol;
    Pr = grid->nprow;
    myrow = MYROW( iam, grid );
    mycol = MYCOL( iam, grid );
    xsup = Glu_persist->xsup;
    supno = Glu_persist->supno;
    nsupers = supno[n-1] + 1;
    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pdgstrsL()");
#endif

    stat->ops[SOLVE] = 0.0;
    Llu->SolveMsgSent = 0;

    /* Save the count to be altered so it can be used by
       subsequent call to PDGSTRS. */
    if ( !(fmod = intMalloc_dist(nlb)) )
	ABORT("Calloc fails for fmod[].");
    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
    if ( !(frecv = intMalloc_dist(nlb)) )
	ABORT("Malloc fails for frecv[].");
    Llu->frecv = frecv;

#ifdef ISEND_IRECV
    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
	ABORT("Malloc fails for send_req[].");
#endif

#ifdef _CRAY
    ftcs1 = _cptofcd("L", strlen("L"));
    ftcs2 = _cptofcd("N", strlen("N"));
    ftcs3 = _cptofcd("U", strlen("U"));
#endif


    /* Obtain ilsum[] and ldalsum for process column 0. */
    ilsum = Llu->ilsum;
    ldalsum = Llu->ldalsum;

    /* Allocate working storage. */
    knsupc = sp_ienv_dist(3);
    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
    if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) )
	ABORT("Calloc fails for lsum[].");
    if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
	ABORT("Malloc fails for x[].");
    if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
	ABORT("Malloc fails for recvbuf[].");
    if ( !(rtemp = doubleCalloc_dist(maxrecvsz)) )
	ABORT("Malloc fails for rtemp[].");

    
    /*---------------------------------------------------
     * Forward solve Ly = b.
     *---------------------------------------------------*/
    /* Redistribute B into X on the diagonal processes. */
    pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, 
			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);

    /* Set up the headers in lsum[]. */
    ii = 0;
    for (k = 0; k < nsupers; ++k) {
	knsupc = SuperSize( k );
	krow = PROW( k, grid );
	if ( myrow == krow ) {
	    lk = LBi( k, grid );   /* Local block number. */
	    il = LSUM_BLK( lk );
	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
	}
	ii += knsupc;
    }

    /*
     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
     */
    {
	superlu_scope_t *scp = &grid->rscp;

	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
	for (k = 0; k < nsupers; ++k) {
	    krow = PROW( k, grid );
	    if ( myrow == krow ) {
		lk = LBi( k, grid );    /* local block number */
		kcol = PCOL( k, grid );
		if ( mycol != kcol && fmod[lk] )
		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
	    }
	}
	/*PrintInt10("mod_bit", nlb, mod_bit);*/
	
#if ( PROFlevel>=2 )
	t_reduce_tmp = SuperLU_timer_();
#endif
	/* Every process receives the count, but it is only useful on the
	   diagonal processes.  */
	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );

#if ( PROFlevel>=2 )
	t_reduce += SuperLU_timer_() - t_reduce_tmp;
#endif

	for (k = 0; k < nsupers; ++k) {
	    krow = PROW( k, grid );
	    if ( myrow == krow ) {
		lk = LBi( k, grid );    /* local block number */
		kcol = PCOL( k, grid );
		if ( mycol == kcol ) { /* diagonal process */
		    nfrecvmod += frecv[lk];
		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
		}
	    }
	}

    }

    /* ---------------------------------------------------------
       Solve the leaf nodes first by all the diagonal processes.
       --------------------------------------------------------- */
#if ( DEBUGlevel>=2 )
    printf("(%2d) nleaf %4d\n", iam, nleaf);
#endif
    for (k = 0; k < nsupers && nleaf; ++k) {
	krow = PROW( k, grid );
	kcol = PCOL( k, grid );
	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
	    knsupc = SuperSize( k );
	    lk = LBi( k, grid );
	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
		ii = X_BLK( lk );
		lk = LBj( k, grid ); /* Local block number, column-wise. */
		lsub = Lrowind_bc_ptr[lk];
		lusup = Lnzval_bc_ptr[lk];
		nsupr = lsub[1];
#ifdef _CRAY
		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
		      lusup, &nsupr, &x[ii], &knsupc);
#elif defined (USE_VENDOR_BLAS)
		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
#else
		dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
		       lusup, &nsupr, &x[ii], &knsupc);
#endif
		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
		--nleaf;
#if ( DEBUGlevel>=2 )
		printf("(%2d) Solve X[%2d]\n", iam, k);
#endif
		
		/*
		 * Send Xk to process column Pc[k].
		 */
		for (p = 0; p < Pr; ++p) {
		    if ( fsendx_plist[lk][p] != EMPTY ) {
			pi = PNUM( p, kcol, grid );
#ifdef ISEND_IRECV
			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
				   MPI_DOUBLE, pi, Xk, grid->comm,
                                   &send_req[Llu->SolveMsgSent++]);
#else
			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
				 MPI_DOUBLE, pi, Xk, grid->comm );
#endif
#if ( DEBUGlevel>=2 )
			printf("(%2d) Sent X[%2.0f] to P %2d\n",
			       iam, x[ii-XK_H], pi);
#endif
		    }
		}
		/*
		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
		 */
		nb = lsub[0] - 1;
		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
		luptr = knsupc; /* Skip diagonal block L(k,k). */
		
		dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
			   send_req, stat);
	    }
	} /* if diagonal process ... */
    } /* for k ... */

    /* -----------------------------------------------------------
       Compute the internal nodes asynchronously by all processes.
       ----------------------------------------------------------- */
#if ( DEBUGlevel>=2 )
    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
	   iam, nfrecvx, nfrecvmod, nleaf);
#endif

    while ( nfrecvx || nfrecvmod ) { /* While not finished. */

	/* Receive a message. */
#ifdef ISEND_IRECV
	/* -MPI- FATAL: Remote protocol queue full */
	MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE,
                 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req );
	MPI_Wait( &recv_req, &status );
#else
	MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE,
                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
#endif

	k = *recvbuf;

#if ( DEBUGlevel>=2 )
	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
#endif
	
	switch ( status.MPI_TAG ) {
	  case Xk:
	      --nfrecvx;
	      lk = LBj( k, grid ); /* Local block number, column-wise. */
	      lsub = Lrowind_bc_ptr[lk];
	      lusup = Lnzval_bc_ptr[lk];
	      if ( lsub ) {
		  nb   = lsub[0];
		  lptr = BC_HEADER;
		  luptr = 0;
		  knsupc = SuperSize( k );

		  /*
		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
		   */
		  dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
			     send_req, stat);
	      } /* if lsub */

	      break;

	  case LSUM: /* Receiver must be a diagonal process */
	      --nfrecvmod;
	      lk = LBi( k, grid ); /* Local block number, row-wise. */
	      ii = X_BLK( lk );
	      knsupc = SuperSize( k );
	      tempv = &recvbuf[LSUM_H];
	      RHS_ITERATE(j) {
		  for (i = 0; i < knsupc; ++i)
		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
	      }

	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
		  lk = LBj( k, grid ); /* Local block number, column-wise. */
		  lsub = Lrowind_bc_ptr[lk];
		  lusup = Lnzval_bc_ptr[lk];
		  nsupr = lsub[1];
#ifdef _CRAY
		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
			lusup, &nsupr, &x[ii], &knsupc);
#elif defined (USE_VENDOR_BLAS)
		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
#else
		  dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
			 lusup, &nsupr, &x[ii], &knsupc);
#endif
		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
#if ( DEBUGlevel>=2 )
		  printf("(%2d) Solve X[%2d]\n", iam, k);
#endif
		
		  /*
		   * Send Xk to process column Pc[k].
		   */
		  kcol = PCOL( k, grid );
		  for (p = 0; p < Pr; ++p) {
		      if ( fsendx_plist[lk][p] != EMPTY ) {
			  pi = PNUM( p, kcol, grid );
#ifdef ISEND_IRECV
			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
                                     MPI_DOUBLE, pi, Xk, grid->comm,
                                     &send_req[Llu->SolveMsgSent++]);
#else
			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
				    MPI_DOUBLE, pi, Xk, grid->comm );
#endif
#if ( DEBUGlevel>=2 )
			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
				 iam, x[ii-XK_H], pi);
#endif
		      }
                  }
		  /*
		   * Perform local block modifications.
		   */
		  nb = lsub[0] - 1;
		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
		  luptr = knsupc; /* Skip diagonal block L(k,k). */

		  dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
			     fmod, nb, lptr, luptr, xsup, grid, Llu,
			     send_req, stat);
	      } /* if */

	      break;

#if ( DEBUGlevel>=2 )
	    default:
	      printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
	      break;
#endif
	  } /* switch */

    } /* while not finished ... */


#if ( PRNTlevel>=2 )
    t = SuperLU_timer_() - t;
    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
    t = SuperLU_timer_();
#endif

#if ( DEBUGlevel==2 )
    {
      printf("(%d) .. After L-solve: y =\n", iam);
      for (i = 0, k = 0; k < nsupers; ++k) {
	  krow = PROW( k, grid );
	  kcol = PCOL( k, grid );
	  if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
	      knsupc = SuperSize( k );
	      lk = LBi( k, grid );
	      ii = X_BLK( lk );
	      for (j = 0; j < knsupc; ++j)
		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
	      fflush(stdout);
	  }
	  MPI_Barrier( grid->comm );
      }
    }
#endif

    SUPERLU_FREE(fmod);
    SUPERLU_FREE(frecv);
    SUPERLU_FREE(rtemp);

#ifdef ISEND_IRECV
    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
    Llu->SolveMsgSent = 0;
#endif

    /* Re-distribute X on the diagonal processes to B distributed on all
       the processes.   */
    pdReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);


    /* Deallocate storage. */
    SUPERLU_FREE(lsum);
    SUPERLU_FREE(x);
    SUPERLU_FREE(recvbuf);
#ifdef ISEND_IRECV
    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
    SUPERLU_FREE(send_req);
#endif

    stat->utime[SOLVE] = SuperLU_timer_() - t;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pdgstrsL()");
#endif

} /* PDGSTRS */