Exemple #1
0
/*! \brief
 *
 * <pre>
 * Purpose
 * =======
 *
 * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
 *
 * Arguments
 * =========
 *
 * n        (input) int
 *          Dimension of the matrix.
 *
 * LUstruct (input) LUstruct_t*
 *          The data structures to store the distributed L and U factors.
 *          see superlu_ddefs.h for its definition.
 *
 * grid     (input) gridinfo_t*
 *          The 2D process mesh. It contains the MPI communicator, the number
 *          of process rows (NPROW), the number of process columns (NPCOL),
 *          and my process rank. It is an input argument to all the
 *          parallel routines.
 *
 * diagU    (output) double*, dimension (n)
 *          The main diagonal of matrix U.
 *          On exit, it is available on all processes.
 *
 *
 * Note
 * ====
 *
 * The diagonal blocks of the L and U matrices are stored in the L
 * data structures, and are on the diagonal processes of the
 * 2D process grid.
 *
 * This routine is modified from gather_diag_to_all() in pzgstrs_Bglobal.c.
 * </pre>
 */
void pzGetDiagU(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
                  doublecomplex *diagU)
{

    int_t *xsup;
    int iam, knsupc, pkk;
    int nsupr; /* number of rows in the block L(:,k) (LDA) */
    int_t i, j, jj, k, lk, lwork, nsupers, p;
    int_t num_diag_procs, *diag_procs, *diag_len;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;
    doublecomplex *zblock, *zwork, *lusup;

    iam = grid->iam;
    nsupers = Glu_persist->supno[n-1] + 1;
    xsup = Glu_persist->xsup;

    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
		   &diag_procs, &diag_len);
    jj = diag_len[0];
    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
    if ( !(zwork = doublecomplexMalloc_dist(jj)) ) ABORT("Malloc fails for zwork[]");

    for (p = 0; p < num_diag_procs; ++p) {
	pkk = diag_procs[p];
	if ( iam == pkk ) {
	    /* Copy diagonal into buffer dwork[]. */
	    lwork = 0;
	    for (k = p; k < nsupers; k += num_diag_procs) {
		knsupc = SuperSize( k );
		lk = LBj( k, grid );
		nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
		lusup = Llu->Lnzval_bc_ptr[lk];
		for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */
		    zwork[lwork+i] = lusup[i*(nsupr+1)];
		lwork += knsupc;
	    }
	    MPI_Bcast( zwork, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
	} else {
	    MPI_Bcast( zwork, diag_len[p], SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm );
	}

	/* Scatter zwork[] into global diagU vector. */
	lwork = 0;
	for (k = p; k < nsupers; k += num_diag_procs) {
	    knsupc = SuperSize( k );
	    zblock = &diagU[FstBlockC( k )];
	    for (i = 0; i < knsupc; ++i) zblock[i] = zwork[lwork+i];
	    lwork += knsupc;
	}
    } /* for p = ... */

    SUPERLU_FREE(diag_procs);
    SUPERLU_FREE(diag_len);
    SUPERLU_FREE(zwork);
}
Exemple #2
0
/*! \brief Initialize the data structure for the solution phase.
 */
int zSolveInit(superlu_options_t *options, SuperMatrix *A, 
	       int_t perm_r[], int_t perm_c[], int_t nrhs,
	       LUstruct_t *LUstruct, gridinfo_t *grid,
	       SOLVEstruct_t *SOLVEstruct)
{
    int_t *row_to_proc, *inv_perm_c, *itemp;
    NRformat_loc *Astore;
    int_t        i, fst_row, m_loc, p;
    int          procs;

    /* prototypes */
    extern int_t pxgstrs_init(int_t, int_t, int_t, int_t,
	                      int_t [], int_t [], gridinfo_t *grid,
	                      Glu_persist_t *, SOLVEstruct_t *);

    Astore = (NRformat_loc *) A->Store;
    fst_row = Astore->fst_row;
    m_loc = Astore->m_loc;
    procs = grid->nprow * grid->npcol;
    
    if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
	ABORT("Malloc fails for row_to_proc[]");
    SOLVEstruct->row_to_proc = row_to_proc;
    if ( !(inv_perm_c = intMalloc_dist(A->ncol)) )
        ABORT("Malloc fails for inv_perm_c[].");
    for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i;
    SOLVEstruct->inv_perm_c = inv_perm_c;

    /* ------------------------------------------------------------
       EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
       SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
       
       NOTE: For those processes that do not own any row, it must
             must be set so that fst_row == A->nrow. 
       ------------------------------------------------------------*/
    if ( !(itemp = intMalloc_dist(procs+1)) )
        ABORT("Malloc fails for itemp[]");
    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
		  grid->comm);
    itemp[procs] = A->nrow;
    for (p = 0; p < procs; ++p) {
        for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p;
    }
#if ( DEBUGlevel>=2 )
    if ( !grid->iam ) {
      printf("fst_row = %d\n", fst_row);
      PrintInt10("row_to_proc", A->nrow, row_to_proc);
      PrintInt10("inv_perm_c", A->ncol, inv_perm_c);
    }
#endif
    SUPERLU_FREE(itemp);

#if 0
    /* Compute the mapping between rows and processes. */
    /* XSL NOTE: What happens if # of mapped processes is smaller
       than total Procs?  For the processes without any row, let
       fst_row be EMPTY (-1). Make sure this case works! */
    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
		  grid->comm);
    itemp[procs] = n;
    for (p = 0; p < procs; ++p) {
        j = itemp[p];
	if ( j != EMPTY ) {
	    k = itemp[p+1];
	    if ( k == EMPTY ) k = n;
	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
	}
    }
#endif    

    get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
		   &SOLVEstruct->num_diag_procs,
		   &SOLVEstruct->diag_procs,
		   &SOLVEstruct->diag_len);

    if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
        ABORT("Malloc fails for gstrs_comm[]");
    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
		 LUstruct->Glu_persist, SOLVEstruct);

    if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *)
           SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) )
        ABORT("Malloc fails for gsmv_comm[]");
    SOLVEstruct->A_colind_gsmv = NULL;
    
    options->SolveInitialized = YES;
    return 0;
} /* zSolveInit */
void
pzgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
		  gridinfo_t *grid, doublecomplex *B, int_t ldb,
		  doublecomplex *X, int_t ldx, int nrhs, double *berr,
		  SuperLUStat_t *stat, int *info)
{
/* 
 * Purpose
 * =======
 *
 * pzgsrfs_ABXglobal improves the computed solution to a system of linear   
 * equations and provides error bounds and backward error estimates
 * for the solution. 
 *
 * Arguments
 * =========
 *
 * n      (input) int (global)
 *        The order of the system of linear equations.
 *
 * A      (input) SuperMatrix*
 *	  The original matrix A, or the scaled A if equilibration was done.
 *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
 *        are permutation matrices. The type of A can be:
 *        Stype = NCP; Dtype = Z; Mtype = GE.
 *
 *        NOTE: Currently, A must reside in all processes when calling
 *              this routine.
 *
 * anorm  (input) double
 *        The norm of the original matrix A, or the scaled A if
 *        equilibration was done.
 *
 * LUstruct (input) LUstruct_t*
 *        The distributed data structures storing L and U factors.
 *        The L and U factors are obtained from pzgstrf for
 *        the possibly scaled and permuted matrix A.
 *        See superlu_ddefs.h for the definition of 'LUstruct_t'.
 *
 * grid   (input) gridinfo_t*
 *        The 2D process mesh. It contains the MPI communicator, the number
 *        of process rows (NPROW), the number of process columns (NPCOL),
 *        and my process rank. It is an input argument to all the
 *        parallel routines.
 *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
 *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
 *
 * B      (input) doublecomplex* (global)
 *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
 *        and row permuted system.
 *       
 *        NOTE: Currently, B must reside on all processes when calling
 *              this routine.
 *
 * ldb    (input) int (global)
 *        Leading dimension of matrix B.
 *
 * X      (input/output) doublecomplex* (global)
 *        On entry, the solution matrix X, as computed by pzgstrs.
 *        On exit, the improved solution matrix X.
 *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
 *        in order to obtain the solution to the original system.
 *
 *        NOTE: Currently, X must reside on all processes when calling
 *              this routine.
 *
 * ldx    (input) int (global)
 *        Leading dimension of matrix X.
 *
 * nrhs   (input) int
 *        Number of right-hand sides.
 *
 * berr   (output) double*, dimension (nrhs)
 *         The componentwise relative backward error of each solution   
 *         vector X(j) (i.e., the smallest relative change in   
 *         any element of A or B that makes X(j) an exact solution).
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics about the refinement steps.
 *        See util.h for the definition of SuperLUStat_t.
 *
 * info   (output) int*
 *        = 0: successful exit
 *        < 0: if info = -i, the i-th argument had an illegal value
 *        
 * Internal Parameters   
 * ===================   
 *
 * ITMAX is the maximum number of steps of iterative refinement.   
 *
 */

#define ITMAX 20
    
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;
    /* 
     * Data structures used by matrix-vector multiply routine.
     */
    int_t  N_update; /* Number of variables updated on this process */
    int_t  *update;  /* vector elements (global index) updated 
			on this processor.                     */
    int_t  *bindx;
    doublecomplex *val;
    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
			       matrix-vector multiply.  */
    /*-- end data structures for matrix-vector multiply --*/
    doublecomplex *b, *ax, *R, *B_col, *temp, *work, *X_col,
                  *x_trs, *dx_trs;
    double *rwork;
    int_t count, ii, j, jj, k, knsupc, lk, lwork,
          nprow, nsupers, notran, nz, p;
    int   i, iam, pkk;
    int_t *ilsum, *xsup;
    double eps, lstres;
    double s, safmin, safe1, safe2;

    /* NEW STUFF */
    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
    int_t *diag_len; /* Length of the X vector on diagonal processes. */

    /*-- Function prototypes --*/
    extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *,
			 doublecomplex *, int, SuperLUStat_t *, int *);
    extern double dlamch_(char *);
    
    /* Test the input parameters. */
    *info = 0;
    if ( n < 0 ) *info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      A->Stype != SLU_NCP || A->Dtype != SLU_Z || A->Mtype != SLU_GE )
	*info = -2;
    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
    else if ( nrhs < 0 ) *info = -13;
    if (*info != 0) {
	i = -(*info);
	xerbla_("pzgsrfs_ABXglobal", &i);
	return;
    }

    /* Quick return if possible. */
    if ( n == 0 || nrhs == 0 ) {
	return;
    }

    /* Initialization. */
    iam = grid->iam;
    nprow = grid->nprow;
    nsupers = Glu_persist->supno[n-1] + 1;
    xsup = Glu_persist->xsup;
    ilsum = Llu->ilsum;
    notran = 1;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pzgsrfs_ABXglobal()");
#endif

    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
		   &diag_procs, &diag_len);
#if ( PRNTlevel>=1 )
    if ( !iam ) {
	printf(".. number of diag processes = %d\n", num_diag_procs);
	PrintInt10("diag_procs", num_diag_procs, diag_procs);
	PrintInt10("diag_len", num_diag_procs, diag_len);
    }
#endif

    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
	ABORT("Calloc fails for mv_sup_to_proc[]");

    pzgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
			  &val, &bindx, mv_sup_to_proc);

    i = CEILING( nsupers, nprow ); /* Number of local block rows */
    ii = Llu->ldalsum + i * XK_H;
    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
    jj = diag_len[0];
    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
    jj = SUPERLU_MAX( jj, N_update );
    lwork = N_update         /* For ax and R */
	  + ii               /* For dx_trs */
	  + ii               /* For x_trs */
          + k                /* For b */
	  + jj;              /* for temp */
    if ( !(work = doublecomplexMalloc_dist(lwork)) )
	ABORT("Malloc fails for work[]");
    ax = R = work;
    dx_trs = work + N_update;
    x_trs  = dx_trs + ii;
    b      = x_trs + ii;
    temp   = b + k;
    if ( !(rwork = SUPERLU_MALLOC(N_update * sizeof(double))) )
	ABORT("Malloc fails for rwork[]");

#if ( DEBUGlevel>=2 )
    {
	doublecomplex *dwork = doublecomplexMalloc_dist(n);
	for (i = 0; i < n; ++i) {
	    if ( i & 1 ) dwork[i].r = 1.;
	    else dwork[i].r = 2.;
	    dwork[i].i = 0.;
	}
	/* Check correctness of matrix-vector multiply. */
	pzgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
	PrintDouble5("Mult A*x", N_update, ax);
	SUPERLU_FREE(dwork);
    }
#endif


    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
    nz     = A->ncol + 1;
    eps    = dlamch_("Epsilon");
    safmin = dlamch_("Safe minimum");
    safe1  = nz * safmin;
    safe2  = safe1 / eps;

#if ( DEBUGlevel>=1 )
    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
		       eps, anorm, safe1, safe2);
#endif

    /* Do for each right-hand side ... */
    for (j = 0; j < nrhs; ++j) {
	count = 0;
	lstres = 3.;

	/* Copy X into x on the diagonal processes. */
	B_col = &B[j*ldb];
	X_col = &X[j*ldx];
	for (p = 0; p < num_diag_procs; ++p) {
	    pkk = diag_procs[p];
	    if ( iam == pkk ) {
		for (k = p; k < nsupers; k += num_diag_procs) {
		    knsupc = SuperSize( k );
		    lk = LBi( k, grid );
		    ii = ilsum[lk] + (lk+1)*XK_H;
		    jj = FstBlockC( k );
		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
		    dx_trs[ii-XK_H].r = k;/* Block number prepended in header. */
		}
	    }
	}
	/* Copy B into b distributed the same way as matrix-vector product. */
	ii = update[0];
	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];

	while (1) { /* Loop until stopping criterion is satisfied. */

	    /* Compute residual R = B - op(A) * X,   
	       where op(A) = A, A**T, or A**H, depending on TRANS. */

	    /* Matrix-vector multiply. */
	    pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
	    
	    /* Compute residual. */
	    for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]);

	    /* Compute abs(op(A))*abs(X) + abs(B). */
	    pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork);
	    for (i = 0; i < N_update; ++i) rwork[i] += z_abs1(&b[i]);
	    
	    s = 0.0;
	    for (i = 0; i < N_update; ++i) {
		if ( rwork[i] > safe2 )
		    s = SUPERLU_MAX(s, z_abs1(&R[i]) / rwork[i]);
		else
		    s = SUPERLU_MAX(s, (z_abs1(&R[i])+safe1)/(rwork[i]+safe1));
	    }
	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
		
#if ( PRNTlevel>= 1 )
	    if ( !iam )
		printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]);
#endif
	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
		/* Compute new dx. */
		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
				   mv_sup_to_proc, dx_trs);
		pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);

		/* Update solution. */
		for (p = 0; p < num_diag_procs; ++p) 
		    if ( iam == diag_procs[p] )
			for (k = p; k < nsupers; k += num_diag_procs) {
			    lk = LBi( k, grid );
			    ii = ilsum[lk] + (lk+1)*XK_H;
			    knsupc = SuperSize( k );
			    for (i = 0; i < knsupc; ++i)
				z_add(&x_trs[i + ii], &x_trs[i + ii], 
				      &dx_trs[i + ii]);
			}
		lstres = berr[j];
		++count;
		/* Transfer x_trs (on diagonal processes) into X
		   (on all processes). */
		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, 
					num_diag_procs, diag_procs, diag_len,
					X_col, temp);
	    } else {
		break;
	    }
	} /* end while */

	stat->RefineSteps = count;

    } /* for j ... */


    /* Deallocate storage used by matrix-vector multiplication. */
    SUPERLU_FREE(diag_procs);
    SUPERLU_FREE(diag_len);
    if ( N_update ) {
	SUPERLU_FREE(update);
	SUPERLU_FREE(bindx);
	SUPERLU_FREE(val);
    }
    SUPERLU_FREE(mv_sup_to_proc);
    SUPERLU_FREE(work);
    SUPERLU_FREE(rwork);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pzgsrfs_ABXglobal()");
#endif

} /* PZGSRFS_ABXGLOBAL */
void
pdgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct,
		  gridinfo_t *grid, double *B, int_t ldb, double *X, int_t ldx,
		  int nrhs, double *berr, SuperLUStat_t *stat, int *info)
{


#define ITMAX 20

    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;
    /*
     * Data structures used by matrix-vector multiply routine.
     */
    int_t  N_update; /* Number of variables updated on this process */
    int_t  *update;  /* vector elements (global index) updated
			on this processor.                     */
    int_t  *bindx;
    double *val;
    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
			       matrix-vector multiply.  */
    /*-- end data structures for matrix-vector multiply --*/
    double *b, *ax, *R, *B_col, *temp, *work, *X_col,
           *x_trs, *dx_trs;
    int_t count, ii, j, jj, k, knsupc, lk, lwork,
          nprow, nsupers, nz, p;
    int   i, iam, pkk;
    int_t *ilsum, *xsup;
    double eps, lstres;
    double s, safmin, safe1, safe2;

    /* NEW STUFF */
    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
    int_t *diag_len; /* Length of the X vector on diagonal processes. */

    /*-- Function prototypes --*/
    extern void pdgstrs1(int_t, LUstruct_t *, gridinfo_t *,
			 double *, int, SuperLUStat_t *, int *);

    /* Test the input parameters. */
    *info = 0;
    if ( n < 0 ) *info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      A->Stype != SLU_NCP || A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -2;
    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
    else if ( nrhs < 0 ) *info = -13;
    if (*info != 0) {
	i = -(*info);
	pxerr_dist("pdgsrfs_ABXglobal", grid, i);
	return;
    }

    /* Quick return if possible. */
    if ( n == 0 || nrhs == 0 ) {
	return;
    }

    /* Initialization. */
    iam = grid->iam;
    nprow = grid->nprow;
    nsupers = Glu_persist->supno[n-1] + 1;
    xsup = Glu_persist->xsup;
    ilsum = Llu->ilsum;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pdgsrfs_ABXglobal()");
#endif

    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
		   &diag_procs, &diag_len);
#if ( PRNTlevel>=1 )
    if ( !iam ) {
	printf(".. number of diag processes = " IFMT "\n", num_diag_procs);
	PrintInt10("diag_procs", num_diag_procs, diag_procs);
	PrintInt10("diag_len", num_diag_procs, diag_len);
    }
#endif

    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
	ABORT("Calloc fails for mv_sup_to_proc[]");

    pdgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
		          &val, &bindx, mv_sup_to_proc);

    i = CEILING( nsupers, nprow ); /* Number of local block rows */
    ii = Llu->ldalsum + i * XK_H;
    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
    jj = diag_len[0];
    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
    jj = SUPERLU_MAX( jj, N_update );
    lwork = N_update         /* For ax and R */
	  + ii               /* For dx_trs */
	  + ii               /* For x_trs */
          + k                /* For b */
	  + jj;              /* for temp */
    if ( !(work = doubleMalloc_dist(lwork)) )
	ABORT("Malloc fails for work[]");
    ax = R = work;
    dx_trs = work + N_update;
    x_trs  = dx_trs + ii;
    b      = x_trs + ii;
    temp   = b + k;

#if ( DEBUGlevel>=2 )
    {
	double *dwork = doubleMalloc_dist(n);
	for (i = 0; i < n; ++i) {
	    if ( i & 1 ) dwork[i] = 1.;
	    else dwork[i] = 2.;
        }
	/* Check correctness of matrix-vector multiply. */
	pdgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
	PrintDouble5("Mult A*x", N_update, ax);
	SUPERLU_FREE(dwork);
    }
#endif


    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
    nz     = A->ncol + 1;
    eps    = dmach_dist("Epsilon");
    safmin = dmach_dist("Safe minimum");

    /* Set SAFE1 essentially to be the underflow threshold times the
       number of additions in each row. */
    safe1  = nz * safmin;
    safe2  = safe1 / eps;

#if ( DEBUGlevel>=1 )
    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
		       eps, anorm, safe1, safe2);
#endif

    /* Do for each right-hand side ... */
    for (j = 0; j < nrhs; ++j) {
	count = 0;
	lstres = 3.;

	/* Copy X into x on the diagonal processes. */
	B_col = &B[j*ldb];
	X_col = &X[j*ldx];
	for (p = 0; p < num_diag_procs; ++p) {
	    pkk = diag_procs[p];
	    if ( iam == pkk ) {
		for (k = p; k < nsupers; k += num_diag_procs) {
		    knsupc = SuperSize( k );
		    lk = LBi( k, grid );
		    ii = ilsum[lk] + (lk+1)*XK_H;
		    jj = FstBlockC( k );
		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
		    dx_trs[ii-XK_H] = k;/* Block number prepended in header. */
		}
	    }
	}
	/* Copy B into b distributed the same way as matrix-vector product. */
        if ( N_update ) ii = update[0];
	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];

	while (1) { /* Loop until stopping criterion is satisfied. */

	    /* Compute residual R = B - op(A) * X,
	       where op(A) = A, A**T, or A**H, depending on TRANS. */

	    /* Matrix-vector multiply. */
	    pdgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);

	    /* Compute residual. */
	    for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i];

	    /* Compute abs(op(A))*abs(X) + abs(B). */
	    pdgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp);
	    for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]);

	    s = 0.0;
	    for (i = 0; i < N_update; ++i) {
		if ( temp[i] > safe2 ) {
		    s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]);
		} else if ( temp[i] != 0.0 ) {
                    /* Adding SAFE1 to the numerator guards against
                       spuriously zero residuals (underflow). */
		    s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) / temp[i]);
                }
                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
                   we know the true residual also must be exactly 0.0. */
	    }
	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );

#if ( PRNTlevel>= 1 )
	    if ( !iam )
		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
#endif
	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
		/* Compute new dx. */
		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
				   mv_sup_to_proc, dx_trs);
		pdgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);

		/* Update solution. */
		for (p = 0; p < num_diag_procs; ++p)
		    if ( iam == diag_procs[p] )
			for (k = p; k < nsupers; k += num_diag_procs) {
			    lk = LBi( k, grid );
			    ii = ilsum[lk] + (lk+1)*XK_H;
			    knsupc = SuperSize( k );
			    for (i = 0; i < knsupc; ++i)
				x_trs[i + ii] += dx_trs[i + ii];
			}
		lstres = berr[j];
		++count;
		/* Transfer x_trs (on diagonal processes) into X
		   (on all processes). */
		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid,
					num_diag_procs, diag_procs, diag_len,
					X_col, temp);
	    } else {
		break;
	    }
	} /* end while */

	stat->RefineSteps = count;

    } /* for j ... */


    /* Deallocate storage used by matrix-vector multiplication. */
    SUPERLU_FREE(diag_procs);
    SUPERLU_FREE(diag_len);
    if ( N_update ) {
	SUPERLU_FREE(update);
	SUPERLU_FREE(bindx);
	SUPERLU_FREE(val);
    }
    SUPERLU_FREE(mv_sup_to_proc);
    SUPERLU_FREE(work);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pdgsrfs_ABXglobal()");
#endif

} /* PDGSRFS_ABXGLOBAL */