Exemple #1
0
void
dgstrf (superlu_options_t *options, SuperMatrix *A, double drop_tol,
        int relax, int panel_size, int *etree, void *work, int lwork,
        int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
        SuperLUStat_t *stat, int *info)
{
/*
 * Purpose
 * =======
 *
 * DGSTRF computes an LU factorization of a general sparse m-by-n
 * matrix A using partial pivoting with row interchanges.
 * The factorization has the form
 *     Pr * A = L * U
 * where Pr is a row permutation matrix, L is lower triangular with unit
 * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper
 * triangular (upper trapezoidal if A->nrow < A->ncol).
 *
 * See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 * Arguments
 * =========
 *
 * options (input) superlu_options_t*
 *         The structure defines the input parameters to control
 *         how the LU decomposition will be performed.
 *
 * A        (input) SuperMatrix*
 *	    Original matrix A, permuted by columns, of dimension
 *          (A->nrow, A->ncol). The type of A can be:
 *          Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
 *
 * drop_tol (input) double (NOT IMPLEMENTED)
 *	    Drop tolerance parameter. At step j of the Gaussian elimination,
 *          if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
 *          0 <= drop_tol <= 1. The default value of drop_tol is 0.
 *
 * relax    (input) int
 *          To control degree of relaxing supernodes. If the number
 *          of nodes (columns) in a subtree of the elimination tree is less
 *          than relax, this subtree is considered as one supernode,
 *          regardless of the row structures of those columns.
 *
 * panel_size (input) int
 *          A panel consists of at most panel_size consecutive columns.
 *
 * etree    (input) int*, dimension (A->ncol)
 *          Elimination tree of A'*A.
 *          Note: etree is a vector of parent pointers for a forest whose
 *          vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
 *          On input, the columns of A should be permuted so that the
 *          etree is in a certain postorder.
 *
 * work     (input/output) void*, size (lwork) (in bytes)
 *          User-supplied work space and space for the output data structures.
 *          Not referenced if lwork = 0;
 *
 * lwork   (input) int
 *         Specifies the size of work array in bytes.
 *         = 0:  allocate space internally by system malloc;
 *         > 0:  use user-supplied work array of length lwork in bytes,
 *               returns error if space runs out.
 *         = -1: the routine guesses the amount of space needed without
 *               performing the factorization, and returns it in
 *               *info; no other side effects.
 *
 * perm_c   (input) int*, dimension (A->ncol)
 *	    Column permutation vector, which defines the
 *          permutation matrix Pc; perm_c[i] = j means column i of A is
 *          in position j in A*Pc.
 *          When searching for diagonal, perm_c[*] is applied to the
 *          row subscripts of A, so that diagonal threshold pivoting
 *          can find the diagonal of A, rather than that of A*Pc.
 *
 * perm_r   (input/output) int*, dimension (A->nrow)
 *          Row permutation vector which defines the permutation matrix Pr,
 *          perm_r[i] = j means row i of A is in position j in Pr*A.
 *          If options->Fact = SamePattern_SameRowPerm, the pivoting routine
 *             will try to use the input perm_r, unless a certain threshold
 *             criterion is violated. In that case, perm_r is overwritten by
 *             a new permutation determined by partial pivoting or diagonal
 *             threshold pivoting.
 *          Otherwise, perm_r is output argument;
 *
 * L        (output) SuperMatrix*
 *          The factor L from the factorization Pr*A=L*U; use compressed row
 *          subscripts storage for supernodes, i.e., L has type:
 *          Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU.
 *
 * U        (output) SuperMatrix*
 *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
 *          storage scheme, i.e., U has types: Stype = SLU_NC,
 *          Dtype = SLU_D, Mtype = SLU_TRU.
 *
 * stat     (output) SuperLUStat_t*
 *          Record the statistics on runtime and floating-point operation count.
 *          See util.h for the definition of 'SuperLUStat_t'.
 *
 * info     (output) int*
 *          = 0: successful exit
 *          < 0: if info = -i, the i-th argument had an illegal value
 *          > 0: if info = i, and i is
 *             <= A->ncol: U(i,i) is exactly zero. The factorization has
 *                been completed, but the factor U is exactly singular,
 *                and division by zero will occur if it is used to solve a
 *                system of equations.
 *             > A->ncol: number of bytes allocated when memory allocation
 *                failure occurred, plus A->ncol. If lwork = -1, it is
 *                the estimated amount of space needed, plus A->ncol.
 *
 * ======================================================================
 *
 * Local Working Arrays:
 * ======================
 *   m = number of rows in the matrix
 *   n = number of columns in the matrix
 *
 *   xprune[0:n-1]: xprune[*] points to locations in subscript
 *	vector lsub[*]. For column i, xprune[i] denotes the point where
 *	structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need
 *	to be traversed for symbolic factorization.
 *
 *   marker[0:3*m-1]: marker[i] = j means that node i has been
 *	reached when working on column j.
 *	Storage: relative to original row subscripts
 *	NOTE: There are 3 of them: marker/marker1 are used for panel dfs,
 *	      see dpanel_dfs.c; marker2 is used for inner-factorization,
 *            see dcolumn_dfs.c.
 *
 *   parent[0:m-1]: parent vector used during dfs
 *      Storage: relative to new row subscripts
 *
 *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs)
 *	unexplored neighbor of i in lsub[*]
 *
 *   segrep[0:nseg-1]: contains the list of supernodal representatives
 *	in topological order of the dfs. A supernode representative is the
 *	last column of a supernode.
 *      The maximum size of segrep[] is n.
 *
 *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a
 *	supernodal representative r, repfnz[r] is the location of the first
 *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
 *	indicates the supernode r has been explored.
 *	NOTE: There are W of them, each used for one column of a panel.
 *
 *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below
 *      the panel diagonal. These are filled in during dpanel_dfs(), and are
 *      used later in the inner LU factorization within the panel.
 *	panel_lsub[]/dense[] pair forms the SPA data structure.
 *	NOTE: There are W of them.
 *
 *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
 *	    	   NOTE: there are W of them.
 *
 *   tempv[0:*]: real temporary used for dense numeric kernels;
 *	The size of this array is defined by NUM_TEMPV() in dsp_defs.h.
 *
 */
    /* Local working arrays */
    NCPformat *Astore;
    int       *iperm_r = NULL; /* inverse of perm_r; used when
                                  options->Fact == SamePattern_SameRowPerm */
    int       *iperm_c; /* inverse of perm_c */
    int       *iwork;
    double    *dwork;
    int	      *segrep, *repfnz, *parent, *xplore;
    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
    int	      *xprune;
    int	      *marker;
    double    *dense, *tempv;
    int       *relax_end;
    double    *a;
    int       *asub;
    int       *xa_begin, *xa_end;
    int       *xsup, *supno;
    int       *xlsub, *xlusup, *xusub;
    int       nzlumax;
    static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */

    /* Local scalars */
    fact_t    fact = options->Fact;
    double    diag_pivot_thresh = options->DiagPivotThresh;
    int       pivrow;   /* pivotal row number in the original matrix A */
    int       nseg1;	/* no of segments in U-column above panel row jcol */
    int       nseg;	/* no of segments in each U-column */
    register int jcol;
    register int kcol;	/* end column of a relaxed snode */
    register int icol;
    register int i, k, jj, new_next, iinfo;
    int       m, n, min_mn, jsupno, fsupc, nextlu, nextu;
    int       w_def;	/* upper bound on panel width */
    int       usepr, iperm_r_allocated = 0;
    int       nnzL, nnzU;
    int       *panel_histo = stat->panel_histo;
    flops_t   *ops = stat->ops;

    iinfo    = 0;
    m        = A->nrow;
    n        = A->ncol;
    min_mn   = SUPERLU_MIN(m, n);
    Astore   = A->Store;
    a        = Astore->nzval;
    asub     = Astore->rowind;
    xa_begin = Astore->colbeg;
    xa_end   = Astore->colend;

    /* Allocate storage common to the factor routines */
    *info = dLUMemInit(fact, work, lwork, m, n, Astore->nnz,
                       panel_size, L, U, &Glu, &iwork, &dwork);
    if ( *info ) return;

    xsup    = Glu.xsup;
    supno   = Glu.supno;
    xlsub   = Glu.xlsub;
    xlusup  = Glu.xlusup;
    xusub   = Glu.xusub;

    SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
	     &repfnz, &panel_lsub, &xprune, &marker);
    dSetRWork(m, panel_size, dwork, &dense, &tempv);

    usepr = (fact == SamePattern_SameRowPerm);
    if ( usepr ) {
	/* Compute the inverse of perm_r */
	iperm_r = (int *) intMalloc(m);
	for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
	iperm_r_allocated = 1;
    }
    iperm_c = (int *) intMalloc(n);
    for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;

    /* Identify relaxed snodes */
    relax_end = (int *) intMalloc(n);
    if ( options->SymmetricMode == YES ) {
        heap_relax_snode(n, etree, relax, marker, relax_end);
    } else {
        relax_snode(n, etree, relax, marker, relax_end);
    }

    ifill (perm_r, m, EMPTY);
    ifill (marker, m * NO_MARKER, EMPTY);
    supno[0] = -1;
    xsup[0]  = xlsub[0] = xusub[0] = xlusup[0] = 0;
    w_def    = panel_size;

    /*
     * Work on one "panel" at a time. A panel is one of the following:
     *	   (a) a relaxed supernode at the bottom of the etree, or
     *	   (b) panel_size contiguous columns, defined by the user
     */
    for (jcol = 0; jcol < min_mn; ) {

	if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
   	    kcol = relax_end[jcol];	  /* end of the relaxed snode */
	    panel_histo[kcol-jcol+1]++;

	    /* --------------------------------------
	     * Factorize the relaxed supernode(jcol:kcol)
	     * -------------------------------------- */
	    /* Determine the union of the row structure of the snode */
	    if ( (*info = dsnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
				    xprune, marker, &Glu)) != 0 )
		return;

            nextu    = xusub[jcol];
	    nextlu   = xlusup[jcol];
	    jsupno   = supno[jcol];
	    fsupc    = xsup[jsupno];
	    new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
	    nzlumax = Glu.nzlumax;
	    while ( new_next > nzlumax ) {
		if ( (*info = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) )
		    return;
	    }

	    for (icol = jcol; icol<= kcol; icol++) {
		xusub[icol+1] = nextu;

    		/* Scatter into SPA dense[*] */
    		for (k = xa_begin[icol]; k < xa_end[icol]; k++)
        	    dense[asub[k]] = a[k];

	       	/* Numeric update within the snode */
	        dsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat);

		if ( (*info = dpivotL(icol, diag_pivot_thresh, &usepr, perm_r,
				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
		    if ( iinfo == 0 ) iinfo = *info;

#ifdef DEBUG
		dprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu);
#endif

	    }

	    jcol = icol;

	} else { /* Work on one panel of panel_size columns */

	    /* Adjust panel_size so that a panel won't overlap with the next
	     * relaxed snode.
	     */
	    panel_size = w_def;
	    for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++)
		if ( relax_end[k] != EMPTY ) {
		    panel_size = k - jcol;
		    break;
		}
	    if ( k == min_mn ) panel_size = min_mn - jcol;
	    panel_histo[panel_size]++;

	    /* symbolic factor on a panel of columns */
	    dpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
		      dense, panel_lsub, segrep, repfnz, xprune,
		      marker, parent, xplore, &Glu);

	    /* numeric sup-panel updates in topological order */
	    dpanel_bmod(m, panel_size, jcol, nseg1, dense,
		        tempv, segrep, repfnz, &Glu, stat);

	    /* Sparse LU within the panel, and below panel diagonal */
    	    for ( jj = jcol; jj < jcol + panel_size; jj++) {
 		k = (jj - jcol) * m; /* column index for w-wide arrays */

		nseg = nseg1;	/* Begin after all the panel segments */

	    	if ((*info = dcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k],
					segrep, &repfnz[k], xprune, marker,
					parent, xplore, &Glu)) != 0) return;

	      	/* Numeric updates */
	    	if ((*info = dcolumn_bmod(jj, (nseg - nseg1), &dense[k],
					 tempv, &segrep[nseg1], &repfnz[k],
					 jcol, &Glu, stat)) != 0) return;

	        /* Copy the U-segments to ucol[*] */
		if ((*info = dcopy_to_ucol(jj, nseg, segrep, &repfnz[k],
					  perm_r, &dense[k], &Glu)) != 0)
		    return;

	    	if ( (*info = dpivotL(jj, diag_pivot_thresh, &usepr, perm_r,
				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
		    if ( iinfo == 0 ) iinfo = *info;

		/* Prune columns (0:jj-1) using column jj */
	    	dpruneL(jj, perm_r, pivrow, nseg, segrep,
                        &repfnz[k], xprune, &Glu);

		/* Reset repfnz[] for this column */
	    	resetrep_col (nseg, segrep, &repfnz[k]);

#ifdef DEBUG
		dprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu);
#endif

	    }

   	    jcol += panel_size;	/* Move to the next panel */

	} /* else */

    } /* for */

    *info = iinfo;

    if ( m > n ) {
	k = 0;
        for (i = 0; i < m; ++i)
            if ( perm_r[i] == EMPTY ) {
    		perm_r[i] = n + k;
		++k;
	    }
    }

    countnz(min_mn, xprune, &nnzL, &nnzU, &Glu);
    fixupL(min_mn, perm_r, &Glu);

    dLUWorkFree(iwork, dwork, &Glu); /* Free work space and compress storage */

    if ( fact == SamePattern_SameRowPerm ) {
        /* L and U structures may have changed due to possibly different
	   pivoting, even though the storage is available.
	   There could also be memory expansions, so the array locations
           may have changed, */
        ((SCformat *)L->Store)->nnz = nnzL;
	((SCformat *)L->Store)->nsuper = Glu.supno[n];
	((SCformat *)L->Store)->nzval = Glu.lusup;
	((SCformat *)L->Store)->nzval_colptr = Glu.xlusup;
	((SCformat *)L->Store)->rowind = Glu.lsub;
	((SCformat *)L->Store)->rowind_colptr = Glu.xlsub;
	((NCformat *)U->Store)->nnz = nnzU;
	((NCformat *)U->Store)->nzval = Glu.ucol;
	((NCformat *)U->Store)->rowind = Glu.usub;
	((NCformat *)U->Store)->colptr = Glu.xusub;
    } else {
        dCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup,
	                         Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno,
			         Glu.xsup, SLU_SC, SLU_D, SLU_TRLU);
    	dCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol,
			       Glu.usub, Glu.xusub, SLU_NC, SLU_D, SLU_TRU);
    }

    ops[FACT] += ops[TRSV] + ops[GEMV];

    if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
    SUPERLU_FREE (iperm_c);
    SUPERLU_FREE (relax_end);

}
void
*pzgstrf_thread(void *arg)
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 *
 * Purpose
 * =======
 *
 * This is the slave process, representing the main scheduling loop to
 * perform the factorization. Each process executes a copy of the
 * following code ... (SPMD paradigm)
 *
 * Working arrays local to each process
 * ======================================
 *   marker[0:3*m-1]: marker[i] == j means node i has been reached when 
 *                                 working on column j.
 *	Storage: relative to original row subscripts
 *
 *	THERE ARE 3 OF THEM:
 *          marker[0 : m-1]:   used by pzgstrf_factor_snode() and 
 *                                     pzgstrf_panel_dfs();
 *          marker[m : 2m-1]:  used by pzgstrf_panel_dfs() and 
 *                                     pxgstrf_super_bnd_dfs();
 *                values in [0 : n-1]  when used by pzgstrf_panel_dfs()
 *                values in [n : 2n-1] when used by pxgstrf_super_bnd_dfs()
 *	    marker[2m : 3m-1]: used by pzgstrf_column_dfs() in inner-factor 
 *
 *   parent[0:n-1]: parent vector used during dfs
 *      Storage: relative to new row subscripts
 *
 *   xplore[0:2m-1]: xplore[i] gives the location of the next (dfs) 
 *	unexplored neighbor of i in lsub[*]; xplore[n+i] gives the
 *      location of the last unexplored neighbor of i in lsub[*].
 *
 *   segrep[0:nseg-1]: contains the list of supernodal representatives
 *	in topological order of the dfs. A supernode representative is the 
 *	last column of a supernode.
 *
 *   repfnz[0:m-1]: for a nonzero segment U[*,j] that ends at a 
 *	supernodal representative r, repfnz[r] is the location of the first 
 *	nonzero in this segment.  It is also used during the dfs:
 *      repfnz[r]>0 indicates that supernode r has been explored.
 *	NOTE: There are w of them, each used for one column of a panel. 
 *
 *   panel_lsub[0:w*m-1]: temporary for the nonzero row indices below 
 *      the panel diagonal. These are filled in during pzgstrf_panel_dfs(), 
 *      and are used later in the inner LU factorization.
 *	panel_lsub[]/dense[] pair forms the SPA data structure.
 *	NOTE: There are w of them.
 *
 *   dense[0:w*m-1]: sparse accumulator (SPA) for intermediate values;
 *	NOTE: there are w of them.
 *
 *   tempv[0:m-1]: real temporary used for dense numeric kernels;
 *
 * 
 * Scheduling algorithm (For each process ...)
 * ====================
 *     Shared task Q <-- { relaxed s-nodes (CANGO) };
 *
 *     WHILE (not finished)
 *
 *         panel = Scheduler(Q); (see pxgstrf_scheduler.c for policy)
 *
 *         IF (panel == RELAXED_SNODE)
 *             factor_relax_snode(panel);
 *         ELSE
 *             * pzgstrf_panel_dfs()
 *                 - skip all BUSY s-nodes (or panels)
 *
 *             * dpanel_bmod()
 *                 - updates from DONE s-nodes
 *                 - wait for BUSY s-nodes to become DONE
 *
 *             * inner-factor()
 *                 - identical as it is in the sequential algorithm,
 *                   except that pruning() will interact with the
 *                   pzgstrf_panel_dfs() of other panels.
 *         ENDIF
 *
 *     END WHILE;
 *
 */

#if ( MACH==SGI || MACH==ORIGIN )
#if ( MACH==SGI )
    int         pnum = mpc_my_threadnum();
#elif ( MACH==ORIGIN )
    int         pnum = mp_my_threadnum();
#endif
    pzgstrf_threadarg_t *thr_arg = &((pzgstrf_threadarg_t *)arg)[pnum];
#else
    pzgstrf_threadarg_t *thr_arg  = arg;
    int         pnum = thr_arg->pnum;
#endif

    /* Unpack the options argument */
    superlumt_options_t *superlumt_options = thr_arg->superlumt_options;
    pxgstrf_shared_t  *pxgstrf_shared= thr_arg->pxgstrf_shared;
    int         panel_size = superlumt_options->panel_size;
    double     diag_pivot_thresh = superlumt_options->diag_pivot_thresh;
    yes_no_t    *usepr     = &superlumt_options->usepr; /* may be modified */
    int         *etree     = superlumt_options->etree;
    int         *super_bnd = superlumt_options->part_super_h;
    int         *perm_r    = superlumt_options->perm_r;
    int         *inv_perm_c= pxgstrf_shared->inv_perm_c;
    int         *inv_perm_r= pxgstrf_shared->inv_perm_r;
    int	        *xprune    = pxgstrf_shared->xprune;
    int	        *ispruned  = pxgstrf_shared->ispruned;
    SuperMatrix *A         = pxgstrf_shared->A;
    GlobalLU_t  *Glu       = pxgstrf_shared->Glu;
    Gstat_t 	*Gstat     = pxgstrf_shared->Gstat;
    int         *info      = &thr_arg->info;

    /* Local working arrays */
    int       *iwork;
    doublecomplex    *dwork;
    int	      *segrep, *repfnz, *parent, *xplore;
    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
    int	      *marker, *marker1, *marker2;
    int       *lbusy; /* "Local busy" array, indicates which descendants
			 were busy when this panel's computation began.
			 Those columns (s-nodes) are treated specially
			 during pzgstrf_panel_dfs() and dpanel_bmod(). */

    int       *spa_marker; /* size n-by-w */
    int       *w_lsub_end; /* record the end of each column in panel_lsub */
    doublecomplex    *dense, *tempv;
    int       *lsub, *xlsub, *xlsub_end;

    /* Local scalars */
    register int m, n, k, jj, jcolm1, itemp, singular;
    int       pivrow;   /* pivotal row number in the original matrix A */
    int       nseg1;	/* no of segments in U-column above panel row jcol */
    int       nseg;	/* no of segments in each U-column */
    int       w, bcol, jcol;

#ifdef PROFILE
    double *utime = Gstat->utime;
    double t1, t2, t, stime;
    register float flopcnt;
#endif

#ifdef PREDICT_OPT
    flops_t  *ops = Gstat->ops;
    register float pdiv;
#endif
    
#if ( DEBUGlevel>=1 )
    printf("(%d) thr_arg-> pnum %d, info %d\n", pnum, thr_arg->pnum, thr_arg->info);
#endif

    singular   = 0;
    m          = A->nrow;
    n          = A->ncol;
    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    xlsub_end  = Glu->xlsub_end;

    /* Allocate and initialize the per-process working storage. */
    if ( (*info = pzgstrf_WorkInit(m, panel_size, &iwork, &dwork)) ) {
	*info += pzgstrf_memory_use(Glu->nzlmax, Glu->nzumax, Glu->nzlumax);
	return 0;
    }
    pxgstrf_SetIWork(m, panel_size, iwork, &segrep, &parent, &xplore,
	     &repfnz, &panel_lsub, &marker, &lbusy);
    pzgstrf_SetRWork(m, panel_size, dwork, &dense, &tempv);
    
    /* New data structures to facilitate parallel algorithm */
    spa_marker = intMalloc(m * panel_size);
    w_lsub_end = intMalloc(panel_size);
    ifill (spa_marker, m * panel_size, EMPTY);
    ifill (marker, m * NO_MARKER, EMPTY);
    ifill (lbusy, m, EMPTY);
    jcol = EMPTY;
    marker1 = marker + m;
    marker2 = marker + 2*m;

#ifdef PROFILE    
    stime = SuperLU_timer_();
#endif

    /* -------------------------
       Main loop: repeatedly ...
       ------------------------- */
    while ( pxgstrf_shared->tasks_remain > 0 ) {
        
#ifdef PROFILE
	TIC(t);
#endif
	/* Get a panel from the scheduler. */
	pxgstrf_scheduler(pnum, n, etree, &jcol, &bcol, pxgstrf_shared);

#if ( DEBUGlevel>=1 )
    if ( jcol>=LOCOL && jcol<=HICOL ) {
	printf("(%d) Scheduler(): jcol %d, bcol %d, tasks_remain %d\n", 
	       pnum, jcol, bcol, pxgstrf_shared->tasks_remain);
	fflush(stdout);
    }
#endif

#ifdef PROFILE	    
	TOC(t2, t);
	Gstat->procstat[pnum].skedtime += t2;	    
#endif
	    
	if ( jcol != EMPTY ) {
	    w = pxgstrf_shared->pan_status[jcol].size;

#if ( DEBUGlevel>=3 )
	    printf("P%2d got panel %5d-%5d\ttime %.4f\tpanels_left %d\n",
		   pnum, jcol, jcol+w-1, SuperLU_timer_(), 
		   pxgstrf_shared->tasks_remain);
	    fflush(stdout); 
#endif
	    /* Nondomain panels */
#ifdef PROFILE
	    flopcnt = Gstat->procstat[pnum].fcops;
	    Gstat->panstat[jcol].pnum = pnum;
	    TIC(t1);
	    Gstat->panstat[jcol].starttime = t1;
#endif
	    if ( pxgstrf_shared->pan_status[jcol].type == RELAXED_SNODE ) {
		
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops;
#endif
		/* A relaxed supernode at the bottom of the etree */
		pzgstrf_factor_snode
		    (pnum, jcol, A, diag_pivot_thresh, usepr,
		     perm_r, inv_perm_r, inv_perm_c, xprune, marker,
		     panel_lsub, dense, tempv, pxgstrf_shared, info);
		if ( *info ) {
		    if ( *info > n ) return 0;
		    else if ( singular == 0 || *info < singular ) 
		        singular = *info;
#if ( DEBUGlevel>=1 )
    printf("(%d) After pzgstrf_factor_snode(): singular=%d\n", pnum, singular);
#endif
		}

		/* Release the whole relaxed supernode */
		for (jj = jcol; jj < jcol + w; ++jj) 
		    pxgstrf_shared->spin_locks[jj] = 0;
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops - pdiv;
		cp_panel[jcol].pdiv = pdiv;
#endif
	    } else { /* Regular panel */
#ifdef PROFILE
		TIC(t);
#endif
		pxgstrf_mark_busy_descends(pnum, jcol, etree, pxgstrf_shared, 
					   &bcol, lbusy);
		
		/* Symbolic factor on a panel of columns */
		pzgstrf_panel_dfs
		    (pnum, m, w, jcol, A, perm_r, xprune,ispruned,lbusy,
		     &nseg1, panel_lsub, w_lsub_end, segrep, repfnz,
		     marker, spa_marker, parent, xplore, dense, Glu);
#if ( DEBUGlevel>=2 )
  if ( jcol==BADPAN )
    printf("(%d) After pzgstrf_panel_dfs(): nseg1 %d, w_lsub_end %d\n",
	   pnum, nseg1, w_lsub_end[0]);
#endif
#ifdef PROFILE
		TOC(t2, t);
		utime[DFS] += t2;
#endif
		/* Numeric sup-panel updates in topological order.
		 * On return, the update values are temporarily stored in 
		 * the n-by-w SPA dense[m,w].
		 */
		pzgstrf_panel_bmod
		    (pnum, m, w, jcol, bcol, inv_perm_r, etree,
		     &nseg1, segrep, repfnz, panel_lsub, w_lsub_end,
		     spa_marker, dense, tempv, pxgstrf_shared);

		/*
		 * All "busy" descendants are "done" now --
		 * Find the set of row subscripts in the preceeding column
		 * "jcol-1" of the current panel. Column "jcol-1" is
		 * usually taken by a process other than myself.
		 * This row-subscripts information will be used by myself
		 * during column dfs to detect whether "jcol" belongs
		 * to the same supernode as "jcol-1".
		 * 
		 * ACCORDING TO PROFILE, THE AMOUNT OF TIME SPENT HERE 
		 * IS NEGLIGIBLE.
		 */
		jcolm1 = jcol - 1;
		itemp = xlsub_end[jcolm1];
		for (k = xlsub[jcolm1]; k < itemp; ++k)
		    marker2[lsub[k]] = jcolm1;
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops;
#endif
		/* Inner-factorization, using sup-col algorithm */
		for ( jj = jcol; jj < jcol + w; jj++) {
		    k = (jj - jcol) * m; /* index into w-wide arrays */
		    nseg = nseg1; /* begin after all the panel segments */
#ifdef PROFILE
		    TIC(t);
#endif
		    /* Allocate storage for the current H-supernode. */
		    if ( Glu->dynamic_snode_bound && super_bnd[jj] ) {
		        /* jj starts a supernode in H */
			pxgstrf_super_bnd_dfs
			    (pnum, m, n, jj, super_bnd[jj], A, perm_r, 
			     inv_perm_r, xprune, ispruned, marker1, parent, 
			     xplore, pxgstrf_shared);
		    }
		    
		    if ( (*info = pzgstrf_column_dfs
			            (pnum, m, jj, jcol, perm_r, ispruned,
				     &panel_lsub[k],w_lsub_end[jj-jcol],
				     super_bnd, &nseg, segrep,
				     &repfnz[k], xprune, marker2,
				     parent, xplore, pxgstrf_shared)) )
			return 0;
#ifdef PROFILE
		    TOC(t2, t);
		    utime[DFS] += t2;
#endif
		    /* On return, the L supernode is gathered into the
		       global storage. */
		    if ( (*info = pzgstrf_column_bmod
			          (pnum, jj, jcol, (nseg - nseg1),
				   &segrep[nseg1], &repfnz[k],
				   &dense[k], tempv, pxgstrf_shared, Gstat)) )
			return 0;
		
		    if ( (*info = pzgstrf_pivotL
			            (pnum, jj, diag_pivot_thresh, usepr,
				     perm_r, inv_perm_r, inv_perm_c,
				     &pivrow, Glu, Gstat)) )
			if ( singular == 0 || *info < singular ) {
			    singular = *info;
#if ( DEBUGlevel>=1 )
    printf("(%d) After pzgstrf_pivotL(): singular=%d\n", pnum, singular);
#endif
			}

                    /* release column "jj", so that the other processes
                       waiting for this column can proceed */
		    pxgstrf_shared->spin_locks[jj] = 0;
		    
		    /* copy the U-segments to ucol[*] */
		    if ( (*info = pzgstrf_copy_to_ucol
			            (pnum,jj,nseg,segrep,&repfnz[k],
				     perm_r, &dense[k], pxgstrf_shared)) )
		      return 0;

		    /* Prune columns [0:jj-1] using column jj */
		    pxgstrf_pruneL(jj, perm_r, pivrow, nseg, segrep,
				   &repfnz[k], xprune, ispruned, Glu);

		    /* Reset repfnz[] for this column */
		    pxgstrf_resetrep_col (nseg, segrep, &repfnz[k]);

#if ( DEBUGlevel>=2 )
/*  if (jj >= LOCOL && jj <= HICOL) {*/
  if ( jj==BADCOL ) {
    dprint_lu_col(pnum, "panel:", jcol, jj, w, pivrow, xprune, Glu);
    dcheck_zero_vec(pnum, "after pzgstrf_copy_to_ucol() dense_col[]", n, &dense[k]);
  }
#endif
		} /* for jj ... */
		
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops - pdiv;
		cp_panel[jcol].pdiv = pdiv;
#endif
		
	    } /* else regular panel ... */
	    
	    STATE( jcol ) = DONE; /* Release panel jcol. */
	    
#ifdef PROFILE
	    TOC(Gstat->panstat[jcol].fctime, t1);
	    Gstat->panstat[jcol].flopcnt += Gstat->procstat[pnum].fcops - flopcnt;
	    /*if ( Glu->tasks_remain < P ) {
		flops_last_P_panels += Gstat->panstat[jcol].flopcnt;
		printf("Panel %d, flops %e\n", jcol, Gstat->panstat[jcol].flopcnt);
		fflush(stdout);
	    } */
#endif

	}
#ifdef PROFILE
	else { /* No panel from the task queue - wait and try again */
	    Gstat->procstat[pnum].skedwaits++;
	}
#endif
	
    } /* while there are more panels */

    *info = singular;

    /* Free work space and compress storage */
    pzgstrf_WorkFree(iwork, dwork, Glu);
    SUPERLU_FREE (spa_marker);
    SUPERLU_FREE (w_lsub_end);

#ifdef PROFILE
    Gstat->procstat[pnum].fctime = SuperLU_timer_() - stime;
#endif

    return 0;
}
int
pcgstrf_factor_snode(
		     const int pnum,  /* process number */
		     const int jcol,
		     SuperMatrix *A,
		     const float diag_pivot_thresh,
		     yes_no_t *usepr,
		     int    *perm_r,
		     int    *inv_perm_r, /* modified */
		     int    *inv_perm_c, /* in - used to find diagonal of Pc*A*Pc' */
		     int    *xprune,
		     int    *marker,
		     int    *col_lsub, /* values are irrevelant on entry 
					  and on return */
		     complex *dense,
		     complex *tempv,
		     pxgstrf_shared_t *pxgstrf_shared,
		     int    *info
		     )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose
 * =======
 *
 *   Factorize the artificial supernodes grouped at the bottom
 *   of the etree.
 *
 */
    GlobalLU_t   *Glu = pxgstrf_shared->Glu;
    int          singular;
    NCPformat    *Astore;
    register int kcol, icol, k, jsupno, fsupc, nsupr;
    register int ifrom, ito;
    int          nextu, nextlu;
    int          pivrow;
    complex       *a;
    int          *asub, *xa_begin, *xa_end, *xusub, *xusub_end,
                 *xsup, *supno, *xlusup, *lsub, *xlsub, *xlsub_end;

    lsub      = Glu->lsub;
    xlsub     = Glu->xlsub;
    xlsub_end = Glu->xlsub_end;
    xusub     = Glu->xusub;
    xusub_end = Glu->xusub_end;
    xsup      = Glu->xsup;
    supno     = Glu->supno;
    xlusup    = Glu->xlusup;
    
    singular = 0;
    Astore   = A->Store;
    a        = Astore->nzval;
    asub     = Astore->rowind;
    xa_begin = Astore->colbeg;
    xa_end   = Astore->colend;
    
    kcol = jcol + pxgstrf_shared->pan_status[jcol].size;
	
    /* Determine the union of the row structure of the supernode */
    if ( (*info = pcgstrf_snode_dfs(pnum, jcol, kcol-1, asub, xa_begin, xa_end,
				   xprune, marker, col_lsub, pxgstrf_shared)) )
	return 0;
    
    /*
     * Factorize the relaxed supernode (jcol:kcol-1)
     */
    nextu        = Glu->nextu; /* xiaoye - race condition (no problem!) */
    jsupno       = supno[jcol];
    fsupc        = xsup[jsupno];
    nsupr        = xlsub_end[fsupc] - xlsub[fsupc];
    if ( (*info = Glu_alloc(pnum, jcol, nsupr*(kcol-jcol), LUSUP, &nextlu,
			  pxgstrf_shared)) )
	return 0;
    
    for (icol = jcol; icol < kcol; icol++) {
	xusub[icol] = xusub_end[icol] = nextu;
	xlusup[icol] = nextlu;
	
	/* Scatter into SPA dense[*] */
	for (k = xa_begin[icol]; k < xa_end[icol]; k++)
	    dense[asub[k]] = a[k];
	
	/* Numeric update within the supernode */
	pcgstrf_snode_bmod(pnum, icol, jsupno, fsupc, dense, tempv, 
			   Glu, pxgstrf_shared->Gstat);
	
	if ( (*info = pcgstrf_pivotL
	                 (pnum, icol, diag_pivot_thresh, usepr, perm_r,
			  inv_perm_r, inv_perm_c, &pivrow, 
			  Glu, pxgstrf_shared->Gstat)) )
	    if ( singular == 0 ) singular = *info;
	
	nextlu += nsupr;

#if ( DEBUGlevel>= 2 )
  if ( icol>=LOCOL && icol<=HICOL )
    dprint_lu_col(pnum,"relax:",jcol,icol,kcol-jcol,pivrow,xprune,Glu);
#endif
	
    }

    /* Store the row subscripts of kcol-1 for pruned graph */
    k = ito = xlsub_end[jcol];
    for (ifrom = xlsub[jcol]+kcol-jcol-1; ifrom < k; ++ifrom)
	lsub[ito++] = lsub[ifrom];
    k = ito;
    xprune[kcol-1] = k;
    if (jcol < kcol-1) {    /* not a singleton */
	for (icol = jcol+1; icol < kcol; ++icol) xlsub_end[icol] = k;
	k = xlsub_end[jcol];
	xprune[jcol] = k;
	for (icol = jcol+1; icol < kcol; ++icol) xlsub[icol] = k;
    }
    
    *info = singular;
    return 0;
}