int
pcgstrf_column_bmod(
		    const int  pnum,   /* process number */
		    const int  jcol,   /* current column in the panel */
		    const int  fpanelc,/* first column in the panel */
		    const int  nseg,   /* number of s-nodes to update jcol */
		    int        *segrep,/* in */
		    int        *repfnz,/* in */
		    complex     *dense, /* modified */
		    complex     *tempv, /* working array */
		    pxgstrf_shared_t *pxgstrf_shared, /* modified */
		    Gstat_t *Gstat     /* modified */
		    )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose:
 * ========
 *    Performs numeric block updates (sup-col) in topological order.
 *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
 *    Special processing on the supernodal portion of L\U[*,j].
 *
 * Return value:
 * =============
 *      0 - successful return
 *    > 0 - number of bytes allocated when run out of space
 *
 */
#if ( MACH==CRAY_PVP )
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    
#ifdef USE_VENDOR_BLAS    
    int         incx = 1, incy = 1;
    complex      alpha, beta;
#endif
    GlobalLU_t *Glu = pxgstrf_shared->Glu;   /* modified */
    
    /* krep = representative of current k-th supernode
     * fsupc = first supernodal column
     * nsupc = no of columns in supernode
     * nsupr = no of rows in supernode (used as leading dimension)
     * luptr = location of supernodal LU-block in storage
     * kfnz = first nonz in the k-th supernodal segment
     * no_zeros = no of leading zeros in a supernodal U-segment
     */
    complex	  ukj, ukj1, ukj2;
    register int lptr, kfnz, isub, irow, i, no_zeros;
    register int luptr, luptr1, luptr2;
    int          fsupc, nsupc, nsupr, segsze;
    int          nrow;	  /* No of rows in the matrix of matrix-vector */
    int          jsupno, k, ksub, krep, krep_ind, ksupno;
    int          ufirst, nextlu;
    int          fst_col; /* First column within small LU update */
    int          d_fsupc; /* Distance between the first column of the current
			     panel and the first column of the current snode.*/
    int          *xsup, *supno;
    int          *lsub, *xlsub, *xlsub_end;
    complex       *lusup;
    int          *xlusup, *xlusup_end;
    complex       *tempv1;
    int          mem_error;
    register float flopcnt;

    complex      zero = {0.0, 0.0};
    complex      one = {1.0, 0.0};
    complex      none = {-1.0, 0.0};
    complex      comp_temp, comp_temp1;

    xsup       = Glu->xsup;
    supno      = Glu->supno;
    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    xlsub_end  = Glu->xlsub_end;
    lusup      = Glu->lusup;
    xlusup     = Glu->xlusup;
    xlusup_end = Glu->xlusup_end;
    jsupno     = supno[jcol];

    /* 
     * For each nonz supernode segment of U[*,j] in topological order 
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) {

	krep = segrep[k];
	k--;
	ksupno = supno[krep];
#if ( DEBUGlvel>=2 )
if (jcol==BADCOL)
printf("(%d) pcgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n",
       pnum, jcol, nseg, krep, jsupno, ksupno);
#endif    
	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */

	    fsupc = xsup[ksupno];
	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

  	    /* Distance from the current supernode to the current panel; 
	       d_fsupc=0 if fsupc >= fpanelc. */
  	    d_fsupc = fst_col - fsupc; 

	    luptr = xlusup[fst_col] + d_fsupc;
	    lptr = xlsub[fsupc] + d_fsupc;
	    kfnz = repfnz[krep];
	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
	    segsze = krep - kfnz + 1;
	    nsupc = krep - fst_col + 1;
	    nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */
	    nrow = nsupr - d_fsupc - nsupc;
	    krep_ind = lptr + nsupc - 1;

	flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze;//sj
		Gstat->procstat[pnum].fcops += flopcnt;

#if ( DEBUGlevel>=2 )
if (jcol==BADCOL)	    
printf("(%d) pcgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\
fsupc %d, nsupr %d, nsupc %d\n",
       pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc);

#endif



            /*
             * Case 1: Update U-segment of size 1 -- col-col update
             */
            if ( segsze == 1 ) {
                ukj = dense[lsub[krep_ind]];
                luptr += nsupr*(nsupc-1) + nsupc;

                for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
                    irow = lsub[i];
                    cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                    c_sub(&dense[irow], &dense[irow], &comp_temp);
                    luptr++;
                }

            } else if ( segsze <= 3 ) {
                ukj = dense[lsub[krep_ind]];
                luptr += nsupr*(nsupc-1) + nsupc-1;
                ukj1 = dense[lsub[krep_ind - 1]];
                luptr1 = luptr - nsupr;

                if ( segsze == 2 ) { /* Case 2: 2cols-col update */
                    cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                    c_sub(&ukj, &ukj, &comp_temp);
                    dense[lsub[krep_ind]] = ukj;
                    for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
                        irow = lsub[i];
                        luptr++;
                        luptr1++;
                        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                        cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        c_sub(&dense[irow], &dense[irow], &comp_temp);
                    }
                } else { /* Case 3: 3cols-col update */
                    ukj2 = dense[lsub[krep_ind - 2]];
                    luptr2 = luptr1 - nsupr;
                    cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                    c_sub(&ukj1, &ukj1, &comp_temp);

                    cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                    cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                    c_add(&comp_temp, &comp_temp, &comp_temp1);
                    c_sub(&ukj, &ukj, &comp_temp);

                    dense[lsub[krep_ind]] = ukj;
                    dense[lsub[krep_ind-1]] = ukj1;
                    for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
                        irow = lsub[i];
                        luptr++;
                        luptr1++;
                        luptr2++;
                        cc_mult(&comp_temp, &ukj, &lusup[luptr]);
                        cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                        c_add(&comp_temp, &comp_temp, &comp_temp1);
                        c_sub(&dense[irow], &dense[irow], &comp_temp);
                    }
                }


	    } else {
	  	/*
		 * Case: sup-col update
		 * Perform a triangular solve and block update,
		 * then scatter the result of sup-col update to dense
		 */
		no_zeros = kfnz - fst_col;

	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
	        isub = lptr + no_zeros;
	        for (i = 0; i < segsze; i++) {
	  	    irow = lsub[isub];
		    tempv[i] = dense[irow];
		    ++isub; 
	        }

	        /* Dense triangular solve -- start effective triangle */
		luptr += nsupr * no_zeros + no_zeros; 
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
		CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#else
		ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#endif
		
 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		alpha = one;
		beta = zero;
#if ( MACH==CRAY_PVP )
		CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
		cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
		clsolve ( nsupr, segsze, &lusup[luptr], tempv );

 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
#endif
                /* Scatter tempv[] into SPA dense[*] */
                isub = lptr + no_zeros;
                for (i = 0; i < segsze; i++) {
                    irow = lsub[isub];
                    dense[irow] = tempv[i]; /* Scatter */
                    tempv[i] = zero;
                    isub++;
                }

		/* Scatter tempv1[] into SPA dense[*] */
		for (i = 0; i < nrow; i++) {
		    irow = lsub[isub];
                    c_sub(&dense[irow], &dense[irow], &tempv1[i]);
		    tempv1[i] = zero;
		    ++isub;
		}
	    } /* else segsze >= 4 */
	    
	} /* if jsupno ... */

    } /* for each segment... */

    
    /* ------------------------------------------
       Process the supernodal portion of L\U[*,j]
       ------------------------------------------ */
    
    fsupc = SUPER_FSUPC (jsupno);
    nsupr = xlsub_end[fsupc] - xlsub[fsupc];
    if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, 
			       pxgstrf_shared)) )
	return mem_error;
    xlusup[jcol] = nextlu;
    lusup = Glu->lusup;
    
    /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */
    for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) {
  	irow = lsub[isub];
	lusup[nextlu] = dense[irow];
	dense[irow] = zero;
#ifdef DEBUG
if (jcol == -1)
    printf("(%d) pcgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n",
	   pnum, jcol, irow, lusup[nextlu]);
#endif	
	++nextlu;
    }
    xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */

#if ( DEBUGlevel>=2 )
if (jcol == -1) {
    nrow = xlusup_end[jcol] - xlusup[jcol];
    print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]],
		     &lusup[xlusup[jcol]]);
}
#endif    
    
    /*
     * For more updates within the panel (also within the current supernode), 
     * should start from the first column of the panel, or the first column 
     * of the supernode, whichever is bigger. There are 2 cases:
     *    (1) fsupc < fpanelc,  then fst_col := fpanelc
     *    (2) fsupc >= fpanelc, then fst_col := fsupc
     */
    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

    if ( fst_col < jcol ) {

  	/* distance between the current supernode and the current panel;
	   d_fsupc=0 if fsupc >= fpanelc. */
  	d_fsupc = fst_col - fsupc;

	lptr = xlsub[fsupc] + d_fsupc;
	luptr = xlusup[fst_col] + d_fsupc;
	nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */
	nsupc = jcol - fst_col;	/* Excluding jcol */
	nrow = nsupr - d_fsupc - nsupc;

	/* points to the beginning of jcol in supernode L\U[*,jsupno] */
	ufirst = xlusup[jcol] + d_fsupc;	

#if ( DEBUGlevel>=2 )
if (jcol==BADCOL)
printf("(%d) pcgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n",
       pnum, jcol, fsupc, nsupr, nsupc, nrow);
#endif    

	flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; //sj
	Gstat->procstat[pnum].fcops += flopcnt;

/*	ops[TRSV] += nsupc * (nsupc - 1);
	ops[GEMV] += 2 * nrow * nsupc;    */
	
#ifdef USE_VENDOR_BLAS
	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
#if ( MACH==CRAY_PVP )
	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
	ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
	cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
	clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );

	cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
		 &lusup[ufirst], tempv );
	
        /* Copy updates from tempv[*] into lusup[*] */
	isub = ufirst + nsupc;
	for (i = 0; i < nrow; i++) {
            c_sub(&lusup[isub], &lusup[isub], &tempv[i]);
            tempv[i] = zero;
	    ++isub;
	}
#endif
    } /* if fst_col < jcol ... */ 

    return 0;
}
int
pzgstrf_column_dfs(
		   const int  pnum,    /* process number */
		   const int  m,       /* number of rows in the matrix */
		   const int  jcol,    /* current column in the panel */
		   const int  fstcol,  /* first column in the panel */
		   int *perm_r,   /* row pivotings that are done so far */
		   int *ispruned, /* in */
		   int *col_lsub, /* the RHS vector to start the dfs */
		   int lsub_end,  /* size of col_lsub[] */
		   int *super_bnd,/* supernode partition by upper bound */
		   int *nseg,     /* modified - with new segments appended */
		   int *segrep,   /* modified - with new segments appended */
		   int *repfnz,   /* modified */
		   int *xprune,   /* modified */
		   int *marker2,  /* modified */
		   int *parent,   /* working array */
		   int *xplore,   /* working array */
		   pxgstrf_shared_t *pxgstrf_shared /* modified */
		   )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose
 * =======
 *   pzgstrf_column_dfs() performs a symbolic factorization on column jcol, 
 *   and detects whether column jcol belongs in the same supernode as jcol-1.
 *
 * Local parameters
 * ================
 *   A supernode representative is the last column of a supernode.
 *   The nonzeros in U[*,j] are segments that end at supernodal
 *   representatives. The routine returns a list of such supernodal 
 *   representatives in topological order of the dfs that generates them.
 *   The location of the first nonzero in each such supernodal segment
 *   (supernodal entry location) is also returned.
 *
 *   nseg: no of segments in current U[*,j]
 *   samesuper: samesuper=NO if column j does not belong in the same
 *	        supernode as j-1. Otherwise, samesuper=YES.
 *
 *   marker2: A-row --> A-row/col (0/1)
 *   repfnz: SuperA-col --> PA-row
 *   parent: SuperA-col --> SuperA-col
 *   xplore: SuperA-col --> index to L-structure
 *
 * Return value
 * ============
 *     0  success;
 *   > 0  number of bytes allocated when run out of space.
 *
 */
    GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */
    Gstat_t *Gstat = pxgstrf_shared->Gstat; /* modified */
    register int jcolm1, jcolm1size, nextl, ifrom;
    register int k, krep, krow, kperm, samesuper, nsuper;
    register int no_lsub;
    int	    fsupc;		/* first column in a supernode */
    int     myfnz;		/* first nonz column in a U-segment */
    int	    chperm, chmark, chrep, kchild;
    int     xdfs, maxdfs, kpar;
    int     ito;	        /* Used to compress row subscripts */
    int     mem_error;
    int     *xsup, *xsup_end, *supno, *lsub, *xlsub, *xlsub_end;
    static  int  first = 1, maxsuper;

    if ( first ) {
	maxsuper = sp_ienv(3);
	first = 0;
    }

    /* Initialize pointers */
    xsup      = Glu->xsup;
    xsup_end  = Glu->xsup_end;
    supno     = Glu->supno;
    lsub      = Glu->lsub;
    xlsub     = Glu->xlsub;
    xlsub_end = Glu->xlsub_end;
    jcolm1    = jcol - 1;
    nextl     = lsub_end;
    no_lsub   = 0;
    samesuper = YES;

    /* Test whether the row structure of column jcol is contained
       in that of column jcol-1. */
    for (k = 0; k < lsub_end; ++k) {
	krow = col_lsub[k];
	if ( perm_r[krow] == EMPTY ) { /* krow is in L */
	    ++no_lsub;
	    if (marker2[krow] != jcolm1) 
	        samesuper = NO; /* row subset test */
	    marker2[krow] = jcol;
	}
    }

#if ( DEBUGlevel>=2 )
  if (jcol == BADCOL)
    printf("(%d) pzgstrf_column_dfs[1] %d, fstcol %d, lsub_end %d, no_lsub %d, samesuper? %d\n",
	   pnum, jcol, fstcol, lsub_end, no_lsub, samesuper);
#endif
    
    /*
     * For each nonzero in A[fstcol:n,jcol] perform DFS ...
     */
    for (k = 0; k < lsub_end; ++k) {
	krow = col_lsub[k];
	
	/* if krow was visited before, go to the next nonzero */
	if ( marker2[krow] == jcol ) continue;
	marker2[krow] = jcol;
	kperm = perm_r[krow];
#if ( DEBUGlevel>=3 )
  if (jcol == BADCOL)
    printf("(%d) pzgstrf_column_dfs[inner]: perm_r[krow=%d] %d\n", pnum, krow, kperm);
#endif
	
	/* Ignore the nonzeros in U corresponding to the busy columns
	   during the panel DFS. */
	/*if ( lbusy[kperm] != fstcol ) {  xiaoye? */
	if ( kperm >= fstcol ) {
	    /*
	     * krow is in U: if its supernode representative krep
	     * has been explored, update repfnz[*].
	     */
	    krep = SUPER_REP(supno[kperm]);
	    myfnz = repfnz[krep];
	    
#if ( DEBUGlevel>=3 )
  if (jcol == BADCOL)
    printf("(%d) pzgstrf_column_dfs[inner-U]: krep %d, myfnz %d, kperm %d\n",
	   pnum, krep, myfnz, kperm);
#endif
	    if ( myfnz != EMPTY ) {	/* Visited before */
		if ( myfnz > kperm ) repfnz[krep] = kperm;
		/* continue; */
	    } else {
		/* Otherwise, perform dfs starting at krep */
		parent[krep] = EMPTY;
		repfnz[krep] = kperm;
		if ( ispruned[krep] ) {
		    if ( SINGLETON( supno[krep] ) )
			xdfs = xlsub_end[krep];
		    else xdfs = xlsub[krep];
		    maxdfs = xprune[krep];
#ifdef PROFILE
		    Gstat->procstat[pnum].pruned++;
#endif		    
		} else {
		    fsupc = SUPER_FSUPC( supno[krep] );
		    xdfs = xlsub[fsupc] + krep-fsupc+1;
		    maxdfs = xlsub_end[fsupc];
#ifdef PROFILE
		    Gstat->procstat[pnum].unpruned++;
#endif		    
		}
		
		do {
		    /* 
		     * For each unmarked kchild of krep ...
		     */
		    while ( xdfs < maxdfs ) {
			
			kchild = lsub[xdfs];
			xdfs++;
			chmark = marker2[kchild];
			
			if ( chmark != jcol ) { /* Not reached yet */
			    marker2[kchild] = jcol;
			    chperm = perm_r[kchild];
			    
			    if ( chperm == EMPTY ) {
				/* kchild is in L: place it in L[*,k]. */
				++no_lsub;
				col_lsub[nextl++] = kchild;
				if (chmark != jcolm1) samesuper = NO;
			    } else {
				/* kchild is in U: chrep = its supernode
				 * representative. If its rep has 
				 * been explored, update its repfnz[*].
				 */
				chrep = SUPER_REP( supno[chperm] );
				myfnz = repfnz[chrep];
				if ( myfnz != EMPTY ) { /* Visited before */
				    if ( myfnz > chperm )
					repfnz[chrep] = chperm;
				} else {
				    /* Continue dfs at super-rep of kchild */
				    xplore[krep] = xdfs;	
				    xplore[m + krep] = maxdfs;	
				    parent[chrep] = krep;
				    krep = chrep; /* Go deeper down G(L^t) */
				    repfnz[krep] = chperm;
				    if ( ispruned[krep] ) {
					if ( SINGLETON( supno[krep] ) )
					    xdfs = xlsub_end[krep];
					else xdfs = xlsub[krep];
					maxdfs = xprune[krep];
#ifdef PROFILE
					Gstat->procstat[pnum].pruned++;
#endif		    
				    } else {
					fsupc = SUPER_FSUPC( supno[krep] );
					xdfs = xlsub[fsupc] + krep-fsupc+1;
					maxdfs = xlsub_end[fsupc];
#ifdef PROFILE
					Gstat->procstat[pnum].unpruned++;
#endif		    
				    }
				}
			    } /* else */
			} /* if */
		    } /* while */
		    
		    /* krow has no more unexplored nbrs:
		     *    place supernode-rep krep in postorder DFS,
		     *    backtrack dfs to its parent.
		     */
		    segrep[*nseg] = krep;
		    ++(*nseg);
#if ( DEBUGlevel>=3 )
  if (jcol == BADCOL)
    printf("(%d) pzgstrf_column_dfs[inner-dfs] new nseg %d, repfnz[krep=%d] %d\n",
	   pnum, *nseg, krep, repfnz[krep]);
#endif
		    kpar = parent[krep]; /* Pop from stack, mimic recursion */
		    if ( kpar == EMPTY ) break; /* dfs done */
		    krep = kpar;
		    xdfs = xplore[krep];
		    maxdfs = xplore[m + krep];
		} while ( kpar != EMPTY ); /* Do ... until empty stack */
		
	    } /* else myfnz ... */
	} /* if kperm >= fstcol ... */
    } /* for each nonzero ... */
	
#if ( DEBUGlevel>=3 )
  if (jcol == BADCOL)
    printf("(%d) pzgstrf_column_dfs[2]: nextl %d, samesuper? %d\n",
	   pnum, nextl, samesuper);
#endif

    /* assert(no_lsub == nextl - no_usub);*/

    /* ---------------------------------------------------------
       Check to see if j belongs in the same supernode as j-1.
       --------------------------------------------------------- */
    
    /* Does it matter if jcol == 0? - xiaoye */
    if ( samesuper == YES ) {
	nsuper = supno[jcolm1];
	jcolm1size = xlsub_end[jcolm1] - xlsub[jcolm1];
#if ( DEBUGlevel>=3 )
  if (jcol == BADCOL)
    printf("(%d) pzgstrf_column_dfs[YES] jcol-1 %d, jcolm1size %d, supno[%d] %d\n",
	   pnum, jcolm1, jcolm1size, jcolm1, nsuper);
#endif	
	if ( no_lsub != jcolm1size-1 )
	    samesuper = NO;        /* Enforce T2 supernode */
	else {
	    /* Make sure the number of columns in a supernode does not
	       exceed threshold. */
	    fsupc = xsup[nsuper];
	    if ( jcol - fsupc >= maxsuper )
		samesuper = NO;
	    else {
		/* start of a supernode in H (coarser partition) */
		if ( super_bnd[jcol] != 0 ) samesuper = NO;
	    }
	}
    }
    
    /* If jcol starts a new supernode, allocate storage for 
     * the subscript set of both first and last column of
     * a previous supernode. (first for num values, last for pruning)
     */
    if ( samesuper == NO ) { /* starts a new supernode */
	nsuper = NewNsuper(pnum, pxgstrf_shared, &Glu->nsuper);
	xsup[nsuper] = jcol;
	
	/* Copy column jcol; also reserve space to store pruned graph */
	if ((mem_error = Glu_alloc(pnum, jcol, 2*no_lsub, LSUB, &ito, 
				  pxgstrf_shared)))
	    return mem_error;
	xlsub[jcol] = ito;
	lsub = Glu->lsub;
	for (ifrom = 0; ifrom < nextl; ++ifrom) {
	    krow = col_lsub[ifrom];
	    if ( perm_r[krow] == EMPTY ) /* Filter U-subscript */
		lsub[ito++] = krow;
	}
	k = ito;
	xlsub_end[jcol] = k;
	
	/* Make a copy in case it is a singleton supernode */
	for (ifrom = xlsub[jcol]; ifrom < ito; ++ifrom)
	    lsub[k++] = lsub[ifrom];
	
    } else { /* Supernode of size > 1: overwrite column jcol-1 */
	k = xlsub_end[fsupc];
	xlsub[jcol] = k;
	xprune[fsupc] = k;
	for (ifrom = 0; ifrom < nextl; ++ifrom) {
	    krow = col_lsub[ifrom];
	    if ( perm_r[krow] == EMPTY ) /* Filter U-subscript */
		lsub[k++] = krow;
	}
	xlsub_end[jcol] = k;
    }

#if ( DEBUGlevel>=3 )
  if (jcol == BADCOL) {
    printf("(%d) pzgstrf_column_dfs[3]: %d in prev s-node %d? %d\n",
	   pnum, jcol, fsupc, samesuper);
    PrintInt10("lsub", xlsub_end[jcol]-xlsub[jcol], &lsub[xlsub[jcol]]);
  }
#endif
    
    /* Tidy up the pointers before exit */
    xprune[jcol] = k;     /* upper bound for pruning */
    supno[jcol] = nsuper;
    xsup_end[nsuper] = jcol + 1;
    
    return 0;
}
int
pdgstrf_copy_to_ucol(
		     const int  pnum,    /* process number */
		     const int  jcol,	 /* current column */
		     const int  nseg,	 /* number of U-segments */
		     const int  *segrep, /* in */
		     const int  *repfnz, /* in */
		     const int  *perm_r, /* in */
		     double	 *dense,  /* modified - reset to zero on exit */
		     pxgstrf_shared_t *pxgstrf_shared /* modified */
		     )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Gather the nonzeros from SPA dense[*,jcol] into global ucol[*].
 */
    register int ksub, krep, ksupno, i, k, kfnz, segsze;
    register int fsupc, isub, irow, jsupno, colsize;
    int      nextu, mem_error;
    int      *xsup, *supno, *lsub, *xlsub, *usub;
    double   *ucol;
    GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */

    double zero = 0.0;

    xsup    = Glu->xsup;
    supno   = Glu->supno;
    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    jsupno  = supno[jcol];

    /* find the size of column jcol */
    colsize = 0;
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ++ksub) {
	krep = segrep[k--];
	ksupno = supno[krep];

	if ( ksupno != jsupno ) { /* should go into ucol[] */
	    kfnz = repfnz[krep];
	    if ( kfnz != EMPTY )  /* nonzero U-segment */
		colsize += krep - kfnz + 1;;
	}
    } /* for each segment... */

    if ( (mem_error = Glu_alloc(pnum, jcol, colsize, UCOL, &nextu, 
				pxgstrf_shared)) )
	return mem_error;
    Glu->xusub[jcol] = nextu;
    ucol = Glu->ucol;
    usub = Glu->usub;

    /* Now, it does not have to be in topological order! */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ++ksub) {
	
	krep = segrep[k--];
	ksupno = supno[krep];

	if ( ksupno != jsupno ) { /* should go into ucol[] */
	    kfnz = repfnz[krep];
	    if ( kfnz != EMPTY ) { /* nonzero U-segment */
	    	fsupc = xsup[ksupno];
	        isub = xlsub[fsupc] + kfnz - fsupc;
	        segsze = krep - kfnz + 1;
#pragma ivdep
		for (i = 0; i < segsze; i++) {
		    irow = lsub[isub];
		    usub[nextu] = perm_r[irow];
		    ucol[nextu] = dense[irow];
		    dense[irow] = zero;
#ifdef DEBUG
if (jcol == EMPTY)
    printf("(%d) pcopy_to_ucol[]: jcol %d, krep %d, irow %d, ucol %.10e\n",
	   ME, jcol, krep, irow, ucol[nextu]);
#endif		    
		    nextu++;
		    isub++;
		} 
	    }
	}

    } /* for each segment... */

    Glu->xusub_end[jcol] = nextu; /* close U[*,jcol] */
    return 0;
}
예제 #4
0
int
psgstrf_snode_dfs(
		  const int  pnum,      /* process number */
		  const int  jcol,	  /* in - start of the supernode */
		  const int  kcol, 	  /* in - end of the supernode */
		  const int  *asub,     /* in */
		  const int  *xa_begin, /* in */
		  const int  *xa_end,   /* in */
		  int        *xprune,   /* out */
		  int        *marker,   /* modified */
		  int        *col_lsub, /* values are irrelevant on entry 
					   and on return */
		  pxgstrf_shared_t *pxgstrf_shared /* modified */
		  )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose
 * =======
 *    psgstrf_snode_dfs() determines the union of the row structures of 
 *    those columns within the relaxed snode.
 *    Note: The relaxed snodes are leaves of the supernodal etree, 
 *    therefore, the portion outside the rectangular supernode must be zero.
 *
 * Return value
 * ============
 *     0   success;
 *    >0   number of bytes allocated when run out of memory.
 *
 */
    GlobalLU_t *Glu = pxgstrf_shared->Glu;
    register int i, k, ifrom, nextl, nsuper;
    int          ito;
    int          krow, kmark, mem_error;
    int          *supno, *lsub, *xlsub, *xlsub_end;
    
    supno                 = Glu->supno;
    xlsub                 = Glu->xlsub;
    xlsub_end             = Glu->xlsub_end;
    nsuper = NewNsuper(pnum, pxgstrf_shared, &Glu->nsuper);
    Glu->xsup[nsuper]     = jcol;
    Glu->xsup_end[nsuper] = kcol + 1;
    
    nextl = 0;
    for (i = jcol; i <= kcol; i++) {
	/* for each nonzero in A[*,i] */
	for (k = xa_begin[i]; k < xa_end[i]; k++) {	
	    krow = asub[k];
	    kmark = marker[krow];
	    if ( kmark != kcol ) { /* First time visit krow */
		marker[krow] = kcol;
		col_lsub[nextl++] = krow;
	    }
    	}
	supno[i] = nsuper;
    }

    if ( (mem_error = Glu_alloc(pnum, jcol, 2*nextl, LSUB, &ito, 
				pxgstrf_shared)) )
	return mem_error;
    
    xlsub[jcol] = ito;
    lsub        = Glu->lsub;
    for (ifrom = 0; ifrom < nextl; ++ifrom)
	lsub[ito++] = col_lsub[ifrom];
    xlsub_end[jcol] = ito;

    return 0;
}
int
pcgstrf_factor_snode(
		     const int pnum,  /* process number */
		     const int jcol,
		     SuperMatrix *A,
		     const float diag_pivot_thresh,
		     yes_no_t *usepr,
		     int    *perm_r,
		     int    *inv_perm_r, /* modified */
		     int    *inv_perm_c, /* in - used to find diagonal of Pc*A*Pc' */
		     int    *xprune,
		     int    *marker,
		     int    *col_lsub, /* values are irrevelant on entry 
					  and on return */
		     complex *dense,
		     complex *tempv,
		     pxgstrf_shared_t *pxgstrf_shared,
		     int    *info
		     )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose
 * =======
 *
 *   Factorize the artificial supernodes grouped at the bottom
 *   of the etree.
 *
 */
    GlobalLU_t   *Glu = pxgstrf_shared->Glu;
    int          singular;
    NCPformat    *Astore;
    register int kcol, icol, k, jsupno, fsupc, nsupr;
    register int ifrom, ito;
    int          nextu, nextlu;
    int          pivrow;
    complex       *a;
    int          *asub, *xa_begin, *xa_end, *xusub, *xusub_end,
                 *xsup, *supno, *xlusup, *lsub, *xlsub, *xlsub_end;

    lsub      = Glu->lsub;
    xlsub     = Glu->xlsub;
    xlsub_end = Glu->xlsub_end;
    xusub     = Glu->xusub;
    xusub_end = Glu->xusub_end;
    xsup      = Glu->xsup;
    supno     = Glu->supno;
    xlusup    = Glu->xlusup;
    
    singular = 0;
    Astore   = A->Store;
    a        = Astore->nzval;
    asub     = Astore->rowind;
    xa_begin = Astore->colbeg;
    xa_end   = Astore->colend;
    
    kcol = jcol + pxgstrf_shared->pan_status[jcol].size;
	
    /* Determine the union of the row structure of the supernode */
    if ( (*info = pcgstrf_snode_dfs(pnum, jcol, kcol-1, asub, xa_begin, xa_end,
				   xprune, marker, col_lsub, pxgstrf_shared)) )
	return 0;
    
    /*
     * Factorize the relaxed supernode (jcol:kcol-1)
     */
    nextu        = Glu->nextu; /* xiaoye - race condition (no problem!) */
    jsupno       = supno[jcol];
    fsupc        = xsup[jsupno];
    nsupr        = xlsub_end[fsupc] - xlsub[fsupc];
    if ( (*info = Glu_alloc(pnum, jcol, nsupr*(kcol-jcol), LUSUP, &nextlu,
			  pxgstrf_shared)) )
	return 0;
    
    for (icol = jcol; icol < kcol; icol++) {
	xusub[icol] = xusub_end[icol] = nextu;
	xlusup[icol] = nextlu;
	
	/* Scatter into SPA dense[*] */
	for (k = xa_begin[icol]; k < xa_end[icol]; k++)
	    dense[asub[k]] = a[k];
	
	/* Numeric update within the supernode */
	pcgstrf_snode_bmod(pnum, icol, jsupno, fsupc, dense, tempv, 
			   Glu, pxgstrf_shared->Gstat);
	
	if ( (*info = pcgstrf_pivotL
	                 (pnum, icol, diag_pivot_thresh, usepr, perm_r,
			  inv_perm_r, inv_perm_c, &pivrow, 
			  Glu, pxgstrf_shared->Gstat)) )
	    if ( singular == 0 ) singular = *info;
	
	nextlu += nsupr;

#if ( DEBUGlevel>= 2 )
  if ( icol>=LOCOL && icol<=HICOL )
    dprint_lu_col(pnum,"relax:",jcol,icol,kcol-jcol,pivrow,xprune,Glu);
#endif
	
    }

    /* Store the row subscripts of kcol-1 for pruned graph */
    k = ito = xlsub_end[jcol];
    for (ifrom = xlsub[jcol]+kcol-jcol-1; ifrom < k; ++ifrom)
	lsub[ito++] = lsub[ifrom];
    k = ito;
    xprune[kcol-1] = k;
    if (jcol < kcol-1) {    /* not a singleton */
	for (icol = jcol+1; icol < kcol; ++icol) xlsub_end[icol] = k;
	k = xlsub_end[jcol];
	xprune[jcol] = k;
	for (icol = jcol+1; icol < kcol; ++icol) xlsub[icol] = k;
    }
    
    *info = singular;
    return 0;
}