int pcgstrf_column_bmod( const int pnum, /* process number */ const int jcol, /* current column in the panel */ const int fpanelc,/* first column in the panel */ const int nseg, /* number of s-nodes to update jcol */ int *segrep,/* in */ int *repfnz,/* in */ complex *dense, /* modified */ complex *tempv, /* working array */ pxgstrf_shared_t *pxgstrf_shared, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j]. * * Return value: * ============= * 0 - successful return * > 0 - number of bytes allocated when run out of space * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; complex alpha, beta; #endif GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ complex ukj, ukj1, ukj2; register int lptr, kfnz, isub, irow, i, no_zeros; register int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jsupno, k, ksub, krep, krep_ind, ksupno; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode.*/ int *xsup, *supno; int *lsub, *xlsub, *xlsub_end; complex *lusup; int *xlusup, *xlusup_end; complex *tempv1; int mem_error; register float flopcnt; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex none = {-1.0, 0.0}; complex comp_temp, comp_temp1; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; xlusup_end = Glu->xlusup_end; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; #if ( DEBUGlvel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n", pnum, jcol, nseg, krep, jsupno, ksupno); #endif if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = SUPERLU_MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = SUPERLU_MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze;//sj Gstat->procstat[pnum].fcops += flopcnt; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\ fsupc %d, nsupr %d, nsupc %d\n", pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc); #endif /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense[irow], &dense[irow], &comp_temp); luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #if ( MACH==CRAY_PVP ) CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; /* Scatter */ tempv[i] = zero; isub++; } /* Scatter tempv1[] into SPA dense[*] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense[irow], &dense[irow], &tempv1[i]); tempv1[i] = zero; ++isub; } } /* else segsze >= 4 */ } /* if jsupno ... */ } /* for each segment... */ /* ------------------------------------------ Process the supernodal portion of L\U[*,j] ------------------------------------------ */ fsupc = SUPER_FSUPC (jsupno); nsupr = xlsub_end[fsupc] - xlsub[fsupc]; if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, pxgstrf_shared)) ) return mem_error; xlusup[jcol] = nextlu; lusup = Glu->lusup; /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; #ifdef DEBUG if (jcol == -1) printf("(%d) pcgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n", pnum, jcol, irow, lusup[nextlu]); #endif ++nextlu; } xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */ #if ( DEBUGlevel>=2 ) if (jcol == -1) { nrow = xlusup_end[jcol] - xlusup[jcol]; print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]], &lusup[xlusup[jcol]]); } #endif /* * For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * (1) fsupc < fpanelc, then fst_col := fpanelc * (2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = SUPERLU_MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* distance between the current supernode and the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; lptr = xlsub[fsupc] + d_fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* points to the beginning of jcol in supernode L\U[*,jsupno] */ ufirst = xlusup[jcol] + d_fsupc; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n", pnum, jcol, fsupc, nsupr, nsupc, nrow); #endif flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; //sj Gstat->procstat[pnum].fcops += flopcnt; /* ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; */ #ifdef USE_VENDOR_BLAS alpha = none; beta = one; /* y := beta*y + alpha*A*x */ #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { c_sub(&lusup[isub], &lusup[isub], &tempv[i]); tempv[i] = zero; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }
void pcgstrf_mark_busy_descends(int pnum, int jcol, int *etree, //pcgstrf_shared_t *pxgstrf_shared, pxgstrf_shared_t *pxgstrf_shared, //sj int *bcol, int *lbusy) { /* * -- SuperLU MT routine (version 1.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * August 15, 1997 * * Purpose * ======= * * Mark busy panels in local "lbusy" array, used for linear pipelining. * * When jcol begins, its busy descendant panels (if any) are bcol and * all the e-tree ancestors of bcol between bcol and jcol. This routine * marks those columns in the array lbusy, which is local to this * processor, to preserve a snapshot regardless of what the other * processors are doing meanwhile. * * Arguments * ========= * * jcol (input) int * Current panel, with leading column jcol. * * etree (input) int* * Elimination tree parent pointers. * * bcol (input/output) int* * Farthest busy descendant of jcol. * On entry, it is the first column of the farthest busy panel. * On exit, it may be adjusted to the first column of the * farthest busy supernode. * * lbusy (input/output) int* * Initially all -1, lbusy(r) = jcol means that r was busy * at the beginning of computing jcol. * */ pxgstrf_shared_t *pcgstrf_shared = pxgstrf_shared; //sj GlobalLU_t *Glu = pcgstrf_shared->Glu; register int w, kcol, fsupc, bcol_reg; int *xsup; bcol_reg = *bcol; if ( bcol_reg < jcol ) { /* ----------------------------------------------------------- Instead of waiting for the completion of "bcol", we can pessimistically assume supno[bcol] == supno[bcol-1], hence always mark as busy the supernode containing "bcol-1". ----------------------------------------------------------- */ if (pcgstrf_shared->pan_status[bcol_reg].type == RELAXED_SNODE) { #if 0 if ( pcgstrf_shared->pan_status[bcol_reg].size < 0 ) fsupc = bcol_reg + pcgstrf_shared->pan_status[bcol_reg].size; else fsupc = bcol_reg; #endif fsupc = bcol_reg; w = pcgstrf_shared->pan_status[fsupc].size; bcol_reg += w; for (kcol = fsupc; kcol < bcol_reg; ++kcol) lbusy[kcol] = jcol; } else { /* Find leading column "fsupc" in the supernode that contains column "bcol-1" */ #if 0 if ( pcgstrf_shared->spin_locks[bcol_reg] ) /* WORSE PERFORMANCE!! */ await( &pcgstrf_shared->spin_locks[bcol_reg] ); #endif xsup = Glu->xsup; fsupc = SUPER_FSUPC ( Glu->supno[bcol_reg-1] ); for (kcol = fsupc; kcol < bcol_reg; ++kcol) lbusy[kcol] = jcol; } #if ( DEBUGlevel>=1 ) if (jcol >= LOCOL && jcol <= HICOL) printf("(%d) mark_busy_descends[1] jcol %d, bcol_reg %d, fsupc %d\n", pnum, jcol, bcol_reg, fsupc); #endif /* Mark as busy all columns on the path between bcol_reg and jcol */ for (kcol = bcol_reg; kcol < jcol; kcol = etree[kcol]) { lbusy[kcol] = jcol; } /* INVARIANT: *bcol must be the first column of the farthest busy supernode */ *bcol = fsupc; } /* if bcol_reg < jcol */ }
int pzgstrf_column_dfs( const int pnum, /* process number */ const int m, /* number of rows in the matrix */ const int jcol, /* current column in the panel */ const int fstcol, /* first column in the panel */ int *perm_r, /* row pivotings that are done so far */ int *ispruned, /* in */ int *col_lsub, /* the RHS vector to start the dfs */ int lsub_end, /* size of col_lsub[] */ int *super_bnd,/* supernode partition by upper bound */ int *nseg, /* modified - with new segments appended */ int *segrep, /* modified - with new segments appended */ int *repfnz, /* modified */ int *xprune, /* modified */ int *marker2, /* modified */ int *parent, /* working array */ int *xplore, /* working array */ pxgstrf_shared_t *pxgstrf_shared /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * pzgstrf_column_dfs() performs a symbolic factorization on column jcol, * and detects whether column jcol belongs in the same supernode as jcol-1. * * Local parameters * ================ * A supernode representative is the last column of a supernode. * The nonzeros in U[*,j] are segments that end at supernodal * representatives. The routine returns a list of such supernodal * representatives in topological order of the dfs that generates them. * The location of the first nonzero in each such supernodal segment * (supernodal entry location) is also returned. * * nseg: no of segments in current U[*,j] * samesuper: samesuper=NO if column j does not belong in the same * supernode as j-1. Otherwise, samesuper=YES. * * marker2: A-row --> A-row/col (0/1) * repfnz: SuperA-col --> PA-row * parent: SuperA-col --> SuperA-col * xplore: SuperA-col --> index to L-structure * * Return value * ============ * 0 success; * > 0 number of bytes allocated when run out of space. * */ GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ Gstat_t *Gstat = pxgstrf_shared->Gstat; /* modified */ register int jcolm1, jcolm1size, nextl, ifrom; register int k, krep, krow, kperm, samesuper, nsuper; register int no_lsub; int fsupc; /* first column in a supernode */ int myfnz; /* first nonz column in a U-segment */ int chperm, chmark, chrep, kchild; int xdfs, maxdfs, kpar; int ito; /* Used to compress row subscripts */ int mem_error; int *xsup, *xsup_end, *supno, *lsub, *xlsub, *xlsub_end; static int first = 1, maxsuper; if ( first ) { maxsuper = sp_ienv(3); first = 0; } /* Initialize pointers */ xsup = Glu->xsup; xsup_end = Glu->xsup_end; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; jcolm1 = jcol - 1; nextl = lsub_end; no_lsub = 0; samesuper = YES; /* Test whether the row structure of column jcol is contained in that of column jcol-1. */ for (k = 0; k < lsub_end; ++k) { krow = col_lsub[k]; if ( perm_r[krow] == EMPTY ) { /* krow is in L */ ++no_lsub; if (marker2[krow] != jcolm1) samesuper = NO; /* row subset test */ marker2[krow] = jcol; } } #if ( DEBUGlevel>=2 ) if (jcol == BADCOL) printf("(%d) pzgstrf_column_dfs[1] %d, fstcol %d, lsub_end %d, no_lsub %d, samesuper? %d\n", pnum, jcol, fstcol, lsub_end, no_lsub, samesuper); #endif /* * For each nonzero in A[fstcol:n,jcol] perform DFS ... */ for (k = 0; k < lsub_end; ++k) { krow = col_lsub[k]; /* if krow was visited before, go to the next nonzero */ if ( marker2[krow] == jcol ) continue; marker2[krow] = jcol; kperm = perm_r[krow]; #if ( DEBUGlevel>=3 ) if (jcol == BADCOL) printf("(%d) pzgstrf_column_dfs[inner]: perm_r[krow=%d] %d\n", pnum, krow, kperm); #endif /* Ignore the nonzeros in U corresponding to the busy columns during the panel DFS. */ /*if ( lbusy[kperm] != fstcol ) { xiaoye? */ if ( kperm >= fstcol ) { /* * krow is in U: if its supernode representative krep * has been explored, update repfnz[*]. */ krep = SUPER_REP(supno[kperm]); myfnz = repfnz[krep]; #if ( DEBUGlevel>=3 ) if (jcol == BADCOL) printf("(%d) pzgstrf_column_dfs[inner-U]: krep %d, myfnz %d, kperm %d\n", pnum, krep, myfnz, kperm); #endif if ( myfnz != EMPTY ) { /* Visited before */ if ( myfnz > kperm ) repfnz[krep] = kperm; /* continue; */ } else { /* Otherwise, perform dfs starting at krep */ parent[krep] = EMPTY; repfnz[krep] = kperm; if ( ispruned[krep] ) { if ( SINGLETON( supno[krep] ) ) xdfs = xlsub_end[krep]; else xdfs = xlsub[krep]; maxdfs = xprune[krep]; #ifdef PROFILE Gstat->procstat[pnum].pruned++; #endif } else { fsupc = SUPER_FSUPC( supno[krep] ); xdfs = xlsub[fsupc] + krep-fsupc+1; maxdfs = xlsub_end[fsupc]; #ifdef PROFILE Gstat->procstat[pnum].unpruned++; #endif } do { /* * For each unmarked kchild of krep ... */ while ( xdfs < maxdfs ) { kchild = lsub[xdfs]; xdfs++; chmark = marker2[kchild]; if ( chmark != jcol ) { /* Not reached yet */ marker2[kchild] = jcol; chperm = perm_r[kchild]; if ( chperm == EMPTY ) { /* kchild is in L: place it in L[*,k]. */ ++no_lsub; col_lsub[nextl++] = kchild; if (chmark != jcolm1) samesuper = NO; } else { /* kchild is in U: chrep = its supernode * representative. If its rep has * been explored, update its repfnz[*]. */ chrep = SUPER_REP( supno[chperm] ); myfnz = repfnz[chrep]; if ( myfnz != EMPTY ) { /* Visited before */ if ( myfnz > chperm ) repfnz[chrep] = chperm; } else { /* Continue dfs at super-rep of kchild */ xplore[krep] = xdfs; xplore[m + krep] = maxdfs; parent[chrep] = krep; krep = chrep; /* Go deeper down G(L^t) */ repfnz[krep] = chperm; if ( ispruned[krep] ) { if ( SINGLETON( supno[krep] ) ) xdfs = xlsub_end[krep]; else xdfs = xlsub[krep]; maxdfs = xprune[krep]; #ifdef PROFILE Gstat->procstat[pnum].pruned++; #endif } else { fsupc = SUPER_FSUPC( supno[krep] ); xdfs = xlsub[fsupc] + krep-fsupc+1; maxdfs = xlsub_end[fsupc]; #ifdef PROFILE Gstat->procstat[pnum].unpruned++; #endif } } } /* else */ } /* if */ } /* while */ /* krow has no more unexplored nbrs: * place supernode-rep krep in postorder DFS, * backtrack dfs to its parent. */ segrep[*nseg] = krep; ++(*nseg); #if ( DEBUGlevel>=3 ) if (jcol == BADCOL) printf("(%d) pzgstrf_column_dfs[inner-dfs] new nseg %d, repfnz[krep=%d] %d\n", pnum, *nseg, krep, repfnz[krep]); #endif kpar = parent[krep]; /* Pop from stack, mimic recursion */ if ( kpar == EMPTY ) break; /* dfs done */ krep = kpar; xdfs = xplore[krep]; maxdfs = xplore[m + krep]; } while ( kpar != EMPTY ); /* Do ... until empty stack */ } /* else myfnz ... */ } /* if kperm >= fstcol ... */ } /* for each nonzero ... */ #if ( DEBUGlevel>=3 ) if (jcol == BADCOL) printf("(%d) pzgstrf_column_dfs[2]: nextl %d, samesuper? %d\n", pnum, nextl, samesuper); #endif /* assert(no_lsub == nextl - no_usub);*/ /* --------------------------------------------------------- Check to see if j belongs in the same supernode as j-1. --------------------------------------------------------- */ /* Does it matter if jcol == 0? - xiaoye */ if ( samesuper == YES ) { nsuper = supno[jcolm1]; jcolm1size = xlsub_end[jcolm1] - xlsub[jcolm1]; #if ( DEBUGlevel>=3 ) if (jcol == BADCOL) printf("(%d) pzgstrf_column_dfs[YES] jcol-1 %d, jcolm1size %d, supno[%d] %d\n", pnum, jcolm1, jcolm1size, jcolm1, nsuper); #endif if ( no_lsub != jcolm1size-1 ) samesuper = NO; /* Enforce T2 supernode */ else { /* Make sure the number of columns in a supernode does not exceed threshold. */ fsupc = xsup[nsuper]; if ( jcol - fsupc >= maxsuper ) samesuper = NO; else { /* start of a supernode in H (coarser partition) */ if ( super_bnd[jcol] != 0 ) samesuper = NO; } } } /* If jcol starts a new supernode, allocate storage for * the subscript set of both first and last column of * a previous supernode. (first for num values, last for pruning) */ if ( samesuper == NO ) { /* starts a new supernode */ nsuper = NewNsuper(pnum, pxgstrf_shared, &Glu->nsuper); xsup[nsuper] = jcol; /* Copy column jcol; also reserve space to store pruned graph */ if ((mem_error = Glu_alloc(pnum, jcol, 2*no_lsub, LSUB, &ito, pxgstrf_shared))) return mem_error; xlsub[jcol] = ito; lsub = Glu->lsub; for (ifrom = 0; ifrom < nextl; ++ifrom) { krow = col_lsub[ifrom]; if ( perm_r[krow] == EMPTY ) /* Filter U-subscript */ lsub[ito++] = krow; } k = ito; xlsub_end[jcol] = k; /* Make a copy in case it is a singleton supernode */ for (ifrom = xlsub[jcol]; ifrom < ito; ++ifrom) lsub[k++] = lsub[ifrom]; } else { /* Supernode of size > 1: overwrite column jcol-1 */ k = xlsub_end[fsupc]; xlsub[jcol] = k; xprune[fsupc] = k; for (ifrom = 0; ifrom < nextl; ++ifrom) { krow = col_lsub[ifrom]; if ( perm_r[krow] == EMPTY ) /* Filter U-subscript */ lsub[k++] = krow; } xlsub_end[jcol] = k; } #if ( DEBUGlevel>=3 ) if (jcol == BADCOL) { printf("(%d) pzgstrf_column_dfs[3]: %d in prev s-node %d? %d\n", pnum, jcol, fsupc, samesuper); PrintInt10("lsub", xlsub_end[jcol]-xlsub[jcol], &lsub[xlsub[jcol]]); } #endif /* Tidy up the pointers before exit */ xprune[jcol] = k; /* upper bound for pruning */ supno[jcol] = nsuper; xsup_end[nsuper] = jcol + 1; return 0; }
void pxgstrf_super_bnd_dfs( const int pnum, /* process number */ const int m, /* number of rows in the matrix */ const int n, /* number of columns in the matrix */ const int jcol, /* first column of the H-supernode */ const int w, /* size of the H-supernode */ SuperMatrix *A, /* original matrix */ int *perm_r, /* in */ int *iperm_r, /* in; inverse of perm_r */ int *xprune, /* in */ int *ispruned, /* in */ int *marker, /* modified */ int *parent, /* working array */ int *xplore, /* working array */ pxgstrf_shared_t *pxgstrf_shared /* modified */ ) { /* * -- SuperLU MT routine (version 1.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * August 15, 1997 * * Purpose * ======= * * Performs a symbolic structure prediction on a supernode in the Householder * matrix H, with jcol being the leading column. * */ GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ register int krep, chperm, chrep, kchild; register int invp_rep; /* "krep" numbered in the original A */ register int krow, kperm, xdfs, maxdfs, kpar; register int fsupc, k, jj, found; register int nrow; /* union of the nonzero rows in a supernode */ NCPformat *Astore; int *asub, *xa_begin, *xa_end; int *xsup, *xsup_end, *supno, *lsub, *xlsub, *xlsub_end; /* Initialize pointers */ xsup = Glu->xsup; xsup_end = Glu->xsup_end; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; Astore = A->Store; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; nrow = 0; found = n + jcol; /* For each column in the H-supernode */ for (jj = jcol; jj < jcol + w; ++jj) { /* For each nonz in A[*,jj] do dfs */ for (k = xa_begin[jj]; k < xa_end[jj]; ++k) { krow = asub[k]; /* krow was visited before, go to the next nonzero. */ if ( marker[krow] == found ) continue; /* For each unmarked nbr krow of jj ... */ kperm = perm_r[krow]; if ( kperm == EMPTY ) { /* krow is in L */ marker[krow] = found; ++nrow; } else { /* krow is in U: if its supernode-rep krep has been explored, skip the search. */ krep = SUPER_REP( supno[kperm] ); invp_rep = iperm_r[krep]; /* Perform dfs starting at krep */ if ( marker[invp_rep] != found ) { marker[invp_rep] = found; parent[krep] = EMPTY; if ( ispruned[krep] ) { if ( SINGLETON( supno[krep] ) ) xdfs = xlsub_end[krep]; else xdfs = xlsub[krep]; maxdfs = xprune[krep]; } else { fsupc = SUPER_FSUPC( supno[krep] ); xdfs = xlsub[fsupc] + krep-fsupc+1; maxdfs = xlsub_end[fsupc]; } do { /* For each unmarked kchild of krep ... */ while ( xdfs < maxdfs ) { kchild = lsub[xdfs]; xdfs++; if (marker[kchild] != found) { /* Not reached yet */ chperm = perm_r[kchild]; if ( chperm == EMPTY ) { /* kchild is in L */ marker[kchild] = found; ++nrow; } else { /* kchild is in U: * chrep = its supernode-rep. If its rep * has been explored, skip the search. */ chrep = SUPER_REP( supno[chperm] ); invp_rep = iperm_r[chrep]; /* Continue dfs at snode-rep of kchild */ if ( marker[invp_rep] != found ) { marker[invp_rep] = found; xplore[krep] = xdfs; xplore[m + krep] = maxdfs; parent[chrep] = krep; krep = chrep;/* Go deeper down G(L^t) */ xdfs = xlsub[krep]; maxdfs = xprune[krep]; if ( ispruned[krep] ) { if ( SINGLETON( supno[krep] ) ) xdfs = xlsub_end[krep]; else xdfs = xlsub[krep]; maxdfs = xprune[krep]; } else { fsupc = SUPER_FSUPC(supno[krep]); xdfs = xlsub[fsupc] + krep-fsupc+1; maxdfs = xlsub_end[fsupc]; } } /* if */ } /* else */ } /* if... */ } /* while xdfs < maxdfs */ /* krow has no more unexplored nbrs: * Place snode-rep krep in postorder dfs, if this * segment is seen for the first time. Note that * the "repfnz[krep]" may change later. * Backtrack dfs to its parent. */ kpar = parent[krep]; /* Pop stack, mimic recursion */ if ( kpar == EMPTY ) break; /* dfs done */ krep = kpar; xdfs = xplore[krep]; maxdfs = xplore[m+krep]; } while ( kpar != EMPTY ); /* do-while - until empty stack */ } /* if */ } /* else */ } /* for each nonz in A[*,jj] */ } /* for jj ... */ DynamicSetMap(pnum, jcol, nrow * w, pxgstrf_shared); /* for (i = 1; i < w; ++i) Glu->map_in_sup[jcol + i] = -i;*/ #if ( DEBUGlevel>=1 ) printf("(%d) pxgstrf_super_bnd_dfs(): jcol= %d, w= %d, nrow= %d\n", pnum, jcol, w, nrow); #endif }
void pdgstrf_panel_dfs( const int pnum, /* process number */ const int m, /* number of rows in the matrix */ const int w, /* current panel width */ const int jcol, /* leading column of the current panel */ SuperMatrix *A, /* original matrix */ int *perm_r, /* row pivotings that are done so far */ int *xprune, /* in */ int *ispruned, /* in */ int *lbusy, /* in; size n */ int *nseg, /* out */ int *panel_lsub, /* out */ int *w_lsub_end, /* out; values irrelevant on entry */ int *segrep, /* out */ int *repfnz, /* out */ int *marker, /* modified */ int *spa_marker, /* modified; size n-by-w */ int *parent, /* working array */ int *xplore, /* working array */ double *dense, /* out; size n-by-w */ GlobalLU_t *Glu /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * Performs a symbolic factorization on a panel of columns [jcol, jcol+w). * It skips all those busy descendants that are worked on by other * processors along the e-tree path. * * Notes * ===== * * (1) panel_lsub[0:w*n-1]: temporary for the nonzero row indices below * the panel diagonal, which will be used later in the inner LU * factorization. For the busy columns, some of the nonzeros in U * may be mistakenly placed in this list, because "perm_r" is * still "empty". Later, during dcolumn_dfs in the inner factorization, * we must filter those nonzeros belonging in U. * * (2) A supernode representative is the last column of a supernode. * The nonzeros in U[*,j] are segments that end at supernodal * representatives. * * (3) The routine returns one list of the supernodal representatives * in topological order of the DFS that generates them. This list is * a superset of the topological order of each individual column within * the panel. The location of the first nonzero in each supernodal * segment (supernodal entry location) is also returned. Each column * has a separate list for this purpose. * * (4) Two marker arrays are used to facilitate dfs: * marker[i] == jj, if i was visited during dfs of current column jj; * marker1[i] == jcol, if i was visited by earlier columns in this panel; * * (5) The dfs stack is the combination of xplore[2*m] and parent[m]: * xplore[k] - pointer to k's adjancency list where search begins * xplore[m + k] - pointer to k's adjancency list where search ends * * (6) Array mappings * marker: A-row --> A-row/col (0/1) * repfnz: SuperA-col --> PA-row * parent: SuperA-col --> SuperA-col * xplore: SuperA-col --> index to L-structure * */ NCPformat *Astore; double *a; int *asub; int *xa_begin, *xa_end; register int krep, chperm, chmark, chrep, kchild, myfnz; register int k, krow, kmark, kperm, fsupc; register int xdfs, maxdfs, kpar, jj, nextp; register int nextl_col;/* next open position in panel_lsub[*,jj] */ int *marker1; /* marker1[jj] == jcol if vertex jj was visited by a previous column within this panel. */ int *repfnz_col; /* start of each column in the panel */ double *dense_col; /* start of each column in the panel */ int *xsup, *xsup_end, *supno, *lsub, *xlsub, *xlsub_end; int *col_marker; /* marker array of each column in the panel */ /* Initialize pointers */ xsup = Glu->xsup; xsup_end = Glu->xsup_end; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; marker1 = marker + m; repfnz_col = repfnz; dense_col = dense; nextp = 0; *nseg = 0; #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) pdgstrf_panel_dfs[begin] jcol %d, w %d\n", pnum, jcol, w); #endif /* * For each column in the panel ... */ for (jj = jcol; jj < jcol + w; ++jj, nextp += m) { nextl_col = nextp; col_marker = &spa_marker[nextp]; /* * For each nonz in A[*,jj] perform dfs ... */ for (k = xa_begin[jj]; k < xa_end[jj]; ++k) { krow = asub[k]; dense_col[krow] = a[k]; kmark = col_marker[krow]; /* if krow was visited before, go to the next nonzero */ if ( kmark == jj ) continue; /* * For each unmarked nbr krow of jj ... */ col_marker[krow] = jj; kperm = perm_r[krow]; if ( kperm == EMPTY ) { /* krow is in L: place it in structure of L[*,jj]. * NOTE: some entries in U may get here, because "perm_r" * is not yet available from a preceeding busy column. */ panel_lsub[nextl_col++] = krow; /* krow is indexed into A */ } else { /* * krow is in U (0 <= kperm < jcol): if its supernode * representative krep has been explored, update repfnz[*]. */ if ( lbusy[kperm] == jcol ) { /* kperm is busy */ #if ( DEBUGlevel>=3 ) if (jj == BADCOL) printf("(%d) pdgstrf_panel_dfs(%d) skip busy krow %d, kperm %d\n", pnum, jj, krow, kperm); #endif continue; } /* Here, krep cannot possibly be "busy" */ krep = SUPER_REP( supno[kperm] ); myfnz = repfnz_col[krep]; #ifdef CHK_DFS if (jj == BADCOL) printf("(%d) pdgstrf_panel_dfs[1] %d, krep %d, fsupc %d, Pr[krow %d] %d, myfnz %d\n", pnum, jj, krep, SUPER_FSUPC(supno[krep]), krow, kperm, myfnz); #endif if ( myfnz != EMPTY ) { /* Representative visited before */ if ( myfnz > kperm ) repfnz_col[krep] = kperm; /* continue; */ } else { /* Otherwise, performs dfs starting from krep */ parent[krep] = EMPTY; repfnz_col[krep] = kperm; if ( ispruned[krep] ) { if ( SINGLETON( supno[krep] ) ) xdfs = xlsub_end[krep]; else xdfs = xlsub[krep]; maxdfs = xprune[krep]; #ifdef PROFILE /*Gstat->procstat[pnum].pruned++;*/ #endif } else { fsupc = SUPER_FSUPC( supno[krep] ); xdfs = xlsub[fsupc] + krep-fsupc+1; maxdfs = xlsub_end[fsupc]; #ifdef PROFILE /*Gstat->procstat[pnum].unpruned++;*/ #endif } #ifdef CHK_DFS if (jj == BADCOL) { register int i; printf("(%d) pdgstrf_panel_dfs[2] %d, ispruned[%d] %d, xdfs %d, maxdfs %d\n", pnum, jj, krep, ispruned[krep], xdfs, maxdfs); /*for (i = xdfs; i < maxdfs; i++) printf("(%d) lsub-%d", pnum, lsub[i]);*/ printf("\n"); } #endif do { while ( xdfs < maxdfs ) { /* for each unmarked kchild of krep ... */ kchild = lsub[xdfs]; xdfs++; chmark = col_marker[kchild]; if ( chmark != jj ) { /* Not reached yet */ col_marker[kchild] = jj; chperm = perm_r[kchild]; if ( chperm == EMPTY ) { /* kchild is in L: place it in L[*,j]. */ panel_lsub[nextl_col++] = kchild; } else { /* kchild is in U (0 <= chperm < jcol): * chrep = its supernode-rep. If its rep * has been explored, update its repfnz[*]. */ if ( lbusy[chperm] == jcol ) { #ifdef DEBUG if (jj == BADCOL) printf("(%d) pdgstrf_panel_dfs(%d) skip busy kchild %d, chperm %d\n", pnum, jj, kchild, chperm); #endif continue; } chrep = SUPER_REP( supno[chperm] ); myfnz = repfnz_col[chrep]; #ifdef DEBUG if (jj == BADCOL) printf("(%d) pdgstrf_panel_dfs[3] %d, krep %d, Pr[kchild %d] %d, chrep %d, fsupc %d, myfnz %d\n", pnum, jj, krep, kchild, chperm, chrep, SUPER_FSUPC(supno[chrep]), myfnz); #endif if ( myfnz != EMPTY ) {/* Visited before */ if ( myfnz > chperm ) repfnz_col[chrep] = chperm; } else { /* Cont. dfs at snode-rep of kchild */ xplore[krep] = xdfs; xplore[m + krep] = maxdfs; parent[chrep] = krep; krep = chrep; /* Go deeper down G(L) */ repfnz_col[krep] = chperm; if ( ispruned[krep] ) { if ( SINGLETON( supno[krep] ) ) xdfs = xlsub_end[krep]; else xdfs = xlsub[krep]; maxdfs = xprune[krep]; #ifdef PROFILE /*procstat[pnum].pruned++;*/ #endif } else { fsupc = SUPER_FSUPC(supno[krep]); xdfs = xlsub[fsupc] + krep-fsupc+1; maxdfs = xlsub_end[fsupc]; #ifdef PROFILE /*procstat[pnum].unpruned++;*/ #endif } #ifdef CHK_DFS if (jj == BADCOL) printf("(%d) pdgstrf_panel_dfs[4] %d, ispruned[%d] %d, xdfs %d, maxdfs %d\n", pnum, jj, krep, ispruned[krep], xdfs, maxdfs); #endif } /* else */ } /* else */ } /* if... */ } /* while xdfs < maxdfs */ /* krow has no more unexplored nbrs: * Place snode-rep krep in postorder DFS, if this * segment is seen for the first time. (Note that * "repfnz[krep]" may change later.) * Backtrack dfs to its parent. */ if ( marker1[krep] != jcol ) { segrep[*nseg] = krep; ++(*nseg); marker1[krep] = jcol; #ifdef CHK_DFS if (jj == BADCOL) printf("(%d) pdgstrf_panel_dfs(%d) repfnz[%d] %d added to top.list by jj %d\n", pnum, jj, krep, repfnz_col[krep], jj); #endif } kpar = parent[krep]; /* Pop stack, mimic recursion */ if ( kpar == EMPTY ) break; /* dfs done */ krep = kpar; xdfs = xplore[krep]; maxdfs = xplore[m + krep]; #ifdef CHK_DFS if (jj == BADCOL) { register int i; printf("(%d) pdgstrf_panel_dfs[5] pop stack: %d, krep %d, xdfs %d, maxdfs %d\n", pnum, jj, krep, xdfs, maxdfs); /* for (i = xdfs; i < maxdfs; i++) printf("(%d) lsub-%d", pnum, lsub[i]);*/ printf("\n"); } #endif } while ( kpar != EMPTY ); /* until empty stack */ } /* else: myfnz == EMPTY */ } /* else: kperm != EMPTY */ } /* for each nonzero in A[*,jj] */ #if ( DEBUGlevel>=3 ) if (jj == BADCOL) { #define REPCOL 0 krep = REPCOL; printf("(%d) pdgstrf_panel_dfs(end) w_lsub_end[jj=%d] %d, repfnz_col[%d] %d\n", pnum, jj, nextl_col - nextp, krep, repfnz_col[krep]); PrintInt10("lsub", nextl_col - nextp, &panel_lsub[nextp]); } #endif w_lsub_end[jj-jcol] = nextl_col - nextp; repfnz_col += m; dense_col += m; } /* for jj ... */ }