void dgstrf (superlu_options_t *options, SuperMatrix *A, double drop_tol, int relax, int panel_size, int *etree, void *work, int lwork, int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * DGSTRF computes an LU factorization of a general sparse m-by-n * matrix A using partial pivoting with row interchanges. * The factorization has the form * Pr * A = L * U * where Pr is a row permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper * triangular (upper trapezoidal if A->nrow < A->ncol). * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * * A (input) SuperMatrix* * Original matrix A, permuted by columns, of dimension * (A->nrow, A->ncol). The type of A can be: * Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. * * drop_tol (input) double (NOT IMPLEMENTED) * Drop tolerance parameter. At step j of the Gaussian elimination, * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. * 0 <= drop_tol <= 1. The default value of drop_tol is 0. * * relax (input) int * To control degree of relaxing supernodes. If the number * of nodes (columns) in a subtree of the elimination tree is less * than relax, this subtree is considered as one supernode, * regardless of the row structures of those columns. * * panel_size (input) int * A panel consists of at most panel_size consecutive columns. * * etree (input) int*, dimension (A->ncol) * Elimination tree of A'*A. * Note: etree is a vector of parent pointers for a forest whose * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. * On input, the columns of A should be permuted so that the * etree is in a certain postorder. * * work (input/output) void*, size (lwork) (in bytes) * User-supplied work space and space for the output data structures. * Not referenced if lwork = 0; * * lwork (input) int * Specifies the size of work array in bytes. * = 0: allocate space internally by system malloc; * > 0: use user-supplied work array of length lwork in bytes, * returns error if space runs out. * = -1: the routine guesses the amount of space needed without * performing the factorization, and returns it in * *info; no other side effects. * * perm_c (input) int*, dimension (A->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * When searching for diagonal, perm_c[*] is applied to the * row subscripts of A, so that diagonal threshold pivoting * can find the diagonal of A, rather than that of A*Pc. * * perm_r (input/output) int*, dimension (A->nrow) * Row permutation vector which defines the permutation matrix Pr, * perm_r[i] = j means row i of A is in position j in Pr*A. * If options->Fact = SamePattern_SameRowPerm, the pivoting routine * will try to use the input perm_r, unless a certain threshold * criterion is violated. In that case, perm_r is overwritten by * a new permutation determined by partial pivoting or diagonal * threshold pivoting. * Otherwise, perm_r is output argument; * * L (output) SuperMatrix* * The factor L from the factorization Pr*A=L*U; use compressed row * subscripts storage for supernodes, i.e., L has type: * Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. * * U (output) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise * storage scheme, i.e., U has types: Stype = SLU_NC, * Dtype = SLU_D, Mtype = SLU_TRU. * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. If lwork = -1, it is * the estimated amount of space needed, plus A->ncol. * * ====================================================================== * * Local Working Arrays: * ====================== * m = number of rows in the matrix * n = number of columns in the matrix * * xprune[0:n-1]: xprune[*] points to locations in subscript * vector lsub[*]. For column i, xprune[i] denotes the point where * structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need * to be traversed for symbolic factorization. * * marker[0:3*m-1]: marker[i] = j means that node i has been * reached when working on column j. * Storage: relative to original row subscripts * NOTE: There are 3 of them: marker/marker1 are used for panel dfs, * see dpanel_dfs.c; marker2 is used for inner-factorization, * see dcolumn_dfs.c. * * parent[0:m-1]: parent vector used during dfs * Storage: relative to new row subscripts * * xplore[0:m-1]: xplore[i] gives the location of the next (dfs) * unexplored neighbor of i in lsub[*] * * segrep[0:nseg-1]: contains the list of supernodal representatives * in topological order of the dfs. A supernode representative is the * last column of a supernode. * The maximum size of segrep[] is n. * * repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a * supernodal representative r, repfnz[r] is the location of the first * nonzero in this segment. It is also used during the dfs: repfnz[r]>0 * indicates the supernode r has been explored. * NOTE: There are W of them, each used for one column of a panel. * * panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below * the panel diagonal. These are filled in during dpanel_dfs(), and are * used later in the inner LU factorization within the panel. * panel_lsub[]/dense[] pair forms the SPA data structure. * NOTE: There are W of them. * * dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values; * NOTE: there are W of them. * * tempv[0:*]: real temporary used for dense numeric kernels; * The size of this array is defined by NUM_TEMPV() in dsp_defs.h. * */ /* Local working arrays */ NCPformat *Astore; int *iperm_r = NULL; /* inverse of perm_r; used when options->Fact == SamePattern_SameRowPerm */ int *iperm_c; /* inverse of perm_c */ int *iwork; double *dwork; int *segrep, *repfnz, *parent, *xplore; int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ int *xprune; int *marker; double *dense, *tempv; int *relax_end; double *a; int *asub; int *xa_begin, *xa_end; int *xsup, *supno; int *xlsub, *xlusup, *xusub; int nzlumax; static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ /* Local scalars */ fact_t fact = options->Fact; double diag_pivot_thresh = options->DiagPivotThresh; int pivrow; /* pivotal row number in the original matrix A */ int nseg1; /* no of segments in U-column above panel row jcol */ int nseg; /* no of segments in each U-column */ register int jcol; register int kcol; /* end column of a relaxed snode */ register int icol; register int i, k, jj, new_next, iinfo; int m, n, min_mn, jsupno, fsupc, nextlu, nextu; int w_def; /* upper bound on panel width */ int usepr, iperm_r_allocated = 0; int nnzL, nnzU; int *panel_histo = stat->panel_histo; flops_t *ops = stat->ops; iinfo = 0; m = A->nrow; n = A->ncol; min_mn = SUPERLU_MIN(m, n); Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; /* Allocate storage common to the factor routines */ *info = dLUMemInit(fact, work, lwork, m, n, Astore->nnz, panel_size, L, U, &Glu, &iwork, &dwork); if ( *info ) return; xsup = Glu.xsup; supno = Glu.supno; xlsub = Glu.xlsub; xlusup = Glu.xlusup; xusub = Glu.xusub; SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, &repfnz, &panel_lsub, &xprune, &marker); dSetRWork(m, panel_size, dwork, &dense, &tempv); usepr = (fact == SamePattern_SameRowPerm); if ( usepr ) { /* Compute the inverse of perm_r */ iperm_r = (int *) intMalloc(m); for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; iperm_r_allocated = 1; } iperm_c = (int *) intMalloc(n); for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; /* Identify relaxed snodes */ relax_end = (int *) intMalloc(n); if ( options->SymmetricMode == YES ) { heap_relax_snode(n, etree, relax, marker, relax_end); } else { relax_snode(n, etree, relax, marker, relax_end); } ifill (perm_r, m, EMPTY); ifill (marker, m * NO_MARKER, EMPTY); supno[0] = -1; xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; w_def = panel_size; /* * Work on one "panel" at a time. A panel is one of the following: * (a) a relaxed supernode at the bottom of the etree, or * (b) panel_size contiguous columns, defined by the user */ for (jcol = 0; jcol < min_mn; ) { if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ kcol = relax_end[jcol]; /* end of the relaxed snode */ panel_histo[kcol-jcol+1]++; /* -------------------------------------- * Factorize the relaxed supernode(jcol:kcol) * -------------------------------------- */ /* Determine the union of the row structure of the snode */ if ( (*info = dsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, xprune, marker, &Glu)) != 0 ) return; nextu = xusub[jcol]; nextlu = xlusup[jcol]; jsupno = supno[jcol]; fsupc = xsup[jsupno]; new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); nzlumax = Glu.nzlumax; while ( new_next > nzlumax ) { if ( (*info = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) ) return; } for (icol = jcol; icol<= kcol; icol++) { xusub[icol+1] = nextu; /* Scatter into SPA dense[*] */ for (k = xa_begin[icol]; k < xa_end[icol]; k++) dense[asub[k]] = a[k]; /* Numeric update within the snode */ dsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); if ( (*info = dpivotL(icol, diag_pivot_thresh, &usepr, perm_r, iperm_r, iperm_c, &pivrow, &Glu, stat)) ) if ( iinfo == 0 ) iinfo = *info; #ifdef DEBUG dprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); #endif } jcol = icol; } else { /* Work on one panel of panel_size columns */ /* Adjust panel_size so that a panel won't overlap with the next * relaxed snode. */ panel_size = w_def; for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) if ( relax_end[k] != EMPTY ) { panel_size = k - jcol; break; } if ( k == min_mn ) panel_size = min_mn - jcol; panel_histo[panel_size]++; /* symbolic factor on a panel of columns */ dpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, dense, panel_lsub, segrep, repfnz, xprune, marker, parent, xplore, &Glu); /* numeric sup-panel updates in topological order */ dpanel_bmod(m, panel_size, jcol, nseg1, dense, tempv, segrep, repfnz, &Glu, stat); /* Sparse LU within the panel, and below panel diagonal */ for ( jj = jcol; jj < jcol + panel_size; jj++) { k = (jj - jcol) * m; /* column index for w-wide arrays */ nseg = nseg1; /* Begin after all the panel segments */ if ((*info = dcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], segrep, &repfnz[k], xprune, marker, parent, xplore, &Glu)) != 0) return; /* Numeric updates */ if ((*info = dcolumn_bmod(jj, (nseg - nseg1), &dense[k], tempv, &segrep[nseg1], &repfnz[k], jcol, &Glu, stat)) != 0) return; /* Copy the U-segments to ucol[*] */ if ((*info = dcopy_to_ucol(jj, nseg, segrep, &repfnz[k], perm_r, &dense[k], &Glu)) != 0) return; if ( (*info = dpivotL(jj, diag_pivot_thresh, &usepr, perm_r, iperm_r, iperm_c, &pivrow, &Glu, stat)) ) if ( iinfo == 0 ) iinfo = *info; /* Prune columns (0:jj-1) using column jj */ dpruneL(jj, perm_r, pivrow, nseg, segrep, &repfnz[k], xprune, &Glu); /* Reset repfnz[] for this column */ resetrep_col (nseg, segrep, &repfnz[k]); #ifdef DEBUG dprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); #endif } jcol += panel_size; /* Move to the next panel */ } /* else */ } /* for */ *info = iinfo; if ( m > n ) { k = 0; for (i = 0; i < m; ++i) if ( perm_r[i] == EMPTY ) { perm_r[i] = n + k; ++k; } } countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); fixupL(min_mn, perm_r, &Glu); dLUWorkFree(iwork, dwork, &Glu); /* Free work space and compress storage */ if ( fact == SamePattern_SameRowPerm ) { /* L and U structures may have changed due to possibly different pivoting, even though the storage is available. There could also be memory expansions, so the array locations may have changed, */ ((SCformat *)L->Store)->nnz = nnzL; ((SCformat *)L->Store)->nsuper = Glu.supno[n]; ((SCformat *)L->Store)->nzval = Glu.lusup; ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; ((SCformat *)L->Store)->rowind = Glu.lsub; ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; ((NCformat *)U->Store)->nnz = nnzU; ((NCformat *)U->Store)->nzval = Glu.ucol; ((NCformat *)U->Store)->rowind = Glu.usub; ((NCformat *)U->Store)->colptr = Glu.xusub; } else { dCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, Glu.xsup, SLU_SC, SLU_D, SLU_TRLU); dCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, Glu.usub, Glu.xusub, SLU_NC, SLU_D, SLU_TRU); } ops[FACT] += ops[TRSV] + ops[GEMV]; if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); SUPERLU_FREE (iperm_c); SUPERLU_FREE (relax_end); }
void *pzgstrf_thread(void *arg) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * * Purpose * ======= * * This is the slave process, representing the main scheduling loop to * perform the factorization. Each process executes a copy of the * following code ... (SPMD paradigm) * * Working arrays local to each process * ====================================== * marker[0:3*m-1]: marker[i] == j means node i has been reached when * working on column j. * Storage: relative to original row subscripts * * THERE ARE 3 OF THEM: * marker[0 : m-1]: used by pzgstrf_factor_snode() and * pzgstrf_panel_dfs(); * marker[m : 2m-1]: used by pzgstrf_panel_dfs() and * pxgstrf_super_bnd_dfs(); * values in [0 : n-1] when used by pzgstrf_panel_dfs() * values in [n : 2n-1] when used by pxgstrf_super_bnd_dfs() * marker[2m : 3m-1]: used by pzgstrf_column_dfs() in inner-factor * * parent[0:n-1]: parent vector used during dfs * Storage: relative to new row subscripts * * xplore[0:2m-1]: xplore[i] gives the location of the next (dfs) * unexplored neighbor of i in lsub[*]; xplore[n+i] gives the * location of the last unexplored neighbor of i in lsub[*]. * * segrep[0:nseg-1]: contains the list of supernodal representatives * in topological order of the dfs. A supernode representative is the * last column of a supernode. * * repfnz[0:m-1]: for a nonzero segment U[*,j] that ends at a * supernodal representative r, repfnz[r] is the location of the first * nonzero in this segment. It is also used during the dfs: * repfnz[r]>0 indicates that supernode r has been explored. * NOTE: There are w of them, each used for one column of a panel. * * panel_lsub[0:w*m-1]: temporary for the nonzero row indices below * the panel diagonal. These are filled in during pzgstrf_panel_dfs(), * and are used later in the inner LU factorization. * panel_lsub[]/dense[] pair forms the SPA data structure. * NOTE: There are w of them. * * dense[0:w*m-1]: sparse accumulator (SPA) for intermediate values; * NOTE: there are w of them. * * tempv[0:m-1]: real temporary used for dense numeric kernels; * * * Scheduling algorithm (For each process ...) * ==================== * Shared task Q <-- { relaxed s-nodes (CANGO) }; * * WHILE (not finished) * * panel = Scheduler(Q); (see pxgstrf_scheduler.c for policy) * * IF (panel == RELAXED_SNODE) * factor_relax_snode(panel); * ELSE * * pzgstrf_panel_dfs() * - skip all BUSY s-nodes (or panels) * * * dpanel_bmod() * - updates from DONE s-nodes * - wait for BUSY s-nodes to become DONE * * * inner-factor() * - identical as it is in the sequential algorithm, * except that pruning() will interact with the * pzgstrf_panel_dfs() of other panels. * ENDIF * * END WHILE; * */ #if ( MACH==SGI || MACH==ORIGIN ) #if ( MACH==SGI ) int pnum = mpc_my_threadnum(); #elif ( MACH==ORIGIN ) int pnum = mp_my_threadnum(); #endif pzgstrf_threadarg_t *thr_arg = &((pzgstrf_threadarg_t *)arg)[pnum]; #else pzgstrf_threadarg_t *thr_arg = arg; int pnum = thr_arg->pnum; #endif /* Unpack the options argument */ superlumt_options_t *superlumt_options = thr_arg->superlumt_options; pxgstrf_shared_t *pxgstrf_shared= thr_arg->pxgstrf_shared; int panel_size = superlumt_options->panel_size; double diag_pivot_thresh = superlumt_options->diag_pivot_thresh; yes_no_t *usepr = &superlumt_options->usepr; /* may be modified */ int *etree = superlumt_options->etree; int *super_bnd = superlumt_options->part_super_h; int *perm_r = superlumt_options->perm_r; int *inv_perm_c= pxgstrf_shared->inv_perm_c; int *inv_perm_r= pxgstrf_shared->inv_perm_r; int *xprune = pxgstrf_shared->xprune; int *ispruned = pxgstrf_shared->ispruned; SuperMatrix *A = pxgstrf_shared->A; GlobalLU_t *Glu = pxgstrf_shared->Glu; Gstat_t *Gstat = pxgstrf_shared->Gstat; int *info = &thr_arg->info; /* Local working arrays */ int *iwork; doublecomplex *dwork; int *segrep, *repfnz, *parent, *xplore; int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ int *marker, *marker1, *marker2; int *lbusy; /* "Local busy" array, indicates which descendants were busy when this panel's computation began. Those columns (s-nodes) are treated specially during pzgstrf_panel_dfs() and dpanel_bmod(). */ int *spa_marker; /* size n-by-w */ int *w_lsub_end; /* record the end of each column in panel_lsub */ doublecomplex *dense, *tempv; int *lsub, *xlsub, *xlsub_end; /* Local scalars */ register int m, n, k, jj, jcolm1, itemp, singular; int pivrow; /* pivotal row number in the original matrix A */ int nseg1; /* no of segments in U-column above panel row jcol */ int nseg; /* no of segments in each U-column */ int w, bcol, jcol; #ifdef PROFILE double *utime = Gstat->utime; double t1, t2, t, stime; register float flopcnt; #endif #ifdef PREDICT_OPT flops_t *ops = Gstat->ops; register float pdiv; #endif #if ( DEBUGlevel>=1 ) printf("(%d) thr_arg-> pnum %d, info %d\n", pnum, thr_arg->pnum, thr_arg->info); #endif singular = 0; m = A->nrow; n = A->ncol; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; /* Allocate and initialize the per-process working storage. */ if ( (*info = pzgstrf_WorkInit(m, panel_size, &iwork, &dwork)) ) { *info += pzgstrf_memory_use(Glu->nzlmax, Glu->nzumax, Glu->nzlumax); return 0; } pxgstrf_SetIWork(m, panel_size, iwork, &segrep, &parent, &xplore, &repfnz, &panel_lsub, &marker, &lbusy); pzgstrf_SetRWork(m, panel_size, dwork, &dense, &tempv); /* New data structures to facilitate parallel algorithm */ spa_marker = intMalloc(m * panel_size); w_lsub_end = intMalloc(panel_size); ifill (spa_marker, m * panel_size, EMPTY); ifill (marker, m * NO_MARKER, EMPTY); ifill (lbusy, m, EMPTY); jcol = EMPTY; marker1 = marker + m; marker2 = marker + 2*m; #ifdef PROFILE stime = SuperLU_timer_(); #endif /* ------------------------- Main loop: repeatedly ... ------------------------- */ while ( pxgstrf_shared->tasks_remain > 0 ) { #ifdef PROFILE TIC(t); #endif /* Get a panel from the scheduler. */ pxgstrf_scheduler(pnum, n, etree, &jcol, &bcol, pxgstrf_shared); #if ( DEBUGlevel>=1 ) if ( jcol>=LOCOL && jcol<=HICOL ) { printf("(%d) Scheduler(): jcol %d, bcol %d, tasks_remain %d\n", pnum, jcol, bcol, pxgstrf_shared->tasks_remain); fflush(stdout); } #endif #ifdef PROFILE TOC(t2, t); Gstat->procstat[pnum].skedtime += t2; #endif if ( jcol != EMPTY ) { w = pxgstrf_shared->pan_status[jcol].size; #if ( DEBUGlevel>=3 ) printf("P%2d got panel %5d-%5d\ttime %.4f\tpanels_left %d\n", pnum, jcol, jcol+w-1, SuperLU_timer_(), pxgstrf_shared->tasks_remain); fflush(stdout); #endif /* Nondomain panels */ #ifdef PROFILE flopcnt = Gstat->procstat[pnum].fcops; Gstat->panstat[jcol].pnum = pnum; TIC(t1); Gstat->panstat[jcol].starttime = t1; #endif if ( pxgstrf_shared->pan_status[jcol].type == RELAXED_SNODE ) { #ifdef PREDICT_OPT pdiv = Gstat->procstat[pnum].fcops; #endif /* A relaxed supernode at the bottom of the etree */ pzgstrf_factor_snode (pnum, jcol, A, diag_pivot_thresh, usepr, perm_r, inv_perm_r, inv_perm_c, xprune, marker, panel_lsub, dense, tempv, pxgstrf_shared, info); if ( *info ) { if ( *info > n ) return 0; else if ( singular == 0 || *info < singular ) singular = *info; #if ( DEBUGlevel>=1 ) printf("(%d) After pzgstrf_factor_snode(): singular=%d\n", pnum, singular); #endif } /* Release the whole relaxed supernode */ for (jj = jcol; jj < jcol + w; ++jj) pxgstrf_shared->spin_locks[jj] = 0; #ifdef PREDICT_OPT pdiv = Gstat->procstat[pnum].fcops - pdiv; cp_panel[jcol].pdiv = pdiv; #endif } else { /* Regular panel */ #ifdef PROFILE TIC(t); #endif pxgstrf_mark_busy_descends(pnum, jcol, etree, pxgstrf_shared, &bcol, lbusy); /* Symbolic factor on a panel of columns */ pzgstrf_panel_dfs (pnum, m, w, jcol, A, perm_r, xprune,ispruned,lbusy, &nseg1, panel_lsub, w_lsub_end, segrep, repfnz, marker, spa_marker, parent, xplore, dense, Glu); #if ( DEBUGlevel>=2 ) if ( jcol==BADPAN ) printf("(%d) After pzgstrf_panel_dfs(): nseg1 %d, w_lsub_end %d\n", pnum, nseg1, w_lsub_end[0]); #endif #ifdef PROFILE TOC(t2, t); utime[DFS] += t2; #endif /* Numeric sup-panel updates in topological order. * On return, the update values are temporarily stored in * the n-by-w SPA dense[m,w]. */ pzgstrf_panel_bmod (pnum, m, w, jcol, bcol, inv_perm_r, etree, &nseg1, segrep, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, pxgstrf_shared); /* * All "busy" descendants are "done" now -- * Find the set of row subscripts in the preceeding column * "jcol-1" of the current panel. Column "jcol-1" is * usually taken by a process other than myself. * This row-subscripts information will be used by myself * during column dfs to detect whether "jcol" belongs * to the same supernode as "jcol-1". * * ACCORDING TO PROFILE, THE AMOUNT OF TIME SPENT HERE * IS NEGLIGIBLE. */ jcolm1 = jcol - 1; itemp = xlsub_end[jcolm1]; for (k = xlsub[jcolm1]; k < itemp; ++k) marker2[lsub[k]] = jcolm1; #ifdef PREDICT_OPT pdiv = Gstat->procstat[pnum].fcops; #endif /* Inner-factorization, using sup-col algorithm */ for ( jj = jcol; jj < jcol + w; jj++) { k = (jj - jcol) * m; /* index into w-wide arrays */ nseg = nseg1; /* begin after all the panel segments */ #ifdef PROFILE TIC(t); #endif /* Allocate storage for the current H-supernode. */ if ( Glu->dynamic_snode_bound && super_bnd[jj] ) { /* jj starts a supernode in H */ pxgstrf_super_bnd_dfs (pnum, m, n, jj, super_bnd[jj], A, perm_r, inv_perm_r, xprune, ispruned, marker1, parent, xplore, pxgstrf_shared); } if ( (*info = pzgstrf_column_dfs (pnum, m, jj, jcol, perm_r, ispruned, &panel_lsub[k],w_lsub_end[jj-jcol], super_bnd, &nseg, segrep, &repfnz[k], xprune, marker2, parent, xplore, pxgstrf_shared)) ) return 0; #ifdef PROFILE TOC(t2, t); utime[DFS] += t2; #endif /* On return, the L supernode is gathered into the global storage. */ if ( (*info = pzgstrf_column_bmod (pnum, jj, jcol, (nseg - nseg1), &segrep[nseg1], &repfnz[k], &dense[k], tempv, pxgstrf_shared, Gstat)) ) return 0; if ( (*info = pzgstrf_pivotL (pnum, jj, diag_pivot_thresh, usepr, perm_r, inv_perm_r, inv_perm_c, &pivrow, Glu, Gstat)) ) if ( singular == 0 || *info < singular ) { singular = *info; #if ( DEBUGlevel>=1 ) printf("(%d) After pzgstrf_pivotL(): singular=%d\n", pnum, singular); #endif } /* release column "jj", so that the other processes waiting for this column can proceed */ pxgstrf_shared->spin_locks[jj] = 0; /* copy the U-segments to ucol[*] */ if ( (*info = pzgstrf_copy_to_ucol (pnum,jj,nseg,segrep,&repfnz[k], perm_r, &dense[k], pxgstrf_shared)) ) return 0; /* Prune columns [0:jj-1] using column jj */ pxgstrf_pruneL(jj, perm_r, pivrow, nseg, segrep, &repfnz[k], xprune, ispruned, Glu); /* Reset repfnz[] for this column */ pxgstrf_resetrep_col (nseg, segrep, &repfnz[k]); #if ( DEBUGlevel>=2 ) /* if (jj >= LOCOL && jj <= HICOL) {*/ if ( jj==BADCOL ) { dprint_lu_col(pnum, "panel:", jcol, jj, w, pivrow, xprune, Glu); dcheck_zero_vec(pnum, "after pzgstrf_copy_to_ucol() dense_col[]", n, &dense[k]); } #endif } /* for jj ... */ #ifdef PREDICT_OPT pdiv = Gstat->procstat[pnum].fcops - pdiv; cp_panel[jcol].pdiv = pdiv; #endif } /* else regular panel ... */ STATE( jcol ) = DONE; /* Release panel jcol. */ #ifdef PROFILE TOC(Gstat->panstat[jcol].fctime, t1); Gstat->panstat[jcol].flopcnt += Gstat->procstat[pnum].fcops - flopcnt; /*if ( Glu->tasks_remain < P ) { flops_last_P_panels += Gstat->panstat[jcol].flopcnt; printf("Panel %d, flops %e\n", jcol, Gstat->panstat[jcol].flopcnt); fflush(stdout); } */ #endif } #ifdef PROFILE else { /* No panel from the task queue - wait and try again */ Gstat->procstat[pnum].skedwaits++; } #endif } /* while there are more panels */ *info = singular; /* Free work space and compress storage */ pzgstrf_WorkFree(iwork, dwork, Glu); SUPERLU_FREE (spa_marker); SUPERLU_FREE (w_lsub_end); #ifdef PROFILE Gstat->procstat[pnum].fctime = SuperLU_timer_() - stime; #endif return 0; }
int pcgstrf_factor_snode( const int pnum, /* process number */ const int jcol, SuperMatrix *A, const float diag_pivot_thresh, yes_no_t *usepr, int *perm_r, int *inv_perm_r, /* modified */ int *inv_perm_c, /* in - used to find diagonal of Pc*A*Pc' */ int *xprune, int *marker, int *col_lsub, /* values are irrevelant on entry and on return */ complex *dense, complex *tempv, pxgstrf_shared_t *pxgstrf_shared, int *info ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * Factorize the artificial supernodes grouped at the bottom * of the etree. * */ GlobalLU_t *Glu = pxgstrf_shared->Glu; int singular; NCPformat *Astore; register int kcol, icol, k, jsupno, fsupc, nsupr; register int ifrom, ito; int nextu, nextlu; int pivrow; complex *a; int *asub, *xa_begin, *xa_end, *xusub, *xusub_end, *xsup, *supno, *xlusup, *lsub, *xlsub, *xlsub_end; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; xusub = Glu->xusub; xusub_end = Glu->xusub_end; xsup = Glu->xsup; supno = Glu->supno; xlusup = Glu->xlusup; singular = 0; Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; kcol = jcol + pxgstrf_shared->pan_status[jcol].size; /* Determine the union of the row structure of the supernode */ if ( (*info = pcgstrf_snode_dfs(pnum, jcol, kcol-1, asub, xa_begin, xa_end, xprune, marker, col_lsub, pxgstrf_shared)) ) return 0; /* * Factorize the relaxed supernode (jcol:kcol-1) */ nextu = Glu->nextu; /* xiaoye - race condition (no problem!) */ jsupno = supno[jcol]; fsupc = xsup[jsupno]; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; if ( (*info = Glu_alloc(pnum, jcol, nsupr*(kcol-jcol), LUSUP, &nextlu, pxgstrf_shared)) ) return 0; for (icol = jcol; icol < kcol; icol++) { xusub[icol] = xusub_end[icol] = nextu; xlusup[icol] = nextlu; /* Scatter into SPA dense[*] */ for (k = xa_begin[icol]; k < xa_end[icol]; k++) dense[asub[k]] = a[k]; /* Numeric update within the supernode */ pcgstrf_snode_bmod(pnum, icol, jsupno, fsupc, dense, tempv, Glu, pxgstrf_shared->Gstat); if ( (*info = pcgstrf_pivotL (pnum, icol, diag_pivot_thresh, usepr, perm_r, inv_perm_r, inv_perm_c, &pivrow, Glu, pxgstrf_shared->Gstat)) ) if ( singular == 0 ) singular = *info; nextlu += nsupr; #if ( DEBUGlevel>= 2 ) if ( icol>=LOCOL && icol<=HICOL ) dprint_lu_col(pnum,"relax:",jcol,icol,kcol-jcol,pivrow,xprune,Glu); #endif } /* Store the row subscripts of kcol-1 for pruned graph */ k = ito = xlsub_end[jcol]; for (ifrom = xlsub[jcol]+kcol-jcol-1; ifrom < k; ++ifrom) lsub[ito++] = lsub[ifrom]; k = ito; xprune[kcol-1] = k; if (jcol < kcol-1) { /* not a singleton */ for (icol = jcol+1; icol < kcol; ++icol) xlsub_end[icol] = k; k = xlsub_end[jcol]; xprune[jcol] = k; for (icol = jcol+1; icol < kcol; ++icol) xlsub[icol] = k; } *info = singular; return 0; }