template <typename Entry> spqr_numeric <Entry> *spqr_factorize ( // input, optionally freed on output cholmod_sparse **Ahandle, // inputs, not modified Long freeA, // if TRUE, free A on output double tol, // for rank detection Long ntol, // apply tol only to first ntol columns spqr_symbolic *QRsym, // workspace and parameters cholmod_common *cc ) { Long *Wi, *Qfill, *PLinv, *Cm, *Sp, *Stack_size, *TaskFront, *TaskFrontp, *TaskStack, *Stack_maxstack ; Entry *Sx, **Rblock, **Cblock, **Stacks ; spqr_numeric <Entry> *QRnum ; Long nf, m, n, anz, fchunk, maxfn, rank, maxfrank, rjsize, rank1, maxstack,j, wtsize, stack, ns, ntasks, keepH, hisize ; char *Rdead ; cholmod_sparse *A ; spqr_work <Entry> *Work ; // ------------------------------------------------------------------------- // get inputs and contents of symbolic object // ------------------------------------------------------------------------- if (QRsym == NULL) { // out of memory in caller if (freeA) { // if freeA is true, A must always be freed, even on error cholmod_l_free_sparse (Ahandle, cc) ; } return (NULL) ; } A = *Ahandle ; nf = QRsym->nf ; // number of frontal matrices m = QRsym->m ; // A is m-by-n n = QRsym->n ; anz = QRsym->anz ; // nnz (A) keepH = QRsym->keepH ; rjsize = QRsym->rjsize ; Sp = QRsym->Sp ; // size m+1, row pointers for S Qfill = QRsym->Qfill ; // fill-reducing ordering PLinv = QRsym->PLinv ; // size m, leftmost column sort ns = QRsym->ns ; // number of stacks ntasks = QRsym->ntasks ; // number of tasks // FUTURE: compute a unique maxfn for each stack. Current maxfn is OK, but // it's a global max of the fn of all fronts, and need only be max fn of // the fronts in any given stack. maxfn = QRsym->maxfn ; // max # of columns in any front ASSERT (maxfn <= n) ; hisize = QRsym->hisize ; // # of integers in Hii, Householder vectors TaskFrontp = QRsym->TaskFrontp ; TaskFront = QRsym->TaskFront ; TaskStack = QRsym->TaskStack ; maxstack = QRsym->maxstack ; Stack_maxstack = QRsym->Stack_maxstack ; if (!(QRsym->do_rank_detection)) { // disable rank detection if not accounted for in analysis tol = -1 ; } // If there is one task, there is only one stack, and visa versa ASSERT ((ns == 1) == (ntasks == 1)) ; PR (("factorize with ns %ld ntasks %ld\n", ns, ntasks)) ; // ------------------------------------------------------------------------- // allocate workspace // ------------------------------------------------------------------------- cholmod_l_allocate_work (0, MAX (m,nf), 0, cc) ; // shared Long workspace Wi = (Long *) cc->Iwork ; // size m, aliased with the rest of Iwork Cm = Wi ; // size nf // Cblock is workspace shared by all threads Cblock = (Entry **) cholmod_l_malloc (nf+1, sizeof (Entry *), cc) ; Work = NULL ; // Work and its contents not yet allocated fchunk = MIN (m, FCHUNK) ; wtsize = 0 ; // ------------------------------------------------------------------------- // create S // ------------------------------------------------------------------------- // create numeric values of S = A(p,q) in row-form in Sx Sx = (Entry *) cholmod_l_malloc (anz, sizeof (Entry), cc) ; if (cc->status == CHOLMOD_OK) { // use Wi as workspace (Iwork (0:m-1)) [ spqr_stranspose2 (A, Qfill, Sp, PLinv, Sx, Wi) ; // Wi no longer needed ] } PR (("status after creating Sx: %d\n", cc->status)) ; // ------------------------------------------------------------------------- // input matrix A no longer needed; free it if the user doesn't need it // ------------------------------------------------------------------------- if (freeA) { // this is done even if out of memory, above cholmod_l_free_sparse (Ahandle, cc) ; ASSERT (*Ahandle == NULL) ; } if (cc->status < CHOLMOD_OK) { // out of memory FREE_WORK ; return (NULL) ; } // ------------------------------------------------------------------------- // allocate numeric object // ------------------------------------------------------------------------- QRnum = (spqr_numeric<Entry> *) cholmod_l_malloc (1, sizeof (spqr_numeric<Entry>), cc) ; if (cc->status < CHOLMOD_OK) { // out of memory FREE_WORK ; return (NULL) ; } Rblock = (Entry **) cholmod_l_malloc (nf, sizeof (Entry *), cc) ; Rdead = (char *) cholmod_l_calloc (n, sizeof (char), cc) ; // these may be revised (with ns=1) if we run out of memory Stacks = (Entry **) cholmod_l_calloc (ns, sizeof (Entry *), cc) ; Stack_size = (Long *) cholmod_l_calloc (ns, sizeof (Long), cc) ; QRnum->Rblock = Rblock ; QRnum->Rdead = Rdead ; QRnum->Stacks = Stacks ; QRnum->Stack_size = Stack_size ; if (keepH) { // allocate permanent space for Stair, Tau, Hii for each front QRnum->HStair= (Long *) cholmod_l_malloc (rjsize, sizeof (Long), cc) ; QRnum->HTau = (Entry *) cholmod_l_malloc (rjsize, sizeof (Entry), cc) ; QRnum->Hii = (Long *) cholmod_l_malloc (hisize, sizeof (Long), cc) ; QRnum->Hm = (Long *) cholmod_l_malloc (nf, sizeof (Long), cc) ; QRnum->Hr = (Long *) cholmod_l_malloc (nf, sizeof (Long), cc) ; QRnum->HPinv = (Long *) cholmod_l_malloc (m, sizeof (Long), cc) ; } else { // H is not kept; this part of the numeric object is not used QRnum->HStair = NULL ; QRnum->HTau = NULL ; QRnum->Hii = NULL ; QRnum->Hm = NULL ; QRnum->Hr = NULL ; QRnum->HPinv = NULL ; } QRnum->n = n ; QRnum->m = m ; QRnum->nf = nf ; QRnum->rjsize = rjsize ; QRnum->hisize = hisize ; QRnum->keepH = keepH ; QRnum->maxstack = maxstack ; QRnum->ns = ns ; QRnum->ntasks = ntasks ; QRnum->maxfm = EMPTY ; // max (Hm [0:nf-1]), computed only if H is kept if (cc->status < CHOLMOD_OK) { // out of memory spqr_freenum (&QRnum, cc) ; FREE_WORK ; return (NULL) ; } // ------------------------------------------------------------------------- // allocate workspace // ------------------------------------------------------------------------- Work = get_Work <Entry> (ns, n, maxfn, keepH, fchunk, &wtsize, cc) ; // ------------------------------------------------------------------------- // allocate and initialize each Stack // ------------------------------------------------------------------------- if (cc->status == CHOLMOD_OK) { for (stack = 0 ; stack < ns ; stack++) { Entry *Stack ; size_t stacksize = (ntasks == 1) ? maxstack : Stack_maxstack [stack] ; Stack_size [stack] = stacksize ; Stack = (Entry *) cholmod_l_malloc (stacksize, sizeof (Entry), cc) ; Stacks [stack] = Stack ; Work [stack].Stack_head = Stack ; Work [stack].Stack_top = Stack + stacksize ; } } // ------------------------------------------------------------------------- // punt to sequential case and fchunk = 1 if out of memory // ------------------------------------------------------------------------- if (cc->status < CHOLMOD_OK) { // PUNT: ran out of memory; try again with smaller workspace // out of memory; free any stacks that were successfully allocated if (Stacks != NULL) { for (stack = 0 ; stack < ns ; stack++) { size_t stacksize = (ntasks == 1) ? maxstack : Stack_maxstack [stack] ; cholmod_l_free (stacksize, sizeof (Entry), Stacks [stack], cc) ; } } cholmod_l_free (ns, sizeof (Entry *), Stacks, cc) ; cholmod_l_free (ns, sizeof (Long), Stack_size, cc) ; // free the contents of Work, and the Work array itself free_Work <Entry> (Work, ns, n, maxfn, wtsize, cc) ; cholmod_l_free (ns, sizeof (spqr_work <Entry>), Work, cc) ; // punt to a single stack, a single task, and fchunk of 1 ns = 1 ; ntasks = 1 ; fchunk = 1 ; cc->status = CHOLMOD_OK ; Work = get_Work <Entry> (ns, n, maxfn, keepH, fchunk, &wtsize, cc) ; Stacks = (Entry **) cholmod_l_calloc (ns, sizeof (Entry *), cc) ; Stack_size = (Long *) cholmod_l_calloc (ns, sizeof (Long), cc) ; QRnum->Stacks = Stacks ; QRnum->Stack_size = Stack_size ; if (cc->status == CHOLMOD_OK) { Entry *Stack ; Stack_size [0] = maxstack ; Stack = (Entry *) cholmod_l_malloc (maxstack, sizeof (Entry), cc) ; Stacks [0] = Stack ; Work [0].Stack_head = Stack ; Work [0].Stack_top = Stack + maxstack ; } } // actual # of stacks and tasks used QRnum->ns = ns ; QRnum->ntasks = ntasks ; // ------------------------------------------------------------------------- // check if everything was allocated OK // ------------------------------------------------------------------------- if (cc->status < CHOLMOD_OK) { spqr_freenum (&QRnum, cc) ; FREE_WORK ; return (NULL) ; } // At this point, the factorization is guaranteed to succeed, unless // sizeof (BLAS_INT) < sizeof (Long), in which case, you really should get // a 64-bit BLAS. // ------------------------------------------------------------------------- // create the Blob : everything the numeric factorization kernel needs // ------------------------------------------------------------------------- spqr_blob <Entry> Blob ; Blob.QRsym = QRsym ; Blob.QRnum = QRnum ; Blob.tol = tol ; Blob.Work = Work ; Blob.Cm = Cm ; Blob.Cblock = Cblock ; Blob.Sx = Sx ; Blob.ntol = ntol ; Blob.fchunk = fchunk ; Blob.cc = cc ; // ------------------------------------------------------------------------- // initialize the "pure" flop count (for performance testing only) // ------------------------------------------------------------------------- cc->other1 [0] = 0 ; // ------------------------------------------------------------------------- // numeric QR factorization // ------------------------------------------------------------------------- if (ntasks == 1) { // Just one task, with or without TBB installed: don't use TBB spqr_kernel (0, &Blob) ; // sequential case } else { #ifdef HAVE_TBB // parallel case: TBB is installed, and there is more than one task int nthreads = MAX (0, cc->SPQR_nthreads) ; spqr_parallel (ntasks, nthreads, &Blob) ; #else // TBB not installed, but the work is still split into multiple tasks. // do tasks 0 to ntasks-2 (skip the placeholder root task id = ntasks-1) for (Long id = 0 ; id < ntasks-1 ; id++) { spqr_kernel (id, &Blob) ; } #endif } // ------------------------------------------------------------------------- // check for BLAS Long overflow // ------------------------------------------------------------------------- if (CHECK_BLAS_INT && cc->status < CHOLMOD_OK) { // problem too large for the BLAS. This can only occur if, for example // you're on a 64-bit platform (with sizeof (Long) = 8) and using a // 32-bit BLAS (with sizeof (BLAS_INT) = 4). If sizeof (BLAS_INT) is // equal to sizeof (Long), then CHECK_BLAS_INT is FALSE at // compile-time, and this entire code is removed as dead code by the // compiler. spqr_freenum (&QRnum, cc) ; FREE_WORK ; return (NULL) ; } // ------------------------------------------------------------------------- // finalize the rank // ------------------------------------------------------------------------- rank = 0 ; maxfrank = 1 ; for (stack = 0 ; stack < ns ; stack++) { rank += Work [stack].sumfrank ; maxfrank = MAX (maxfrank, Work [stack].maxfrank) ; } QRnum->rank = rank ; // required by spqr_hpinv QRnum->maxfrank = maxfrank ; PR (("m %ld n %ld my QR rank %ld\n", m, n, rank)) ; // ------------------------------------------------------------------------- // finalize norm(w) for the dead column 2-norms // ------------------------------------------------------------------------- double wscale = 0 ; double wssq = 1 ; for (stack = 0 ; stack < ns ; stack++) { // norm_E_fro = norm (s.*sqrt(q)) ; see also LAPACK's dnrm2 double ws = Work [stack].wscale ; double wq = Work [stack].wssq ; if (wq != 0) { double wk = ws * sqrt (wq) ; if (wscale < wk) { double rr = wscale / wk ; wssq = 1 + wssq * rr * rr ; wscale = wk ; } else { double rr = wk / wscale ; wssq += rr * rr ; } } } QRnum->norm_E_fro = wscale * sqrt (wssq) ; cc->SPQR_xstat [2] = QRnum->norm_E_fro ; // ------------------------------------------------------------------------- // free all workspace, except Cblock and Work // ------------------------------------------------------------------------- FREE_WORK_PART1 ; // ------------------------------------------------------------------------- // shrink the Stacks to hold just R (and H, if H kept) // ------------------------------------------------------------------------- // If shrink is <= 0, then the Stacks are not modified. // If shrink is 1, each Stack is realloc'ed to the right size (default) // If shrink is > 1, then each Stack is forcibly moved and shrunk. // This option is mainly meant for testing, not production use. // It should be left at 1 for production use. Long any_moved = FALSE ; int shrink = cc->SPQR_shrink ; if (shrink > 0) { for (stack = 0 ; stack < ns ; stack++) { // stacksize is the current size of the this Stack size_t stacksize = Stack_size [stack] ; Entry *Stack = Stacks [stack] ; // Work [stack].Stack_head points to the first empty slot in stack, // so newstacksize is the size of the space in use by R and H. size_t newstacksize = Work [stack].Stack_head - Stack ; ASSERT (newstacksize <= stacksize) ; // Reduce the size of this stack. Cblock [0:nf-1] is no longer // needed for holding pointers to the C blocks of each frontal // matrix. Reuse it to hold the reallocated stacks. if (shrink > 1) { // force the block to move by malloc'ing a new one; // this option is mainly for testing only. Cblock [stack] = (Entry *) cholmod_l_malloc (newstacksize, sizeof (Entry), cc) ; if (Cblock [stack] == NULL) { // oops, the malloc failed; just use the old block cc->status = CHOLMOD_OK ; Cblock [stack] = Stack ; // update the memory usage statistics, however cc->memory_inuse += ((newstacksize-stacksize) * sizeof (Entry)) ; } else { // malloc is OK; copy the block over and free the old one memcpy (Cblock [stack], Stack, newstacksize*sizeof(Entry)) ; cholmod_l_free (stacksize, sizeof (Entry), Stack, cc) ; } // the Stack has been shrunk to the new size stacksize = newstacksize ; } else { // normal method; just realloc the block Cblock [stack] = // pointer to the new Stack (Entry *) cholmod_l_realloc ( newstacksize, // requested size of Stack, in # of Entries sizeof (Entry), // size of each Entry in the Stack Stack, // pointer to the old Stack &stacksize, // input: old stack size; output: new size cc) ; } Stack_size [stack] = stacksize ; any_moved = any_moved || (Cblock [stack] != Stack) ; // reducing the size of a block of memory always succeeds ASSERT (cc->status == CHOLMOD_OK) ; } } // ------------------------------------------------------------------------- // adjust the Rblock pointers if the Stacks have been moved // ------------------------------------------------------------------------- if (any_moved) { // at least one Stack has moved; check all fronts and adjust them for (Long task = 0 ; task < ntasks ; task++) { Long kfirst, klast ; if (ntasks == 1) { // sequential case kfirst = 0 ; klast = nf ; stack = 0 ; } else { kfirst = TaskFrontp [task] ; klast = TaskFrontp [task+1] ; stack = TaskStack [task] ; } ASSERT (stack >= 0 && stack < ns) ; Entry *Old_Stack = Stacks [stack] ; Entry *New_Stack = Cblock [stack] ; if (New_Stack != Old_Stack) { for (Long kf = kfirst ; kf < klast ; kf++) { Long f = (ntasks == 1) ? kf : TaskFront [kf] ; Rblock [f] = New_Stack + (Rblock [f] - Old_Stack) ; } } } // finalize the Stacks for (stack = 0 ; stack < ns ; stack++) { Stacks [stack] = Cblock [stack] ; } } // ------------------------------------------------------------------------- // free the rest of the workspace // ------------------------------------------------------------------------- FREE_WORK_PART2 ; // ------------------------------------------------------------------------- // extract the implicit row permutation for H // ------------------------------------------------------------------------- // this must be done sequentially, when all threads are finished if (keepH) { // use Wi as workspace (Iwork (0:m-1)) [ spqr_hpinv (QRsym, QRnum, Wi) ; // Wi no longer needed ] } // ------------------------------------------------------------------------- // find the rank and return the result // ------------------------------------------------------------------------- // find the rank of the first ntol columns of A if (ntol >= n) { rank1 = rank ; } else { rank1 = 0 ; for (j = 0 ; j < ntol ; j++) { if (!Rdead [j]) { rank1++ ; } } } QRnum->rank1 = rank1 ; return (QRnum) ; }
template <typename Entry> int spqr_1fixed ( // inputs, not modified double tol, // only accept singletons above tol Long bncols, // number of columns of B cholmod_sparse *A, // m-by-n sparse matrix // output arrays, neither allocated nor defined on input. Long **p_R1p, // size n1rows+1, R1p [k] = # of nonzeros in kth // row of R1. NULL if n1cols == 0. Long **p_P1inv, // size m, singleton row inverse permutation. // If row i of A is the kth singleton row, then // P1inv [i] = k. NULL if n1cols is zero. cholmod_sparse **p_Y, // on output, only the first n-n1cols+1 entries of // Y->p are defined (if Y is not NULL), where // Y = [A B] or Y = [A2 B2]. If B is empty and // there are no column singletons, Y is NULL Long *p_n1cols, // number of column singletons found Long *p_n1rows, // number of corresponding rows found // workspace and parameters cholmod_common *cc ) { cholmod_sparse *Y ; Long *P1inv, *R1p, *Yp, *Qrows, *Ap, *Ai ; char *Mark ; Entry *Ax ; Long i, j, k, p, d, row, n1rows, n1cols, ynz, iold, inew, kk, m, n, xtype ; // ------------------------------------------------------------------------- // get inputs // ------------------------------------------------------------------------- xtype = spqr_type <Entry> ( ) ; m = A->nrow ; n = A->ncol ; Ap = (Long *) A->p ; Ai = (Long *) A->i ; Ax = (Entry *) A->x ; // set outputs to NULL in case of early return *p_R1p = NULL ; *p_P1inv = NULL ; *p_Y = NULL ; *p_n1cols = EMPTY ; *p_n1rows = EMPTY ; // ------------------------------------------------------------------------- // allocate workspace // ------------------------------------------------------------------------- Mark = (char *) cholmod_l_calloc (m, sizeof (char), cc) ; Qrows = (Long *) cholmod_l_malloc (n, sizeof (Long), cc) ; if (cc->status < CHOLMOD_OK) { // out of memory cholmod_l_free (m, sizeof (char), Mark, cc) ; cholmod_l_free (n, sizeof (Long), Qrows, cc) ; return (FALSE) ; } // ------------------------------------------------------------------------- // find singletons; no column permutations allowed // ------------------------------------------------------------------------- n1cols = 0 ; // number of column singletons found n1rows = 0 ; // number of corresponding singleton rows for (j = 0 ; j < n ; j++) { // count the number of unmarked rows in column j Entry aij = 0 ; d = 0 ; row = EMPTY ; for (p = Ap [j] ; d < 2 && p < Ap [j+1] ; p++) { i = Ai [p] ; if (!Mark [i]) { // row i is not taken by a prior column singleton. If this // is the only unflagged row and the value is large enough, // it will become the row for this column singleton. aij = Ax [p] ; row = i ; d++ ; } } if (d == 0) { // j is a dead column singleton Qrows [n1cols++] = EMPTY ; } else if (d == 1 && spqr_abs (aij, cc) > tol) { // j is a live column singleton Qrows [n1cols++] = row ; // flag row i as taken Mark [row] = TRUE ; n1rows++ ; } else { // j is not a singleton; quit searching break ; } } // ------------------------------------------------------------------------- // construct P1inv permutation, row counts R1p, and col pointers Yp // ------------------------------------------------------------------------- if (n1cols == 0 && bncols == 0) { // --------------------------------------------------------------------- // no singletons, and B empty; Y=A will be done via pointer alias // --------------------------------------------------------------------- Y = NULL ; Yp = NULL ; P1inv = NULL ; R1p = NULL ; } else if (n1cols == 0) { // --------------------------------------------------------------------- // no singletons in the matrix; no R1 matrix, no P1inv permutation // --------------------------------------------------------------------- // Y has no entries yet; nnz(Y) will be determined later Y = cholmod_l_allocate_sparse (m, n+bncols, 0, FALSE, TRUE, 0, xtype, cc) ; if (cc->status < CHOLMOD_OK) { // out of memory cholmod_l_free (m, sizeof (char), Mark, cc) ; cholmod_l_free (n, sizeof (Long), Qrows, cc) ; return (FALSE) ; } Yp = (Long *) Y->p ; ASSERT (n1rows == 0) ; P1inv = NULL ; R1p = NULL ; // --------------------------------------------------------------------- // copy the column pointers of A for the first part of Y = [A B] // --------------------------------------------------------------------- ynz = Ap [n] ; for (k = 0 ; k <= n ; k++) { Yp [k] = Ap [k] ; } } else { // --------------------------------------------------------------------- // construct the row singleton permutation // --------------------------------------------------------------------- // Y has no entries yet; nnz(Y) will be determined later Y = cholmod_l_allocate_sparse (m-n1rows, n-n1cols+bncols, 0, TRUE, TRUE, 0, xtype, cc) ; P1inv = (Long *) cholmod_l_malloc (m, sizeof (Long), cc) ; R1p = (Long *) cholmod_l_calloc (n1rows+1, sizeof (Long), cc) ; if (cc->status < CHOLMOD_OK) { // out of memory cholmod_l_free_sparse (&Y, cc) ; cholmod_l_free (m, sizeof (Long), P1inv, cc) ; cholmod_l_free (n1rows+1, sizeof (Long), R1p, cc) ; cholmod_l_free (m, sizeof (char), Mark, cc) ; cholmod_l_free (n, sizeof (Long), Qrows, cc) ; return (FALSE) ; } Yp = (Long *) Y->p ; #ifndef NDEBUG for (i = 0 ; i < m ; i++) P1inv [i] = EMPTY ; #endif kk = 0 ; for (k = 0 ; k < n1cols ; k++) { i = Qrows [k] ; if (i != EMPTY) { // row i is the kk-th singleton row ASSERT (Mark [i]) ; ASSERT (P1inv [i] == EMPTY) ; P1inv [i] = kk ; kk++ ; } } for (i = 0 ; i < m ; i++) { if (!Mark [i]) { // row i is not a singleton row ASSERT (P1inv [i] == EMPTY) ; P1inv [i] = kk ; kk++ ; } } ASSERT (kk == m) ; // --------------------------------------------------------------------- // find row counts for R11 // --------------------------------------------------------------------- for (k = 0 ; k < n1cols ; k++) { for (p = Ap [k] ; p < Ap [k+1] ; p++) { iold = Ai [p] ; inew = P1inv [iold] ; ASSERT (inew < n1rows) ; R1p [inew]++ ; // a singleton row; in R1 } } // --------------------------------------------------------------------- // find row counts for R12 and column pointers for A2 part of Y // --------------------------------------------------------------------- ynz = 0 ; for ( ; k < n ; k++) { Yp [k-n1cols] = ynz ; for (p = Ap [k] ; p < Ap [k+1] ; p++) { iold = Ai [p] ; inew = P1inv [iold] ; if (inew < n1rows) { R1p [inew]++ ; // a singleton row; in R1 } else { ynz++ ; // not a singleton row; in A2 } } } Yp [n-n1cols] = ynz ; #ifndef NDEBUG PR (("n1cols: %ld\n", n1cols)) ; for (i = 0 ; i < n1rows ; i++) { PR (("R1p [%ld] is %ld\n", i, R1p [i])) ; ASSERT (R1p [i] > 0) ; } #endif } // ------------------------------------------------------------------------- // free workspace and return results // ------------------------------------------------------------------------- cholmod_l_free (n, sizeof (Long), Qrows, cc) ; cholmod_l_free (m, sizeof (char), Mark, cc) ; *p_R1p = R1p ; *p_P1inv = P1inv ; *p_Y = Y ; *p_n1cols = n1cols ; *p_n1rows = n1rows ; return (TRUE) ; }