void get_colamd( const int m, /* number of rows in matrix A. */ const int n, /* number of columns in matrix A. */ const int nnz,/* number of nonzeros in matrix A. */ int *colptr, /* column pointer of size n+1 for matrix A. */ int *rowind, /* row indices of size nz for matrix A. */ int *perm_c /* out - the column permutation vector. */ ) { int Alen, *A, i, info, *p; double *knobs; Alen = colamd_recommended(nnz, m, n); if ( !(knobs = (double *) SUPERLU_MALLOC(COLAMD_KNOBS * sizeof(double))) ) SUPERLU_ABORT("Malloc fails for knobs"); colamd_set_defaults(knobs); if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) ) SUPERLU_ABORT("Malloc fails for A[]"); if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) ) SUPERLU_ABORT("Malloc fails for p[]"); for (i = 0; i <= n; ++i) p[i] = colptr[i]; for (i = 0; i < nnz; ++i) A[i] = rowind[i]; info = colamd(m, n, Alen, A, p, knobs); if ( info == FALSE ) SUPERLU_ABORT("COLAMD failed"); for (i = 0; i < n; ++i) perm_c[p[i]] = i; SUPERLU_FREE(knobs); SUPERLU_FREE(A); SUPERLU_FREE(p); }
/* * Allocate storage for various statistics. */ void StatAlloc(const int n, const int nprocs, const int panel_size, const int relax, Gstat_t *Gstat) { register int w; w = SUPERLU_MAX( panel_size, relax ) + 1; Gstat->panel_histo = intCalloc(w); Gstat->utime = (double *) doubleMalloc(NPHASES); Gstat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t)); if ( !(Gstat->procstat = (procstat_t *) SUPERLU_MALLOC(nprocs*sizeof(procstat_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for procstat[]" ); #if (PRNTlevel==1) printf(".. StatAlloc(): n %d, nprocs %d, panel_size %d, relax %d\n", n, nprocs, panel_size, relax); #endif #ifdef PROFILE if ( !(Gstat->panstat = (panstat_t*) SUPERLU_MALLOC(n * sizeof(panstat_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for panstat[]" ); Gstat->panhows = intCalloc(3); Gstat->height = intCalloc(n+1); if ( !(Gstat->flops_by_height = (float *) SUPERLU_MALLOC(n * sizeof(float))) ) SUPERLU_ABORT("SUPERLU_MALLOC failed for flops_by_height[]"); #endif #ifdef PREDICT_OPT if ( !(cp_panel = (cp_panel_t *) SUPERLU_MALLOC(n * sizeof(cp_panel_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for cp_panel[]" ); if ( !(desc_eft = (desc_eft_t *) SUPERLU_MALLOC(n * sizeof(desc_eft_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for desc_eft[]" ); cp_firstkid = intMalloc(n+1); cp_nextkid = intMalloc(n+1); #endif }
/* * Check whether repfnz[] == EMPTY after reset. */ void check_repfnz(int n, int w, int jcol, int *repfnz) { int jj, k; for (jj = jcol; jj < jcol+w; jj++) for (k = 0; k < n; k++) if ( repfnz[(jj-jcol)*n + k] != EMPTY ) { fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj, k, repfnz[(jj-jcol)*n + k]); SUPERLU_ABORT("repfnz[] not empty."); } }
int dcheck_perm(char *what, int n, int *perm) { register int i; int *marker; marker = (int *) intCalloc(n); for (i = 0; i < n; ++i) { if ( marker[perm[i]] == 1 || perm[i] >= n ) { printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]); SUPERLU_ABORT("Invalid perm."); } else { marker[perm[i]] = 1; } } return 0; }
/* * Check whether vec[*] == 0. For the two vectors dense[*] and tempv[*], * this invariant should be mantained before and after calling some * numeric update routines, such as "panel_bmod" and "column_bmod". */ void scheck_zero_vec(int pnum, char *msg, int n, float *vec) { register int i, nonzero; nonzero = FALSE; for (i = 0; i < n; ++i) { if (vec[i] != 0.0) { printf("(%d) vec[%d] = %.10e; should be zero!\n", pnum, i, vec[i]); nonzero = TRUE; } } if ( nonzero ) { printf("(%d) %s\n", pnum, msg); SUPERLU_ABORT("Not a zero vector."); } }
int ParallelInit(int n, pxgstrf_relax_t *pxgstrf_relax, superlumt_options_t *superlumt_options, pxgstrf_shared_t *pxgstrf_shared) { int *etree = superlumt_options->etree; register int w, dad, ukids, i, j, k, rs, panel_size, relax; register int P, w_top, do_split = 0; panel_t panel_type; int *panel_histo = pxgstrf_shared->Gstat->panel_histo; register int nthr, concurrency, info; Gstat_t *Gstat = pxgstrf_shared->Gstat; #if ( MACH==SUN ) register int sync_type = USYNC_THREAD; /* Set concurrency level. */ nthr = sysconf(_SC_NPROCESSORS_ONLN); thr_setconcurrency(nthr); /* number of LWPs */ concurrency = thr_getconcurrency(); #if ( PRNTlevel==1 ) printf(".. CPUs %d, concurrency (#LWP) %d, P %d\n", nthr, concurrency, P); #endif /* Initialize mutex variables. */ pxgstrf_shared->lu_locks = (mutex_t *) SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(mutex_t)); for (i = 0; i < NO_GLU_LOCKS; ++i) mutex_init(&pxgstrf_shared->lu_locks[i], sync_type, 0); #elif ( MACH==DEC || MACH==PTHREAD ) pxgstrf_shared->lu_locks = (pthread_mutex_t *) SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(pthread_mutex_t)); for (i = 0; i < NO_GLU_LOCKS; ++i) pthread_mutex_init(&pxgstrf_shared->lu_locks[i], NULL); #else pxgstrf_shared->lu_locks = (mutex_t *) SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(mutex_t)); #endif #if ( PRNTlevel==1 ) printf(".. ParallelInit() ... nprocs %2d\n", superlumt_options->nprocs); #endif pxgstrf_shared->spin_locks = intCalloc(n); pxgstrf_shared->pan_status = (pan_status_t *) SUPERLU_MALLOC((n+1)*sizeof(pan_status_t)); pxgstrf_shared->fb_cols = intMalloc(n+1); panel_size = superlumt_options->panel_size; relax = superlumt_options->relax; w = SUPERLU_MAX(panel_size, relax) + 1; for (i = 0; i < w; ++i) panel_histo[i] = 0; pxgstrf_shared->num_splits = 0; if ( (info = queue_init(&pxgstrf_shared->taskq, n)) ) { fprintf(stderr, "ParallelInit(): %d\n", info); SUPERLU_ABORT("queue_init fails."); } /* Count children of each node in the etree. */ for (i = 0; i <= n; ++i) pxgstrf_shared->pan_status[i].ukids = 0; for (i = 0; i < n; ++i) { dad = etree[i]; ++pxgstrf_shared->pan_status[dad].ukids; } /* Find the panel partitions and initialize each panel's status */ #ifdef PROFILE Gstat->num_panels = 0; #endif pxgstrf_shared->tasks_remain = 0; rs = 1; /* index for the next relaxed s-node */ w_top = panel_size/2; if ( w_top == 0 ) w_top = 1; P = 12; for (i = 0; i < n; ) { if ( pxgstrf_relax[rs].fcol == i ) { w = pxgstrf_relax[rs++].size; panel_type = RELAXED_SNODE; pxgstrf_shared->pan_status[i].state = CANGO; } else { /* Adjust panel_size so that a panel won't overlap with the next relaxed snode. */ #if 0 /* Only works when etree is postordered. */ w = SUPERLU_MIN(panel_size, pxgstrf_relax[rs].fcol - i); #else w = panel_size; for (k = i + 1; k < SUPERLU_MIN(i + panel_size, n); ++k) if ( k == pxgstrf_relax[rs].fcol ) { w = k - i; /* panel stops at column k-1 */ break; } if ( k == n ) w = n - i; #endif #ifdef SPLIT_TOP if ( !do_split ) { if ( (n-i) < panel_size * P ) do_split = 1; } if ( do_split && w > w_top ) { /* split large panel */ w = w_top; ++pxgstrf_shared->num_splits; } #endif for (j = i+1; j < i + w; ++j) /* Do not allow panel to cross a branch point in the etree. */ if ( pxgstrf_shared->pan_status[j].ukids > 1 ) break; w = j - i; /* j should start a new panel */ panel_type = REGULAR_PANEL; pxgstrf_shared->pan_status[i].state = UNREADY; #ifdef DOMAINS if ( in_domain[i] == TREE_DOMAIN ) panel_type = TREE_DOMAIN; #endif } if ( panel_type == REGULAR_PANEL ) { ++pxgstrf_shared->tasks_remain; /*printf("nondomain panel %6d -- %6d\n", i, i+w-1); fflush(stdout);*/ } ukids = k = 0; for (j = i; j < i + w; ++j) { pxgstrf_shared->pan_status[j].size = k--; pxgstrf_shared->pan_status[j].type = panel_type; ukids += pxgstrf_shared->pan_status[j].ukids; } pxgstrf_shared->pan_status[i].size = w; /* leading column */ /* only count those kids outside the panel */ pxgstrf_shared->pan_status[i].ukids = ukids - (w-1); panel_histo[w]++; #ifdef PROFILE Gstat->panstat[i].size = w; ++Gstat->num_panels; #endif pxgstrf_shared->fb_cols[i] = i; i += w; /* move to the next panel */ } /* for i ... */ /* Dummy root */ pxgstrf_shared->pan_status[n].size = 1; pxgstrf_shared->pan_status[n].state = UNREADY; #if ( PRNTlevel==1 ) printf(".. Split: P %d, #nondomain panels %d\n", P, pxgstrf_shared->tasks_remain); #endif #ifdef DOMAINS EnqueueDomains(&pxgstrf_shared->taskq, list_head, pxgstrf_shared); #else EnqueueRelaxSnode(&pxgstrf_shared->taskq, n, pxgstrf_relax, pxgstrf_shared); #endif #if ( PRNTlevel==1 ) printf(".. # tasks %d\n", pxgstrf_shared->tasks_remain); fflush(stdout); #endif #ifdef PREDICT_OPT /* Set up structure describing children */ for (i = 0; i <= n; cp_firstkid[i++] = EMPTY); for (i = n-1; i >= 0; i--) { dad = etree[i]; cp_nextkid[i] = cp_firstkid[dad]; cp_firstkid[dad] = i; } #endif return 0; } /* ParallelInit */
int sp_zgemv(char *trans, doublecomplex alpha, SuperMatrix *A, doublecomplex *x, int incx, doublecomplex beta, doublecomplex *y, int incy) { /* Purpose ======= sp_zgemv() performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, where alpha and beta are scalars, x and y are vectors and A is a sparse A->nrow by A->ncol matrix. Parameters ========== TRANS - (input) char* On entry, TRANS specifies the operation to be performed as follows: TRANS = 'N' or 'n' y := alpha*A*x + beta*y. TRANS = 'T' or 't' y := alpha*A'*x + beta*y. TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. ALPHA - (input) doublecomplex On entry, ALPHA specifies the scalar alpha. A - (input) SuperMatrix* Before entry, the leading m by n part of the array A must contain the matrix of coefficients. X - (input) doublecomplex*, array of DIMENSION at least ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' and at least ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. Before entry, the incremented array X must contain the vector x. INCX - (input) int On entry, INCX specifies the increment for the elements of X. INCX must not be zero. BETA - (input) doublecomplex On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input. Y - (output) doublecomplex*, array of DIMENSION at least ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' and at least ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. Before entry with BETA non-zero, the incremented array Y must contain the vector y. On exit, Y is overwritten by the updated vector y. INCY - (input) int On entry, INCY specifies the increment for the elements of Y. INCY must not be zero. ==== Sparse Level 2 Blas routine. */ /* Local variables */ NCformat *Astore; doublecomplex *Aval; int info; doublecomplex temp, temp1; int lenx, leny, i, j, irow; int iy, jx, jy, kx, ky; int notran; doublecomplex comp_zero = {0.0, 0.0}; doublecomplex comp_one = {1.0, 0.0}; notran = lsame_(trans, "N"); Astore = A->Store; Aval = Astore->nzval; /* Test the input parameters */ info = 0; if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; else if (incx == 0) info = 5; else if (incy == 0) info = 8; if (info != 0) { xerbla_("sp_zgemv ", &info); return 0; } /* Quick return if possible. */ if (A->nrow == 0 || A->ncol == 0 || z_eq(&alpha, &comp_zero) && z_eq(&beta, &comp_one)) return 0; /* Set LENX and LENY, the lengths of the vectors x and y, and set up the start points in X and Y. */ if (lsame_(trans, "N")) { lenx = A->ncol; leny = A->nrow; } else { lenx = A->nrow; leny = A->ncol; } if (incx > 0) kx = 0; else kx = - (lenx - 1) * incx; if (incy > 0) ky = 0; else ky = - (leny - 1) * incy; /* Start the operations. In this version the elements of A are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ if ( !z_eq(&beta, &comp_one) ) { if (incy == 1) { if ( z_eq(&beta, &comp_zero) ) for (i = 0; i < leny; ++i) y[i] = comp_zero; else for (i = 0; i < leny; ++i) zz_mult(&y[i], &beta, &y[i]); } else { iy = ky; if ( z_eq(&beta, &comp_zero) ) for (i = 0; i < leny; ++i) { y[iy] = comp_zero; iy += incy; } else for (i = 0; i < leny; ++i) { zz_mult(&y[iy], &beta, &y[iy]); iy += incy; } } } if ( z_eq(&alpha, &comp_zero) ) return 0; if ( notran ) { /* Form y := alpha*A*x + y. */ jx = kx; if (incy == 1) { for (j = 0; j < A->ncol; ++j) { if ( !z_eq(&x[jx], &comp_zero) ) { zz_mult(&temp, &alpha, &x[jx]); for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; zz_mult(&temp1, &temp, &Aval[i]); z_add(&y[irow], &y[irow], &temp1); } } jx += incx; } } else { SUPERLU_ABORT("Not implemented."); } } else { /* Form y := alpha*A'*x + y. */ jy = ky; if (incx == 1) { for (j = 0; j < A->ncol; ++j) { temp = comp_zero; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; zz_mult(&temp1, &Aval[i], &x[irow]); z_add(&temp, &temp, &temp1); } zz_mult(&temp1, &alpha, &temp); z_add(&y[jy], &y[jy], &temp1); jy += incy; } } else { SUPERLU_ABORT("Not implemented."); } } return 0; } /* sp_zgemv */
void pcgstrf(superlumt_options_t *superlumt_options, SuperMatrix *A, int *perm_r, SuperMatrix *L, SuperMatrix *U, Gstat_t *Gstat, int *info) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * PCGSTRF computes an LU factorization of a general sparse nrow-by-ncol * matrix A using partial pivoting with row interchanges. The factorization * has the form * Pr * A = L * U * where Pr is a row permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is * upper triangular (upper trapezoidal if A->nrow < A->ncol). * * Arguments * ========= * * superlumt_options (input) superlumt_options_t* * The structure defines the parameters to control how the sparse * LU factorization is performed. The following fields must be set * by the user: * * o nprocs (int) * Number of processes to be spawned and used for factorization. * * o refact (yes_no_t) * Specifies whether this is first time or subsequent factorization. * = NO: this factorization is treated as the first one; * = YES: it means that a factorization was performed prior to this * one. Therefore, this factorization will re-use some * existing data structures, such as L and U storage, column * elimination tree, and the symbolic information of the * Householder matrix. * * o panel_size (int) * A panel consists of at most panel_size consecutive columns. * * o relax (int) * Degree of relaxing supernodes. If the number of nodes (columns) * in a subtree of the elimination tree is less than relax, this * subtree is considered as one supernode, regardless of the row * structures of those columns. * * o diag_pivot_thresh (float) * Diagonal pivoting threshold. At step j of Gaussian elimination, * if abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)), * use A_jj as pivot. 0 <= diag_pivot_thresh <= 1. The default * value is 1.0, corresponding to partial pivoting. * * o usepr (yes_no_t) * Whether the pivoting will use perm_r specified by the user. * = YES: use perm_r; perm_r is input, unchanged on exit. * = NO: perm_r is determined by partial pivoting, and is output. * * o drop_tol (double) (NOT IMPLEMENTED) * Drop tolerance parameter. At step j of the Gaussian elimination, * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. * 0 <= drop_tol <= 1. The default value of drop_tol is 0, * corresponding to not dropping any entry. * * o perm_c (int*) * Column permutation vector of size A->ncol, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * o perm_r (int*) * Column permutation vector of size A->nrow. * If superlumt_options->usepr = NO, this is an output argument. * * o work (void*) of size lwork * User-supplied work space and space for the output data structures. * Not referenced if lwork = 0; * * o lwork (int) * Specifies the length of work array. * = 0: allocate space internally by system malloc; * > 0: use user-supplied work array of length lwork in bytes, * returns error if space runs out. * = -1: the routine guesses the amount of space needed without * performing the factorization, and returns it in * superlu_memusage->total_needed; no other side effects. * * A (input) SuperMatrix* * Original matrix A, permuted by columns, of dimension * (A->nrow, A->ncol). The type of A can be: * Stype = NCP; Dtype = _D; Mtype = GE. * * perm_r (input/output) int*, dimension A->nrow * Row permutation vector which defines the permutation matrix Pr, * perm_r[i] = j means row i of A is in position j in Pr*A. * If superlumt_options->usepr = NO, perm_r is output argument; * If superlumt_options->usepr = YES, the pivoting routine will try * to use the input perm_r, unless a certain threshold criterion * is violated. In that case, perm_r is overwritten by a new * permutation determined by partial pivoting or diagonal * threshold pivoting. * * L (output) SuperMatrix* * The factor L from the factorization Pr*A=L*U; use compressed row * subscripts storage for supernodes, i.e., L has type: * Stype = SCP, Dtype = _D, Mtype = TRLU. * * U (output) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise * storage scheme, i.e., U has types: Stype = NCP, Dtype = _D, * Mtype = TRU. * * Gstat (output) Gstat_t* * Record all the statistics about the factorization; * See Gstat_t structure defined in slu_mt_util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * */ pcgstrf_threadarg_t *pcgstrf_threadarg; pxgstrf_shared_t pxgstrf_shared; register int nprocs = superlumt_options->nprocs; register int i, iinfo; double *utime = Gstat->utime; double usrtime, wtime; double usertimer_(); #if ( MACH==SUN ) thread_t *thread_id; #elif ( MACH==DEC || MACH==PTHREAD ) pthread_t *thread_id; void *status; #endif void *pcgstrf_thread(void *); /* -------------------------------------------------------------- Initializes the parallel data structures for pcgstrf_thread(). --------------------------------------------------------------*/ pcgstrf_threadarg = pcgstrf_thread_init(A, L, U, superlumt_options, &pxgstrf_shared, Gstat, info); if ( *info ) return; /* Start timing factorization. */ usrtime = usertimer_(); wtime = SuperLU_timer_(); /* ------------------------------------------------------------ On a SUN multiprocessor system, use Solaris thread. ------------------------------------------------------------*/ #if ( MACH==SUN ) /* Create nproc threads for concurrent factorization. */ thread_id = (thread_t *) SUPERLU_MALLOC(nprocs * sizeof(thread_t)); for (i = 1; i < nprocs; ++i) { #if ( PRNTlevel==1 ) printf(".. Create unbound threads: i %d, nprocs %d\n", i, nprocs); #endif if ( (iinfo = thr_create(NULL, 0, pcgstrf_thread, &(pcgstrf_threadarg[i]), 0, &thread_id[i])) ) { fprintf(stderr, "thr_create: %d\n", iinfo); SUPERLU_ABORT("thr_creat()"); } } pcgstrf_thread( &(pcgstrf_threadarg[0]) ); /* Wait for all threads to terminate. */ for (i = 1; i < nprocs; i++) thr_join(thread_id[i], 0, 0); SUPERLU_FREE (thread_id); /* _SOLARIS_2 */ /* ------------------------------------------------------------ On a DEC multiprocessor system, use pthread. ------------------------------------------------------------*/ #elif ( MACH==DEC ) /* Use DECthreads ... */ /* Create nproc threads for concurrent factorization. */ thread_id = (pthread_t *) SUPERLU_MALLOC(nprocs * sizeof(pthread_t)); for (i = 0; i < nprocs; ++i) { if ( iinfo = pthread_create(&thread_id[i], NULL, pcgstrf_thread, &(pcgstrf_threadarg[i])) ) { fprintf(stderr, "pthread_create: %d\n", iinfo); SUPERLU_ABORT("pthread_create()"); } /* pthread_bind_to_cpu_np(thread_id[i], i);*/ } /* Wait for all threads to terminate. */ for (i = 0; i < nprocs; i++) pthread_join(thread_id[i], &status); SUPERLU_FREE (thread_id); /* _DEC */ /* ------------------------------------------------------------ On a SGI Power Challenge or Origin multiprocessor system, use parallel C. ------------------------------------------------------------*/ #elif ( MACH==SGI || MACH==ORIGIN ) /* Use parallel C ... */ if ( getenv("MP_SET_NUMTHREADS") ) { i = atoi(getenv("MP_SET_NUMTHREADS")); if ( nprocs > i ) { printf("nprocs=%d > environment allowed: MP_SET_NUMTHREADS=%d\n", nprocs, i); exit(-1); } } #pragma parallel #pragma shared (pcgstrf_threadarg) /*#pragma numthreads (max = nprocs)*/ #pragma numthreads (nprocs) { pcgstrf_thread( pcgstrf_threadarg ); } /* _SGI or _ORIGIN */ /* ------------------------------------------------------------ On a Cray PVP multiprocessor system, use microtasking. ------------------------------------------------------------*/ #elif ( MACH==CRAY_PVP ) /* Use C microtasking. */ if ( getenv("NCPUS") ) { i = atoi(getenv("NCPUS")); if ( nprocs > i ) { printf("nprocs=%d > environment allowed: NCPUS=%d\n", nprocs, i); exit(-1); } } #pragma _CRI taskloop private (i,nprocs) shared (pcgstrf_threadarg) /* Stand-alone task loop */ for (i = 0; i < nprocs; ++i) { pcgstrf_thread( &(pcgstrf_threadarg[i]) ); } /* _CRAY_PVP */ /* ------------------------------------------------------------ Use POSIX threads. ------------------------------------------------------------*/ #elif ( MACH==PTHREAD ) /* Use pthread ... */ /* Create nproc threads for concurrent factorization. */ thread_id = (pthread_t *) SUPERLU_MALLOC(nprocs * sizeof(pthread_t)); for (i = 0; i < nprocs; ++i) { if ( iinfo = pthread_create(&thread_id[i], NULL, pcgstrf_thread, &(pcgstrf_threadarg[i])) ) { fprintf(stderr, "pthread_create: %d\n", iinfo); SUPERLU_ABORT("pthread_create()"); } } /* Wait for all threads to terminate. */ for (i = 0; i < nprocs; i++) pthread_join(thread_id[i], &status); SUPERLU_FREE (thread_id); /* _PTHREAD */ /* ------------------------------------------------------------ Use openMP. ------------------------------------------------------------*/ #elif ( MACH==OPENMP ) /* Use openMP ... */ #pragma omp parallel for shared (pcgstrf_threadarg) private (i) /* Stand-alone task loop */ for (i = 0; i < nprocs; ++i) { pcgstrf_thread( &(pcgstrf_threadarg[i]) ); } /* _OPENMP */ /* ------------------------------------------------------------ On all other systems, use single processor. ------------------------------------------------------------*/ #else printf("pcgstrf() is not parallelized on this machine.\n"); printf("pcgstrf() will be run on single processor.\n"); pcgstrf_thread( &(pcgstrf_threadarg[0]) ); #endif wtime = SuperLU_timer_() - wtime; usrtime = usertimer_() - usrtime; utime[FACT] = wtime; #if ( PRNTlevel==1 ) printf(".. pcgstrf_thread() returns info %d, usrtime %.2f, wtime %.2f\n", *info, usrtime, wtime); #endif /* check_mem_leak("after pcgstrf_thread()"); */ /* ------------------------------------------------------------ Clean up and free storage after multithreaded factorization. ------------------------------------------------------------*/ pcgstrf_thread_finalize(pcgstrf_threadarg, &pxgstrf_shared, A, perm_r, L, U); }
double zlangs(char *norm, SuperMatrix *A) { /* Purpose ======= ZLANGS returns the value of the one norm, or the Frobenius norm, or the infinity norm, or the element of largest absolute value of a real matrix A. Description =========== ZLANGS returns the value ZLANGS = ( max(abs(A(i,j))), NORM = 'M' or 'm' ( ( norm1(A), NORM = '1', 'O' or 'o' ( ( normI(A), NORM = 'I' or 'i' ( ( normF(A), NORM = 'F', 'f', 'E' or 'e' where norm1 denotes the one norm of a matrix (maximum column sum), normI denotes the infinity norm of a matrix (maximum row sum) and normF denotes the Frobenius norm of a matrix (square root of sum of squares). Note that max(abs(A(i,j))) is not a matrix norm. Arguments ========= NORM (input) CHARACTER*1 Specifies the value to be returned in ZLANGE as described above. A (input) SuperMatrix* The M by N sparse matrix A. ===================================================================== */ /* Local variables */ NCformat *Astore; doublecomplex *Aval; int i, j, irow; double value, sum; double *rwork; Astore = A->Store; Aval = Astore->nzval; if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { value = 0.; } else if (lsame_(norm, "M")) { /* Find max(abs(A(i,j))). */ value = 0.; for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) value = SUPERLU_MAX( value, z_abs( &Aval[i]) ); } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { /* Find norm1(A). */ value = 0.; for (j = 0; j < A->ncol; ++j) { sum = 0.; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) sum += z_abs( &Aval[i] ); value = SUPERLU_MAX(value,sum); } } else if (lsame_(norm, "I")) { /* Find normI(A). */ if ( !(rwork = (double *) SUPERLU_MALLOC((size_t) A->nrow * sizeof(double))) ) SUPERLU_ABORT("SUPERLU_MALLOC fails for rwork."); for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { irow = Astore->rowind[i]; rwork[irow] += z_abs( &Aval[i] ); } value = 0.; for (i = 0; i < A->nrow; ++i) value = SUPERLU_MAX(value, rwork[i]); SUPERLU_FREE (rwork); } else if (lsame_(norm, "F") || lsame_(norm, "E")) { /* Find normF(A). */ SUPERLU_ABORT("Not implemented."); } else SUPERLU_ABORT("Illegal norm specified."); return (value); } /* zlangs */
main(int argc, char *argv[]) { SuperMatrix A, AC, L, U, B; NCformat *Astore; SCPformat *Lstore; NCPformat *Ustore; superlumt_options_t superlumt_options; pxgstrf_shared_t pxgstrf_shared; pdgstrf_threadarg_t *pdgstrf_threadarg; int nprocs; fact_t fact; trans_t trans; yes_no_t refact, usepr; double u, drop_tol; double *a; int *asub, *xa; int *perm_c; /* column permutation vector */ int *perm_r; /* row permutations from partial pivoting */ void *work; int info, lwork, nrhs, ldx; int m, n, nnz, permc_spec, panel_size, relax; int i, firstfact; double *rhsb, *xact; Gstat_t Gstat; flops_t flopcnt; void parse_command_line(); /* Default parameters to control factorization. */ nprocs = 1; fact = EQUILIBRATE; trans = NOTRANS; panel_size = sp_ienv(1); relax = sp_ienv(2); u = 1.0; usepr = NO; drop_tol = 0.0; work = NULL; lwork = 0; nrhs = 1; /* Get the number of processes from command line. */ parse_command_line(argc, argv, &nprocs); /* Read the input matrix stored in Harwell-Boeing format. */ dreadhb(&m, &n, &nnz, &a, &asub, &xa); /* Set up the sparse matrix data structure for A. */ dCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); if (!(rhsb = doubleMalloc(m * nrhs))) SUPERLU_ABORT("Malloc fails for rhsb[]."); dCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_D, SLU_GE); xact = doubleMalloc(n * nrhs); ldx = n; dGenXtrue(n, nrhs, xact, ldx); dFillRHS(trans, nrhs, xact, ldx, &A, &B); if (!(perm_r = intMalloc(m))) SUPERLU_ABORT("Malloc fails for perm_r[]."); if (!(perm_c = intMalloc(n))) SUPERLU_ABORT("Malloc fails for perm_c[]."); /******************************** * THE FIRST TIME FACTORIZATION * ********************************/ /* ------------------------------------------------------------ Allocate storage and initialize statistics variables. ------------------------------------------------------------*/ StatAlloc(n, nprocs, panel_size, relax, &Gstat); StatInit(n, nprocs, &Gstat); /* ------------------------------------------------------------ Get column permutation vector perm_c[], according to permc_spec: permc_spec = 0: natural ordering permc_spec = 1: minimum degree ordering on structure of A'*A permc_spec = 2: minimum degree ordering on structure of A'+A permc_spec = 3: approximate minimum degree for unsymmetric matrices ------------------------------------------------------------*/ permc_spec = 1; get_perm_c(permc_spec, &A, perm_c); /* ------------------------------------------------------------ Initialize the option structure superlumt_options using the user-input parameters; Apply perm_c to the columns of original A to form AC. ------------------------------------------------------------*/ refact= NO; pdgstrf_init(nprocs, fact, trans, refact, panel_size, relax, u, usepr, drop_tol, perm_c, perm_r, work, lwork, &A, &AC, &superlumt_options, &Gstat); /* ------------------------------------------------------------ Compute the LU factorization of A. The following routine will create nprocs threads. ------------------------------------------------------------*/ pdgstrf(&superlumt_options, &AC, perm_r, &L, &U, &Gstat, &info); flopcnt = 0; for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops; Gstat.ops[FACT] = flopcnt; /* ------------------------------------------------------------ Solve the system A*X=B, overwriting B with X. ------------------------------------------------------------*/ dgstrs(trans, &L, &U, perm_r, perm_c, &B, &Gstat, &info); printf("\n** Result of sparse LU **\n"); dinf_norm_error(nrhs, &B, xact); /* Check inf. norm of the error */ Destroy_CompCol_Permuted(&AC); /* Free extra arrays in AC. */ /********************************* * THE SUBSEQUENT FACTORIZATIONS * *********************************/ /* ------------------------------------------------------------ Re-initialize statistics variables and options used by the factorization routine pdgstrf(). ------------------------------------------------------------*/ StatInit(n, nprocs, &Gstat); refact= YES; pdgstrf_init(nprocs, fact, trans, refact, panel_size, relax, u, usepr, drop_tol, perm_c, perm_r, work, lwork, &A, &AC, &superlumt_options, &Gstat); /* ------------------------------------------------------------ Compute the LU factorization of A. The following routine will create nprocs threads. ------------------------------------------------------------*/ pdgstrf(&superlumt_options, &AC, perm_r, &L, &U, &Gstat, &info); flopcnt = 0; for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops; Gstat.ops[FACT] = flopcnt; /* ------------------------------------------------------------ Re-generate right-hand side B, then solve A*X= B. ------------------------------------------------------------*/ dFillRHS(trans, nrhs, xact, ldx, &A, &B); dgstrs(trans, &L, &U, perm_r, perm_c, &B, &Gstat, &info); /* ------------------------------------------------------------ Deallocate storage after factorization. ------------------------------------------------------------*/ pxgstrf_finalize(&superlumt_options, &AC); printf("\n** Result of sparse LU **\n"); dinf_norm_error(nrhs, &B, xact); /* Check inf. norm of the error */ Lstore = (SCPformat *) L.Store; Ustore = (NCPformat *) U.Store; printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); fflush(stdout); SUPERLU_FREE (rhsb); SUPERLU_FREE (xact); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); Destroy_CompCol_Matrix(&A); Destroy_SuperMatrix_Store(&B); if ( lwork >= 0 ) { Destroy_SuperNode_SCP(&L); Destroy_CompCol_NCP(&U); } StatFree(&Gstat); }
int sp_dgemv(char *trans, double alpha, SuperMatrix *A, double *x, int incx, double beta, double *y, int incy) { /* Purpose ======= sp_dgemv() performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, where alpha and beta are scalars, x and y are vectors and A is a sparse A->nrow by A->ncol matrix. Parameters ========== TRANS - (input) char* On entry, TRANS specifies the operation to be performed as follows: TRANS = 'N' or 'n' y := alpha*A*x + beta*y. TRANS = 'T' or 't' y := alpha*A'*x + beta*y. TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. ALPHA - (input) double On entry, ALPHA specifies the scalar alpha. A - (input) SuperMatrix* Matrix A with a sparse format, of dimension (A->nrow, A->ncol). Currently, the type of A can be: Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. In the future, more general A can be handled. X - (input) double*, array of DIMENSION at least ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' and at least ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. Before entry, the incremented array X must contain the vector x. INCX - (input) int On entry, INCX specifies the increment for the elements of X. INCX must not be zero. BETA - (input) double On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input. Y - (output) double*, array of DIMENSION at least ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' and at least ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. Before entry with BETA non-zero, the incremented array Y must contain the vector y. On exit, Y is overwritten by the updated vector y. INCY - (input) int On entry, INCY specifies the increment for the elements of Y. INCY must not be zero. ==== Sparse Level 2 Blas routine. */ /* Local variables */ NCformat *Astore; double *Aval; int info; double temp; int lenx, leny, i, j, irow; int iy, jx, jy, kx, ky; int notran; notran = lsame_(trans, "N"); Astore = A->Store; Aval = Astore->nzval; /* Test the input parameters */ info = 0; if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; else if (incx == 0) info = 5; else if (incy == 0) info = 8; if (info != 0) { xerbla_("sp_dgemv ", &info); return 0; } /* Quick return if possible. */ if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.)) return 0; /* Set LENX and LENY, the lengths of the vectors x and y, and set up the start points in X and Y. */ if (lsame_(trans, "N")) { lenx = A->ncol; leny = A->nrow; } else { lenx = A->nrow; leny = A->ncol; } if (incx > 0) kx = 0; else kx = - (lenx - 1) * incx; if (incy > 0) ky = 0; else ky = - (leny - 1) * incy; /* Start the operations. In this version the elements of A are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ if (beta != 1.) { if (incy == 1) { if (beta == 0.) for (i = 0; i < leny; ++i) y[i] = 0.; else for (i = 0; i < leny; ++i) y[i] = beta * y[i]; } else { iy = ky; if (beta == 0.) for (i = 0; i < leny; ++i) { y[iy] = 0.; iy += incy; } else for (i = 0; i < leny; ++i) { y[iy] = beta * y[iy]; iy += incy; } } } if (alpha == 0.) return 0; if ( notran ) { /* Form y := alpha*A*x + y. */ jx = kx; if (incy == 1) { for (j = 0; j < A->ncol; ++j) { if (x[jx] != 0.) { temp = alpha * x[jx]; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; y[irow] += temp * Aval[i]; } } jx += incx; } } else { SUPERLU_ABORT("Not implemented."); } } else { /* Form y := alpha*A'*x + y. */ jy = ky; if (incx == 1) { for (j = 0; j < A->ncol; ++j) { temp = 0.; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; temp += Aval[i] * x[irow]; } y[jy] += alpha * temp; jy += incy; } } else { SUPERLU_ABORT("Not implemented."); } } return 0; } /* sp_dgemv */
void get_perm_c(int ispec, SuperMatrix *A, int *perm_c) /* * Purpose * ======= * * GET_PERM_C obtains a permutation matrix Pc, by applying the multiple * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A'. * or using approximate minimum degree column ordering by Davis et. al. * The LU factorization of A*Pc tends to have less fill than the LU * factorization of A. * * Arguments * ========= * * ispec (input) int * Specifies the type of column ordering to reduce fill: * = 1: minimum degree on the structure of A^T * A * = 2: minimum degree on the structure of A^T + A * = 3: approximate minimum degree for unsymmetric matrices * If ispec == 0, the natural ordering (i.e., Pc = I) is returned. * * A (input) SuperMatrix* * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number * of the linear equations is A->nrow. Currently, the type of A * can be: Stype = NC; Dtype = _D; Mtype = GE. In the future, * more general A can be handled. * * perm_c (output) int* * Column permutation vector of size A->ncol, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * */ { NCformat *Astore = A->Store; int m, n, bnz, *b_colptr, i; int delta, maxint, nofsub, *invp; int *b_rowind, *dhead, *qsize, *llist, *marker; double t, SuperLU_timer_(); m = A->nrow; n = A->ncol; t = SuperLU_timer_(); switch ( ispec ) { case 0: /* Natural ordering */ for (i = 0; i < n; ++i) perm_c[i] = i; //printf("Use natural column ordering.\n"); //FT Commented return; case 1: /* Minimum degree ordering on A'*A */ getata(m, n, Astore->nnz, Astore->colptr, Astore->rowind, &bnz, &b_colptr, &b_rowind); //printf("Use minimum degree ordering on A'*A.\n"); //FT Commented t = SuperLU_timer_() - t; /*printf("Form A'*A time = %8.3f\n", t);*/ break; case 2: /* Minimum degree ordering on A'+A */ if ( m != n ) SUPERLU_ABORT("Matrix is not square"); at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, &bnz, &b_colptr, &b_rowind); //printf("Use minimum degree ordering on A'+A.\n"); //FT Commented t = SuperLU_timer_() - t; /*printf("Form A'+A time = %8.3f\n", t);*/ break; case 3: /* Approximate minimum degree column ordering. */ get_colamd(m, n, Astore->nnz, Astore->colptr, Astore->rowind, perm_c); //printf(".. Use approximate minimum degree column ordering.\n"); //FT Commented return; default: SUPERLU_ABORT("Invalid ISPEC"); } if ( bnz != 0 ) { t = SuperLU_timer_(); /* Initialize and allocate storage for GENMMD. */ delta = 0; /* DELTA is a parameter to allow the choice of nodes whose degree <= min-degree + DELTA. */ maxint = 2147483647; /* 2**31 - 1 */ invp = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); if ( !invp ) SUPERLU_ABORT("SUPERLU_MALLOC fails for invp."); dhead = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); if ( !dhead ) SUPERLU_ABORT("SUPERLU_MALLOC fails for dhead."); qsize = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int)); if ( !qsize ) SUPERLU_ABORT("SUPERLU_MALLOC fails for qsize."); llist = (int *) SUPERLU_MALLOC(n*sizeof(int)); if ( !llist ) SUPERLU_ABORT("SUPERLU_MALLOC fails for llist."); marker = (int *) SUPERLU_MALLOC(n*sizeof(int)); if ( !marker ) SUPERLU_ABORT("SUPERLU_MALLOC fails for marker."); /* Transform adjacency list into 1-based indexing required by GENMMD.*/ for (i = 0; i <= n; ++i) ++b_colptr[i]; for (i = 0; i < bnz; ++i) ++b_rowind[i]; genmmd_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, qsize, llist, marker, &maxint, &nofsub); /* Transform perm_c into 0-based indexing. */ for (i = 0; i < n; ++i) --perm_c[i]; SUPERLU_FREE(b_colptr); SUPERLU_FREE(b_rowind); SUPERLU_FREE(invp); SUPERLU_FREE(dhead); SUPERLU_FREE(qsize); SUPERLU_FREE(llist); SUPERLU_FREE(marker); t = SuperLU_timer_() - t; /* printf("call GENMMD time = %8.3f\n", t);*/ } else { /* Empty adjacency structure */ for (i = 0; i < n; ++i) perm_c[i] = i; } }
void at_plus_a( const int n, /* number of columns in matrix A. */ const int nz, /* number of nonzeros in matrix A */ int *colptr, /* column pointer of size n+1 for matrix A. */ int *rowind, /* row indices of size nz for matrix A. */ int *bnz, /* out - on exit, returns the actual number of nonzeros in matrix A'*A. */ int **b_colptr, /* out - size n+1 */ int **b_rowind /* out - size *bnz */ ) { /* * Purpose * ======= * * Form the structure of A'+A. A is an n-by-n matrix in column oriented * format represented by (colptr, rowind). The output A'+A is in column * oriented format (symmetrically, also row oriented), represented by * (b_colptr, b_rowind). * */ register int i, j, k, col, num_nz; int *t_colptr, *t_rowind; /* a column oriented form of T = A' */ int *marker; if ( !(marker = (int*) SUPERLU_MALLOC( n * sizeof(int)) ) ) SUPERLU_ABORT("SUPERLU_MALLOC fails for marker[]"); if ( !(t_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) SUPERLU_ABORT("SUPERLU_MALLOC fails for t_colptr[]"); if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) ) SUPERLU_ABORT("SUPERLU_MALLOC fails t_rowind[]"); /* Get counts of each column of T, and set up column pointers */ for (i = 0; i < n; ++i) marker[i] = 0; for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) ++marker[rowind[i]]; } t_colptr[0] = 0; for (i = 0; i < n; ++i) { t_colptr[i+1] = t_colptr[i] + marker[i]; marker[i] = t_colptr[i]; } /* Transpose the matrix from A to T */ for (j = 0; j < n; ++j) for (i = colptr[j]; i < colptr[j+1]; ++i) { col = rowind[i]; t_rowind[marker[col]] = j; ++marker[col]; } /* ---------------------------------------------------------------- compute B = A + T, where column j of B is: Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k) do not include the diagonal entry ---------------------------------------------------------------- */ /* Zero the diagonal flag */ for (i = 0; i < n; ++i) marker[i] = -1; /* First pass determines number of nonzeros in B */ num_nz = 0; for (j = 0; j < n; ++j) { /* Flag the diagonal so it's not included in the B matrix */ marker[j] = j; /* Add pattern of column A_*k to B_*j */ for (i = colptr[j]; i < colptr[j+1]; ++i) { k = rowind[i]; if ( marker[k] != j ) { marker[k] = j; ++num_nz; } } /* Add pattern of column T_*k to B_*j */ for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { k = t_rowind[i]; if ( marker[k] != j ) { marker[k] = j; ++num_nz; } } } *bnz = num_nz; /* Allocate storage for A+A' */ if ( !(*b_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) SUPERLU_ABORT("SUPERLU_MALLOC fails for b_colptr[]"); if ( *bnz) { if ( !(*b_rowind = (int*) SUPERLU_MALLOC( *bnz * sizeof(int)) ) ) SUPERLU_ABORT("SUPERLU_MALLOC fails for b_rowind[]"); } /* Zero the diagonal flag */ for (i = 0; i < n; ++i) marker[i] = -1; /* Compute each column of B, one at a time */ num_nz = 0; for (j = 0; j < n; ++j) { (*b_colptr)[j] = num_nz; /* Flag the diagonal so it's not included in the B matrix */ marker[j] = j; /* Add pattern of column A_*k to B_*j */ for (i = colptr[j]; i < colptr[j+1]; ++i) { k = rowind[i]; if ( marker[k] != j ) { marker[k] = j; (*b_rowind)[num_nz++] = k; } } /* Add pattern of column T_*k to B_*j */ for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { k = t_rowind[i]; if ( marker[k] != j ) { marker[k] = j; (*b_rowind)[num_nz++] = k; } } } (*b_colptr)[n] = num_nz; SUPERLU_FREE(marker); SUPERLU_FREE(t_colptr); SUPERLU_FREE(t_rowind); }
void sp_colorder(SuperMatrix *A, int *perm_c, superlumt_options_t *options, SuperMatrix *AC) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * * Purpose * ======= * * sp_colorder() permutes the columns of the original matrix A into AC. * It performs the following steps: * * 1. Apply column permutation perm_c[] to A's column pointers to form AC; * * 2. If options->refact = NO, then * (1) Allocate etree[], and compute column etree etree[] of AC'AC; * (2) Post order etree[] to get a postordered elimination tree etree[], * and a postorder permutation post[]; * (3) Apply post[] permutation to columns of AC; * (4) Overwrite perm_c[] with the product perm_c * post. * (5) Allocate storage, and compute the column count (colcnt_h) and the * supernode partition (part_super_h) for the Householder matrix H. * * Arguments * ========= * * A (input) SuperMatrix* * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number * of the linear equations is A->nrow. Currently, the type of A can be: * Stype = NC or NCP; Dtype = _D; Mtype = GE. * * perm_c (input/output) int* * Column permutation vector of size A->ncol, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * options (input/output) superlumt_options_t* * If options->refact = YES, then options is an * input argument. The arrays etree[], colcnt_h[] and part_super_h[] * are available from a previous factor and will be re-used. * If options->refact = NO, then options is an output argument. * * AC (output) SuperMatrix* * The resulting matrix after applied the column permutation * perm_c[] to matrix A. The type of AC can be: * Stype = NCP; Dtype = _D; Mtype = GE. * */ NCformat *Astore; NCPformat *ACstore; int i, n, nnz, nlnz; yes_no_t refact = options->refact; int *etree; int *colcnt_h; int *part_super_h; int *iwork, *post, *iperm; int *invp; int *part_super_ata; extern void at_plus_a(const int, const int, int *, int *, int *, int **, int **, int); n = A->ncol; iwork = intMalloc(n+1); part_super_ata = intMalloc(n); /* Apply column permutation perm_c to A's column pointers so to obtain NCP format in AC = A*Pc. */ AC->Stype = SLU_NCP; AC->Dtype = A->Dtype; AC->Mtype = A->Mtype; AC->nrow = A->nrow; AC->ncol = A->ncol; Astore = A->Store; ACstore = AC->Store = (void *) malloc( sizeof(NCPformat) ); ACstore->nnz = Astore->nnz; ACstore->nzval = Astore->nzval; ACstore->rowind = Astore->rowind; ACstore->colbeg = intMalloc(n); ACstore->colend = intMalloc(n); nnz = Astore->nnz; #ifdef CHK_COLORDER print_int_vec("pre_order:", n, perm_c); dcheck_perm("Initial perm_c", n, perm_c); #endif for (i = 0; i < n; i++) { ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; ACstore->colend[perm_c[i]] = Astore->colptr[i+1]; } if ( refact == NO ) { int *b_colptr, *b_rowind, bnz, j; options->etree = etree = intMalloc(n); options->colcnt_h = colcnt_h = intMalloc(n); options->part_super_h = part_super_h = intMalloc(n); if ( options->SymmetricMode ) { /* Compute the etree of C = Pc*(A'+A)*Pc'. */ int *c_colbeg, *c_colend; /* Form B = A + A'. */ at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind, &bnz, &b_colptr, &b_rowind, 1); /* Form C = Pc*B*Pc'. */ c_colbeg = (int_t*) intMalloc(n); c_colend = (int_t*) intMalloc(n); if (!(c_colbeg) || !(c_colend) ) SUPERLU_ABORT("SUPERLU_MALLOC fails for c_colbeg/c_colend"); for (i = 0; i < n; i++) { c_colbeg[perm_c[i]] = b_colptr[i]; c_colend[perm_c[i]] = b_colptr[i+1]; } for (j = 0; j < n; ++j) { for (i = c_colbeg[j]; i < c_colend[j]; ++i) { b_rowind[i] = perm_c[b_rowind[i]]; } iwork[perm_c[j]] = j; /* inverse perm_c */ } /* Compute etree of C. */ sp_symetree(c_colbeg, c_colend, b_rowind, n, etree); /* Restore B to be A+A', without column permutation */ for (i = 0; i < bnz; ++i) b_rowind[i] = iwork[b_rowind[i]]; SUPERLU_FREE(c_colbeg); SUPERLU_FREE(c_colend); } else { /* Compute the column elimination tree. */ sp_coletree(ACstore->colbeg, ACstore->colend, ACstore->rowind, A->nrow, A->ncol, etree); } #ifdef CHK_COLORDER print_int_vec("etree:", n, etree); #endif /* Post order etree. */ post = (int *) TreePostorder(n, etree); invp = intMalloc(n); for (i = 0; i < n; ++i) invp[post[i]] = i; #ifdef CHK_COLORDER print_int_vec("post:", n+1, post); dcheck_perm("post", n, post); #endif /* Renumber etree in postorder. */ for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]]; for (i = 0; i < n; ++i) etree[i] = iwork[i]; #ifdef CHK_COLORDER print_int_vec("postorder etree:", n, etree); #endif /* Postmultiply A*Pc by post[]. */ for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i]; for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i]; for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i]; for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i]; for (i = 0; i < n; ++i) iwork[i] = post[perm_c[i]]; /* product of perm_c and post */ for (i = 0; i < n; ++i) perm_c[i] = iwork[i]; for (i = 0; i < n; ++i) invp[perm_c[i]] = i; /* inverse of perm_c */ iperm = post; /* alias to the same address */ #ifdef ZFD_PERM /* Permute the rows of AC to have zero-free diagonal. */ printf("** Permute the rows to have zero-free diagonal....\n"); for (i = 0; i < n; ++i) iwork[i] = ACstore->colend[i] - ACstore->colbeg[i]; zfdperm(n, nnz, ACstore->rowind, ACstore->colbeg, iwork, iperm); #else for (i = 0; i < n; ++i) iperm[i] = i; #endif /* NOTE: iperm is returned as column permutation so that * the diagonal is nonzero. Since a symmetric permutation * preserves the diagonal, we can do the following: * P'(AP')P = P'A * That is, we apply the inverse of iperm to rows of A * to get zero-free diagonal. But since iperm is defined * in MC21A inversely as our definition of permutation, * so it is indeed an inverse for our purpose. We can * apply it directly. */ if ( options->SymmetricMode ) { /* Determine column count in the Cholesky factor of B = A+A' */ #if 0 cholnzcnt(n, Astore->colptr, Astore->rowind, invp, perm_c, etree, colcnt_h, &nlnz, part_super_h); #else cholnzcnt(n, b_colptr, b_rowind, invp, perm_c, etree, colcnt_h, &nlnz, part_super_h); #endif #if ( PRNTlevel>=1 ) printf(".. bnz %d\n", bnz); #endif SUPERLU_FREE(b_colptr); if ( bnz ) SUPERLU_FREE(b_rowind); } else { /* Determine the row and column counts in the QR factor. */ qrnzcnt(n, nnz, Astore->colptr, Astore->rowind, iperm, invp, perm_c, etree, colcnt_h, &nlnz, part_super_ata, part_super_h); } #if ( PRNTlevel>=2 ) dCheckZeroDiagonal(n, ACstore->rowind, ACstore->colbeg, ACstore->colend, perm_c); print_int_vec("colcnt", n, colcnt_h); dPrintSuperPart("Hpart", n, part_super_h); print_int_vec("iperm", n, iperm); #endif #ifdef CHK_COLORDER print_int_vec("Pc*post:", n, perm_c); dcheck_perm("final perm_c", n, perm_c); #endif SUPERLU_FREE (post); SUPERLU_FREE (invp); } /* if refact == NO */ SUPERLU_FREE (iwork); SUPERLU_FREE (part_super_ata); }
int main ( int argc, char *argv[] ) /******************************************************************************/ /* Purpose: MAIN is the main program for PSLINSOL. Licensing: This code is distributed under the GNU LGPL license. Modified: 10 February 2014 Author: Xiaoye Li */ { SuperMatrix A; NCformat *Astore; float *a; int *asub, *xa; int *perm_r; /* row permutations from partial pivoting */ int *perm_c; /* column permutation vector */ SuperMatrix L; /* factor L */ SCPformat *Lstore; SuperMatrix U; /* factor U */ NCPformat *Ustore; SuperMatrix B; int nrhs, ldx, info, m, n, nnz, b; int nprocs; /* maximum number of processors to use. */ int panel_size, relax, maxsup; int permc_spec; trans_t trans; float *xact, *rhs; superlu_memusage_t superlu_memusage; void parse_command_line(); timestamp ( ); printf ( "\n" ); printf ( "PSLINSOL:\n" ); printf ( " C/OpenMP version\n" ); printf ( " Call the OpenMP version of SuperLU to solve a linear system.\n" ); nrhs = 1; trans = NOTRANS; nprocs = 1; n = 1000; b = 1; panel_size = sp_ienv(1); relax = sp_ienv(2); maxsup = sp_ienv(3); /* Check for any commandline input. */ parse_command_line ( argc, argv, &nprocs, &n, &b, &panel_size, &relax, &maxsup ); #if ( PRNTlevel>=1 || DEBUGlevel>=1 ) cpp_defs(); #endif #define HB #if defined( DEN ) m = n; nnz = n * n; sband(n, n, nnz, &a, &asub, &xa); #elif defined( BAND ) m = n; nnz = (2*b+1) * n; sband(n, b, nnz, &a, &asub, &xa); #elif defined( BD ) nb = 5; bs = 200; m = n = bs * nb; nnz = bs * bs * nb; sblockdiag(nb, bs, nnz, &a, &asub, &xa); #elif defined( HB ) sreadhb(&m, &n, &nnz, &a, &asub, &xa); #else sreadmt(&m, &n, &nnz, &a, &asub, &xa); #endif sCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_S, SLU_GE); Astore = A.Store; printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz); if (!(rhs = floatMalloc(m * nrhs))) SUPERLU_ABORT("Malloc fails for rhs[]."); sCreate_Dense_Matrix(&B, m, nrhs, rhs, m, SLU_DN, SLU_S, SLU_GE); xact = floatMalloc(n * nrhs); ldx = n; sGenXtrue(n, nrhs, xact, ldx); sFillRHS(trans, nrhs, xact, ldx, &A, &B); if (!(perm_r = intMalloc(m))) SUPERLU_ABORT("Malloc fails for perm_r[]."); if (!(perm_c = intMalloc(n))) SUPERLU_ABORT("Malloc fails for perm_c[]."); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = 0: natural ordering * permc_spec = 1: minimum degree ordering on structure of A'*A * permc_spec = 2: minimum degree ordering on structure of A'+A * permc_spec = 3: approximate minimum degree for unsymmetric matrices */ permc_spec = 1; get_perm_c(permc_spec, &A, perm_c); psgssv(nprocs, &A, perm_c, perm_r, &L, &U, &B, &info); if ( info == 0 ) { sinf_norm_error(nrhs, &B, xact); /* Inf. norm of the error */ Lstore = (SCPformat *) L.Store; Ustore = (NCPformat *) U.Store; printf("#NZ in factor L = %d\n", Lstore->nnz); printf("#NZ in factor U = %d\n", Ustore->nnz); printf("#NZ in L+U = %d\n", Lstore->nnz + Ustore->nnz - L.ncol); superlu_sQuerySpace(nprocs, &L, &U, panel_size, &superlu_memusage); printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n", superlu_memusage.for_lu/1024/1024, superlu_memusage.total_needed/1024/1024, superlu_memusage.expansions); } SUPERLU_FREE (rhs); SUPERLU_FREE (xact); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); Destroy_CompCol_Matrix(&A); Destroy_SuperMatrix_Store(&B); Destroy_SuperNode_SCP(&L); Destroy_CompCol_NCP(&U); /* Terminate. */ printf ( "\n" ); printf ( "PSLINSOL:\n" ); printf ( " Normal end of execution.\n" ); printf ( "\n" ); timestamp ( ); return 0; }
int sp_dtrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, double *x, int *info) { /* * Purpose * ======= * * sp_dtrsv() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A'*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = _D, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NCP, Dtype = _D, Mtype = TRU. * * x - (input/output) double* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1, ftcs2, ftcs3; #endif SCPformat *Lstore; NCPformat *Ustore; double *Lval, *Uval; int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; register int fsupc, luptr, istart, irow, k, iptr, jcol, nsuper; int nsupr, nsupc, nrow, i; double *work; flops_t solve_ops; /* Test the input parameters */ *info = 0; if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; else if ( !lsame_(trans, "N") && !lsame_(trans, "T") ) *info = -2; else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); xerbla_("sp_dtrsv", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; nsuper = Lstore->nsuper; solve_ops = 0; if ( !(work = doubleCalloc(L->nrow)) ) SUPERLU_ABORT("Malloc fails for work in sp_dtrsv()."); if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_END(fsupc) - istart; nsupc = L_LAST_SUPC(k) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1); solve_ops += 2 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_END(fsupc); ++iptr) { irow = L_SUB(iptr); ++luptr; x[irow] -= x[fsupc] * Lval[luptr]; } } else { #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else dtrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); dgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif #else dlsolve (nsupr, nsupc, &Lval[luptr], &x[fsupc]); dmatvec (nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); x[irow] -= work[i]; /* Scatter */ work[i] = 0.0; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_END(fsupc) - L_SUB_START(fsupc); nsupc = L_LAST_SUPC(k) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; for (i = U_NZ_START(fsupc); i < U_NZ_END(fsupc); ++i) { irow = U_SUB(i); x[irow] -= x[fsupc] * Uval[i]; } } else { #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("N", strlen("N")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else dtrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else dusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 2*(U_NZ_END(jcol) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_END(jcol); i++) { irow = U_SUB(i); x[irow] -= x[jcol] * Uval[i]; } } } } /* for k ... */ } } else { /* Form x := inv(A')*x */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_END(fsupc) - istart; nsupc = L_LAST_SUPC(k) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 2 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_LAST_SUPC(k); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_END(jcol); i++) { irow = L_SUB(iptr); x[jcol] -= x[irow] * Lval[i]; iptr++; } } if ( nsupc > 1 ) { solve_ops += nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_END(fsupc) - L_SUB_START(fsupc); nsupc = L_LAST_SUPC(k) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 2*(U_NZ_END(jcol) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_END(jcol); i++) { irow = U_SUB(i); x[jcol] -= x[irow] * Uval[i]; } } solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } SUPERLU_FREE(work); return 0; }
void getata( const int m, /* number of rows in matrix A. */ const int n, /* number of columns in matrix A. */ const int nz, /* number of nonzeros in matrix A */ int *colptr, /* column pointer of size n+1 for matrix A. */ int *rowind, /* row indices of size nz for matrix A. */ int *atanz, /* out - on exit, returns the actual number of nonzeros in matrix A'*A. */ int **ata_colptr, /* out - size n+1 */ int **ata_rowind /* out - size *atanz */ ) /* * Purpose * ======= * * Form the structure of A'*A. A is an m-by-n matrix in column oriented * format represented by (colptr, rowind). The output A'*A is in column * oriented format (symmetrically, also row oriented), represented by * (ata_colptr, ata_rowind). * * This routine is modified from GETATA routine by Tim Davis. * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2, * i.e., the sum of the square of the row counts. * * Questions * ========= * o Do I need to withhold the *dense* rows? * o How do I know the number of nonzeros in A'*A? * */ { register int i, j, k, col, num_nz, ti, trow; int *marker, *b_colptr, *b_rowind; int *t_colptr, *t_rowind; /* a column oriented form of T = A' */ if (!(marker = (int*)SUPERLU_MALLOC( (SUPERLU_MAX(m,n)+1) * sizeof(int)) )) SUPERLU_ABORT("SUPERLU_MALLOC fails for marker[]"); if ( !(t_colptr = (int*) SUPERLU_MALLOC( (m+1) * sizeof(int)) ) ) SUPERLU_ABORT("SUPERLU_MALLOC t_colptr[]"); if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) ) SUPERLU_ABORT("SUPERLU_MALLOC fails for t_rowind[]"); /* Get counts of each column of T, and set up column pointers */ for (i = 0; i < m; ++i) marker[i] = 0; for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) ++marker[rowind[i]]; } t_colptr[0] = 0; for (i = 0; i < m; ++i) { t_colptr[i+1] = t_colptr[i] + marker[i]; marker[i] = t_colptr[i]; } /* Transpose the matrix from A to T */ for (j = 0; j < n; ++j) for (i = colptr[j]; i < colptr[j+1]; ++i) { col = rowind[i]; t_rowind[marker[col]] = j; ++marker[col]; } /* ---------------------------------------------------------------- compute B = T * A, where column j of B is: Struct (B_*j) = UNION ( Struct (T_*k) ) A_kj != 0 do not include the diagonal entry ( Partition A as: A = (A_*1, ..., A_*n) Then B = T * A = (T * A_*1, ..., T * A_*n), where T * A_*j = (T_*1, ..., T_*m) * A_*j. ) ---------------------------------------------------------------- */ /* Zero the diagonal flag */ for (i = 0; i < n; ++i) marker[i] = -1; /* First pass determines number of nonzeros in B */ num_nz = 0; for (j = 0; j < n; ++j) { /* Flag the diagonal so it's not included in the B matrix */ marker[j] = j; for (i = colptr[j]; i < colptr[j+1]; ++i) { /* A_kj is nonzero, add pattern of column T_*k to B_*j */ k = rowind[i]; for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { trow = t_rowind[ti]; if ( marker[trow] != j ) { marker[trow] = j; num_nz++; } } } } *atanz = num_nz; /* Allocate storage for A'*A */ if ( !(*ata_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) ) SUPERLU_ABORT("SUPERLU_MALLOC fails for ata_colptr[]"); if ( *atanz ) { if ( !(*ata_rowind = (int*) SUPERLU_MALLOC( *atanz * sizeof(int)) ) ) SUPERLU_ABORT("SUPERLU_MALLOC fails for ata_rowind[]"); } b_colptr = *ata_colptr; /* aliasing */ b_rowind = *ata_rowind; /* Zero the diagonal flag */ for (i = 0; i < n; ++i) marker[i] = -1; /* Compute each column of B, one at a time */ num_nz = 0; for (j = 0; j < n; ++j) { b_colptr[j] = num_nz; /* Flag the diagonal so it's not included in the B matrix */ marker[j] = j; for (i = colptr[j]; i < colptr[j+1]; ++i) { /* A_kj is nonzero, add pattern of column T_*k to B_*j */ k = rowind[i]; for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) { trow = t_rowind[ti]; if ( marker[trow] != j ) { marker[trow] = j; b_rowind[num_nz++] = trow; } } } } b_colptr[n] = num_nz; SUPERLU_FREE(marker); SUPERLU_FREE(t_colptr); SUPERLU_FREE(t_rowind); }
void cgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_r, int *perm_c, equed_t equed, float *R, float *C, SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr, Gstat_t *Gstat, int *info) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * * Purpose * ======= * * cgsrfs improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solution. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A**T * X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose = Transpose) * * A (input) SuperMatrix* * The original matrix A in the system, or the scaled A if * equilibration was done. The type of A can be: * Stype = NC, Dtype = _D, Mtype = GE. * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SCP, Dtype = _D, Mtype = TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, * i.e., U has types: Stype = NCP, Dtype = _D, Mtype = TRU. * * perm_r (input) int*, dimension (A->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * perm_c (input) int*, dimension (A->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * equed (input) equed_t * Specifies the form of equilibration that was done. * = NOEQUIL: No equilibration. * = ROW: Row equilibration, i.e., A was premultiplied by diag(R). * = COL: Column equilibration, i.e., A was postmultiplied by * diag(C). * = BOTH: Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * * R (input) double*, dimension (A->nrow) * The row scale factors for A. * If equed = ROW or BOTH, A is premultiplied by diag(R). * If equed = NOEQUIL or COL, R is not accessed. * * C (input) double*, dimension (A->ncol) * The column scale factors for A. * If equed = COL or BOTH, A is postmultiplied by diag(C). * If equed = NOEQUIL or ROW, C is not accessed. * * B (input) SuperMatrix* * B has types: Stype = DN, Dtype = _D, Mtype = GE. * The right hand side matrix B. * * X (input/output) SuperMatrix* * X has types: Stype = DN, Dtype = _D, Mtype = GE. * On entry, the solution matrix X, as computed by dgstrs(). * On exit, the improved solution matrix X. * * FERR (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * info (output) int* * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * */ #define ITMAX 5 /* Table of constant values */ int ione = 1; complex ndone = {-1., 0.}; complex done = {1., 0.}; /* Local variables */ NCformat *Astore; complex *Aval; SuperMatrix Bjcol; DNformat *Bstore, *Xstore, *Bjcol_store; complex *Bmat, *Xmat, *Bptr, *Xptr; int kase; float safe1, safe2; int i, j, k, irow, nz, count, notran, rowequ, colequ; int ldb, ldx, nrhs; float s, xk, lstres, eps, safmin; char transc[1]; trans_t transt; complex *work; float *rwork; int *iwork; extern double slamch_(char *); extern int clacon_(int *, complex *, complex *, float *, int *); #ifdef _CRAY extern int CCOPY(int *, complex *, int *, complex *, int *); extern int CSAXPY(int *, complex *, complex *, int *, complex *, int *); #else extern int ccopy_(int *, complex *, int *, complex *, int *); extern int caxpy_(int *, complex *, complex *, int *, complex *, int *); #endif Astore = A->Store; Aval = Astore->nzval; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; /* Test the input parameters */ *info = 0; notran = (trans == NOTRANS); if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NC || A->Dtype != SLU_C || A->Mtype != SLU_GE ) *info = -2; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SCP || L->Dtype != SLU_C || L->Mtype != SLU_TRLU ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NCP || U->Dtype != SLU_C || U->Mtype != SLU_TRU ) *info = -4; else if ( ldb < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) *info = -10; else if ( ldx < SUPERLU_MAX(0, A->nrow) || X->Stype != SLU_DN || X->Dtype != SLU_C || X->Mtype != SLU_GE ) *info = -11; if (*info != 0) { i = -(*info); xerbla_("cgsrfs", &i); return; } /* Quick return if possible */ if ( A->nrow == 0 || nrhs == 0) { for (j = 0; j < nrhs; ++j) { ferr[j] = 0.; berr[j] = 0.; } return; } rowequ = (equed == ROW) || (equed == BOTH); colequ = (equed == COL) || (equed == BOTH); /* Allocate working space */ work = complexMalloc(2*A->nrow); rwork = (float *) SUPERLU_MALLOC( (size_t) A->nrow * sizeof(float) ); iwork = intMalloc(A->nrow); if ( !work || !rwork || !iwork ) SUPERLU_ABORT("Malloc fails for work/rwork/iwork."); if ( notran ) { *(unsigned char *)transc = 'N'; transt = TRANS; } else { *(unsigned char *)transc = 'T'; transt = NOTRANS; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = slamch_("Epsilon"); safmin = slamch_("Safe minimum"); /* Set SAFE1 essentially to be the underflow threshold times the number of additions in each row. */ safe1 = nz * safmin; safe2 = safe1 / eps; /* Compute the number of nonzeros in each row (or column) of A */ for (i = 0; i < A->nrow; ++i) iwork[i] = 0; if ( notran ) { for (k = 0; k < A->ncol; ++k) for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) ++iwork[Astore->rowind[i]]; } else { for (k = 0; k < A->ncol; ++k) iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; } /* Copy one column of RHS B into Bjcol. */ Bjcol.Stype = B->Stype; Bjcol.Dtype = B->Dtype; Bjcol.Mtype = B->Mtype; Bjcol.nrow = B->nrow; Bjcol.ncol = 1; Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); if ( !Bjcol.Store ) SUPERLU_ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); Bjcol_store = Bjcol.Store; Bjcol_store->lda = ldb; Bjcol_store->nzval = work; /* address aliasing */ /* Do for each right hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; Bptr = &Bmat[j*ldb]; Xptr = &Xmat[j*ldx]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ #ifdef _CRAY CCOPY(&A->nrow, Bptr, &ione, work, &ione); #else ccopy_(&A->nrow, Bptr, &ione, work, &ione); #endif sp_cgemv(transc, ndone, A, Xptr, ione, done, work, ione); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th component of the numerator before dividing. */ for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if (notran) { for (k = 0; k < A->ncol; ++k) { xk = c_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; s += c_abs1(&Aval[i]) * c_abs1(&Xptr[irow]); } rwork[k] += s; } } s = 0.; for (i = 0; i < A->nrow; ++i) { if (rwork[i] > safe2) { s = SUPERLU_MAX( s, c_abs1(&work[i]) / rwork[i] ); } else if ( rwork[i] != 0.0 ) { s = SUPERLU_MAX( s, (c_abs1(&work[i]) + safe1) / rwork[i] ); } /* If rwork[i] is exactly 0.0, then we know the true residual also must be exactly 0.0. */ } berr[j] = s; /* Test stopping criterion. Continue iterating if 1) The residual BERR(J) is larger than machine epsilon, and 2) BERR(J) decreased by at least a factor of 2 during the last iteration, and 3) At most ITMAX iterations tried. */ if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { /* Update solution and try again. */ cgstrs (trans, L, U, perm_r, perm_c, &Bjcol, Gstat, info); #ifdef _CRAY CAXPY(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #else caxpy_(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #endif lstres = berr[j]; ++count; } else { break; } } /* end while */ /* Bound error from formula: norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use CLACON to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = c_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; xk = c_abs1( &Xptr[irow] ); s += c_abs1(&Aval[i]) * xk; } rwork[k] += s; } } for (i = 0; i < A->nrow; ++i) if (rwork[i] > safe2) rwork[i] = c_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i]; else rwork[i] = c_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; kase = 0; do { clacon_(&A->nrow, &work[A->nrow], work, &ferr[j], &kase); if (kase == 0) break; if (kase == 1) { /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { cs_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->nrow; ++i) { cs_mult(&work[i], &work[i], R[i]); } cgstrs (transt, L, U, perm_r, perm_c, &Bjcol, Gstat, info); for (i = 0; i < A->nrow; ++i) { cs_mult(&work[i], &work[i], rwork[i]); } } else { /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ for (i = 0; i < A->nrow; ++i) { cs_mult(&work[i], &work[i], rwork[i]); } cgstrs (trans, L, U, perm_r, perm_c, &Bjcol, Gstat, info); if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { cs_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->ncol; ++i) { cs_mult(&work[i], &work[i], R[i]); } } } while ( kase != 0 ); /* Normalize error. */ lstres = 0.; if ( notran && colequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, C[i] * c_abs1( &Xptr[i]) ); } else if ( !notran && rowequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, R[i] * c_abs1( &Xptr[i]) ); } else { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, c_abs1( &Xptr[i]) ); } if ( lstres != 0. ) ferr[j] /= lstres; } /* for each RHS j ... */ SUPERLU_FREE(work); SUPERLU_FREE(rwork); SUPERLU_FREE(iwork); SUPERLU_FREE(Bjcol.Store); return; } /* cgsrfs */
void dgstrs(trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_r, int *perm_c, SuperMatrix *B, Gstat_t *Gstat, int *info) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * * Purpose * ======= * * dgstrs() solves a system of linear equations A*X=B or A'*X=B * with A sparse and B dense, using the LU factorization computed by * pdgstrf(). * * Arguments * ========= * * trans (input) Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * pdgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SCP, Dtype = _D, Mtype = TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * pdgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = NCP, Dtype = _D, Mtype = TRU. * * perm_r (input) int* * Row permutation vector of size L->nrow, which defines the * permutation matrix Pr; perm_r[i] = j means row i of A is in * position j in Pr*A. * * perm_c (int*) dimension A->ncol * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * B (input/output) SuperMatrix* * B has types: Stype = DN, Dtype = _D, Mtype = GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * Gstat (output) Gstat_t* * Record all the statistics about the triangular solves; * See Gstat_t structure defined in slu_mt_util.h. * * info (output) Diagnostics * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; #endif register int j, k, jcol, iptr, luptr, ksupno, istart, irow, bptr; register int fsupc, nsuper; int i, n, nsupc, nsupr, nrow, nrhs, ldb; int *supno; DNformat *Bstore; SCPformat *Lstore; NCPformat *Ustore; double *Lval, *Uval, *Bmat; double *work, *work_col, *rhs_work, *soln; flops_t solve_ops; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -4; else if ( ldb < SUPERLU_MAX(0, L->nrow) ) *info = -6; if ( *info ) { i = -(*info); xerbla_("dgstrs", &i); return; } n = L->nrow; work = doubleCalloc(n * nrhs); if ( !work ) SUPERLU_ABORT("Malloc fails for local work[]."); soln = doubleMalloc(n); if ( !soln ) SUPERLU_ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; supno = Lstore->col_to_sup; nsuper = Lstore->nsuper; solve_ops = 0; if ( trans == NOTRANS ) { /* Permute right hand sides to form Pr*B */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ /*>> for (k = 0; k < n; k += nsupc) { ksupno = supno[k]; */ for (ksupno = 0; ksupno <= nsuper; ++ksupno) { fsupc = L_FST_SUPC(ksupno); istart = L_SUB_START(fsupc); nsupr = L_SUB_END(fsupc) - istart; nsupc = L_LAST_SUPC(ksupno) - fsupc; nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1) * nrhs; solve_ops += 2 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) { rhs_work = &Bmat[bptr]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_END(fsupc); iptr++){ irow = L_SUB(iptr); ++luptr; rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); SGEMM(ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) { rhs_work = &Bmat[bptr]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work_col[i]; /* Scatter */ work_col[i] = 0.0; iptr++; } } #else for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) { rhs_work = &Bmat[bptr]; dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work[i]; work[i] = 0.0; iptr++; } } #endif } /* if-else: nsupc == 1 ... */ } /* for L-solve */ #if ( DEBUGlevel>=2 ) printf("After L-solve: y=\n"); dprint_soln(n, nrhs, Bmat); #endif /* * Back solve Ux=y. */ /*>> for (k = n-1; k >= 0; k -= nsupc) { ksupno = supno[k]; */ for (ksupno = nsuper; ksupno >= 0; --ksupno) { fsupc = L_FST_SUPC(ksupno); istart = L_SUB_START(fsupc); nsupr = L_SUB_END(fsupc) - istart; nsupc = L_LAST_SUPC(ksupno) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1) * nrhs; /* dense triangular matrix */ if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { rhs_work[fsupc] /= Lval[luptr]; rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); STRSM(ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else dtrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0, bptr = fsupc; j < nrhs; j++, bptr += ldb) { dusolve (nsupr, nsupc, &Lval[luptr], &Bmat[bptr]); } #endif } /* matrix-vector update */ for (j = 0, bptr = 0; j < nrhs; ++j, bptr += ldb) { rhs_work = &Bmat[bptr]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 2*(U_NZ_END(jcol) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_END(jcol); i++ ){ irow = U_SUB(i); rhs_work[irow] -= rhs_work[jcol] * Uval[i]; } } } } /* for U-solve */ #if ( DEBUGlevel>=2 ) printf("After U-solve: x=\n"); dprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X <= Pc*X. */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } else { /* Solve A'*X=B */ /* Permute right hand sides to form Pc'*B. */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], info); /* Multiply by inv(L'). */ sp_dtrsv("L", "T", "U", L, U, &Bmat[k*ldb], info); } /* Compute the final solution X <= Pr'*X (=inv(Pr)*X) */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } /* if-else trans */ Gstat->ops[TRISOLVE] = solve_ops; SUPERLU_FREE(work); SUPERLU_FREE(soln); }