Пример #1
0
void
get_colamd(
	   const int m,  /* number of rows in matrix A. */
	   const int n,  /* number of columns in matrix A. */
	   const int nnz,/* number of nonzeros in matrix A. */
	   int *colptr,  /* column pointer of size n+1 for matrix A. */
	   int *rowind,  /* row indices of size nz for matrix A. */
	   int *perm_c   /* out - the column permutation vector. */
	   )
{
    int Alen, *A, i, info, *p;
    double *knobs;

    Alen = colamd_recommended(nnz, m, n);

    if ( !(knobs = (double *) SUPERLU_MALLOC(COLAMD_KNOBS * sizeof(double))) )
        SUPERLU_ABORT("Malloc fails for knobs");
    colamd_set_defaults(knobs);

    if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) )
        SUPERLU_ABORT("Malloc fails for A[]");
    if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) )
        SUPERLU_ABORT("Malloc fails for p[]");
    for (i = 0; i <= n; ++i) p[i] = colptr[i];
    for (i = 0; i < nnz; ++i) A[i] = rowind[i];
    info = colamd(m, n, Alen, A, p, knobs);
    if ( info == FALSE ) SUPERLU_ABORT("COLAMD failed");

    for (i = 0; i < n; ++i) perm_c[p[i]] = i;

    SUPERLU_FREE(knobs);
    SUPERLU_FREE(A);
    SUPERLU_FREE(p);
}
Пример #2
0
/*
 * Allocate storage for various statistics.
 */
void
StatAlloc(const int n, const int nprocs, const int panel_size, 
	  const int relax, Gstat_t *Gstat)
{
    register int w;

    w = SUPERLU_MAX( panel_size, relax ) + 1;
    Gstat->panel_histo = intCalloc(w);
    Gstat->utime = (double *) doubleMalloc(NPHASES);
    Gstat->ops   = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t));
    
    if ( !(Gstat->procstat =
	   (procstat_t *) SUPERLU_MALLOC(nprocs*sizeof(procstat_t))) )
	SUPERLU_ABORT( "SUPERLU_MALLOC failed for procstat[]" );

#if (PRNTlevel==1)
    printf(".. StatAlloc(): n %d, nprocs %d, panel_size %d, relax %d\n",
		n, nprocs, panel_size, relax);
#endif
#ifdef PROFILE    
    if ( !(Gstat->panstat =
	   (panstat_t*) SUPERLU_MALLOC(n * sizeof(panstat_t))) )
	SUPERLU_ABORT( "SUPERLU_MALLOC failed for panstat[]" );
    Gstat->panhows = intCalloc(3);
    Gstat->height = intCalloc(n+1);
    if ( !(Gstat->flops_by_height =
	   (float *) SUPERLU_MALLOC(n * sizeof(float))) )
	SUPERLU_ABORT("SUPERLU_MALLOC failed for flops_by_height[]");
    
#endif
    
#ifdef PREDICT_OPT
    if ( !(cp_panel = (cp_panel_t *) SUPERLU_MALLOC(n * sizeof(cp_panel_t))) )
	SUPERLU_ABORT( "SUPERLU_MALLOC failed for cp_panel[]" );
    if ( !(desc_eft = (desc_eft_t *) SUPERLU_MALLOC(n * sizeof(desc_eft_t))) )
	SUPERLU_ABORT( "SUPERLU_MALLOC failed for desc_eft[]" );
    cp_firstkid = intMalloc(n+1);
    cp_nextkid = intMalloc(n+1);
#endif
    
}
Пример #3
0
/*
 * Check whether repfnz[] == EMPTY after reset.
 */
void check_repfnz(int n, int w, int jcol, int *repfnz)
{
    int jj, k;

    for (jj = jcol; jj < jcol+w; jj++) 
	for (k = 0; k < n; k++)
	    if ( repfnz[(jj-jcol)*n + k] != EMPTY ) {
		fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj,
			k, repfnz[(jj-jcol)*n + k]);
		SUPERLU_ABORT("repfnz[] not empty.");
	    }
}
Пример #4
0
int dcheck_perm(char *what, int n, int *perm)
{
    register int i;
    int          *marker;
    marker = (int *) intCalloc(n);

    for (i = 0; i < n; ++i) {
	if ( marker[perm[i]] == 1 || perm[i] >= n ) {
	    printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]);
	    SUPERLU_ABORT("Invalid perm.");
	} else {
	    marker[perm[i]] = 1;
	}
    }

    return 0;
}
Пример #5
0
/*
 * Check whether vec[*] == 0. For the two vectors dense[*] and tempv[*],
 * this invariant should be mantained before and after calling some
 * numeric update routines, such as "panel_bmod" and "column_bmod".
 */
void
scheck_zero_vec(int pnum, char *msg, int n, float *vec)
{
    register int i, nonzero;

    nonzero = FALSE;
    for (i = 0; i < n; ++i) {
        if (vec[i] != 0.0) {
            printf("(%d) vec[%d] = %.10e; should be zero!\n",
                   pnum, i, vec[i]);
            nonzero = TRUE;
        }
    }
    if ( nonzero ) {
        printf("(%d) %s\n", pnum, msg);
        SUPERLU_ABORT("Not a zero vector.");
    }
}
Пример #6
0
int
ParallelInit(int n, pxgstrf_relax_t *pxgstrf_relax, 
	     superlumt_options_t *superlumt_options, 
	     pxgstrf_shared_t *pxgstrf_shared)
{
    int      *etree = superlumt_options->etree;
    register int w, dad, ukids, i, j, k, rs, panel_size, relax;
    register int P, w_top, do_split = 0;
    panel_t panel_type;
    int      *panel_histo = pxgstrf_shared->Gstat->panel_histo;
    register int nthr, concurrency, info;
    Gstat_t  *Gstat = pxgstrf_shared->Gstat;

#if ( MACH==SUN )
    register int sync_type = USYNC_THREAD;
    
    /* Set concurrency level. */
    nthr = sysconf(_SC_NPROCESSORS_ONLN);
    thr_setconcurrency(nthr);            /* number of LWPs */
    concurrency = thr_getconcurrency();

#if ( PRNTlevel==1 )    
    printf(".. CPUs %d, concurrency (#LWP) %d, P %d\n",
	   nthr, concurrency, P);
#endif

    /* Initialize mutex variables. */
    pxgstrf_shared->lu_locks = (mutex_t *) 
        SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(mutex_t));
    for (i = 0; i < NO_GLU_LOCKS; ++i)
	mutex_init(&pxgstrf_shared->lu_locks[i], sync_type, 0);

#elif ( MACH==DEC || MACH==PTHREAD )
    pxgstrf_shared->lu_locks = (pthread_mutex_t *) 
        SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(pthread_mutex_t));
    for (i = 0; i < NO_GLU_LOCKS; ++i)
	pthread_mutex_init(&pxgstrf_shared->lu_locks[i], NULL);
#else

    pxgstrf_shared->lu_locks = (mutex_t *) SUPERLU_MALLOC(NO_GLU_LOCKS * sizeof(mutex_t));

#endif    
    
#if ( PRNTlevel==1 )
    printf(".. ParallelInit() ... nprocs %2d\n", superlumt_options->nprocs);
#endif

    pxgstrf_shared->spin_locks = intCalloc(n);
    pxgstrf_shared->pan_status = 
        (pan_status_t *) SUPERLU_MALLOC((n+1)*sizeof(pan_status_t));
    pxgstrf_shared->fb_cols    = intMalloc(n+1);

    panel_size = superlumt_options->panel_size;
    relax = superlumt_options->relax;
    w = SUPERLU_MAX(panel_size, relax) + 1;
    for (i = 0; i < w; ++i) panel_histo[i] = 0;
    pxgstrf_shared->num_splits = 0;
    
    if ( (info = queue_init(&pxgstrf_shared->taskq, n)) ) {
	fprintf(stderr, "ParallelInit(): %d\n", info);
	SUPERLU_ABORT("queue_init fails.");
    }

    /* Count children of each node in the etree. */
    for (i = 0; i <= n; ++i) pxgstrf_shared->pan_status[i].ukids = 0;
    for (i = 0; i < n; ++i) {
	dad = etree[i];
	++pxgstrf_shared->pan_status[dad].ukids;
    }

    
    /* Find the panel partitions and initialize each panel's status */

#ifdef PROFILE
    Gstat->num_panels = 0;
#endif

    pxgstrf_shared->tasks_remain = 0;
    rs = 1;   /* index for the next relaxed s-node */
    w_top = panel_size/2;
    if ( w_top == 0 ) w_top = 1;
    P = 12;

    for (i = 0; i < n; ) {
	if ( pxgstrf_relax[rs].fcol == i ) {
	    w = pxgstrf_relax[rs++].size;
	    panel_type = RELAXED_SNODE;
	    pxgstrf_shared->pan_status[i].state = CANGO;
	} else {
	    /* Adjust panel_size so that a panel won't overlap with
	       the next relaxed snode.     */
#if 0
	    /* Only works when etree is postordered. */
	    w = SUPERLU_MIN(panel_size, pxgstrf_relax[rs].fcol - i);
#else
	    w = panel_size;
	    for (k = i + 1; k < SUPERLU_MIN(i + panel_size, n); ++k)
		if ( k == pxgstrf_relax[rs].fcol ) {
		    w = k - i;  /* panel stops at column k-1 */
		    break;
		}
	    if ( k == n ) w = n - i;
#endif

#ifdef SPLIT_TOP
	    if ( !do_split ) {
	  	if ( (n-i) < panel_size * P ) do_split = 1;
	    }
	    if ( do_split && w > w_top ) { /* split large panel */
	    	w = w_top;
	    	++pxgstrf_shared->num_splits;
	    }
#endif
	    for (j = i+1; j < i + w; ++j) 
		/* Do not allow panel to cross a branch point in the etree. */
		if ( pxgstrf_shared->pan_status[j].ukids > 1 ) break;
	    w = j - i;    /* j should start a new panel */
	    panel_type = REGULAR_PANEL;
	    pxgstrf_shared->pan_status[i].state = UNREADY;
#ifdef DOMAINS
	    if ( in_domain[i] == TREE_DOMAIN ) panel_type = TREE_DOMAIN;
#endif
	}

	if ( panel_type == REGULAR_PANEL ) {
	    ++pxgstrf_shared->tasks_remain;
	    /*printf("nondomain panel %6d -- %6d\n", i, i+w-1);
	    fflush(stdout);*/
	}

	ukids = k = 0;
	for (j = i; j < i + w; ++j) {
	    pxgstrf_shared->pan_status[j].size = k--;
	    pxgstrf_shared->pan_status[j].type = panel_type;
	    ukids += pxgstrf_shared->pan_status[j].ukids;
	}
	pxgstrf_shared->pan_status[i].size = w; /* leading column */
	/* only count those kids outside the panel */
	pxgstrf_shared->pan_status[i].ukids = ukids - (w-1);
	panel_histo[w]++;
	
#ifdef PROFILE
	Gstat->panstat[i].size = w;
	++Gstat->num_panels;
#endif
	
	pxgstrf_shared->fb_cols[i] = i;
	i += w;    /* move to the next panel */

    } /* for i ... */
    
    /* Dummy root */
    pxgstrf_shared->pan_status[n].size = 1;
    pxgstrf_shared->pan_status[n].state = UNREADY;

#if ( PRNTlevel==1 )
    printf(".. Split: P %d, #nondomain panels %d\n", P, pxgstrf_shared->tasks_remain);
#endif
#ifdef DOMAINS
    EnqueueDomains(&pxgstrf_shared->taskq, list_head, pxgstrf_shared);
#else
    EnqueueRelaxSnode(&pxgstrf_shared->taskq, n, pxgstrf_relax, pxgstrf_shared);
#endif
#if ( PRNTlevel==1 )
    printf(".. # tasks %d\n", pxgstrf_shared->tasks_remain);
    fflush(stdout);
#endif

#ifdef PREDICT_OPT
    /* Set up structure describing children */
    for (i = 0; i <= n; cp_firstkid[i++] = EMPTY);
    for (i = n-1; i >= 0; i--) {
	dad = etree[i];
	cp_nextkid[i] = cp_firstkid[dad];
	cp_firstkid[dad] = i;
    }
#endif

    return 0;
} /* ParallelInit */
Пример #7
0
int
sp_zgemv(char *trans, doublecomplex alpha, SuperMatrix *A, doublecomplex *x, 
	 int incx, doublecomplex beta, doublecomplex *y, int incy)
{
/*  Purpose   
    =======   

    sp_zgemv()  performs one of the matrix-vector operations   
       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
    where alpha and beta are scalars, x and y are vectors and A is a
    sparse A->nrow by A->ncol matrix.   

    Parameters   
    ==========   

    TRANS  - (input) char*
             On entry, TRANS specifies the operation to be performed as   
             follows:   
                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   

    ALPHA  - (input) doublecomplex
             On entry, ALPHA specifies the scalar alpha.   

    A      - (input) SuperMatrix*
             Before entry, the leading m by n part of the array A must   
             contain the matrix of coefficients.   

    X      - (input) doublecomplex*, array of DIMENSION at least   
             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
             and at least   
             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
             Before entry, the incremented array X must contain the   
             vector x.   

    INCX   - (input) int
             On entry, INCX specifies the increment for the elements of   
             X. INCX must not be zero.   

    BETA   - (input) doublecomplex
             On entry, BETA specifies the scalar beta. When BETA is   
             supplied as zero then Y need not be set on input.   

    Y      - (output) doublecomplex*,  array of DIMENSION at least   
             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
             and at least   
             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
             Before entry with BETA non-zero, the incremented array Y   
             must contain the vector y. On exit, Y is overwritten by the 
             updated vector y.
	     
    INCY   - (input) int
             On entry, INCY specifies the increment for the elements of   
             Y. INCY must not be zero.   

    ==== Sparse Level 2 Blas routine.   
*/

    /* Local variables */
    NCformat *Astore;
    doublecomplex   *Aval;
    int info;
    doublecomplex temp, temp1;
    int lenx, leny, i, j, irow;
    int iy, jx, jy, kx, ky;
    int notran;
    doublecomplex comp_zero = {0.0, 0.0};
    doublecomplex comp_one = {1.0, 0.0};

    notran = lsame_(trans, "N");
    Astore = A->Store;
    Aval = Astore->nzval;
    
    /* Test the input parameters */
    info = 0;
    if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
    else if (incx == 0) info = 5;
    else if (incy == 0)	info = 8;
    if (info != 0) {
	xerbla_("sp_zgemv ", &info);
	return 0;
    }

    /* Quick return if possible. */
    if (A->nrow == 0 || A->ncol == 0 || 
	z_eq(&alpha, &comp_zero) && 
	z_eq(&beta, &comp_one))
	return 0;


    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
       up the start points in  X  and  Y. */
    if (lsame_(trans, "N")) {
	lenx = A->ncol;
	leny = A->nrow;
    } else {
	lenx = A->nrow;
	leny = A->ncol;
    }
    if (incx > 0) kx = 0;
    else kx =  - (lenx - 1) * incx;
    if (incy > 0) ky = 0;
    else ky =  - (leny - 1) * incy;

    /* Start the operations. In this version the elements of A are   
       accessed sequentially with one pass through A. */
    /* First form  y := beta*y. */
    if ( !z_eq(&beta, &comp_one) ) {
	if (incy == 1) {
	    if ( z_eq(&beta, &comp_zero) )
		for (i = 0; i < leny; ++i) y[i] = comp_zero;
	    else
		for (i = 0; i < leny; ++i) 
		  zz_mult(&y[i], &beta, &y[i]);
	} else {
	    iy = ky;
	    if ( z_eq(&beta, &comp_zero) )
		for (i = 0; i < leny; ++i) {
		    y[iy] = comp_zero;
		    iy += incy;
		}
	    else
		for (i = 0; i < leny; ++i) {
		    zz_mult(&y[iy], &beta, &y[iy]);
		    iy += incy;
		}
	}
    }
    
    if ( z_eq(&alpha, &comp_zero) ) return 0;

    if ( notran ) {
	/* Form  y := alpha*A*x + y. */
	jx = kx;
	if (incy == 1) {
	    for (j = 0; j < A->ncol; ++j) {
		if ( !z_eq(&x[jx], &comp_zero) ) {
		    zz_mult(&temp, &alpha, &x[jx]);
		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
			irow = Astore->rowind[i];
			zz_mult(&temp1, &temp,  &Aval[i]);
			z_add(&y[irow], &y[irow], &temp1);
		    }
		}
		jx += incx;
	    }
	} else {
	    SUPERLU_ABORT("Not implemented.");
	}
    } else {
	/* Form  y := alpha*A'*x + y. */
	jy = ky;
	if (incx == 1) {
	    for (j = 0; j < A->ncol; ++j) {
		temp = comp_zero;
		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
		    irow = Astore->rowind[i];
		    zz_mult(&temp1, &Aval[i], &x[irow]);
		    z_add(&temp, &temp, &temp1);
		}
		zz_mult(&temp1, &alpha, &temp);
		z_add(&y[jy], &y[jy], &temp1);
		jy += incy;
	    }
	} else {
	    SUPERLU_ABORT("Not implemented.");
	}
    }
    return 0;    
} /* sp_zgemv */
Пример #8
0
void
pcgstrf(superlumt_options_t *superlumt_options, SuperMatrix *A, int *perm_r,
	SuperMatrix *L, SuperMatrix *U,	Gstat_t *Gstat, int *info)
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose
 * =======
 *
 * PCGSTRF computes an LU factorization of a general sparse nrow-by-ncol
 * matrix A using partial pivoting with row interchanges. The factorization
 * has the form
 *     Pr * A = L * U
 * where Pr is a row permutation matrix, L is lower triangular with unit
 * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is
 * upper triangular (upper trapezoidal if A->nrow < A->ncol).
 *
 * Arguments
 * =========
 * 
 * superlumt_options (input) superlumt_options_t*
 *        The structure defines the parameters to control how the sparse
 *        LU factorization is performed. The following fields must be set 
 *        by the user:
 *
 *        o nprocs (int)
 *          Number of processes to be spawned and used for factorization.
 *
 *        o refact (yes_no_t)
 *          Specifies whether this is first time or subsequent factorization.
 *          = NO:  this factorization is treated as the first one;
 *          = YES: it means that a factorization was performed prior to this
 *                 one. Therefore, this factorization will re-use some
 *                 existing data structures, such as L and U storage, column
 *                 elimination tree, and the symbolic information of the
 *                 Householder matrix.
 *
 *        o panel_size (int)
 *          A panel consists of at most panel_size consecutive columns.
 *
 *        o relax (int)
 *          Degree of relaxing supernodes. If the number of nodes (columns)
 *          in a subtree of the elimination tree is less than relax, this 
 *          subtree is considered as one supernode, regardless of the row
 *          structures of those columns.
 *
 *        o diag_pivot_thresh (float)
 *	    Diagonal pivoting threshold. At step j of Gaussian elimination,
 *          if abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)),
 *          use A_jj as pivot. 0 <= diag_pivot_thresh <= 1. The default
 *          value is 1.0, corresponding to partial pivoting.
 *
 *        o usepr (yes_no_t)
 *          Whether the pivoting will use perm_r specified by the user.
 *          = YES: use perm_r; perm_r is input, unchanged on exit.
 *          = NO:  perm_r is determined by partial pivoting, and is output.
 *
 *        o drop_tol (double) (NOT IMPLEMENTED)
 *	    Drop tolerance parameter. At step j of the Gaussian elimination,
 *          if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
 *          0 <= drop_tol <= 1. The default value of drop_tol is 0,
 *          corresponding to not dropping any entry.
 *
 *        o perm_c (int*)
 *	    Column permutation vector of size A->ncol, which defines the 
 *          permutation matrix Pc; perm_c[i] = j means column i of A is 
 *          in position j in A*Pc.
 *
 *        o perm_r (int*)
 *	    Column permutation vector of size A->nrow.
 *          If superlumt_options->usepr = NO, this is an output argument.
 *
 *        o work (void*) of size lwork
 *          User-supplied work space and space for the output data structures.
 *          Not referenced if lwork = 0;
 *
 *        o lwork (int)
 *          Specifies the length of work array.
 *            = 0:  allocate space internally by system malloc;
 *            > 0:  use user-supplied work array of length lwork in bytes,
 *                  returns error if space runs out.
 *            = -1: the routine guesses the amount of space needed without
 *                  performing the factorization, and returns it in
 *                  superlu_memusage->total_needed; no other side effects.
 *
 * A      (input) SuperMatrix*
 *	  Original matrix A, permuted by columns, of dimension
 *        (A->nrow, A->ncol). The type of A can be:
 *        Stype = NCP; Dtype = _D; Mtype = GE.
 *
 * perm_r (input/output) int*, dimension A->nrow
 *        Row permutation vector which defines the permutation matrix Pr,
 *        perm_r[i] = j means row i of A is in position j in Pr*A.
 *        If superlumt_options->usepr = NO, perm_r is output argument;
 *        If superlumt_options->usepr = YES, the pivoting routine will try 
 *           to use the input perm_r, unless a certain threshold criterion
 *           is violated. In that case, perm_r is overwritten by a new
 *           permutation determined by partial pivoting or diagonal 
 *           threshold pivoting.
 *
 * L      (output) SuperMatrix*
 *        The factor L from the factorization Pr*A=L*U; use compressed row 
 *        subscripts storage for supernodes, i.e., L has type: 
 *        Stype = SCP, Dtype = _D, Mtype = TRLU.
 *
 * U      (output) SuperMatrix*
 *	  The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
 *        storage scheme, i.e., U has types: Stype = NCP, Dtype = _D,
 *        Mtype = TRU.
 *
 * Gstat  (output) Gstat_t*
 *        Record all the statistics about the factorization; 
 *        See Gstat_t structure defined in slu_mt_util.h.
 *
 * info   (output) int*
 *        = 0: successful exit
 *        < 0: if info = -i, the i-th argument had an illegal value
 *        > 0: if info = i, and i is
 *             <= A->ncol: U(i,i) is exactly zero. The factorization has
 *                been completed, but the factor U is exactly singular,
 *                and division by zero will occur if it is used to solve a
 *                system of equations.
 *             > A->ncol: number of bytes allocated when memory allocation
 *                failure occurred, plus A->ncol.
 *
 */
    pcgstrf_threadarg_t *pcgstrf_threadarg;
    pxgstrf_shared_t pxgstrf_shared;
    register int nprocs = superlumt_options->nprocs;
    register int i, iinfo;
    double    *utime = Gstat->utime;
    double    usrtime, wtime;
    double    usertimer_();
#if ( MACH==SUN )
    thread_t  *thread_id;
#elif ( MACH==DEC || MACH==PTHREAD )
    pthread_t *thread_id;
    void      *status;
#endif
    void      *pcgstrf_thread(void *);


    /* --------------------------------------------------------------
       Initializes the parallel data structures for pcgstrf_thread().
       --------------------------------------------------------------*/
    pcgstrf_threadarg = pcgstrf_thread_init(A, L, U, superlumt_options,
					    &pxgstrf_shared, Gstat, info);
    if ( *info ) return;

    /* Start timing factorization. */
    usrtime = usertimer_();
    wtime = SuperLU_timer_(); 

    /* ------------------------------------------------------------
       On a SUN multiprocessor system, use Solaris thread.
       ------------------------------------------------------------*/
#if ( MACH==SUN )
    
    /* Create nproc threads for concurrent factorization. */
    thread_id = (thread_t *) SUPERLU_MALLOC(nprocs * sizeof(thread_t));
    
    for (i = 1; i < nprocs; ++i) {
#if ( PRNTlevel==1 )
	printf(".. Create unbound threads: i %d, nprocs %d\n", i, nprocs);
#endif
	if ( (iinfo = thr_create(NULL, 0,
			       pcgstrf_thread, &(pcgstrf_threadarg[i]),
			       0, &thread_id[i])) )
	{
	    fprintf(stderr, "thr_create: %d\n", iinfo);
	    SUPERLU_ABORT("thr_creat()");
	}
    }
	 
    pcgstrf_thread( &(pcgstrf_threadarg[0]) );
    
    /* Wait for all threads to terminate. */
    for (i = 1; i < nprocs; i++) thr_join(thread_id[i], 0, 0);
    SUPERLU_FREE (thread_id);
/* _SOLARIS_2 */


    /* ------------------------------------------------------------
       On a DEC multiprocessor system, use pthread.
       ------------------------------------------------------------*/
#elif ( MACH==DEC )	/* Use DECthreads ... */
    
    /* Create nproc threads for concurrent factorization. */
    thread_id = (pthread_t *) SUPERLU_MALLOC(nprocs * sizeof(pthread_t));
    
    for (i = 0; i < nprocs; ++i) {
	if ( iinfo = pthread_create(&thread_id[i],
				    NULL,
				    pcgstrf_thread, 
				    &(pcgstrf_threadarg[i])) ) {
	    fprintf(stderr, "pthread_create: %d\n", iinfo);
	    SUPERLU_ABORT("pthread_create()");
	}
	/*	pthread_bind_to_cpu_np(thread_id[i], i);*/
    }
	 
    /* Wait for all threads to terminate. */
    for (i = 0; i < nprocs; i++)
	pthread_join(thread_id[i], &status);
    SUPERLU_FREE (thread_id);
/* _DEC */

    
    /* ------------------------------------------------------------
       On a SGI Power Challenge or Origin multiprocessor system,
       use parallel C.
       ------------------------------------------------------------*/
#elif ( MACH==SGI || MACH==ORIGIN )	/* Use parallel C ... */

    if ( getenv("MP_SET_NUMTHREADS") ) {
        i = atoi(getenv("MP_SET_NUMTHREADS"));
	if ( nprocs > i ) {
	    printf("nprocs=%d > environment allowed: MP_SET_NUMTHREADS=%d\n",
		   nprocs, i);
	    exit(-1);
	}
    }
#pragma parallel
#pragma shared  (pcgstrf_threadarg)
/*#pragma numthreads (max = nprocs)*/
#pragma numthreads (nprocs)
    {
      pcgstrf_thread( pcgstrf_threadarg );
    }
/* _SGI or _ORIGIN */

    /* ------------------------------------------------------------
       On a Cray PVP multiprocessor system, use microtasking.
       ------------------------------------------------------------*/
#elif ( MACH==CRAY_PVP )       /* Use C microtasking. */

    if ( getenv("NCPUS") ) {
        i = atoi(getenv("NCPUS"));
	if ( nprocs > i ) {
	    printf("nprocs=%d > environment allowed: NCPUS=%d\n",
		   nprocs, i);
	    exit(-1);
	}
    }
#pragma _CRI taskloop private (i,nprocs)  shared (pcgstrf_threadarg)
    /* Stand-alone task loop */
    for (i = 0; i < nprocs; ++i) {
	pcgstrf_thread( &(pcgstrf_threadarg[i]) );
    }
/* _CRAY_PVP */

    /* ------------------------------------------------------------
       Use POSIX threads.
       ------------------------------------------------------------*/
#elif ( MACH==PTHREAD )	/* Use pthread ... */
    
    /* Create nproc threads for concurrent factorization. */
    thread_id = (pthread_t *) SUPERLU_MALLOC(nprocs * sizeof(pthread_t));
    
    for (i = 0; i < nprocs; ++i) {
	if ( iinfo = pthread_create(&thread_id[i],
				    NULL,
				    pcgstrf_thread, 
				    &(pcgstrf_threadarg[i])) ) {
	    fprintf(stderr, "pthread_create: %d\n", iinfo);
	    SUPERLU_ABORT("pthread_create()");
	}
    }
	 
    /* Wait for all threads to terminate. */
    for (i = 0; i < nprocs; i++)
	pthread_join(thread_id[i], &status);
    SUPERLU_FREE (thread_id);
/* _PTHREAD */

    /* ------------------------------------------------------------
       Use openMP.
       ------------------------------------------------------------*/
#elif ( MACH==OPENMP ) /* Use openMP ... */

#pragma omp parallel for shared (pcgstrf_threadarg) private (i)
    /* Stand-alone task loop */
    for (i = 0; i < nprocs; ++i) {
        pcgstrf_thread( &(pcgstrf_threadarg[i]) );
    }

/* _OPENMP */

    /* ------------------------------------------------------------
       On all other systems, use single processor.
       ------------------------------------------------------------*/
#else

    printf("pcgstrf() is not parallelized on this machine.\n");
    printf("pcgstrf() will be run on single processor.\n");
    pcgstrf_thread( &(pcgstrf_threadarg[0]) );
    
#endif    

    wtime = SuperLU_timer_() - wtime;
    usrtime = usertimer_() - usrtime;
    utime[FACT] = wtime;

#if ( PRNTlevel==1 )
    printf(".. pcgstrf_thread() returns info %d, usrtime %.2f, wtime %.2f\n",
           *info, usrtime, wtime);
#endif

    /* check_mem_leak("after pcgstrf_thread()"); */
    
    /* ------------------------------------------------------------
       Clean up and free storage after multithreaded factorization.
       ------------------------------------------------------------*/
    pcgstrf_thread_finalize(pcgstrf_threadarg, &pxgstrf_shared, 
			    A, perm_r, L, U);

}
Пример #9
0
double zlangs(char *norm, SuperMatrix *A)
{
/* 
    Purpose   
    =======   

    ZLANGS returns the value of the one norm, or the Frobenius norm, or 
    the infinity norm, or the element of largest absolute value of a 
    real matrix A.   

    Description   
    ===========   

    ZLANGS returns the value   

       ZLANGS = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
                (   
                ( norm1(A),         NORM = '1', 'O' or 'o'   
                (   
                ( normI(A),         NORM = 'I' or 'i'   
                (   
                ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   

    where  norm1  denotes the  one norm of a matrix (maximum column sum), 
    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
    normF  denotes the  Frobenius norm of a matrix (square root of sum of 
    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   

    Arguments   
    =========   

    NORM    (input) CHARACTER*1   
            Specifies the value to be returned in ZLANGE as described above.   
    A       (input) SuperMatrix*
            The M by N sparse matrix A. 

   ===================================================================== 
*/
    
    /* Local variables */
    NCformat *Astore;
    doublecomplex   *Aval;
    int      i, j, irow;
    double   value, sum;
    double   *rwork;

    Astore = A->Store;
    Aval   = Astore->nzval;
    
    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
	value = 0.;
	
    } else if (lsame_(norm, "M")) {
	/* Find max(abs(A(i,j))). */
	value = 0.;
	for (j = 0; j < A->ncol; ++j)
	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
		value = SUPERLU_MAX( value, z_abs( &Aval[i]) );
	
    } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') {
	/* Find norm1(A). */
	value = 0.;
	for (j = 0; j < A->ncol; ++j) {
	    sum = 0.;
	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
		sum += z_abs( &Aval[i] );
	    value = SUPERLU_MAX(value,sum);
	}
	
    } else if (lsame_(norm, "I")) {
	/* Find normI(A). */
	if ( !(rwork = (double *) SUPERLU_MALLOC((size_t) A->nrow * sizeof(double))) )
	    SUPERLU_ABORT("SUPERLU_MALLOC fails for rwork.");
	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
	for (j = 0; j < A->ncol; ++j)
	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
		irow = Astore->rowind[i];
		rwork[irow] += z_abs( &Aval[i] );
	    }
	value = 0.;
	for (i = 0; i < A->nrow; ++i)
	    value = SUPERLU_MAX(value, rwork[i]);
	
	SUPERLU_FREE (rwork);
	
    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
	/* Find normF(A). */
	SUPERLU_ABORT("Not implemented.");
    } else
	SUPERLU_ABORT("Illegal norm specified.");

    return (value);

} /* zlangs */
Пример #10
0
main(int argc, char *argv[])
{
    SuperMatrix A, AC, L, U, B;
    NCformat    *Astore;
    SCPformat   *Lstore;
    NCPformat   *Ustore;
    superlumt_options_t superlumt_options;
    pxgstrf_shared_t pxgstrf_shared;
    pdgstrf_threadarg_t *pdgstrf_threadarg;
    int         nprocs;
    fact_t      fact;
    trans_t     trans;
    yes_no_t    refact, usepr;
    double      u, drop_tol;
    double      *a;
    int         *asub, *xa;
    int         *perm_c; /* column permutation vector */
    int         *perm_r; /* row permutations from partial pivoting */
    void        *work;
    int         info, lwork, nrhs, ldx; 
    int         m, n, nnz, permc_spec, panel_size, relax;
    int         i, firstfact;
    double      *rhsb, *xact;
    Gstat_t Gstat;
    flops_t     flopcnt;
    void parse_command_line();

    /* Default parameters to control factorization. */
    nprocs = 1;
    fact  = EQUILIBRATE;
    trans = NOTRANS;
    panel_size = sp_ienv(1);
    relax = sp_ienv(2);
    u     = 1.0;
    usepr = NO;
    drop_tol = 0.0;
    work = NULL;
    lwork = 0;
    nrhs  = 1;

    /* Get the number of processes from command line. */
    parse_command_line(argc, argv, &nprocs);

    /* Read the input matrix stored in Harwell-Boeing format. */
    dreadhb(&m, &n, &nnz, &a, &asub, &xa);

    /* Set up the sparse matrix data structure for A. */
    dCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE);

    if (!(rhsb = doubleMalloc(m * nrhs))) SUPERLU_ABORT("Malloc fails for rhsb[].");
    dCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_D, SLU_GE);
    xact = doubleMalloc(n * nrhs);
    ldx = n;
    dGenXtrue(n, nrhs, xact, ldx);
    dFillRHS(trans, nrhs, xact, ldx, &A, &B);
    
    if (!(perm_r = intMalloc(m))) SUPERLU_ABORT("Malloc fails for perm_r[].");
    if (!(perm_c = intMalloc(n))) SUPERLU_ABORT("Malloc fails for perm_c[].");


    /********************************
     * THE FIRST TIME FACTORIZATION *
     ********************************/

    /* ------------------------------------------------------------
       Allocate storage and initialize statistics variables. 
       ------------------------------------------------------------*/
    StatAlloc(n, nprocs, panel_size, relax, &Gstat);
    StatInit(n, nprocs, &Gstat);

    /* ------------------------------------------------------------
       Get column permutation vector perm_c[], according to permc_spec:
       permc_spec = 0: natural ordering 
       permc_spec = 1: minimum degree ordering on structure of A'*A
       permc_spec = 2: minimum degree ordering on structure of A'+A
       permc_spec = 3: approximate minimum degree for unsymmetric matrices
       ------------------------------------------------------------*/ 	
    permc_spec = 1;
    get_perm_c(permc_spec, &A, perm_c);

    /* ------------------------------------------------------------
       Initialize the option structure superlumt_options using the
       user-input parameters;
       Apply perm_c to the columns of original A to form AC.
       ------------------------------------------------------------*/
    refact= NO;
    pdgstrf_init(nprocs, fact, trans, refact, panel_size, relax,
		 u, usepr, drop_tol, perm_c, perm_r,
		 work, lwork, &A, &AC, &superlumt_options, &Gstat);

    /* ------------------------------------------------------------
       Compute the LU factorization of A.
       The following routine will create nprocs threads.
       ------------------------------------------------------------*/
    pdgstrf(&superlumt_options, &AC, perm_r, &L, &U, &Gstat, &info);
    
    flopcnt = 0;
    for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops;
    Gstat.ops[FACT] = flopcnt;

    /* ------------------------------------------------------------
       Solve the system A*X=B, overwriting B with X.
       ------------------------------------------------------------*/
    dgstrs(trans, &L, &U, perm_r, perm_c, &B, &Gstat, &info);
    
    printf("\n** Result of sparse LU **\n");
    dinf_norm_error(nrhs, &B, xact); /* Check inf. norm of the error */

    Destroy_CompCol_Permuted(&AC); /* Free extra arrays in AC. */


    /*********************************
     * THE SUBSEQUENT FACTORIZATIONS *
     *********************************/

    /* ------------------------------------------------------------
       Re-initialize statistics variables and options used by the
       factorization routine pdgstrf().
       ------------------------------------------------------------*/
    StatInit(n, nprocs, &Gstat);
    refact= YES;
    pdgstrf_init(nprocs, fact, trans, refact, panel_size, relax,
		 u, usepr, drop_tol, perm_c, perm_r,
		 work, lwork, &A, &AC, &superlumt_options, &Gstat);

    /* ------------------------------------------------------------
       Compute the LU factorization of A.
       The following routine will create nprocs threads.
       ------------------------------------------------------------*/
    pdgstrf(&superlumt_options, &AC, perm_r, &L, &U, &Gstat, &info);
    
    flopcnt = 0;
    for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops;
    Gstat.ops[FACT] = flopcnt;

    /* ------------------------------------------------------------
       Re-generate right-hand side B, then solve A*X= B.
       ------------------------------------------------------------*/
    dFillRHS(trans, nrhs, xact, ldx, &A, &B);
    dgstrs(trans, &L, &U, perm_r, perm_c, &B, &Gstat, &info);

    
     /* ------------------------------------------------------------
       Deallocate storage after factorization.
       ------------------------------------------------------------*/
    pxgstrf_finalize(&superlumt_options, &AC);

    printf("\n** Result of sparse LU **\n");
    dinf_norm_error(nrhs, &B, xact); /* Check inf. norm of the error */

    Lstore = (SCPformat *) L.Store;
    Ustore = (NCPformat *) U.Store;
    printf("No of nonzeros in factor L = %d\n", Lstore->nnz);
    printf("No of nonzeros in factor U = %d\n", Ustore->nnz);
    printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n);
    fflush(stdout);

    SUPERLU_FREE (rhsb);
    SUPERLU_FREE (xact);
    SUPERLU_FREE (perm_r);
    SUPERLU_FREE (perm_c);
    Destroy_CompCol_Matrix(&A);
    Destroy_SuperMatrix_Store(&B);
    if ( lwork >= 0 ) {
        Destroy_SuperNode_SCP(&L);
        Destroy_CompCol_NCP(&U);
    }
    StatFree(&Gstat);
}
Пример #11
0
int
sp_dgemv(char *trans, double alpha, SuperMatrix *A, double *x, 
	 int incx, double beta, double *y, int incy)
{
/*  Purpose   
    =======   

    sp_dgemv()  performs one of the matrix-vector operations   
       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
    where alpha and beta are scalars, x and y are vectors and A is a
    sparse A->nrow by A->ncol matrix.   

    Parameters   
    ==========   

    TRANS  - (input) char*
             On entry, TRANS specifies the operation to be performed as   
             follows:   
                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   

    ALPHA  - (input) double
             On entry, ALPHA specifies the scalar alpha.   

    A      - (input) SuperMatrix*
             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
             Currently, the type of A can be:
                 Stype = NC or NCP; Dtype = SLU_D; Mtype = GE. 
             In the future, more general A can be handled.

    X      - (input) double*, array of DIMENSION at least   
             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
             and at least   
             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
             Before entry, the incremented array X must contain the   
             vector x.   

    INCX   - (input) int
             On entry, INCX specifies the increment for the elements of   
             X. INCX must not be zero.   

    BETA   - (input) double
             On entry, BETA specifies the scalar beta. When BETA is   
             supplied as zero then Y need not be set on input.   

    Y      - (output) double*,  array of DIMENSION at least   
             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
             and at least   
             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
             Before entry with BETA non-zero, the incremented array Y   
             must contain the vector y. On exit, Y is overwritten by the 
             updated vector y.
	     
    INCY   - (input) int
             On entry, INCY specifies the increment for the elements of   
             Y. INCY must not be zero.   

    ==== Sparse Level 2 Blas routine.   
*/

    /* Local variables */
    NCformat *Astore;
    double   *Aval;
    int info;
    double temp;
    int lenx, leny, i, j, irow;
    int iy, jx, jy, kx, ky;
    int notran;

    notran = lsame_(trans, "N");
    Astore = A->Store;
    Aval = Astore->nzval;
    
    /* Test the input parameters */
    info = 0;
    if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
    else if (incx == 0) info = 5;
    else if (incy == 0)	info = 8;
    if (info != 0) {
	xerbla_("sp_dgemv ", &info);
	return 0;
    }

    /* Quick return if possible. */
    if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.))
	return 0;

    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
       up the start points in  X  and  Y. */
    if (lsame_(trans, "N")) {
	lenx = A->ncol;
	leny = A->nrow;
    } else {
	lenx = A->nrow;
	leny = A->ncol;
    }
    if (incx > 0) kx = 0;
    else kx =  - (lenx - 1) * incx;
    if (incy > 0) ky = 0;
    else ky =  - (leny - 1) * incy;

    /* Start the operations. In this version the elements of A are   
       accessed sequentially with one pass through A. */
    /* First form  y := beta*y. */
    if (beta != 1.) {
	if (incy == 1) {
	    if (beta == 0.)
		for (i = 0; i < leny; ++i) y[i] = 0.;
	    else
		for (i = 0; i < leny; ++i) y[i] = beta * y[i];
	} else {
	    iy = ky;
	    if (beta == 0.)
		for (i = 0; i < leny; ++i) {
		    y[iy] = 0.;
		    iy += incy;
		}
	    else
		for (i = 0; i < leny; ++i) {
		    y[iy] = beta * y[iy];
		    iy += incy;
		}
	}
    }
    
    if (alpha == 0.) return 0;

    if ( notran ) {
	/* Form  y := alpha*A*x + y. */
	jx = kx;
	if (incy == 1) {
	    for (j = 0; j < A->ncol; ++j) {
		if (x[jx] != 0.) {
		    temp = alpha * x[jx];
		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
			irow = Astore->rowind[i];
			y[irow] += temp * Aval[i];
		    }
		}
		jx += incx;
	    }
	} else {
	    SUPERLU_ABORT("Not implemented.");
	}
    } else {
	/* Form  y := alpha*A'*x + y. */
	jy = ky;
	if (incx == 1) {
	    for (j = 0; j < A->ncol; ++j) {
		temp = 0.;
		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
		    irow = Astore->rowind[i];
		    temp += Aval[i] * x[irow];
		}
		y[jy] += alpha * temp;
		jy += incy;
	    }
	} else {
	    SUPERLU_ABORT("Not implemented.");
	}
    }
    return 0;
} /* sp_dgemv */
Пример #12
0
void
get_perm_c(int ispec, SuperMatrix *A, int *perm_c)
/*
 * Purpose
 * =======
 *
 * GET_PERM_C obtains a permutation matrix Pc, by applying the multiple
 * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A'.
 * or using approximate minimum degree column ordering by Davis et. al.
 * The LU factorization of A*Pc tends to have less fill than the LU 
 * factorization of A.
 *
 * Arguments
 * =========
 *
 * ispec   (input) int
 *         Specifies the type of column ordering to reduce fill:
 *         = 1: minimum degree on the structure of A^T * A
 *         = 2: minimum degree on the structure of A^T + A
 *         = 3: approximate minimum degree for unsymmetric matrices
 *         If ispec == 0, the natural ordering (i.e., Pc = I) is returned.
 * 
 * A       (input) SuperMatrix*
 *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
 *         of the linear equations is A->nrow. Currently, the type of A 
 *         can be: Stype = NC; Dtype = _D; Mtype = GE. In the future,
 *         more general A can be handled.
 *
 * perm_c  (output) int*
 *	   Column permutation vector of size A->ncol, which defines the 
 *         permutation matrix Pc; perm_c[i] = j means column i of A is 
 *         in position j in A*Pc.
 *
 */
{
    NCformat *Astore = A->Store;
    int m, n, bnz, *b_colptr, i;
    int delta, maxint, nofsub, *invp;
    int *b_rowind, *dhead, *qsize, *llist, *marker;
    double t, SuperLU_timer_();
    
    m = A->nrow;
    n = A->ncol;

    t = SuperLU_timer_();
    switch ( ispec ) {
        case 0: /* Natural ordering */
	      for (i = 0; i < n; ++i) perm_c[i] = i;
	      //printf("Use natural column ordering.\n");	//FT Commented
	      return;
        case 1: /* Minimum degree ordering on A'*A */
	      getata(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
		     &bnz, &b_colptr, &b_rowind);
	      //printf("Use minimum degree ordering on A'*A.\n");	//FT Commented
	      t = SuperLU_timer_() - t;
	      /*printf("Form A'*A time = %8.3f\n", t);*/
	      break;
        case 2: /* Minimum degree ordering on A'+A */
	      if ( m != n ) SUPERLU_ABORT("Matrix is not square");
	      at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind,
			&bnz, &b_colptr, &b_rowind);
	      //printf("Use minimum degree ordering on A'+A.\n");	//FT Commented
	      t = SuperLU_timer_() - t;
	      /*printf("Form A'+A time = %8.3f\n", t);*/
	      break;
        case 3: /* Approximate minimum degree column ordering. */
	      get_colamd(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
			 perm_c);
	      //printf(".. Use approximate minimum degree column ordering.\n");	//FT Commented
	      return; 
        default:
	      SUPERLU_ABORT("Invalid ISPEC");
    }

    if ( bnz != 0 ) {
	t = SuperLU_timer_();

	/* Initialize and allocate storage for GENMMD. */
	delta = 0; /* DELTA is a parameter to allow the choice of nodes
		      whose degree <= min-degree + DELTA. */
	maxint = 2147483647; /* 2**31 - 1 */
	invp = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int));
	if ( !invp ) SUPERLU_ABORT("SUPERLU_MALLOC fails for invp.");
	dhead = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int));
	if ( !dhead ) SUPERLU_ABORT("SUPERLU_MALLOC fails for dhead.");
	qsize = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int));
	if ( !qsize ) SUPERLU_ABORT("SUPERLU_MALLOC fails for qsize.");
	llist = (int *) SUPERLU_MALLOC(n*sizeof(int));
	if ( !llist ) SUPERLU_ABORT("SUPERLU_MALLOC fails for llist.");
	marker = (int *) SUPERLU_MALLOC(n*sizeof(int));
	if ( !marker ) SUPERLU_ABORT("SUPERLU_MALLOC fails for marker.");

	/* Transform adjacency list into 1-based indexing required by GENMMD.*/
	for (i = 0; i <= n; ++i) ++b_colptr[i];
	for (i = 0; i < bnz; ++i) ++b_rowind[i];
	
	genmmd_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, 
		qsize, llist, marker, &maxint, &nofsub);

	/* Transform perm_c into 0-based indexing. */
	for (i = 0; i < n; ++i) --perm_c[i];

	SUPERLU_FREE(b_colptr);
	SUPERLU_FREE(b_rowind);
	SUPERLU_FREE(invp);
	SUPERLU_FREE(dhead);
	SUPERLU_FREE(qsize);
	SUPERLU_FREE(llist);
	SUPERLU_FREE(marker);

	t = SuperLU_timer_() - t;
	/*  printf("call GENMMD time = %8.3f\n", t);*/

    } else { /* Empty adjacency structure */
	for (i = 0; i < n; ++i) perm_c[i] = i;
    }

}
Пример #13
0
void
at_plus_a(
	  const int n,      /* number of columns in matrix A. */
	  const int nz,     /* number of nonzeros in matrix A */
	  int *colptr,      /* column pointer of size n+1 for matrix A. */
	  int *rowind,      /* row indices of size nz for matrix A. */
	  int *bnz,         /* out - on exit, returns the actual number of
                               nonzeros in matrix A'*A. */
	  int **b_colptr,   /* out - size n+1 */
	  int **b_rowind    /* out - size *bnz */
	  )
{
/*
 * Purpose
 * =======
 *
 * Form the structure of A'+A. A is an n-by-n matrix in column oriented
 * format represented by (colptr, rowind). The output A'+A is in column
 * oriented format (symmetrically, also row oriented), represented by
 * (b_colptr, b_rowind).
 *
 */
    register int i, j, k, col, num_nz;
    int *t_colptr, *t_rowind; /* a column oriented form of T = A' */
    int *marker;

    if ( !(marker = (int*) SUPERLU_MALLOC( n * sizeof(int)) ) )
	SUPERLU_ABORT("SUPERLU_MALLOC fails for marker[]");
    if ( !(t_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
	SUPERLU_ABORT("SUPERLU_MALLOC fails for t_colptr[]");
    if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) )
	SUPERLU_ABORT("SUPERLU_MALLOC fails t_rowind[]");

    
    /* Get counts of each column of T, and set up column pointers */
    for (i = 0; i < n; ++i) marker[i] = 0;
    for (j = 0; j < n; ++j) {
	for (i = colptr[j]; i < colptr[j+1]; ++i)
	    ++marker[rowind[i]];
    }
    t_colptr[0] = 0;
    for (i = 0; i < n; ++i) {
	t_colptr[i+1] = t_colptr[i] + marker[i];
	marker[i] = t_colptr[i];
    }

    /* Transpose the matrix from A to T */
    for (j = 0; j < n; ++j)
	for (i = colptr[j]; i < colptr[j+1]; ++i) {
	    col = rowind[i];
	    t_rowind[marker[col]] = j;
	    ++marker[col];
	}


    /* ----------------------------------------------------------------
       compute B = A + T, where column j of B is:

       Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k)

       do not include the diagonal entry
       ---------------------------------------------------------------- */

    /* Zero the diagonal flag */
    for (i = 0; i < n; ++i) marker[i] = -1;

    /* First pass determines number of nonzeros in B */
    num_nz = 0;
    for (j = 0; j < n; ++j) {
	/* Flag the diagonal so it's not included in the B matrix */
	marker[j] = j;

	/* Add pattern of column A_*k to B_*j */
	for (i = colptr[j]; i < colptr[j+1]; ++i) {
	    k = rowind[i];
	    if ( marker[k] != j ) {
		marker[k] = j;
		++num_nz;
	    }
	}

	/* Add pattern of column T_*k to B_*j */
	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
	    k = t_rowind[i];
	    if ( marker[k] != j ) {
		marker[k] = j;
		++num_nz;
	    }
	}
    }
    *bnz = num_nz;
    
    /* Allocate storage for A+A' */
    if ( !(*b_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
	SUPERLU_ABORT("SUPERLU_MALLOC fails for b_colptr[]");
    if ( *bnz) {
      if ( !(*b_rowind = (int*) SUPERLU_MALLOC( *bnz * sizeof(int)) ) )
	SUPERLU_ABORT("SUPERLU_MALLOC fails for b_rowind[]");
    }
    
    /* Zero the diagonal flag */
    for (i = 0; i < n; ++i) marker[i] = -1;
    
    /* Compute each column of B, one at a time */
    num_nz = 0;
    for (j = 0; j < n; ++j) {
	(*b_colptr)[j] = num_nz;
	
	/* Flag the diagonal so it's not included in the B matrix */
	marker[j] = j;

	/* Add pattern of column A_*k to B_*j */
	for (i = colptr[j]; i < colptr[j+1]; ++i) {
	    k = rowind[i];
	    if ( marker[k] != j ) {
		marker[k] = j;
		(*b_rowind)[num_nz++] = k;
	    }
	}

	/* Add pattern of column T_*k to B_*j */
	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
	    k = t_rowind[i];
	    if ( marker[k] != j ) {
		marker[k] = j;
		(*b_rowind)[num_nz++] = k;
	    }
	}
    }
    (*b_colptr)[n] = num_nz;
       
    SUPERLU_FREE(marker);
    SUPERLU_FREE(t_colptr);
    SUPERLU_FREE(t_rowind);
}
Пример #14
0
void
sp_colorder(SuperMatrix *A, int *perm_c, superlumt_options_t *options,
	    SuperMatrix *AC)
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 *
 * Purpose
 * =======
 *
 * sp_colorder() permutes the columns of the original matrix A into AC. 
 * It performs the following steps:
 *
 *    1. Apply column permutation perm_c[] to A's column pointers to form AC;
 *
 *    2. If options->refact = NO, then
 *       (1) Allocate etree[], and compute column etree etree[] of AC'AC;
 *       (2) Post order etree[] to get a postordered elimination tree etree[],
 *           and a postorder permutation post[];
 *       (3) Apply post[] permutation to columns of AC;
 *       (4) Overwrite perm_c[] with the product perm_c * post.
 *       (5) Allocate storage, and compute the column count (colcnt_h) and the
 *           supernode partition (part_super_h) for the Householder matrix H.
 *
 * Arguments
 * =========
 *
 * A      (input) SuperMatrix*
 *        Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
 *        of the linear equations is A->nrow. Currently, the type of A can be:
 *        Stype = NC or NCP; Dtype = _D; Mtype = GE.
 *
 * perm_c (input/output) int*
 *	  Column permutation vector of size A->ncol, which defines the 
 *        permutation matrix Pc; perm_c[i] = j means column i of A is 
 *        in position j in A*Pc.
 *
 * options (input/output) superlumt_options_t*
 *        If options->refact = YES, then options is an
 *        input argument. The arrays etree[], colcnt_h[] and part_super_h[]
 *        are available from a previous factor and will be re-used.
 *        If options->refact = NO, then options is an output argument. 
 *
 * AC     (output) SuperMatrix*
 *        The resulting matrix after applied the column permutation
 *        perm_c[] to matrix A. The type of AC can be:
 *        Stype = NCP; Dtype = _D; Mtype = GE.
 *
 */

    NCformat  *Astore;
    NCPformat *ACstore;
    int i, n, nnz, nlnz;
    yes_no_t  refact = options->refact;
    int *etree;
    int *colcnt_h;
    int *part_super_h;
    int *iwork, *post, *iperm;
    int *invp;
    int *part_super_ata;

    extern void at_plus_a(const int, const int,	int *, int *,
			  int *, int **, int **, int);

    n     = A->ncol;
    iwork = intMalloc(n+1);
    part_super_ata = intMalloc(n);
    
    /* Apply column permutation perm_c to A's column pointers so to
       obtain NCP format in AC = A*Pc.  */
    AC->Stype       = SLU_NCP;
    AC->Dtype       = A->Dtype;
    AC->Mtype       = A->Mtype;
    AC->nrow        = A->nrow;
    AC->ncol        = A->ncol;
    Astore          = A->Store;
    ACstore = AC->Store = (void *) malloc( sizeof(NCPformat) );
    ACstore->nnz    = Astore->nnz;
    ACstore->nzval  = Astore->nzval;
    ACstore->rowind = Astore->rowind;
    ACstore->colbeg = intMalloc(n);
    ACstore->colend = intMalloc(n);
    nnz             = Astore->nnz;

#ifdef CHK_COLORDER
    print_int_vec("pre_order:", n, perm_c);
    dcheck_perm("Initial perm_c", n, perm_c);
#endif      

    for (i = 0; i < n; i++) {
	ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; 
	ACstore->colend[perm_c[i]] = Astore->colptr[i+1];
    }
	
    if ( refact == NO ) {
	int *b_colptr, *b_rowind, bnz, j;
	
	options->etree = etree = intMalloc(n);
	options->colcnt_h = colcnt_h = intMalloc(n);
	options->part_super_h = part_super_h = intMalloc(n);
	
	if ( options->SymmetricMode ) {
	    /* Compute the etree of C = Pc*(A'+A)*Pc'. */
	    int *c_colbeg, *c_colend;

	    /* Form B = A + A'. */
	    at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind,
		      &bnz, &b_colptr, &b_rowind, 1);
	    
	    /* Form C = Pc*B*Pc'. */
	    c_colbeg = (int_t*) intMalloc(n);
	    c_colend = (int_t*) intMalloc(n);
	    if (!(c_colbeg) || !(c_colend) )
		SUPERLU_ABORT("SUPERLU_MALLOC fails for c_colbeg/c_colend");
	    for (i = 0; i < n; i++) {
		c_colbeg[perm_c[i]] = b_colptr[i]; 
		c_colend[perm_c[i]] = b_colptr[i+1];
	    }
	    for (j = 0; j < n; ++j) {
		for (i = c_colbeg[j]; i < c_colend[j]; ++i) {
		    b_rowind[i] = perm_c[b_rowind[i]];
		}
		iwork[perm_c[j]] = j; /* inverse perm_c */
	    }

	    /* Compute etree of C. */
	    sp_symetree(c_colbeg, c_colend, b_rowind, n, etree);

	    /* Restore B to be A+A', without column permutation */
	    for (i = 0; i < bnz; ++i)
		b_rowind[i] = iwork[b_rowind[i]];

	    SUPERLU_FREE(c_colbeg);
	    SUPERLU_FREE(c_colend);
	    
	} else {
	    /* Compute the column elimination tree. */
	    sp_coletree(ACstore->colbeg, ACstore->colend, ACstore->rowind,
			A->nrow, A->ncol, etree);
	}

#ifdef CHK_COLORDER	
	print_int_vec("etree:", n, etree);
#endif	

	/* Post order etree. */
	post = (int *) TreePostorder(n, etree);
	invp  = intMalloc(n);
	for (i = 0; i < n; ++i) invp[post[i]] = i;

#ifdef CHK_COLORDER
	print_int_vec("post:", n+1, post);
	dcheck_perm("post", n, post);	
#endif	

	/* Renumber etree in postorder. */
	for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]];
	for (i = 0; i < n; ++i) etree[i] = iwork[i];

#ifdef CHK_COLORDER	
	print_int_vec("postorder etree:", n, etree);
#endif

	/* Postmultiply A*Pc by post[]. */
	for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i];
	for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i];
	for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i];
	for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i];

	for (i = 0; i < n; ++i)
	    iwork[i] = post[perm_c[i]];  /* product of perm_c and post */
	for (i = 0; i < n; ++i) perm_c[i] = iwork[i];
	for (i = 0; i < n; ++i) invp[perm_c[i]] = i; /* inverse of perm_c */

	iperm = post; /* alias to the same address */

#ifdef ZFD_PERM
	/* Permute the rows of AC to have zero-free diagonal. */
	printf("** Permute the rows to have zero-free diagonal....\n");
	for (i = 0; i < n; ++i)
	    iwork[i] = ACstore->colend[i] - ACstore->colbeg[i];
	zfdperm(n, nnz, ACstore->rowind, ACstore->colbeg, iwork, iperm);
#else
	for (i = 0; i < n; ++i) iperm[i] = i;
#endif	

	/* NOTE: iperm is returned as column permutation so that
	 * the diagonal is nonzero. Since a symmetric permutation
	 * preserves the diagonal, we can do the following:
	 *     P'(AP')P = P'A
	 * That is, we apply the inverse of iperm to rows of A
	 * to get zero-free diagonal. But since iperm is defined
	 * in MC21A inversely as our definition of permutation,
	 * so it is indeed an inverse for our purpose. We can
	 * apply it directly.
	 */

	if ( options->SymmetricMode ) {
	    /* Determine column count in the Cholesky factor of B = A+A' */
#if 0
	    cholnzcnt(n, Astore->colptr, Astore->rowind,
		      invp, perm_c, etree, colcnt_h, &nlnz, part_super_h);
#else
	    cholnzcnt(n, b_colptr, b_rowind,
		      invp, perm_c, etree, colcnt_h, &nlnz, part_super_h);
#endif
#if ( PRNTlevel>=1 ) 
	    printf(".. bnz %d\n", bnz);
#endif

	    SUPERLU_FREE(b_colptr);
	    if ( bnz ) SUPERLU_FREE(b_rowind);

	} else {
	    /* Determine the row and column counts in the QR factor. */
	    qrnzcnt(n, nnz, Astore->colptr, Astore->rowind, iperm,
		    invp, perm_c, etree, colcnt_h, &nlnz,
		    part_super_ata, part_super_h);
	}

#if ( PRNTlevel>=2 )
	dCheckZeroDiagonal(n, ACstore->rowind, ACstore->colbeg,
			   ACstore->colend, perm_c);
	print_int_vec("colcnt", n, colcnt_h);
	dPrintSuperPart("Hpart", n, part_super_h);
	print_int_vec("iperm", n, iperm);
#endif	
	
#ifdef CHK_COLORDER
	print_int_vec("Pc*post:", n, perm_c);
	dcheck_perm("final perm_c", n, perm_c);	
#endif

	SUPERLU_FREE (post);
	SUPERLU_FREE (invp);

    } /* if refact == NO */

    SUPERLU_FREE (iwork);
    SUPERLU_FREE (part_super_ata);
}
Пример #15
0
int main ( int argc, char *argv[] )

/******************************************************************************/
/*
  Purpose:

    MAIN is the main program for PSLINSOL.

  Licensing:

    This code is distributed under the GNU LGPL license. 

  Modified:

    10 February 2014

  Author:

    Xiaoye Li
*/
{
  SuperMatrix   A;
  NCformat *Astore;
  float   *a;
  int      *asub, *xa;
  int      *perm_r; /* row permutations from partial pivoting */
  int      *perm_c; /* column permutation vector */
  SuperMatrix   L;       /* factor L */
  SCPformat *Lstore;
  SuperMatrix   U;       /* factor U */
  NCPformat *Ustore;
  SuperMatrix   B;
  int      nrhs, ldx, info, m, n, nnz, b;
  int      nprocs; /* maximum number of processors to use. */
  int      panel_size, relax, maxsup;
  int      permc_spec;
  trans_t  trans;
  float   *xact, *rhs;
  superlu_memusage_t   superlu_memusage;
  void   parse_command_line();

  timestamp ( );
  printf ( "\n" );
  printf ( "PSLINSOL:\n" );
  printf ( "  C/OpenMP version\n" );
  printf ( "  Call the OpenMP version of SuperLU to solve a linear system.\n" );

  nrhs              = 1;
  trans             = NOTRANS;
  nprocs             = 1;
  n                 = 1000;
  b                 = 1;
  panel_size        = sp_ienv(1);
  relax             = sp_ienv(2);
  maxsup            = sp_ienv(3);
/*
  Check for any commandline input.
*/  
  parse_command_line ( argc, argv, &nprocs, &n, &b, &panel_size, 
    &relax, &maxsup );

#if ( PRNTlevel>=1 || DEBUGlevel>=1 )
    cpp_defs();
#endif

#define HB
#if defined( DEN )
    m = n;
    nnz = n * n;
    sband(n, n, nnz, &a, &asub, &xa);
#elif defined( BAND )
    m = n;
    nnz = (2*b+1) * n;
    sband(n, b, nnz, &a, &asub, &xa);
#elif defined( BD )
    nb = 5;
    bs = 200;
    m = n = bs * nb;
    nnz = bs * bs * nb;
    sblockdiag(nb, bs, nnz, &a, &asub, &xa);
#elif defined( HB )
    sreadhb(&m, &n, &nnz, &a, &asub, &xa);
#else    
    sreadmt(&m, &n, &nnz, &a, &asub, &xa);
#endif

    sCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_S, SLU_GE);
    Astore = A.Store;
    printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz);
    
    if (!(rhs = floatMalloc(m * nrhs))) SUPERLU_ABORT("Malloc fails for rhs[].");
    sCreate_Dense_Matrix(&B, m, nrhs, rhs, m, SLU_DN, SLU_S, SLU_GE);
    xact = floatMalloc(n * nrhs);
    ldx = n;
    sGenXtrue(n, nrhs, xact, ldx);
    sFillRHS(trans, nrhs, xact, ldx, &A, &B);

    if (!(perm_r = intMalloc(m))) SUPERLU_ABORT("Malloc fails for perm_r[].");
    if (!(perm_c = intMalloc(n))) SUPERLU_ABORT("Malloc fails for perm_c[].");

    /*
     * Get column permutation vector perm_c[], according to permc_spec:
     *   permc_spec = 0: natural ordering 
     *   permc_spec = 1: minimum degree ordering on structure of A'*A
     *   permc_spec = 2: minimum degree ordering on structure of A'+A
     *   permc_spec = 3: approximate minimum degree for unsymmetric matrices
     */    	
    permc_spec = 1;
    get_perm_c(permc_spec, &A, perm_c);

    psgssv(nprocs, &A, perm_c, perm_r, &L, &U, &B, &info);
    
    if ( info == 0 ) {
	sinf_norm_error(nrhs, &B, xact); /* Inf. norm of the error */

	Lstore = (SCPformat *) L.Store;
	Ustore = (NCPformat *) U.Store;
    	printf("#NZ in factor L = %d\n", Lstore->nnz);
    	printf("#NZ in factor U = %d\n", Ustore->nnz);
    	printf("#NZ in L+U = %d\n", Lstore->nnz + Ustore->nnz - L.ncol);
	
	superlu_sQuerySpace(nprocs, &L, &U, panel_size, &superlu_memusage);
	printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n",
	       superlu_memusage.for_lu/1024/1024, 
	       superlu_memusage.total_needed/1024/1024,
	       superlu_memusage.expansions);

    }

    SUPERLU_FREE (rhs);
    SUPERLU_FREE (xact);
    SUPERLU_FREE (perm_r);
    SUPERLU_FREE (perm_c);
    Destroy_CompCol_Matrix(&A);
    Destroy_SuperMatrix_Store(&B);
    Destroy_SuperNode_SCP(&L);
    Destroy_CompCol_NCP(&U);
/*
  Terminate.
*/
  printf ( "\n" );
  printf ( "PSLINSOL:\n" );
  printf ( "  Normal end of execution.\n" );
  printf ( "\n" );
  timestamp ( );

  return 0;
}
Пример #16
0
int
sp_dtrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, 
         SuperMatrix *U, double *x, int *info)
{
/*
 *   Purpose
 *   =======
 *
 *   sp_dtrsv() solves one of the systems of equations   
 *       A*x = b,   or   A'*x = b,
 *   where b and x are n element vectors and A is a sparse unit , or   
 *   non-unit, upper or lower triangular matrix.   
 *   No test for singularity or near-singularity is included in this   
 *   routine. Such tests must be performed before calling this routine.   
 *
 *   Parameters   
 *   ==========   
 *
 *   uplo   - (input) char*
 *            On entry, uplo specifies whether the matrix is an upper or   
 *             lower triangular matrix as follows:   
 *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
 *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
 *
 *   trans  - (input) char*
 *             On entry, trans specifies the equations to be solved as   
 *             follows:   
 *                trans = 'N' or 'n'   A*x = b.   
 *                trans = 'T' or 't'   A'*x = b.
 *                trans = 'C' or 'c'   A'*x = b.   
 *
 *   diag   - (input) char*
 *             On entry, diag specifies whether or not A is unit   
 *             triangular as follows:   
 *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
 *                diag = 'N' or 'n'   A is not assumed to be unit   
 *                                    triangular.   
 *	     
 *   L       - (input) SuperMatrix*
 *	       The factor L from the factorization Pr*A*Pc=L*U. Use
 *             compressed row subscripts storage for supernodes,
 *             i.e., L has types: Stype = SC, Dtype = _D, Mtype = TRLU.
 *
 *   U       - (input) SuperMatrix*
 *	        The factor U from the factorization Pr*A*Pc=L*U.
 *	        U has types: Stype = NCP, Dtype = _D, Mtype = TRU.
 *    
 *   x       - (input/output) double*
 *             Before entry, the incremented array X must contain the n   
 *             element right-hand side vector b. On exit, X is overwritten 
 *             with the solution vector x.
 *
 *   info    - (output) int*
 *             If *info = -i, the i-th argument had an illegal value.
 *
 */
#if ( MACH==CRAY_PVP )
    _fcd ftcs1, ftcs2, ftcs3;
#endif
    SCPformat *Lstore;
    NCPformat *Ustore;
    double   *Lval, *Uval;
    int incx = 1, incy = 1;
    double alpha = 1.0, beta = 1.0;
    register int fsupc, luptr, istart, irow, k, iptr, jcol, nsuper;
    int          nsupr, nsupc, nrow, i;
    double *work;
    flops_t solve_ops;

    /* Test the input parameters */
    *info = 0;
    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") ) *info = -2;
    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
    if ( *info ) {
	i = -(*info);
	xerbla_("sp_dtrsv", &i);
	return 0;
    }

    Lstore = L->Store;
    Lval = Lstore->nzval;
    Ustore = U->Store;
    Uval = Ustore->nzval;
    nsuper = Lstore->nsuper;
    solve_ops = 0;

    if ( !(work = doubleCalloc(L->nrow)) )
	SUPERLU_ABORT("Malloc fails for work in sp_dtrsv().");
    
    if ( lsame_(trans, "N") ) {	/* Form x := inv(A)*x. */
	
	if ( lsame_(uplo, "L") ) {
	    /* Form x := inv(L)*x */
    	    if ( L->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = 0; k <= nsuper; k++) {
		fsupc = L_FST_SUPC(k);
		istart = L_SUB_START(fsupc);
                nsupr = L_SUB_END(fsupc) - istart;
                nsupc = L_LAST_SUPC(k) - fsupc;
		luptr = L_NZ_START(fsupc);
		nrow = nsupr - nsupc;

	        solve_ops += nsupc * (nsupc - 1);
	        solve_ops += 2 * nrow * nsupc;

		if ( nsupc == 1 ) {
		    for (iptr=istart+1; iptr < L_SUB_END(fsupc); ++iptr) {
			irow = L_SUB(iptr);
			++luptr;
			x[irow] -= x[fsupc] * Lval[luptr];
		    }
		} else {
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
                    ftcs1 = _cptofcd("L", strlen("L"));
                    ftcs2 = _cptofcd("N", strlen("N"));
                    ftcs3 = _cptofcd("U", strlen("U"));

		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
		       	&x[fsupc], &incx);
		
		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#else
		    dtrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
		       	&x[fsupc], &incx);
		
		    dgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#endif
#else
		    dlsolve (nsupr, nsupc, &Lval[luptr], &x[fsupc]);
		
		    dmatvec (nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
                             &x[fsupc], &work[0] );
#endif		
		
		    iptr = istart + nsupc;
		    for (i = 0; i < nrow; ++i, ++iptr) {
			irow = L_SUB(iptr);
			x[irow] -= work[i];	/* Scatter */
			work[i] = 0.0;

		    }
	 	}
	    } /* for k ... */
	    
	} else {
	    /* Form x := inv(U)*x */
	    
	    if ( U->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = nsuper; k >= 0; k--) {
	    	fsupc = L_FST_SUPC(k);
                nsupr = L_SUB_END(fsupc) - L_SUB_START(fsupc);
                nsupc = L_LAST_SUPC(k) - fsupc;
	    	luptr = L_NZ_START(fsupc);
		
    	        solve_ops += nsupc * (nsupc + 1);

		if ( nsupc == 1 ) {
		    x[fsupc] /= Lval[luptr];
		    for (i = U_NZ_START(fsupc); i < U_NZ_END(fsupc); ++i) {
			irow = U_SUB(i);
			x[irow] -= x[fsupc] * Uval[i];
		    }
		} else {
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
                    ftcs1 = _cptofcd("U", strlen("U"));
                    ftcs2 = _cptofcd("N", strlen("N"));
                    ftcs3 = _cptofcd("N", strlen("N"));

		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
		       &x[fsupc], &incx);
#else
		    dtrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
                           &x[fsupc], &incx);
#endif
#else		
		    dusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
#endif		

                    for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
		        solve_ops += 2*(U_NZ_END(jcol) - U_NZ_START(jcol));
		    	for (i = U_NZ_START(jcol); i < U_NZ_END(jcol); i++) {
			    irow = U_SUB(i);
			    x[irow] -= x[jcol] * Uval[i];
		    	}
                    }
		}
	    } /* for k ... */
	    
	}
    } else { /* Form x := inv(A')*x */
	
	if ( lsame_(uplo, "L") ) {
	    /* Form x := inv(L')*x */
    	    if ( L->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = nsuper; k >= 0; --k) {
	    	fsupc = L_FST_SUPC(k);
	    	istart = L_SUB_START(fsupc);
                nsupr = L_SUB_END(fsupc) - istart;
                nsupc = L_LAST_SUPC(k) - fsupc;
	    	luptr = L_NZ_START(fsupc);

		solve_ops += 2 * (nsupr - nsupc) * nsupc;

		for (jcol = fsupc; jcol < L_LAST_SUPC(k); jcol++) {
		    iptr = istart + nsupc;
		    for (i = L_NZ_START(jcol) + nsupc; 
				i < L_NZ_END(jcol); i++) {
			irow = L_SUB(iptr);
			x[jcol] -= x[irow] * Lval[i];
			iptr++;
		    }
		}
		
		if ( nsupc > 1 ) {
		    solve_ops += nsupc * (nsupc - 1);
#ifdef _CRAY
                    ftcs1 = _cptofcd("L", strlen("L"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("U", strlen("U"));
		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
			&x[fsupc], &incx);
#else
		    dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
			&x[fsupc], &incx);
#endif
		}
	    }
	} else {
	    /* Form x := inv(U')*x */
	    if ( U->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = 0; k <= nsuper; k++) {
	    	fsupc = L_FST_SUPC(k);
                nsupr = L_SUB_END(fsupc) - L_SUB_START(fsupc);
                nsupc = L_LAST_SUPC(k) - fsupc;
	    	luptr = L_NZ_START(fsupc);

		for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
		    solve_ops += 2*(U_NZ_END(jcol) - U_NZ_START(jcol));
		    for (i = U_NZ_START(jcol); i < U_NZ_END(jcol); i++) {
			irow = U_SUB(i);
			x[jcol] -= x[irow] * Uval[i];
		    }
		}

		solve_ops += nsupc * (nsupc + 1);

		if ( nsupc == 1 ) {
		    x[fsupc] /= Lval[luptr];
		} else {
#ifdef _CRAY
                    ftcs1 = _cptofcd("U", strlen("U"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("N", strlen("N"));
		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
			    &x[fsupc], &incx);
#else
		    dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
			    &x[fsupc], &incx);
#endif
		}
	    } /* for k ... */
	}
    }

    SUPERLU_FREE(work);
    return 0;
}
Пример #17
0
void
getata(
       const int m,      /* number of rows in matrix A. */
       const int n,      /* number of columns in matrix A. */
       const int nz,     /* number of nonzeros in matrix A */
       int *colptr,      /* column pointer of size n+1 for matrix A. */
       int *rowind,      /* row indices of size nz for matrix A. */
       int *atanz,       /* out - on exit, returns the actual number of
                            nonzeros in matrix A'*A. */
       int **ata_colptr, /* out - size n+1 */
       int **ata_rowind  /* out - size *atanz */
       )
/*
 * Purpose
 * =======
 *
 * Form the structure of A'*A. A is an m-by-n matrix in column oriented
 * format represented by (colptr, rowind). The output A'*A is in column
 * oriented format (symmetrically, also row oriented), represented by
 * (ata_colptr, ata_rowind).
 *
 * This routine is modified from GETATA routine by Tim Davis.
 * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2,
 * i.e., the sum of the square of the row counts.
 *
 * Questions
 * =========
 *     o  Do I need to withhold the *dense* rows?
 *     o  How do I know the number of nonzeros in A'*A?
 * 
 */
{
    register int i, j, k, col, num_nz, ti, trow;
    int *marker, *b_colptr, *b_rowind;
    int *t_colptr, *t_rowind; /* a column oriented form of T = A' */

    if (!(marker = (int*)SUPERLU_MALLOC( (SUPERLU_MAX(m,n)+1) * sizeof(int)) ))
	SUPERLU_ABORT("SUPERLU_MALLOC fails for marker[]");
    if ( !(t_colptr = (int*) SUPERLU_MALLOC( (m+1) * sizeof(int)) ) )
	SUPERLU_ABORT("SUPERLU_MALLOC t_colptr[]");
    if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) )
	SUPERLU_ABORT("SUPERLU_MALLOC fails for t_rowind[]");

    
    /* Get counts of each column of T, and set up column pointers */
    for (i = 0; i < m; ++i) marker[i] = 0;
    for (j = 0; j < n; ++j) {
	for (i = colptr[j]; i < colptr[j+1]; ++i)
	    ++marker[rowind[i]];
    }
    t_colptr[0] = 0;
    for (i = 0; i < m; ++i) {
	t_colptr[i+1] = t_colptr[i] + marker[i];
	marker[i] = t_colptr[i];
    }

    /* Transpose the matrix from A to T */
    for (j = 0; j < n; ++j)
	for (i = colptr[j]; i < colptr[j+1]; ++i) {
	    col = rowind[i];
	    t_rowind[marker[col]] = j;
	    ++marker[col];
	}

    
    /* ----------------------------------------------------------------
       compute B = T * A, where column j of B is:

       Struct (B_*j) =    UNION   ( Struct (T_*k) )
                        A_kj != 0

       do not include the diagonal entry
   
       ( Partition A as: A = (A_*1, ..., A_*n)
         Then B = T * A = (T * A_*1, ..., T * A_*n), where
         T * A_*j = (T_*1, ..., T_*m) * A_*j.  )
       ---------------------------------------------------------------- */

    /* Zero the diagonal flag */
    for (i = 0; i < n; ++i) marker[i] = -1;

    /* First pass determines number of nonzeros in B */
    num_nz = 0;
    for (j = 0; j < n; ++j) {
	/* Flag the diagonal so it's not included in the B matrix */
	marker[j] = j;

	for (i = colptr[j]; i < colptr[j+1]; ++i) {
	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
	    k = rowind[i];
	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
		trow = t_rowind[ti];
		if ( marker[trow] != j ) {
		    marker[trow] = j;
		    num_nz++;
		}
	    }
	}
    }
    *atanz = num_nz;
    
    /* Allocate storage for A'*A */
    if ( !(*ata_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
	SUPERLU_ABORT("SUPERLU_MALLOC fails for ata_colptr[]");
    if ( *atanz ) {
	if ( !(*ata_rowind = (int*) SUPERLU_MALLOC( *atanz * sizeof(int)) ) )
	    SUPERLU_ABORT("SUPERLU_MALLOC fails for ata_rowind[]");
    }
    b_colptr = *ata_colptr; /* aliasing */
    b_rowind = *ata_rowind;
    
    /* Zero the diagonal flag */
    for (i = 0; i < n; ++i) marker[i] = -1;
    
    /* Compute each column of B, one at a time */
    num_nz = 0;
    for (j = 0; j < n; ++j) {
	b_colptr[j] = num_nz;
	
	/* Flag the diagonal so it's not included in the B matrix */
	marker[j] = j;

	for (i = colptr[j]; i < colptr[j+1]; ++i) {
	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
	    k = rowind[i];
	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
		trow = t_rowind[ti];
		if ( marker[trow] != j ) {
		    marker[trow] = j;
		    b_rowind[num_nz++] = trow;
		}
	    }
	}
    }
    b_colptr[n] = num_nz;
       
    SUPERLU_FREE(marker);
    SUPERLU_FREE(t_colptr);
    SUPERLU_FREE(t_rowind);
}
Пример #18
0
void
cgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U,
       int *perm_r, int *perm_c, equed_t equed, float *R, float *C,
       SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr,
       Gstat_t *Gstat, int *info)
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 *
 * Purpose
 * =======   
 *
 * cgsrfs improves the computed solution to a system of linear
 * equations and provides error bounds and backward error estimates for
 * the solution.
 *
 * See supermatrix.h for the definition of 'SuperMatrix' structure.
 *
 * Arguments
 * =========
 *
 * trans   (input) trans_t
 *         Specifies the form of the system of equations:
 *         = NOTRANS:  A * X = B     (No transpose)
 *         = TRANS:    A**T * X = B  (Transpose)
 *         = CONJ:     A**H * X = B  (Conjugate transpose = Transpose)
 *
 * A       (input) SuperMatrix*
 *         The original matrix A in the system, or the scaled A if
 *         equilibration was done. The type of A can be:
 *         Stype = NC, Dtype = _D, Mtype = GE.
 *
 * L       (input) SuperMatrix*
 *         The factor L from the factorization Pr*A*Pc=L*U. Use
 *         compressed row subscripts storage for supernodes,
 *         i.e., L has types: Stype = SCP, Dtype = _D, Mtype = TRLU.
 *
 * U       (input) SuperMatrix*
 *         The factor U from the factorization Pr*A*Pc=L*U as computed by
 *         dgstrf(). Use column-wise storage scheme,
 *         i.e., U has types: Stype = NCP, Dtype = _D, Mtype = TRU.
 *
 * perm_r  (input) int*, dimension (A->nrow)
 *         Row permutation vector, which defines the permutation matrix Pr;
 *         perm_r[i] = j means row i of A is in position j in Pr*A.
 *
 * perm_c  (input) int*, dimension (A->ncol)
 *         Column permutation vector, which defines the
 *         permutation matrix Pc; perm_c[i] = j means column i of A is 
 *         in position j in A*Pc.
 *
 * equed   (input) equed_t
 *         Specifies the form of equilibration that was done.
 *         = NOEQUIL: No equilibration.
 *         = ROW:  Row equilibration, i.e., A was premultiplied by diag(R).
 *         = COL:  Column equilibration, i.e., A was postmultiplied by
 *                 diag(C).
 *         = BOTH: Both row and column equilibration, i.e., A was replaced
 *                 by diag(R)*A*diag(C).
 *
 * R       (input) double*, dimension (A->nrow)
 *         The row scale factors for A.
 *         If equed = ROW or BOTH, A is premultiplied by diag(R).
 *         If equed = NOEQUIL or COL, R is not accessed.
 *
 * C       (input) double*, dimension (A->ncol)
 *         The column scale factors for A.
 *         If equed = COL or BOTH, A is postmultiplied by diag(C).
 *         If equed = NOEQUIL or ROW, C is not accessed.
 *
 * B       (input) SuperMatrix*
 *         B has types: Stype = DN, Dtype = _D, Mtype = GE.
 *         The right hand side matrix B.
 *
 * X       (input/output) SuperMatrix*
 *         X has types: Stype = DN, Dtype = _D, Mtype = GE.
 *         On entry, the solution matrix X, as computed by dgstrs().
 *         On exit, the improved solution matrix X.
 *
 * FERR    (output) double*, dimension (B->ncol)
 *         The estimated forward error bound for each solution vector
 *         X(j) (the j-th column of the solution matrix X).
 *         If XTRUE is the true solution corresponding to X(j), FERR(j)
 *         is an estimated upper bound for the magnitude of the largest
 *         element in (X(j) - XTRUE) divided by the magnitude of the
 *         largest element in X(j).  The estimate is as reliable as
 *         the estimate for RCOND, and is almost always a slight
 *         overestimate of the true error.
 *
 * BERR    (output) double*, dimension (B->ncol)
 *         The componentwise relative backward error of each solution
 *         vector X(j) (i.e., the smallest relative change in
 *         any element of A or B that makes X(j) an exact solution).
 *
 * info    (output) int*
 *         = 0:  successful exit
 *         < 0:  if INFO = -i, the i-th argument had an illegal value
 *
 * Internal Parameters
 * ===================
 *
 * ITMAX is the maximum number of steps of iterative refinement.
 *
 */

#define ITMAX 5
    
    /* Table of constant values */
    int    ione = 1;
    complex ndone = {-1., 0.};
    complex done = {1., 0.};
    
    /* Local variables */
    NCformat *Astore;
    complex   *Aval;
    SuperMatrix Bjcol;
    DNformat *Bstore, *Xstore, *Bjcol_store;
    complex   *Bmat, *Xmat, *Bptr, *Xptr;
    int      kase;
    float   safe1, safe2;
    int      i, j, k, irow, nz, count, notran, rowequ, colequ;
    int      ldb, ldx, nrhs;
    float   s, xk, lstres, eps, safmin;
    char     transc[1];
    trans_t  transt;
    complex   *work;
    float   *rwork;
    int      *iwork;
    extern double slamch_(char *);
    extern int clacon_(int *, complex *, complex *, float *, int *);
#ifdef _CRAY
    extern int CCOPY(int *, complex *, int *, complex *, int *);
    extern int CSAXPY(int *, complex *, complex *, int *, complex *, int *);
#else
    extern int ccopy_(int *, complex *, int *, complex *, int *);
    extern int caxpy_(int *, complex *, complex *, int *, complex *, int *);
#endif

    Astore = A->Store;
    Aval   = Astore->nzval;
    Bstore = B->Store;
    Xstore = X->Store;
    Bmat   = Bstore->nzval;
    Xmat   = Xstore->nzval;
    ldb    = Bstore->lda;
    ldx    = Xstore->lda;
    nrhs   = B->ncol;
    
    /* Test the input parameters */
    *info = 0;
    notran = (trans == NOTRANS);
    if ( !notran && trans != TRANS && trans != CONJ ) *info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	      A->Stype != SLU_NC || A->Dtype != SLU_C || A->Mtype != SLU_GE )
	*info = -2;
    else if ( L->nrow != L->ncol || L->nrow < 0 ||
 	      L->Stype != SLU_SCP || L->Dtype != SLU_C || L->Mtype != SLU_TRLU )
	*info = -3;
    else if ( U->nrow != U->ncol || U->nrow < 0 ||
 	      U->Stype != SLU_NCP || U->Dtype != SLU_C || U->Mtype != SLU_TRU )
	*info = -4;
    else if ( ldb < SUPERLU_MAX(0, A->nrow) ||
 	      B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE )
        *info = -10;
    else if ( ldx < SUPERLU_MAX(0, A->nrow) ||
 	      X->Stype != SLU_DN || X->Dtype != SLU_C || X->Mtype != SLU_GE )
	*info = -11;
    if (*info != 0) {
	i = -(*info);
	xerbla_("cgsrfs", &i);
	return;
    }

    /* Quick return if possible */
    if ( A->nrow == 0 || nrhs == 0) {
	for (j = 0; j < nrhs; ++j) {
	    ferr[j] = 0.;
	    berr[j] = 0.;
	}
	return;
    }

    rowequ = (equed == ROW) || (equed == BOTH);
    colequ = (equed == COL) || (equed == BOTH);
    
    /* Allocate working space */
    work = complexMalloc(2*A->nrow);
    rwork = (float *) SUPERLU_MALLOC( (size_t) A->nrow * sizeof(float) );
    iwork = intMalloc(A->nrow);
    if ( !work || !rwork || !iwork ) 
        SUPERLU_ABORT("Malloc fails for work/rwork/iwork.");
    
    if ( notran ) {
	*(unsigned char *)transc = 'N';
        transt = TRANS;
    } else {
	*(unsigned char *)transc = 'T';
	transt = NOTRANS;
    }

    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
    nz     = A->ncol + 1;
    eps    = slamch_("Epsilon");
    safmin = slamch_("Safe minimum");
    /* Set SAFE1 essentially to be the underflow threshold times the
       number of additions in each row. */
    safe1  = nz * safmin;
    safe2  = safe1 / eps;

    /* Compute the number of nonzeros in each row (or column) of A */
    for (i = 0; i < A->nrow; ++i) iwork[i] = 0;
    if ( notran ) {
	for (k = 0; k < A->ncol; ++k)
	    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) 
		++iwork[Astore->rowind[i]];
    } else {
	for (k = 0; k < A->ncol; ++k)
	    iwork[k] = Astore->colptr[k+1] - Astore->colptr[k];
    }	

    /* Copy one column of RHS B into Bjcol. */
    Bjcol.Stype = B->Stype;
    Bjcol.Dtype = B->Dtype;
    Bjcol.Mtype = B->Mtype;
    Bjcol.nrow  = B->nrow;
    Bjcol.ncol  = 1;
    Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
    if ( !Bjcol.Store ) SUPERLU_ABORT("SUPERLU_MALLOC fails for Bjcol.Store");
    Bjcol_store = Bjcol.Store;
    Bjcol_store->lda = ldb;
    Bjcol_store->nzval = work; /* address aliasing */
	
    /* Do for each right hand side ... */
    for (j = 0; j < nrhs; ++j) {
	count = 0;
	lstres = 3.;
	Bptr = &Bmat[j*ldb];
	Xptr = &Xmat[j*ldx];

	while (1) { /* Loop until stopping criterion is satisfied. */

	    /* Compute residual R = B - op(A) * X,   
	       where op(A) = A, A**T, or A**H, depending on TRANS. */
	    
#ifdef _CRAY
	    CCOPY(&A->nrow, Bptr, &ione, work, &ione);
#else
	    ccopy_(&A->nrow, Bptr, &ione, work, &ione);
#endif
	    sp_cgemv(transc, ndone, A, Xptr, ione, done, work, ione);

	    /* Compute componentwise relative backward error from formula 
	       max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) )   
	       where abs(Z) is the componentwise absolute value of the matrix
	       or vector Z.  If the i-th component of the denominator is less
	       than SAFE2, then SAFE1 is added to the i-th component of the   
	       numerator before dividing. */

	    for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] );
	    
	    /* Compute abs(op(A))*abs(X) + abs(B). */
	    if (notran) {
		for (k = 0; k < A->ncol; ++k) {
		    xk = c_abs1( &Xptr[k] );
		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
			rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk;
		}
	    } else {
		for (k = 0; k < A->ncol; ++k) {
		    s = 0.;
		    for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
			irow = Astore->rowind[i];
			s += c_abs1(&Aval[i]) * c_abs1(&Xptr[irow]);
		    }
		    rwork[k] += s;
		}
	    }
	    s = 0.;
	    for (i = 0; i < A->nrow; ++i) {
		if (rwork[i] > safe2) {
		    s = SUPERLU_MAX( s, c_abs1(&work[i]) / rwork[i] );
		} else if ( rwork[i] != 0.0 ) {
		    s = SUPERLU_MAX( s, (c_abs1(&work[i]) + safe1) / rwork[i] );
                }
                /* If rwork[i] is exactly 0.0, then we know the true 
                   residual also must be exactly 0.0. */
	    }
	    berr[j] = s;

	    /* Test stopping criterion. Continue iterating if   
	       1) The residual BERR(J) is larger than machine epsilon, and   
	       2) BERR(J) decreased by at least a factor of 2 during the   
	          last iteration, and   
	       3) At most ITMAX iterations tried. */

	    if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) {
		/* Update solution and try again. */
		cgstrs (trans, L, U, perm_r, perm_c, &Bjcol, Gstat, info);
		
#ifdef _CRAY
		CAXPY(&A->nrow, &done, work, &ione,
		       &Xmat[j*ldx], &ione);
#else
		caxpy_(&A->nrow, &done, work, &ione,
		       &Xmat[j*ldx], &ione);
#endif
		lstres = berr[j];
		++count;
	    } else {
		break;
	    }
        
	} /* end while */

	/* Bound error from formula:
	   norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))*   
	   ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X)   
          where   
            norm(Z) is the magnitude of the largest component of Z   
            inv(op(A)) is the inverse of op(A)   
            abs(Z) is the componentwise absolute value of the matrix or
	       vector Z   
            NZ is the maximum number of nonzeros in any row of A, plus 1   
            EPS is machine epsilon   

          The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B))   
          is incremented by SAFE1 if the i-th component of   
          abs(op(A))*abs(X) + abs(B) is less than SAFE2.   

          Use CLACON to estimate the infinity-norm of the matrix   
             inv(op(A)) * diag(W),   
          where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */
	
	for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] );
	
	/* Compute abs(op(A))*abs(X) + abs(B). */
	if ( notran ) {
	    for (k = 0; k < A->ncol; ++k) {
		xk = c_abs1( &Xptr[k] );
		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i)
		    rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk;
	    }
	} else {
	    for (k = 0; k < A->ncol; ++k) {
		s = 0.;
		for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) {
		    irow = Astore->rowind[i];
		    xk = c_abs1( &Xptr[irow] );
		    s += c_abs1(&Aval[i]) * xk;
		}
		rwork[k] += s;
	    }
	}
	
	for (i = 0; i < A->nrow; ++i)
	    if (rwork[i] > safe2)
		rwork[i] = c_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i];
	    else
		rwork[i] = c_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1;
	kase = 0;

	do {
	    clacon_(&A->nrow, &work[A->nrow], work,
		    &ferr[j], &kase);
	    if (kase == 0) break;

	    if (kase == 1) {
		/* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */
		if ( notran && colequ )
		    for (i = 0; i < A->ncol; ++i) {
		        cs_mult(&work[i], &work[i], C[i]);
	            }
		else if ( !notran && rowequ )
		    for (i = 0; i < A->nrow; ++i) {
		        cs_mult(&work[i], &work[i], R[i]);
                    }

		cgstrs (transt, L, U, perm_r, perm_c, &Bjcol, Gstat, info);
		
		for (i = 0; i < A->nrow; ++i) {
		    cs_mult(&work[i], &work[i], rwork[i]);
	 	}
	    } else {
		/* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */
		for (i = 0; i < A->nrow; ++i) {
		    cs_mult(&work[i], &work[i], rwork[i]);
		}
		
		cgstrs (trans, L, U, perm_r, perm_c, &Bjcol, Gstat, info);
		
		if ( notran && colequ )
		    for (i = 0; i < A->ncol; ++i) {
		        cs_mult(&work[i], &work[i], C[i]);
		    }
		else if ( !notran && rowequ )
		    for (i = 0; i < A->ncol; ++i) {
		        cs_mult(&work[i], &work[i], R[i]);  
		    }
	    }
	    
	} while ( kase != 0 );

	/* Normalize error. */
	lstres = 0.;
 	if ( notran && colequ ) {
	    for (i = 0; i < A->nrow; ++i)
	    	lstres = SUPERLU_MAX( lstres, C[i] * c_abs1( &Xptr[i]) );
  	} else if ( !notran && rowequ ) {
	    for (i = 0; i < A->nrow; ++i)
	    	lstres = SUPERLU_MAX( lstres, R[i] * c_abs1( &Xptr[i]) );
	} else {
	    for (i = 0; i < A->nrow; ++i)
	    	lstres = SUPERLU_MAX( lstres, c_abs1( &Xptr[i]) );
	}
	if ( lstres != 0. )
	    ferr[j] /= lstres;

    } /* for each RHS j ... */
    
    SUPERLU_FREE(work);
    SUPERLU_FREE(rwork);
    SUPERLU_FREE(iwork);
    SUPERLU_FREE(Bjcol.Store);

    return;

} /* cgsrfs */
Пример #19
0
void
dgstrs(trans_t trans, SuperMatrix *L, SuperMatrix *U, 
       int *perm_r, int *perm_c, SuperMatrix *B, Gstat_t *Gstat, int *info)
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 *
 * Purpose
 * =======
 *
 * dgstrs() solves a system of linear equations A*X=B or A'*X=B
 * with A sparse and B dense, using the LU factorization computed by
 * pdgstrf().
 *
 * Arguments
 * =========
 *
 * trans   (input) Specifies the form of the system of equations:
 *          = NOTRANS: A * X = B  (No transpose)
 *          = TRANS:   A'* X = B  (Transpose)
 *
 * L       (input) SuperMatrix*
 *         The factor L from the factorization Pr*A*Pc=L*U as computed by
 *         pdgstrf(). Use compressed row subscripts storage for supernodes,
 *         i.e., L has types: Stype = SCP, Dtype = _D, Mtype = TRLU.
 *
 * U       (input) SuperMatrix*
 *         The factor U from the factorization Pr*A*Pc=L*U as computed by
 *         pdgstrf(). Use column-wise storage scheme, i.e., U has types:
 *         Stype = NCP, Dtype = _D, Mtype = TRU.
 *
 * perm_r  (input) int*
 *         Row permutation vector of size L->nrow, which defines the
 *         permutation matrix Pr; perm_r[i] = j means row i of A is in
 *         position j in Pr*A.
 *
 * perm_c  (int*) dimension A->ncol
 *	   Column permutation vector, which defines the 
 *         permutation matrix Pc; perm_c[i] = j means column i of A is 
 *         in position j in A*Pc.
 *
 * B       (input/output) SuperMatrix*
 *         B has types: Stype = DN, Dtype = _D, Mtype = GE.
 *         On entry, the right hand side matrix.
 *         On exit, the solution matrix if info = 0;
 *
 * Gstat   (output) Gstat_t*
 *          Record all the statistics about the triangular solves; 
 *          See Gstat_t structure defined in slu_mt_util.h.
 *
 * info    (output) Diagnostics
 * 	   = 0: successful exit
 *	   < 0: if info = -i, the i-th argument had an illegal value
 *
 */
#if ( MACH==CRAY_PVP )
    _fcd ftcs1, ftcs2, ftcs3, ftcs4;
#endif

#ifdef USE_VENDOR_BLAS
    int      incx = 1, incy = 1;
    double   alpha = 1.0, beta = 1.0;
#endif

    register int j, k, jcol, iptr, luptr, ksupno, istart, irow, bptr;
    register int fsupc, nsuper;
    int      i, n, nsupc, nsupr, nrow, nrhs, ldb;
    int      *supno;
    DNformat *Bstore;
    SCPformat *Lstore;
    NCPformat *Ustore;
    double   *Lval, *Uval, *Bmat;
    double   *work, *work_col, *rhs_work, *soln;
    flops_t  solve_ops;
    void dprint_soln();

    /* Test input parameters ... */
    *info = 0;
    Bstore = B->Store;
    ldb = Bstore->lda;
    nrhs = B->ncol;
    if ( trans != NOTRANS && trans != TRANS ) *info = -1;
    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -3;
    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -4;
    else if ( ldb < SUPERLU_MAX(0, L->nrow) ) *info = -6;
    if ( *info ) {
        i = -(*info);
	xerbla_("dgstrs", &i);
	return;
    }

    n = L->nrow;
    work = doubleCalloc(n * nrhs);
    if ( !work ) SUPERLU_ABORT("Malloc fails for local work[].");
    soln = doubleMalloc(n);
    if ( !soln ) SUPERLU_ABORT("Malloc fails for local soln[].");

    Bmat = Bstore->nzval;
    Lstore = L->Store;
    Lval = Lstore->nzval;
    Ustore = U->Store;
    Uval = Ustore->nzval;
    supno = Lstore->col_to_sup;
    nsuper = Lstore->nsuper;
    solve_ops = 0;
    
    if ( trans == NOTRANS ) {
	/* Permute right hand sides to form Pr*B */
	for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) {
	    rhs_work = &Bmat[bptr];
	    for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k];
	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
	}
	
	/* Forward solve PLy=Pb. */
/*>>	for (k = 0; k < n; k += nsupc) {
	    ksupno = supno[k];
*/
	for (ksupno = 0; ksupno <= nsuper; ++ksupno) {
	    fsupc = L_FST_SUPC(ksupno);
	    istart = L_SUB_START(fsupc);
	    nsupr = L_SUB_END(fsupc) - istart;
	    nsupc = L_LAST_SUPC(ksupno) - fsupc;
	    nrow = nsupr - nsupc;

	    solve_ops += nsupc * (nsupc - 1) * nrhs;
	    solve_ops += 2 * nrow * nsupc * nrhs;
	    
	    if ( nsupc == 1 ) {
		for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) {
		    rhs_work = &Bmat[bptr];
	    	    luptr = L_NZ_START(fsupc);
		    for (iptr=istart+1; iptr < L_SUB_END(fsupc); iptr++){
			irow = L_SUB(iptr);
			++luptr;
                        rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr];
		    }
		}
	    } else {
	    	luptr = L_NZ_START(fsupc);
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
		ftcs1 = _cptofcd("L", strlen("L"));
		ftcs2 = _cptofcd("N", strlen("N"));
		ftcs3 = _cptofcd("U", strlen("U"));
 		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha,
		      &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
		
		SGEMM(ftcs2, ftcs2,  &nrow, &nrhs, &nsupc, &alpha, 
		      &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
		      &beta, &work[0], &n );
#else
 		dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha,
		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
		
		dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, 
			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
			&beta, &work[0], &n );
#endif
		for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) {
		    rhs_work = &Bmat[bptr];
		    work_col = &work[j*n];
		    iptr = istart + nsupc;
		    for (i = 0; i < nrow; i++) {
			irow = L_SUB(iptr);
                        rhs_work[irow] -= work_col[i]; /* Scatter */
                        work_col[i] = 0.0;
			iptr++;
		    }
		}
#else		
		for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) {
		    rhs_work = &Bmat[bptr];
		    dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]);
		    dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc],
			     &rhs_work[fsupc], &work[0] );

		    iptr = istart + nsupc;
		    for (i = 0; i < nrow; i++) {
			irow = L_SUB(iptr);
                        rhs_work[irow] -= work[i];
                        work[i] = 0.0;
			iptr++;
		    }
		}
#endif		    
	    } /* if-else: nsupc == 1 ... */
	} /* for L-solve */

#if ( DEBUGlevel>=2 )
  	printf("After L-solve: y=\n");
	dprint_soln(n, nrhs, Bmat);
#endif

	/*
	 * Back solve Ux=y.
	 */
/*>>	for (k = n-1; k >= 0; k -= nsupc) {
	    ksupno = supno[k];
*/
	for (ksupno = nsuper; ksupno >= 0; --ksupno) {
	    fsupc = L_FST_SUPC(ksupno);
	    istart = L_SUB_START(fsupc);
	    nsupr = L_SUB_END(fsupc) - istart;
	    nsupc = L_LAST_SUPC(ksupno) - fsupc;
	    luptr = L_NZ_START(fsupc);

	    solve_ops += nsupc * (nsupc + 1) * nrhs;

	    /* dense triangular matrix */
	    if ( nsupc == 1 ) {
		rhs_work = &Bmat[0];
		for (j = 0; j < nrhs; j++) {
                    rhs_work[fsupc] /= Lval[luptr];
		    rhs_work += ldb;
		}
	    } else {
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
		ftcs1 = _cptofcd("L", strlen("L"));
		ftcs2 = _cptofcd("U", strlen("U"));
		ftcs3 = _cptofcd("N", strlen("N"));
		STRSM(ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha,
		      &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
#else
		dtrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha,
		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
#endif
#else		
		for (j = 0, bptr = fsupc; j < nrhs; j++, bptr += ldb) {
		    dusolve (nsupr, nsupc, &Lval[luptr], &Bmat[bptr]);
		}
#endif		
	    }

	    /* matrix-vector update */
	    for (j = 0, bptr = 0; j < nrhs; ++j, bptr += ldb) {
		rhs_work = &Bmat[bptr];
		for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
                    solve_ops += 2*(U_NZ_END(jcol) - U_NZ_START(jcol));
		    for (i = U_NZ_START(jcol); i < U_NZ_END(jcol); i++ ){
			irow = U_SUB(i);
                        rhs_work[irow] -= rhs_work[jcol] * Uval[i];
		    }
		}
	    }
	    
	} /* for U-solve */

#if ( DEBUGlevel>=2 )
  	printf("After U-solve: x=\n");
	dprint_soln(n, nrhs, Bmat);
#endif

	/* Compute the final solution X <= Pc*X. */
	for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) {
	    rhs_work = &Bmat[bptr];
	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]];
	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
	}
	
    } else { /* Solve A'*X=B */
	/* Permute right hand sides to form Pc'*B. */
	for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) {
	    rhs_work = &Bmat[bptr];
	    for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k];
	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
	}
	
        for (k = 0; k < nrhs; ++k) {

            /* Multiply by inv(U'). */
            sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], info);

            /* Multiply by inv(L'). */
            sp_dtrsv("L", "T", "U", L, U, &Bmat[k*ldb], info);

        }
	/* Compute the final solution X <= Pr'*X (=inv(Pr)*X) */
	for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) {
	    rhs_work = &Bmat[bptr];
	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]];
	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
	}

    } /* if-else trans */

    Gstat->ops[TRISOLVE] = solve_ops;
    SUPERLU_FREE(work);
    SUPERLU_FREE(soln);
}