Exemple #1
0
int
f2c_strsv(char* uplo, char* trans, char* diag, integer* N,
          real* A, integer* lda,
          real* X, integer* incX)
{
    strsv_(uplo, trans, diag,
           N, A, lda, X, incX);
    return 0;
}
void
psgstrf_bmod1D(
	       const int pnum,  /* process number */
	       const int m,     /* number of rows in the matrix */
	       const int w,     /* current panel width */
	       const int jcol,  /* leading column of the current panel */
	       const int fsupc, /* leading column of the updating supernode */ 
	       const int krep,  /* last column of the updating supernode */ 
	       const int nsupc, /* number of columns in the updating s-node */ 
	       int nsupr, /* number of rows in the updating supernode */  
	       int nrow,  /* number of rows below the diagonal block of
			     the updating supernode */ 
	       int *repfnz,     /* in */
	       int *panel_lsub, /* modified */
	       int *w_lsub_end, /* modified */
	       int *spa_marker, /* modified; size n-by-w */
	       float *dense,   /* modified */
	       float *tempv,   /* working array - zeros on entry/exit */
	       GlobalLU_t *Glu, /* modified */
	       Gstat_t *Gstat   /* modified */
	       )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab,  Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose
 * =======
 *
 *    Performs numeric block updates (sup-panel) in topological order.
 *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
 *    Results are returned in SPA dense[*,w].
 *
 */
#if ( MACH==CRAY_PVP )
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
#ifdef USE_VENDOR_BLAS
    int          incx = 1, incy = 1;
    float       alpha, beta;
#endif

    float       ukj, ukj1, ukj2;
    int          luptr, luptr1, luptr2;
    int          segsze;
    register int lptr; /* start of row subscripts of the updating supernode */
    register int i, krep_ind, kfnz, isub, irow, no_zeros;
    register int jj;	      /* index through each column in the panel */
    int          *repfnz_col; /* repfnz[] for a column in the panel */
    float       *dense_col;  /* dense[] for a column in the panel */
    float      *tempv1;     /* used to store matrix-vector result */
    int          *col_marker; /* each column of the spa_marker[*,w] */
    int          *col_lsub;   /* each column of the panel_lsub[*,w] */
    int          *lsub, *xlsub_end;
    float       *lusup;
    int          *xlusup;
    register float flopcnt;

    float      zero = 0.0;
    float      one = 1.0;
    
#ifdef TIMING
    double *utime = Gstat->utime;
    double f_time;
#endif    
    
    lsub      = Glu->lsub;
    xlsub_end = Glu->xlsub_end;
    lusup     = Glu->lusup;
    xlusup    = Glu->xlusup;
    lptr      = Glu->xlsub[fsupc];
    krep_ind  = lptr + nsupc - 1;

    /* Pointers to each column of the w-wide arrays. */
    repfnz_col= repfnz;
    dense_col = dense;
    col_marker= spa_marker;
    col_lsub  = panel_lsub;

#if ( DEBUGlevel>=2 )
if (jcol == BADPAN && krep == BADREP) {
    printf("(%d) psgstrf_bmod1D[1] jcol %d, fsupc %d, krep %d, nsupc %d, nsupr %d, nrow %d\n",
	   pnum, jcol, fsupc, krep, nsupc, nsupr, nrow);
    PrintInt10("lsub[xlsub[2774]]", nsupr, &lsub[lptr]);
}    
#endif
    
    /*
     * Sequence through each column in the panel ...
     */
    for (jj = jcol; jj < jcol + w; ++jj, col_marker += m, col_lsub += m,
	 repfnz_col += m, dense_col += m) {

	kfnz = repfnz_col[krep];
	if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */

	segsze = krep - kfnz + 1;
	luptr = xlusup[fsupc];

	/* Calculate flops: tri-solve + mat-vector */
        flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze;
	Gstat->procstat[pnum].fcops += flopcnt;

	/* Case 1: Update U-segment of size 1 -- col-col update */
	if ( segsze == 1 ) {
#ifdef TIMING
	    f_time = SuperLU_timer_();
#endif	    
	    ukj = dense_col[lsub[krep_ind]];
	    luptr += nsupr*(nsupc-1) + nsupc;
#if ( DEBUGlevel>=2 )
if (krep == BADCOL && jj == -1) {
    printf("(%d) psgstrf_bmod1D[segsze=1]: k %d, j %d, ukj %.10e\n",
	   pnum, lsub[krep_ind], jj, ukj);
    PrintInt10("segsze=1", nsupr, &lsub[lptr]);
}
#endif	    
	    for (i = lptr + nsupc; i < xlsub_end[fsupc]; i++) {
		irow = lsub[i];
                        dense_col[irow] -= ukj * lusup[luptr];
		++luptr;
#ifdef SCATTER_FOUND		
		if ( col_marker[irow] != jj ) {
		    col_marker[irow] = jj;
		    col_lsub[w_lsub_end[jj-jcol]++] = irow;
		}
#endif		
	    }
#ifdef TIMING
	    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    
	} else if ( segsze <= 3 ) {
#ifdef TIMING
	    f_time = SuperLU_timer_();
#endif	    
	    ukj = dense_col[lsub[krep_ind]];
	    luptr += nsupr*(nsupc-1) + nsupc-1;
	    ukj1 = dense_col[lsub[krep_ind - 1]];
	    luptr1 = luptr - nsupr;
	    if ( segsze == 2 ) {
                ukj -= ukj1 * lusup[luptr1];
		dense_col[lsub[krep_ind]] = ukj;
		for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
		    irow = lsub[i];
		    ++luptr;  ++luptr1;
                            dense_col[irow] -= (ukj * lusup[luptr]
                                                + ukj1 * lusup[luptr1]);
#ifdef SCATTER_FOUND		
		    if ( col_marker[irow] != jj ) {
			col_marker[irow] = jj;
			col_lsub[w_lsub_end[jj-jcol]++] = irow;
		    }
#endif		
		}
	    } else {
		ukj2 = dense_col[lsub[krep_ind - 2]];
		luptr2 = luptr1 - nsupr;
                ukj1 -= ukj2 * lusup[luptr2-1];
                ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
		dense_col[lsub[krep_ind]] = ukj;
		dense_col[lsub[krep_ind-1]] = ukj1;
		for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
		    irow = lsub[i];
		    ++luptr; ++luptr1; ++luptr2;
                    dense_col[irow] -= (ukj * lusup[luptr]
                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2]);
#ifdef SCATTER_FOUND		
		    if ( col_marker[irow] != jj ) {
			col_marker[irow] = jj;
			col_lsub[w_lsub_end[jj-jcol]++] = irow;
		    }
#endif		
		}
	    }
#ifdef TIMING
	    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    
	} else { /* segsze >= 4 */
	    /* 
	     * Perform a triangular solve and matrix-vector update,
	     * then scatter the result of sup-col update to dense[*].
	     */
	    no_zeros = kfnz - fsupc;

	    /* Gather U[*,j] segment from dense[*] to tempv[*]: 
	     *   The result of triangular solve is in tempv[*];
	     *   The result of matrix vector update is in dense_col[*]
	     */
	    isub = lptr + no_zeros;
/*#pragma ivdep*/
	    for (i = 0; i < segsze; ++i) {
		irow = lsub[isub];
		tempv[i] = dense_col[irow]; /* Gather */
		++isub;
	    }

	    /* start effective triangle */
	    luptr += nsupr * no_zeros + no_zeros;
#ifdef TIMING
	    f_time = SuperLU_timer_();
#endif
		
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
	    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
		  &nsupr, tempv, &incx );
#else
	    strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
		   &nsupr, tempv, &incx );
#endif
		
	    luptr += segsze;	/* Dense matrix-vector */
	    tempv1 = &tempv[segsze];

            alpha = one;
            beta = zero;
#if ( MACH==CRAY_PVP )
	    SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
		  &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
	    sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
		   &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif /* _CRAY_PVP */
#else
	    slsolve ( nsupr, segsze, &lusup[luptr], tempv );
	    
	    luptr += segsze;        /* Dense matrix-vector */
	    tempv1 = &tempv[segsze];
	    smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
#endif
		
#ifdef TIMING
	    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    

	    /* Scatter tempv[*] into SPA dense[*] temporarily, 
	     * such that tempv[*] can be used for the triangular solve of
	     * the next column of the panel. They will be copied into 
	     * ucol[*] after the whole panel has been finished.
	     */
	    isub = lptr + no_zeros;
/*#pragma ivdep*/
	    for (i = 0; i < segsze; i++) {
		irow = lsub[isub];
		dense_col[irow] = tempv[i]; /* Scatter */
		tempv[i] = zero;
		isub++;
#if ( DEBUGlevel>=2 )
	if (jj == -1 && krep == 3423)
	    printf("(%d) psgstrf_bmod1D[scatter] jj %d, dense_col[%d] %e\n",
		   pnum, jj, irow, dense_col[irow]);
#endif
	    }
		
	    /* Scatter the update from tempv1[*] into SPA dense[*] */
/*#pragma ivdep*/
	    for (i = 0; i < nrow; i++) {
		irow = lsub[isub];
                dense_col[irow] -= tempv1[i]; /* Scatter-add */
#ifdef SCATTER_FOUND		
		if ( col_marker[irow] != jj ) {
		    col_marker[irow] = jj;
		    col_lsub[w_lsub_end[jj-jcol]++] = irow;
		}
#endif		
		tempv1[i] = zero;
		isub++;
	    }
		
	} /* else segsze >= 4 ... */
	
    } /* for jj ... */

}
void
strsv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx)
{
   strsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
}
Exemple #4
0
/* Subroutine */ int ssygs2_(integer *itype, char *uplo, integer *n, real *a, 
	integer *lda, real *b, integer *ldb, integer *info)
{
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
    real r__1;

    /* Local variables */
    integer k;
    real ct, akk, bkk;
    extern /* Subroutine */ int ssyr2_(char *, integer *, real *, real *, 
	    integer *, real *, integer *, real *, integer *);
    extern logical lsame_(char *, char *);
    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
    logical upper;
    extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, 
	    real *, integer *), strmv_(char *, char *, char *, integer *, 
	    real *, integer *, real *, integer *), 
	    strsv_(char *, char *, char *, integer *, real *, integer *, real 
	    *, integer *), xerbla_(char *, integer *);


/*  -- LAPACK routine (version 3.2) -- */
/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/*     November 2006 */

/*     .. Scalar Arguments .. */
/*     .. */
/*     .. Array Arguments .. */
/*     .. */

/*  Purpose */
/*  ======= */

/*  SSYGS2 reduces a real symmetric-definite generalized eigenproblem */
/*  to standard form. */

/*  If ITYPE = 1, the problem is A*x = lambda*B*x, */
/*  and A is overwritten by inv(U')*A*inv(U) or inv(L)*A*inv(L') */

/*  If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or */
/*  B*A*x = lambda*x, and A is overwritten by U*A*U` or L'*A*L. */

/*  B must have been previously factorized as U'*U or L*L' by SPOTRF. */

/*  Arguments */
/*  ========= */

/*  ITYPE   (input) INTEGER */
/*          = 1: compute inv(U')*A*inv(U) or inv(L)*A*inv(L'); */
/*          = 2 or 3: compute U*A*U' or L'*A*L. */

/*  UPLO    (input) CHARACTER*1 */
/*          Specifies whether the upper or lower triangular part of the */
/*          symmetric matrix A is stored, and how B has been factorized. */
/*          = 'U':  Upper triangular */
/*          = 'L':  Lower triangular */

/*  N       (input) INTEGER */
/*          The order of the matrices A and B.  N >= 0. */

/*  A       (input/output) REAL array, dimension (LDA,N) */
/*          On entry, the symmetric matrix A.  If UPLO = 'U', the leading */
/*          n by n upper triangular part of A contains the upper */
/*          triangular part of the matrix A, and the strictly lower */
/*          triangular part of A is not referenced.  If UPLO = 'L', the */
/*          leading n by n lower triangular part of A contains the lower */
/*          triangular part of the matrix A, and the strictly upper */
/*          triangular part of A is not referenced. */

/*          On exit, if INFO = 0, the transformed matrix, stored in the */
/*          same format as A. */

/*  LDA     (input) INTEGER */
/*          The leading dimension of the array A.  LDA >= max(1,N). */

/*  B       (input) REAL array, dimension (LDB,N) */
/*          The triangular factor from the Cholesky factorization of B, */
/*          as returned by SPOTRF. */

/*  LDB     (input) INTEGER */
/*          The leading dimension of the array B.  LDB >= max(1,N). */

/*  INFO    (output) INTEGER */
/*          = 0:  successful exit. */
/*          < 0:  if INFO = -i, the i-th argument had an illegal value. */

/*  ===================================================================== */

/*     .. Parameters .. */
/*     .. */
/*     .. Local Scalars .. */
/*     .. */
/*     .. External Subroutines .. */
/*     .. */
/*     .. Intrinsic Functions .. */
/*     .. */
/*     .. External Functions .. */
/*     .. */
/*     .. Executable Statements .. */

/*     Test the input parameters. */

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1;
    b -= b_offset;

    /* Function Body */
    *info = 0;
    upper = lsame_(uplo, "U");
    if (*itype < 1 || *itype > 3) {
	*info = -1;
    } else if (! upper && ! lsame_(uplo, "L")) {
	*info = -2;
    } else if (*n < 0) {
	*info = -3;
    } else if (*lda < max(1,*n)) {
	*info = -5;
    } else if (*ldb < max(1,*n)) {
	*info = -7;
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("SSYGS2", &i__1);
	return 0;
    }

    if (*itype == 1) {
	if (upper) {

/*           Compute inv(U')*A*inv(U) */

	    i__1 = *n;
	    for (k = 1; k <= i__1; ++k) {

/*              Update the upper triangle of A(k:n,k:n) */

		akk = a[k + k * a_dim1];
		bkk = b[k + k * b_dim1];
/* Computing 2nd power */
		r__1 = bkk;
		akk /= r__1 * r__1;
		a[k + k * a_dim1] = akk;
		if (k < *n) {
		    i__2 = *n - k;
		    r__1 = 1.f / bkk;
		    sscal_(&i__2, &r__1, &a[k + (k + 1) * a_dim1], lda);
		    ct = akk * -.5f;
		    i__2 = *n - k;
		    saxpy_(&i__2, &ct, &b[k + (k + 1) * b_dim1], ldb, &a[k + (
			    k + 1) * a_dim1], lda);
		    i__2 = *n - k;
		    ssyr2_(uplo, &i__2, &c_b6, &a[k + (k + 1) * a_dim1], lda, 
			    &b[k + (k + 1) * b_dim1], ldb, &a[k + 1 + (k + 1) 
			    * a_dim1], lda);
		    i__2 = *n - k;
		    saxpy_(&i__2, &ct, &b[k + (k + 1) * b_dim1], ldb, &a[k + (
			    k + 1) * a_dim1], lda);
		    i__2 = *n - k;
		    strsv_(uplo, "Transpose", "Non-unit", &i__2, &b[k + 1 + (
			    k + 1) * b_dim1], ldb, &a[k + (k + 1) * a_dim1], 
			    lda);
		}
/* L10: */
	    }
	} else {

/*           Compute inv(L)*A*inv(L') */

	    i__1 = *n;
	    for (k = 1; k <= i__1; ++k) {

/*              Update the lower triangle of A(k:n,k:n) */

		akk = a[k + k * a_dim1];
		bkk = b[k + k * b_dim1];
/* Computing 2nd power */
		r__1 = bkk;
		akk /= r__1 * r__1;
		a[k + k * a_dim1] = akk;
		if (k < *n) {
		    i__2 = *n - k;
		    r__1 = 1.f / bkk;
		    sscal_(&i__2, &r__1, &a[k + 1 + k * a_dim1], &c__1);
		    ct = akk * -.5f;
		    i__2 = *n - k;
		    saxpy_(&i__2, &ct, &b[k + 1 + k * b_dim1], &c__1, &a[k + 
			    1 + k * a_dim1], &c__1);
		    i__2 = *n - k;
		    ssyr2_(uplo, &i__2, &c_b6, &a[k + 1 + k * a_dim1], &c__1, 
			    &b[k + 1 + k * b_dim1], &c__1, &a[k + 1 + (k + 1) 
			    * a_dim1], lda);
		    i__2 = *n - k;
		    saxpy_(&i__2, &ct, &b[k + 1 + k * b_dim1], &c__1, &a[k + 
			    1 + k * a_dim1], &c__1);
		    i__2 = *n - k;
		    strsv_(uplo, "No transpose", "Non-unit", &i__2, &b[k + 1 
			    + (k + 1) * b_dim1], ldb, &a[k + 1 + k * a_dim1], 
			    &c__1);
		}
/* L20: */
	    }
	}
    } else {
	if (upper) {

/*           Compute U*A*U' */

	    i__1 = *n;
	    for (k = 1; k <= i__1; ++k) {

/*              Update the upper triangle of A(1:k,1:k) */

		akk = a[k + k * a_dim1];
		bkk = b[k + k * b_dim1];
		i__2 = k - 1;
		strmv_(uplo, "No transpose", "Non-unit", &i__2, &b[b_offset], 
			ldb, &a[k * a_dim1 + 1], &c__1);
		ct = akk * .5f;
		i__2 = k - 1;
		saxpy_(&i__2, &ct, &b[k * b_dim1 + 1], &c__1, &a[k * a_dim1 + 
			1], &c__1);
		i__2 = k - 1;
		ssyr2_(uplo, &i__2, &c_b27, &a[k * a_dim1 + 1], &c__1, &b[k * 
			b_dim1 + 1], &c__1, &a[a_offset], lda);
		i__2 = k - 1;
		saxpy_(&i__2, &ct, &b[k * b_dim1 + 1], &c__1, &a[k * a_dim1 + 
			1], &c__1);
		i__2 = k - 1;
		sscal_(&i__2, &bkk, &a[k * a_dim1 + 1], &c__1);
/* Computing 2nd power */
		r__1 = bkk;
		a[k + k * a_dim1] = akk * (r__1 * r__1);
/* L30: */
	    }
	} else {

/*           Compute L'*A*L */

	    i__1 = *n;
	    for (k = 1; k <= i__1; ++k) {

/*              Update the lower triangle of A(1:k,1:k) */

		akk = a[k + k * a_dim1];
		bkk = b[k + k * b_dim1];
		i__2 = k - 1;
		strmv_(uplo, "Transpose", "Non-unit", &i__2, &b[b_offset], 
			ldb, &a[k + a_dim1], lda);
		ct = akk * .5f;
		i__2 = k - 1;
		saxpy_(&i__2, &ct, &b[k + b_dim1], ldb, &a[k + a_dim1], lda);
		i__2 = k - 1;
		ssyr2_(uplo, &i__2, &c_b27, &a[k + a_dim1], lda, &b[k + 
			b_dim1], ldb, &a[a_offset], lda);
		i__2 = k - 1;
		saxpy_(&i__2, &ct, &b[k + b_dim1], ldb, &a[k + a_dim1], lda);
		i__2 = k - 1;
		sscal_(&i__2, &bkk, &a[k + a_dim1], lda);
/* Computing 2nd power */
		r__1 = bkk;
		a[k + k * a_dim1] = akk * (r__1 * r__1);
/* L40: */
	    }
	}
    }
    return 0;

/*     End of SSYGS2 */

} /* ssygs2_ */
Exemple #5
0
void
spanel_bmod (
	    const int  m,          /* in - number of rows in the matrix */
	    const int  w,          /* in */
	    const int  jcol,       /* in */
	    const int  nseg,       /* in */
	    float     *dense,     /* out, of size n by w */
	    float     *tempv,     /* working array */
	    int        *segrep,    /* in */
	    int        *repfnz,    /* in, of size n by w */
	    GlobalLU_t *Glu,       /* modified */
	    SuperLUStat_t *stat    /* output */
	    )
{


#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    int          incx = 1, incy = 1;
    float       alpha, beta;
#endif

    register int k, ksub;
    int          fsupc, nsupc, nsupr, nrow;
    int          krep, krep_ind;
    float       ukj, ukj1, ukj2;
    int          luptr, luptr1, luptr2;
    int          segsze;
    int          block_nrow;  /* no of rows in a block row */
    register int lptr;	      /* Points to the row subscripts of a supernode */
    int          kfnz, irow, no_zeros; 
    register int isub, isub1, i;
    register int jj;	      /* Index through each column in the panel */
    int          *xsup, *supno;
    int          *lsub, *xlsub;
    float       *lusup;
    int          *xlusup;
    int          *repfnz_col; /* repfnz[] for a column in the panel */
    float       *dense_col;  /* dense[] for a column in the panel */
    float       *tempv1;             /* Used in 1-D update */
    float       *TriTmp, *MatvecTmp; /* used in 2-D update */
    float      zero = 0.0;
    float      one = 1.0;
    register int ldaTmp;
    register int r_ind, r_hi;
    int  maxsuper, rowblk, colblk;
    flops_t  *ops = stat->ops;
    
    xsup    = Glu->xsup;
    supno   = Glu->supno;
    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    lusup   = (float *) Glu->lusup;
    xlusup  = Glu->xlusup;
    
    maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) );
    rowblk   = sp_ienv(4);
    colblk   = sp_ienv(5);
    ldaTmp   = maxsuper + rowblk;

    /* 
     * For each nonz supernode segment of U[*,j] in topological order 
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */

	/* krep = representative of current k-th supernode
	 * fsupc = first supernodal column
	 * nsupc = no of columns in a supernode
	 * nsupr = no of rows in a supernode
	 */
        krep = segrep[k--];
	fsupc = xsup[supno[krep]];
	nsupc = krep - fsupc + 1;
	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
	nrow = nsupr - nsupc;
	lptr = xlsub[fsupc];
	krep_ind = lptr + nsupc - 1;

	repfnz_col = repfnz;
	dense_col = dense;
	
	if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */

	    TriTmp = tempv;
	
	    /* Sequence through each column in panel -- triangular solves */
	    for (jj = jcol; jj < jcol + w; jj++,
		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {

		kfnz = repfnz_col[krep];
		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
	    
		segsze = krep - kfnz + 1;
		luptr = xlusup[fsupc];

		ops[TRSV] += segsze * (segsze - 1);
		ops[GEMV] += 2 * nrow * segsze;
	
		/* Case 1: Update U-segment of size 1 -- col-col update */
		if ( segsze == 1 ) {
		    ukj = dense_col[lsub[krep_ind]];
		    luptr += nsupr*(nsupc-1) + nsupc;

		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
			irow = lsub[i];
			dense_col[irow] -= ukj * lusup[luptr];
			++luptr;
		    }

		} else if ( segsze <= 3 ) {
		    ukj = dense_col[lsub[krep_ind]];
		    ukj1 = dense_col[lsub[krep_ind - 1]];
		    luptr += nsupr*(nsupc-1) + nsupc-1;
		    luptr1 = luptr - nsupr;

		    if ( segsze == 2 ) {
			ukj -= ukj1 * lusup[luptr1];
			dense_col[lsub[krep_ind]] = ukj;
			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
			    irow = lsub[i];
			    luptr++; luptr1++;
			    dense_col[irow] -= (ukj*lusup[luptr]
						+ ukj1*lusup[luptr1]);
			}
		    } else {
			ukj2 = dense_col[lsub[krep_ind - 2]];
			luptr2 = luptr1 - nsupr;
			ukj1 -= ukj2 * lusup[luptr2-1];
			ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
			dense_col[lsub[krep_ind]] = ukj;
			dense_col[lsub[krep_ind-1]] = ukj1;
			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
			    irow = lsub[i];
			    luptr++; luptr1++; luptr2++;
			    dense_col[irow] -= ( ukj*lusup[luptr]
                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
			}
		    }

		} else  {	/* segsze >= 4 */
		    
		    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
		       holds the result of triangular solves.    */
		    no_zeros = kfnz - fsupc;
		    isub = lptr + no_zeros;
		    for (i = 0; i < segsze; ++i) {
			irow = lsub[isub];
			TriTmp[i] = dense_col[irow]; /* Gather */
			++isub;
		    }
		    
		    /* start effective triangle */
		    luptr += nsupr * no_zeros + no_zeros;

#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
		    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
			   &nsupr, TriTmp, &incx );
#else
		    strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
			   &nsupr, TriTmp, &incx );
#endif
#else		
		    slsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
#endif
		    

		} /* else ... */
	    
	    }  /* for jj ... end tri-solves */

	    /* Block row updates; push all the way into dense[*] block */
	    for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
		
		r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
		block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
		luptr = xlusup[fsupc] + nsupc + r_ind;
		isub1 = lptr + nsupc + r_ind;
		
		repfnz_col = repfnz;
		TriTmp = tempv;
		dense_col = dense;
		
		/* Sequence through each column in panel -- matrix-vector */
		for (jj = jcol; jj < jcol + w; jj++,
		     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
		    
		    kfnz = repfnz_col[krep];
		    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
		    
		    segsze = krep - kfnz + 1;
		    if ( segsze <= 3 ) continue;   /* skip unrolled cases */
		    
		    /* Perform a block update, and scatter the result of
		       matrix-vector to dense[].		 */
		    no_zeros = kfnz - fsupc;
		    luptr1 = luptr + nsupr * no_zeros;
		    MatvecTmp = &TriTmp[maxsuper];
		    
#ifdef USE_VENDOR_BLAS
		    alpha = one; 
                    beta = zero;
#ifdef _CRAY
		    SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], 
			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
#else
		    sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], 
			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
#endif
#else
		    smatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
			   TriTmp, MatvecTmp);
#endif
		    
		    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
		     * such that MatvecTmp[*] can be re-used for the
		     * the next blok row update. dense[] will be copied into 
		     * global store after the whole panel has been finished.
		     */
		    isub = isub1;
		    for (i = 0; i < block_nrow; i++) {
			irow = lsub[isub];
			dense_col[irow] -= MatvecTmp[i];
			MatvecTmp[i] = zero;
			++isub;
		    }
		    
		} /* for jj ... */
		
	    } /* for each block row ... */
	    
	    /* Scatter the triangular solves into SPA dense[*] */
	    repfnz_col = repfnz;
	    TriTmp = tempv;
	    dense_col = dense;
	    
	    for (jj = jcol; jj < jcol + w; jj++,
		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
		kfnz = repfnz_col[krep];
		if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
		
		segsze = krep - kfnz + 1;
		if ( segsze <= 3 ) continue; /* skip unrolled cases */
		
		no_zeros = kfnz - fsupc;		
		isub = lptr + no_zeros;
		for (i = 0; i < segsze; i++) {
		    irow = lsub[isub];
		    dense_col[irow] = TriTmp[i];
		    TriTmp[i] = zero;
		    ++isub;
		}
		
	    } /* for jj ... */
	    
	} else { /* 1-D block modification */
	    
	    
	    /* Sequence through each column in the panel */
	    for (jj = jcol; jj < jcol + w; jj++,
		 repfnz_col += m, dense_col += m) {
		
		kfnz = repfnz_col[krep];
		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
		
		segsze = krep - kfnz + 1;
		luptr = xlusup[fsupc];

		ops[TRSV] += segsze * (segsze - 1);
		ops[GEMV] += 2 * nrow * segsze;
		
		/* Case 1: Update U-segment of size 1 -- col-col update */
		if ( segsze == 1 ) {
		    ukj = dense_col[lsub[krep_ind]];
		    luptr += nsupr*(nsupc-1) + nsupc;

		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
			irow = lsub[i];
			dense_col[irow] -= ukj * lusup[luptr];
			++luptr;
		    }

		} else if ( segsze <= 3 ) {
		    ukj = dense_col[lsub[krep_ind]];
		    luptr += nsupr*(nsupc-1) + nsupc-1;
		    ukj1 = dense_col[lsub[krep_ind - 1]];
		    luptr1 = luptr - nsupr;

		    if ( segsze == 2 ) {
			ukj -= ukj1 * lusup[luptr1];
			dense_col[lsub[krep_ind]] = ukj;
			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
			    irow = lsub[i];
			    ++luptr;  ++luptr1;
			    dense_col[irow] -= (ukj*lusup[luptr]
						+ ukj1*lusup[luptr1]);
			}
		    } else {
			ukj2 = dense_col[lsub[krep_ind - 2]];
			luptr2 = luptr1 - nsupr;
			ukj1 -= ukj2 * lusup[luptr2-1];
			ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
			dense_col[lsub[krep_ind]] = ukj;
			dense_col[lsub[krep_ind-1]] = ukj1;
			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
			    irow = lsub[i];
			    ++luptr; ++luptr1; ++luptr2;
			    dense_col[irow] -= ( ukj*lusup[luptr]
                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
			}
		    }

		} else  { /* segsze >= 4 */
		    /* 
		     * Perform a triangular solve and block update,
		     * then scatter the result of sup-col update to dense[].
		     */
		    no_zeros = kfnz - fsupc;
		    
		    /* Copy U[*,j] segment from dense[*] to tempv[*]: 
		     *    The result of triangular solve is in tempv[*];
		     *    The result of matrix vector update is in dense_col[*]
		     */
		    isub = lptr + no_zeros;
		    for (i = 0; i < segsze; ++i) {
			irow = lsub[isub];
			tempv[i] = dense_col[irow]; /* Gather */
			++isub;
		    }
		    
		    /* start effective triangle */
		    luptr += nsupr * no_zeros + no_zeros;
		    
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
		    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
			   &nsupr, tempv, &incx );
#else
		    strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
			   &nsupr, tempv, &incx );
#endif
		    
		    luptr += segsze;	/* Dense matrix-vector */
		    tempv1 = &tempv[segsze];
                    alpha = one;
                    beta = zero;
#ifdef _CRAY
		    SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
		    sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
		    slsolve ( nsupr, segsze, &lusup[luptr], tempv );
		    
		    luptr += segsze;        /* Dense matrix-vector */
		    tempv1 = &tempv[segsze];
		    smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
#endif
		    
		    /* Scatter tempv[*] into SPA dense[*] temporarily, such
		     * that tempv[*] can be used for the triangular solve of
		     * the next column of the panel. They will be copied into 
		     * ucol[*] after the whole panel has been finished.
		     */
		    isub = lptr + no_zeros;
		    for (i = 0; i < segsze; i++) {
			irow = lsub[isub];
			dense_col[irow] = tempv[i];
			tempv[i] = zero;
			isub++;
		    }
		    
		    /* Scatter the update from tempv1[*] into SPA dense[*] */
		    /* Start dense rectangular L */
		    for (i = 0; i < nrow; i++) {
			irow = lsub[isub];
			dense_col[irow] -= tempv1[i];
			tempv1[i] = zero;
			++isub;	
		    }
		    
		} /* else segsze>=4 ... */
		
	    } /* for each column in the panel... */
	    
	} /* else 1-D update ... */

    } /* for each updating supernode ... */

}
Exemple #6
0
void
psgstrf_bmod1D_mv2(
		   const int pnum, /* process number */
		   const int n,    /* number of rows in the matrix */
		   const int w,    /* current panel width */
		   const int jcol, /* leading column of the current panel */
		   const int fsupc,/* leading column of the updating s-node */ 
		   const int krep, /* last column of the updating s-node */ 
		   const int nsupc,/* number of columns in the updating s-node */ 
		   int nsupr, /* number of rows in the updating supernode */  
		   int nrow,  /* number of rows below the diagonal block of
				 the updating supernode */ 
		   int *repfnz,    /* in */
		   int *panel_lsub,/* modified */
		   int *w_lsub_end,/* modified */
		   int *spa_marker,/* modified; size n-by-w */
		   float *dense,  /* modified */
		   float *tempv,  /* working array - zeros on entry/exit */
		   GlobalLU_t *Glu,/* modified */
		   Gstat_t *Gstat  /* modified */
		   )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose
 * =======
 *
 *    Performs numeric block updates (sup-panel) in topological order.
 *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
 *    Results are returned in SPA dense[*,w].
 *
 */

    float      zero = 0.0;
    float      one = 1.0;

#if ( MACH==CRAY_PVP )
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
#ifdef USE_VENDOR_BLAS
    int          incx = 1, incy = 1;
    float       alpha = one, beta = zero;
#endif

    float       ukj, ukj1, ukj2;
    int          luptr, luptr1, luptr2;
    int          segsze;
    register int lptr; /* start of row subscripts of the updating supernode */
    register int i, j, kfnz, krep_ind, isub, irow, no_zeros, twocols;
    register int jj;	       /* index through each column in the panel */
    int      kfnz2[2], jj2[2]; /* detect two identical columns */
    int  *repfnz_col, *repfnz_col1; /* repfnz[] for a column in the panel */
    float *dense_col, *dense_col1;  /* dense[] for a column in the panel */
    float *tri[2], *matvec[2];
    int  *col_marker, *col_marker1; /* each column of the spa_marker[*,w] */
    int  *col_lsub, *col_lsub1;   /* each column of the panel_lsub[*,w] */
    int          *lsub, *xlsub_end;
    float	*lusup;
    int          *xlusup;
    register float flopcnt;
    
#ifdef TIMING
    double *utime = Gstat->utime;
    double f_time;
#endif    
    
    lsub      = Glu->lsub;
    xlsub_end = Glu->xlsub_end;
    lusup     = Glu->lusup;
    xlusup    = Glu->xlusup;
    lptr      = Glu->xlsub[fsupc];
    krep_ind  = lptr + nsupc - 1;
    twocols = 0;
    tri[0] = tempv;
    tri[1] = tempv + n;

#ifdef DEBUG
if (jcol == BADPAN && krep == BADREP) {
    printf("(%d) dbmod1D[1] jcol %d, fsupc %d, krep %d, nsupc %d, nsupr %d, nrow %d\n",
	   pnum, jcol, fsupc, krep, nsupc, nsupr, nrow);
    PrintInt10("lsub[xlsub[2774]", nsupr, &lsub[lptr]);
}    
#endif
    
    /* -----------------------------------------------
     * Sequence through each column in the panel ...
     * ----------------------------------------------- */
    repfnz_col= repfnz;
    dense_col = dense;
    col_marker= spa_marker;
    col_lsub  = panel_lsub;
    
    for (jj = jcol; jj < jcol + w; ++jj, col_marker += n, col_lsub += n,
	 repfnz_col += n, dense_col += n) {
	
	kfnz = repfnz_col[krep];
	if ( kfnz == EMPTY ) continue;	/* skip any zero segment */

	segsze = krep - kfnz + 1;
	luptr = xlusup[fsupc];

        flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze;
	Gstat->procstat[pnum].fcops += flopcnt;

	/* Case 1: Update U-segment of size 1 -- col-col update */
	if ( segsze == 1 ) {
#ifdef TIMING
	    f_time = SuperLU_timer_();
#endif	    
	    ukj = dense_col[lsub[krep_ind]];
	    luptr += nsupr*(nsupc-1) + nsupc;
#ifdef DEBUG
if (krep == BADCOL && jj == -1) {
    printf("(%d) dbmod1D[segsze=1]: k %d, j %d, ukj %.10e\n",
	   pnum, lsub[krep_ind], jj, ukj);
    PrintInt10("segsze=1", nsupr, &lsub[lptr]);
}
#endif	    
	    for (i = lptr + nsupc; i < xlsub_end[fsupc]; i++) {
		irow = lsub[i];
                dense_col[irow] -= ukj * lusup[luptr];
		++luptr;
#ifdef SCATTER_FOUND		
		if ( col_marker[irow] != jj ) {
		    col_marker[irow] = jj;
		    col_lsub[w_lsub_end[jj-jcol]++] = irow;
		}
#endif		
	    }
#ifdef TIMING
	    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif
	    
	} else if ( segsze <= 3 ) {
	    
#ifdef TIMING
	    f_time = SuperLU_timer_();
#endif	    
	    ukj = dense_col[lsub[krep_ind]];
	    luptr += nsupr*(nsupc-1) + nsupc-1;
	    ukj1 = dense_col[lsub[krep_ind - 1]];
	    luptr1 = luptr - nsupr;
	    if ( segsze == 2 ) {
                ukj -= ukj1 * lusup[luptr1];
		dense_col[lsub[krep_ind]] = ukj;
/*#pragma ivdep*/
		for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
		    irow = lsub[i];
		    ++luptr;  ++luptr1;
                    dense_col[irow] -= (ukj * lusup[luptr]
                                                + ukj1 * lusup[luptr1]);
#ifdef SCATTER_FOUND		
		    if ( col_marker[irow] != jj ) {
			col_marker[irow] = jj;
			col_lsub[w_lsub_end[jj-jcol]++] = irow;
		    }
#endif		
		}
	    } else {
		ukj2 = dense_col[lsub[krep_ind - 2]];
		luptr2 = luptr1 - nsupr;
                ukj1 -= ukj2 * lusup[luptr2-1];
                ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
		dense_col[lsub[krep_ind]] = ukj;
		dense_col[lsub[krep_ind-1]] = ukj1;
		for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
		    irow = lsub[i];
		    ++luptr; ++luptr1; ++luptr2;
                    dense_col[irow] -= (ukj * lusup[luptr]
                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2]);
#ifdef SCATTER_FOUND		
		    if ( col_marker[irow] != jj ) {
			col_marker[irow] = jj;
			col_lsub[w_lsub_end[jj-jcol]++] = irow;
		    }
#endif		
		}
	    }
#ifdef TIMING
	    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif
	} else { /* segsze >= 4 */
	    if ( twocols == 1 ) {
		jj2[1] = jj; /* got two columns */
		twocols = 0;
		
		for (j = 0; j < 2; ++j) { /* Do two tri-solves */
		    i = n * (jj2[j] - jcol);
		    repfnz_col1 = &repfnz[i];
		    dense_col1  = &dense[i];
		    kfnz2[j] = repfnz_col1[krep];		    
		    no_zeros = kfnz2[j] - fsupc;
		    segsze = krep - kfnz2[j] + 1;
		    matvec[j] = tri[j] + segsze;

		    /* Gather U[*,j] segment from dense[*] to tri[*]. */
		    isub = lptr + no_zeros;
		    for (i = 0; i < segsze; ++i) {
			irow = lsub[isub];
			tri[j][i] = dense_col1[irow]; /* Gather */
			++isub;
		    }

#ifdef TIMING
		    f_time = SuperLU_timer_();
#endif
		    /* start effective triangle */
		    luptr = xlusup[fsupc] + nsupr * no_zeros + no_zeros;
		    
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
		    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
			   &nsupr, tri[j], &incx );
#else
		    strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
			   &nsupr, tri[j], &incx );
#endif
#else
		    slsolve ( nsupr, segsze, &lusup[luptr], tri[j] );
		    
#endif

#ifdef TIMING
		    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    
		} /* end for j ... two tri-solves */

#ifdef TIMING
		f_time = SuperLU_timer_();
#endif
		
		if ( kfnz2[0] < kfnz2[1] ) { /* First column is bigger */
		    no_zeros = kfnz2[0] - fsupc;
		    segsze = kfnz2[1] - kfnz2[0];
		    luptr = xlusup[fsupc] + nsupr * no_zeros + nsupc;
#ifdef USE_VENDOR_BLAS		    
#if ( MACH==CRAY_PVP )
		    SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
			   &nsupr, tri[0], &incx, &beta, matvec[0], &incy );
#else
		    sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
			   &nsupr, tri[0], &incx, &beta, matvec[0], &incy );
#endif
#else
		    smatvec (nsupr, nrow, segsze, &lusup[luptr],
			     tri[0], matvec[0]);
#endif
		    
		} else if ( kfnz2[0] > kfnz2[1] ) {
		    no_zeros = kfnz2[1] - fsupc;
		    segsze = kfnz2[0] - kfnz2[1];
		    luptr = xlusup[fsupc] + nsupr * no_zeros + nsupc;
#ifdef USE_VENDOR_BLAS		    
#if ( MACH==CRAY_PVP )
		    SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
			   &nsupr, tri[1], &incx, &beta, matvec[1], &incy );
#else
		    sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
			   &nsupr, tri[1], &incx, &beta, matvec[1], &incy );
#endif
#else
		    smatvec (nsupr, nrow, segsze, &lusup[luptr],
			     tri[1], matvec[1]);
#endif
		}
		
		/* Do matrix-vector multiply with two destinations */
		kfnz = SUPERLU_MAX( kfnz2[0], kfnz2[1] );
		no_zeros = kfnz - fsupc;
		segsze = krep - kfnz + 1;
		luptr = xlusup[fsupc] + nsupr * no_zeros + nsupc;
#if ( MACH==DEC )
		sgemv2_(&nsupr, &nrow, &segsze, &lusup[luptr],
			&tri[0][kfnz-kfnz2[0]], &tri[1][kfnz-kfnz2[1]],
			matvec[0], matvec[1]);
		/*#elif ( MACH==CRAY_PVP )
		SGEMV2(&nsupr, &nrow, &segsze, &lusup[luptr],
		       &tri[0][kfnz-kfnz2[0]], &tri[1][kfnz-kfnz2[1]],
		       matvec[0], matvec[1]);*/
#else
		//smatvec2(nsupr, nrow, segsze, &lusup[luptr],
		//	 &tri[0][kfnz-kfnz2[0]], &tri[1][kfnz-kfnz2[1]],
		//	 matvec[0], matvec[1]);
#endif

#ifdef TIMING
		utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    

		for (j = 0; j < 2; ++j) {
		    i = n * (jj2[j] - jcol);
		    dense_col1  = &dense[i];
		    col_marker1 = &spa_marker[i];
		    col_lsub1   = &panel_lsub[i];
		    no_zeros = kfnz2[j] - fsupc;
		    segsze = krep - kfnz2[j] + 1;
		    
		    /* Scatter tri[*] into SPA dense[*]. */
		    isub = lptr + no_zeros;
		    for (i = 0; i < segsze; i++) {
			irow = lsub[isub];
			dense_col1[irow] = tri[j][i]; /* Scatter */
			tri[j][i] = zero;
			++isub;
#ifdef DEBUG
	if (jj == -1 && krep == 3423)
	    printf("(%d) dbmod1D[scatter] jj %d, dense_col[%d] %e\n",
		   pnum, jj, irow, dense_col[irow]);
#endif
		    }
		    
		    /* Scatter matvec[*] into SPA dense[*]. */
/*#pragma ivdep*/
		    for (i = 0; i < nrow; i++) {
			irow = lsub[isub];
            	        dense_col1[irow] -= matvec[j][i]; /* Scatter-add */
#ifdef SCATTER_FOUND		
			if ( col_marker1[irow] != jj2[j] ) {
			    col_marker1[irow] = jj2[j];
			    col_lsub1[w_lsub_end[jj2[j]-jcol]++] = irow;
			}
#endif		
			matvec[j][i] = zero;
			++isub;
		    }
		    
		} /* end for two destination update */
		
	    } else { /* wait for a second column */
		jj2[0] = jj;
		twocols = 1;
	    }
	} /* else segsze >= 4 */
	
    } /* for jj ... */

    
    if ( twocols == 1 ) { /* one more column left */
	i = n * (jj2[0] - jcol);
	repfnz_col1 = &repfnz[i];
	dense_col1  = &dense[i];
	col_marker1 = &spa_marker[i];
	col_lsub1   = &panel_lsub[i];
	kfnz = repfnz_col1[krep];		    
	no_zeros = kfnz - fsupc;
	segsze = krep - kfnz + 1;

	/* Gather U[*,j] segment from dense[*] to tri[*]. */
	isub = lptr + no_zeros;
	for (i = 0; i < segsze; ++i) {
	    irow = lsub[isub];
	    tri[0][i] = dense_col1[irow]; /* Gather */
	    ++isub;
	}

#ifdef TIMING
	f_time = SuperLU_timer_();
#endif
	/* start effective triangle */
	luptr = xlusup[fsupc] + nsupr * no_zeros + no_zeros;
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
	STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
	       &nsupr, tri[0], &incx );
#else
	strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
	       &nsupr, tri[0], &incx );
#endif
#else
	slsolve ( nsupr, segsze, &lusup[luptr], tri[0] );
#endif
	
	luptr += segsze;	/* Dense matrix-vector */
	matvec[0] = tri[0] + segsze;
		
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
	SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
	       &nsupr, tri[0], &incx, &beta, matvec[0], &incy );
#else
	sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
	       &nsupr, tri[0], &incx, &beta, matvec[0], &incy );
#endif
#else
	smatvec (nsupr, nrow, segsze, &lusup[luptr], tri[0], matvec[0]);
#endif
#ifdef TIMING
	utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    

	/* Scatter tri[*] into SPA dense[*]. */
	isub = lptr + no_zeros;
	for (i = 0; i < segsze; i++) {
	    irow = lsub[isub];
	    dense_col1[irow] = tri[0][i]; /* Scatter */
	    tri[0][i] = zero;
	    ++isub;
#ifdef DEBUG
	if (jj == -1 && krep == 3423)
	    printf("(%d) dbmod1D[scatter] jj %d, dense_col[%d] %e\n",
		   pnum, jj, irow, dense_col[irow]);
#endif
	}
		    
	/* Scatter matvec[*] into SPA dense[*]. */
	for (i = 0; i < nrow; i++) {
	    irow = lsub[isub];
            dense_col1[irow] -= matvec[0][i]; /* Scatter-add */
#ifdef SCATTER_FOUND		
	    if ( col_marker1[irow] != jj2[0] ) {
		col_marker1[irow] = jj2[0];
		col_lsub1[w_lsub_end[jj2[0]-jcol]++] = irow;
	    }
#endif		
	    matvec[0][i] = zero;
	    ++isub;
	}
    
    } /* if twocols == 1 */

}
void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
                   const int n, const float *A, const int lda, float *x, 
                   const int incx)
{
	strsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
}
/*
 * Performs numeric block updates within the relaxed snode.
 */
int
ssnode_bmod (
    const int  jcol,	  /* in */
    const int  fsupc,     /* in */
    double     *dense,    /* in */
    double     *tempv,    /* working array */
    GlobalLU_t *Glu,      /* modified */
    SuperLUStat_t *stat   /* output */
)
{
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    int            incx = 1, incy = 1;
    double         alpha = -1.0, beta = 1.0;
#endif

    int            luptr, nsupc, nsupr, nrow;
    int            isub, irow, i, iptr;
    register int   ufirst, nextlu;
    int            *lsub, *xlsub;
    double         *lusup;
    int            *xlusup;
    flops_t *ops = stat->ops;

    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    lusup   = Glu->lusup;
    xlusup  = Glu->xlusup;

    nextlu = xlusup[jcol];

    /*
     *	Process the supernodal portion of L\U[*,j]
     */
    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
        irow = lsub[isub];
        lusup[nextlu] = dense[irow];
        dense[irow] = 0;
        ++nextlu;
    }

    xlusup[jcol + 1] = nextlu;	/* Initialize xlusup for next column */

    if ( fsupc < jcol ) {

        luptr = xlusup[fsupc];
        nsupr = xlsub[fsupc+1] - xlsub[fsupc];
        nsupc = jcol - fsupc;	/* Excluding jcol */
        ufirst = xlusup[jcol];	/* Points to the beginning of column
				   jcol in supernode L\U(jsupno). */
        nrow = nsupr - nsupc;

        ops[TRSV] += nsupc * (nsupc - 1);
        ops[GEMV] += 2 * nrow * nsupc;

#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
        STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr,
               &lusup[ufirst], &incx );
        SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
               &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
        strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr,
                &lusup[ufirst], &incx );
        sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
                &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
        slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
        smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
                  &lusup[ufirst], &tempv[0] );

        /* Scatter tempv[*] into lusup[*] */
        iptr = ufirst + nsupc;
        for (i = 0; i < nrow; i++) {
            lusup[iptr++] -= tempv[i];
            tempv[i] = 0.0;
        }
#endif

    }

    return 0;
}
int
psgstrf_snode_bmod(
		   const int  pnum,   /* process number */
		   const int  jcol,   /* in - current column in the s-node */
		   const int  jsupno, /* in */
		   const int  fsupc,  /* in - first column in the s-node */
		   float     *dense, /* in */
		   float     *tempv, /* working array */
		   GlobalLU_t *Glu,   /* modified */
		   Gstat_t *Gstat     /* modified */
		   )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Performs numeric block updates within the relaxed supernode. 
 */

    float      zero = 0.0;
    float      one = 1.0;
    float      none = -1.0;

#if ( MACH==CRAY_PVP )
    _fcd ftcs1, ftcs2, ftcs3;
#endif
#ifdef USE_VENDOR_BLAS    
    int            incx = 1, incy = 1;
    float         alpha = none, beta = one;
#endif
    
    int            luptr, nsupc, nsupr, nrow;
    int            isub, irow, i, iptr; 
    register int   ufirst, nextlu;
    float         *lusup;
    int            *lsub, *xlsub, *xlsub_end, *xlusup, *xlusup_end;
    register float flopcnt;

    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    xlsub_end  = Glu->xlsub_end;
    lusup      = Glu->lusup;
    xlusup     = Glu->xlusup;
    xlusup_end = Glu->xlusup_end;

    nextlu = xlusup[jcol];
    
    /*
     *	Process the supernodal portion of L\U[*,j]
     */
    for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; isub++) {
  	irow = lsub[isub];
	lusup[nextlu] = dense[irow];
	dense[irow] = zero;
	++nextlu;
    }

    xlusup_end[jcol] = nextlu;
    
    if ( fsupc < jcol ) {

	luptr = xlusup[fsupc];
	nsupr = xlsub_end[fsupc] - xlsub[fsupc];
	nsupc = jcol - fsupc;	/* Excluding jcol */
	ufirst = xlusup[jcol];	/* Points to the beginning of column
				   jcol in supernode L\U(jsupno). */
	nrow = nsupr - nsupc;
	
        flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc;
	Gstat->procstat[pnum].fcops += flopcnt;

/*	ops[TRSV] += nsupc * (nsupc - 1);
	ops[GEMV] += 2 * nrow * nsupc;    */

#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
	ftcs1 = _cptofcd("L", strlen("L"));
	ftcs2 = _cptofcd("N", strlen("N"));
	ftcs3 = _cptofcd("U", strlen("U"));
	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, 
	      &lusup[ufirst], &incx );
	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
	      &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
	strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, 
	      &lusup[ufirst], &incx );
	sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
	slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
	smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], 
		 &lusup[ufirst], &tempv[0] );

        /* Scatter tempv[*] into lusup[*] */
	iptr = ufirst + nsupc;
	for (i = 0; i < nrow; i++) {
            lusup[iptr++] -= tempv[i];
            tempv[i] = zero;
	}
#endif

    }

    return 0;
}
Exemple #10
0
/* Subroutine */ int sgglse_(integer *m, integer *n, integer *p, real *a, 
	integer *lda, real *b, integer *ldb, real *c, real *d, real *x, real *
	work, integer *lwork, integer *info)
{
/*  -- LAPACK driver routine (version 2.0) --   
       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
       Courant Institute, Argonne National Lab, and Rice University   
       September 30, 1994   


    Purpose   
    =======   

    SGGLSE solves the linear equality-constrained least squares (LSE)   
    problem:   

            minimize || c - A*x ||_2   subject to   B*x = d   

    where A is an M-by-N matrix, B is a P-by-N matrix, c is a given   
    M-vector, and d is a given P-vector. It is assumed that   
    P <= N <= M+P, and   

             rank(B) = P and  rank( ( A ) ) = N.   
                                  ( ( B ) )   

    These conditions ensure that the LSE problem has a unique solution,   
    which is obtained using a GRQ factorization of the matrices B and A. 
  

    Arguments   
    =========   

    M       (input) INTEGER   
            The number of rows of the matrix A.  M >= 0.   

    N       (input) INTEGER   
            The number of columns of the matrices A and B. N >= 0.   

    P       (input) INTEGER   
            The number of rows of the matrix B. 0 <= P <= N <= M+P.   

    A       (input/output) REAL array, dimension (LDA,N)   
            On entry, the M-by-N matrix A.   
            On exit, A is destroyed.   

    LDA     (input) INTEGER   
            The leading dimension of the array A. LDA >= max(1,M).   

    B       (input/output) REAL array, dimension (LDB,N)   
            On entry, the P-by-N matrix B.   
            On exit, B is destroyed.   

    LDB     (input) INTEGER   
            The leading dimension of the array B. LDB >= max(1,P).   

    C       (input/output) REAL array, dimension (M)   
            On entry, C contains the right hand side vector for the   
            least squares part of the LSE problem.   
            On exit, the residual sum of squares for the solution   
            is given by the sum of squares of elements N-P+1 to M of   
            vector C.   

    D       (input/output) REAL array, dimension (P)   
            On entry, D contains the right hand side vector for the   
            constrained equation.   
            On exit, D is destroyed.   

    X       (output) REAL array, dimension (N)   
            On exit, X is the solution of the LSE problem.   

    WORK    (workspace/output) REAL array, dimension (LWORK)   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK. LWORK >= max(1,M+N+P).   
            For optimum performance LWORK >= P+min(M,N)+max(M,N)*NB,   
            where NB is an upper bound for the optimal blocksizes for   
            SGEQRF, SGERQF, SORMQR and SORMRQ.   

    INFO    (output) INTEGER   
            = 0:  successful exit.   
            < 0:  if INFO = -i, the i-th argument had an illegal value.   

    ===================================================================== 
  


       Test the input parameters   

    
   Parameter adjustments   
       Function Body */
    /* Table of constant values */
    static integer c__1 = 1;
    static real c_b11 = -1.f;
    static real c_b13 = 1.f;
    
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
    /* Local variables */
    static integer lopt;
    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, 
	    real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), 
	    saxpy_(integer *, real *, real *, integer *, real *, integer *), 
	    strmv_(char *, char *, char *, integer *, real *, integer *, real 
	    *, integer *), strsv_(char *, char *, 
	    char *, integer *, real *, integer *, real *, integer *);
    static integer mn, nr;
    extern /* Subroutine */ int xerbla_(char *, integer *), sggrqf_(
	    integer *, integer *, integer *, real *, integer *, real *, real *
	    , integer *, real *, real *, integer *, integer *), sormqr_(char *
	    , char *, integer *, integer *, integer *, real *, integer *, 
	    real *, real *, integer *, real *, integer *, integer *), sormrq_(char *, char *, integer *, integer *, integer *, 
	    real *, integer *, real *, real *, integer *, real *, integer *, 
	    integer *);



#define C(I) c[(I)-1]
#define D(I) d[(I)-1]
#define X(I) x[(I)-1]
#define WORK(I) work[(I)-1]

#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]

    *info = 0;
    mn = min(*m,*n);
    if (*m < 0) {
	*info = -1;
    } else if (*n < 0) {
	*info = -2;
    } else if (*p < 0 || *p > *n || *p < *n - *m) {
	*info = -3;
    } else if (*lda < max(1,*m)) {
	*info = -5;
    } else if (*ldb < max(1,*p)) {
	*info = -7;
    } else /* if(complicated condition) */ {
/* Computing MAX */
	i__1 = 1, i__2 = *m + *n + *p;
	if (*lwork < max(i__1,i__2)) {
	    *info = -12;
	}
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("SGGLSE", &i__1);
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }

/*     Compute the GRQ factorization of matrices B and A:   

              B*Q' = (  0  T12 ) P   Z'*A*Q' = ( R11 R12 ) N-P   
                       N-P  P                  (  0  R22 ) M+P-N   
                                                 N-P  P   

       where T12 and R11 are upper triangular, and Q and Z are   
       orthogonal. */

    i__1 = *lwork - *p - mn;
    sggrqf_(p, m, n, &B(1,1), ldb, &WORK(1), &A(1,1), lda, &WORK(*p 
	    + 1), &WORK(*p + mn + 1), &i__1, info);
    lopt = WORK(*p + mn + 1);

/*     Update c = Z'*c = ( c1 ) N-P   
                         ( c2 ) M+P-N */

    i__1 = max(1,*m);
    i__2 = *lwork - *p - mn;
    sormqr_("Left", "Transpose", m, &c__1, &mn, &A(1,1), lda, &WORK(*p + 
	    1), &C(1), &i__1, &WORK(*p + mn + 1), &i__2, info);
/* Computing MAX */
    i__1 = lopt, i__2 = (integer) WORK(*p + mn + 1);
    lopt = max(i__1,i__2);

/*     Solve T12*x2 = d for x2 */

    strsv_("Upper", "No transpose", "Non unit", p, &B(1,*n-*p+1), ldb, &D(1), &c__1);

/*     Update c1 */

    i__1 = *n - *p;
    sgemv_("No transpose", &i__1, p, &c_b11, &A(1,*n-*p+1), 
	    lda, &D(1), &c__1, &c_b13, &C(1), &c__1);

/*     Sovle R11*x1 = c1 for x1 */

    i__1 = *n - *p;
    strsv_("Upper", "No transpose", "Non unit", &i__1, &A(1,1), lda, &C(
	    1), &c__1);

/*     Put the solutions in X */

    i__1 = *n - *p;
    scopy_(&i__1, &C(1), &c__1, &X(1), &c__1);
    scopy_(p, &D(1), &c__1, &X(*n - *p + 1), &c__1);

/*     Compute the residual vector: */

    if (*m < *n) {
	nr = *m + *p - *n;
	i__1 = *n - *m;
	sgemv_("No transpose", &nr, &i__1, &c_b11, &A(*n-*p+1,*m+1), lda, &D(nr + 1), &c__1, &c_b13, &C(*n - *p + 1), &
		c__1);
    } else {
	nr = *p;
    }
    strmv_("Upper", "No transpose", "Non unit", &nr, &A(*n-*p+1,*n-*p+1), lda, &D(1), &c__1);
    saxpy_(&nr, &c_b11, &D(1), &c__1, &C(*n - *p + 1), &c__1);

/*     Backward transformation x = Q'*x */

    i__1 = *lwork - *p - mn;
    sormrq_("Left", "Transpose", n, &c__1, p, &B(1,1), ldb, &WORK(1), &X(
	    1), n, &WORK(*p + mn + 1), &i__1, info);
/* Computing MAX */
    i__1 = lopt, i__2 = (integer) WORK(*p + mn + 1);
    WORK(1) = (real) (*p + mn + max(i__1,i__2));

    return 0;

/*     End of SGGLSE */

} /* sgglse_ */
Exemple #11
0
/*! \brief Solves one of the systems of equations A*x = b,   or   A'*x = b
 *
 * <pre>
 *   Purpose
 *   =======
 *
 *   sp_strsv() solves one of the systems of equations
 *       A*x = b,   or   A'*x = b,
 *   where b and x are n element vectors and A is a sparse unit , or
 *   non-unit, upper or lower triangular matrix.
 *   No test for singularity or near-singularity is included in this
 *   routine. Such tests must be performed before calling this routine.
 *
 *   Parameters
 *   ==========
 *
 *   uplo   - (input) char*
 *            On entry, uplo specifies whether the matrix is an upper or
 *             lower triangular matrix as follows:
 *                uplo = 'U' or 'u'   A is an upper triangular matrix.
 *                uplo = 'L' or 'l'   A is a lower triangular matrix.
 *
 *   trans  - (input) char*
 *             On entry, trans specifies the equations to be solved as
 *             follows:
 *                trans = 'N' or 'n'   A*x = b.
 *                trans = 'T' or 't'   A'*x = b.
 *                trans = 'C' or 'c'   A'*x = b.
 *
 *   diag   - (input) char*
 *             On entry, diag specifies whether or not A is unit
 *             triangular as follows:
 *                diag = 'U' or 'u'   A is assumed to be unit triangular.
 *                diag = 'N' or 'n'   A is not assumed to be unit
 *                                    triangular.
 *
 *   L       - (input) SuperMatrix*
 *             The factor L from the factorization Pr*A*Pc=L*U. Use
 *             compressed row subscripts storage for supernodes,
 *             i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU.
 *
 *   U       - (input) SuperMatrix*
 *              The factor U from the factorization Pr*A*Pc=L*U.
 *              U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU.
 *
 *   x       - (input/output) float*
 *             Before entry, the incremented array X must contain the n
 *             element right-hand side vector b. On exit, X is overwritten
 *             with the solution vector x.
 *
 *   info    - (output) int*
 *             If *info = -i, the i-th argument had an illegal value.
 * </pre>
 */
int
sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L,
         SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info)
{
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    SCformat *Lstore;
    NCformat *Ustore;
    float   *Lval, *Uval;
    int incx = 1, incy = 1;
    float alpha = 1.0, beta = 1.0;
    int nrow;
    int fsupc, nsupr, nsupc, luptr, istart, irow;
    int i, k, iptr, jcol;
    float *work;
    flops_t solve_ops;

    /* Test the input parameters */
    *info = 0;
    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") &&
              !lsame_(trans, "C")) *info = -2;
    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
    if ( *info ) {
        i = -(*info);
        xerbla_("sp_strsv", &i);
        return 0;
    }

    Lstore = L->Store;
    Lval = Lstore->nzval;
    Ustore = U->Store;
    Uval = Ustore->nzval;
    solve_ops = 0;

    if ( !(work = floatCalloc(L->nrow)) )
        ABORT("Malloc fails for work in sp_strsv().");

    if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */

        if ( lsame_(uplo, "L") ) {
            /* Form x := inv(L)*x */
            if ( L->nrow == 0 ) return 0; /* Quick return */

            for (k = 0; k <= Lstore->nsuper; k++) {
                fsupc = L_FST_SUPC(k);
                istart = L_SUB_START(fsupc);
                nsupr = L_SUB_START(fsupc+1) - istart;
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);
                nrow = nsupr - nsupc;

                solve_ops += nsupc * (nsupc - 1);
                solve_ops += 2 * nrow * nsupc;

                if ( nsupc == 1 ) {
                    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
                        irow = L_SUB(iptr);
                        ++luptr;
                        x[irow] -= x[fsupc] * Lval[luptr];
                    }
                } else {
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
                        &x[fsupc], &incx);

                    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
                        &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#else
                    strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
                        &x[fsupc], &incx);

                    sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
                        &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#endif
#else
                    slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);

                    smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
                             &x[fsupc], &work[0] );
#endif

                    iptr = istart + nsupc;
                    for (i = 0; i < nrow; ++i, ++iptr) {
                        irow = L_SUB(iptr);
                        x[irow] -= work[i];     /* Scatter */
                        work[i] = 0.0;

                    }
                }
            } /* for k ... */

        } else {
            /* Form x := inv(U)*x */

            if ( U->nrow == 0 ) return 0; /* Quick return */

            for (k = Lstore->nsuper; k >= 0; k--) {
                fsupc = L_FST_SUPC(k);
                nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);

                solve_ops += nsupc * (nsupc + 1);

                if ( nsupc == 1 ) {
                    x[fsupc] /= Lval[luptr];
                    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
                        irow = U_SUB(i);
                        x[irow] -= x[fsupc] * Uval[i];
                    }
                } else {
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
                       &x[fsupc], &incx);
#else
                    strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
                           &x[fsupc], &incx);
#endif
#else
                    susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
#endif

                    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
                        solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
                        for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1);
                                i++) {
                            irow = U_SUB(i);
                            x[irow] -= x[jcol] * Uval[i];
                        }
                    }
                }
            } /* for k ... */

        }
    } else { /* Form x := inv(A')*x */

        if ( lsame_(uplo, "L") ) {
            /* Form x := inv(L')*x */
            if ( L->nrow == 0 ) return 0; /* Quick return */

            for (k = Lstore->nsuper; k >= 0; --k) {
                fsupc = L_FST_SUPC(k);
                istart = L_SUB_START(fsupc);
                nsupr = L_SUB_START(fsupc+1) - istart;
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);

                solve_ops += 2 * (nsupr - nsupc) * nsupc;

                for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
                    iptr = istart + nsupc;
                    for (i = L_NZ_START(jcol) + nsupc;
                                i < L_NZ_START(jcol+1); i++) {
                        irow = L_SUB(iptr);
                        x[jcol] -= x[irow] * Lval[i];
                        iptr++;
                    }
                }

                if ( nsupc > 1 ) {
                    solve_ops += nsupc * (nsupc - 1);
#ifdef _CRAY
                    ftcs1 = _cptofcd("L", strlen("L"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("U", strlen("U"));
                    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
                        &x[fsupc], &incx);
#else
                    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
                        &x[fsupc], &incx);
#endif
                }
            }
        } else {
            /* Form x := inv(U')*x */
            if ( U->nrow == 0 ) return 0; /* Quick return */

            for (k = 0; k <= Lstore->nsuper; k++) {
                fsupc = L_FST_SUPC(k);
                nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);

                for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
                    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
                    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
                        irow = U_SUB(i);
                        x[jcol] -= x[irow] * Uval[i];
                    }
                }

                solve_ops += nsupc * (nsupc + 1);

                if ( nsupc == 1 ) {
                    x[fsupc] /= Lval[luptr];
                } else {
#ifdef _CRAY
                    ftcs1 = _cptofcd("U", strlen("U"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("N", strlen("N"));
                    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
                            &x[fsupc], &incx);
#else
                    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
                            &x[fsupc], &incx);
#endif
                }
            } /* for k ... */
        }
    }

    stat->ops[SOLVE] += solve_ops;
    SUPERLU_FREE(work);
    return 0;
}
Exemple #12
0
/* Subroutine */ int sggglm_(integer *n, integer *m, integer *p, real *a, 
	integer *lda, real *b, integer *ldb, real *d__, real *x, real *y, 
	real *work, integer *lwork, integer *info)
{
/*  -- LAPACK driver routine (version 3.0) --   
       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
       Courant Institute, Argonne National Lab, and Rice University   
       June 30, 1999   


    Purpose   
    =======   

    SGGGLM solves a general Gauss-Markov linear model (GLM) problem:   

            minimize || y ||_2   subject to   d = A*x + B*y   
                x   

    where A is an N-by-M matrix, B is an N-by-P matrix, and d is a   
    given N-vector. It is assumed that M <= N <= M+P, and   

               rank(A) = M    and    rank( A B ) = N.   

    Under these assumptions, the constrained equation is always   
    consistent, and there is a unique solution x and a minimal 2-norm   
    solution y, which is obtained using a generalized QR factorization   
    of A and B.   

    In particular, if matrix B is square nonsingular, then the problem   
    GLM is equivalent to the following weighted linear least squares   
    problem   

                 minimize || inv(B)*(d-A*x) ||_2   
                     x   

    where inv(B) denotes the inverse of B.   

    Arguments   
    =========   

    N       (input) INTEGER   
            The number of rows of the matrices A and B.  N >= 0.   

    M       (input) INTEGER   
            The number of columns of the matrix A.  0 <= M <= N.   

    P       (input) INTEGER   
            The number of columns of the matrix B.  P >= N-M.   

    A       (input/output) REAL array, dimension (LDA,M)   
            On entry, the N-by-M matrix A.   
            On exit, A is destroyed.   

    LDA     (input) INTEGER   
            The leading dimension of the array A. LDA >= max(1,N).   

    B       (input/output) REAL array, dimension (LDB,P)   
            On entry, the N-by-P matrix B.   
            On exit, B is destroyed.   

    LDB     (input) INTEGER   
            The leading dimension of the array B. LDB >= max(1,N).   

    D       (input/output) REAL array, dimension (N)   
            On entry, D is the left hand side of the GLM equation.   
            On exit, D is destroyed.   

    X       (output) REAL array, dimension (M)   
    Y       (output) REAL array, dimension (P)   
            On exit, X and Y are the solutions of the GLM problem.   

    WORK    (workspace/output) REAL array, dimension (LWORK)   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK. LWORK >= max(1,N+M+P).   
            For optimum performance, LWORK >= M+min(N,P)+max(N,P)*NB,   
            where NB is an upper bound for the optimal blocksizes for   
            SGEQRF, SGERQF, SORMQR and SORMRQ.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    INFO    (output) INTEGER   
            = 0:  successful exit.   
            < 0:  if INFO = -i, the i-th argument had an illegal value.   

    ===================================================================   


       Test the input parameters   

       Parameter adjustments */
    /* Table of constant values */
    static integer c__1 = 1;
    static integer c_n1 = -1;
    static real c_b32 = -1.f;
    static real c_b34 = 1.f;
    
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
    /* Local variables */
    static integer lopt, i__;
    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, 
	    real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), 
	    strsv_(char *, char *, char *, integer *, real *, integer *, real 
	    *, integer *);
    static integer nb, np;
    extern /* Subroutine */ int xerbla_(char *, integer *);
    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
	    integer *, integer *, ftnlen, ftnlen);
    extern /* Subroutine */ int sggqrf_(integer *, integer *, integer *, real 
	    *, integer *, real *, real *, integer *, real *, real *, integer *
	    , integer *);
    static integer nb1, nb2, nb3, nb4, lwkopt;
    static logical lquery;
    extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, 
	    integer *, real *, integer *, real *, real *, integer *, real *, 
	    integer *, integer *), sormrq_(char *, char *, 
	    integer *, integer *, integer *, real *, integer *, real *, real *
	    , integer *, real *, integer *, integer *);
#define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1]


    a_dim1 = *lda;
    a_offset = 1 + a_dim1 * 1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1 * 1;
    b -= b_offset;
    --d__;
    --x;
    --y;
    --work;

    /* Function Body */
    *info = 0;
    np = min(*n,*p);
    nb1 = ilaenv_(&c__1, "SGEQRF", " ", n, m, &c_n1, &c_n1, (ftnlen)6, (
	    ftnlen)1);
    nb2 = ilaenv_(&c__1, "SGERQF", " ", n, m, &c_n1, &c_n1, (ftnlen)6, (
	    ftnlen)1);
    nb3 = ilaenv_(&c__1, "SORMQR", " ", n, m, p, &c_n1, (ftnlen)6, (ftnlen)1);
    nb4 = ilaenv_(&c__1, "SORMRQ", " ", n, m, p, &c_n1, (ftnlen)6, (ftnlen)1);
/* Computing MAX */
    i__1 = max(nb1,nb2), i__1 = max(i__1,nb3);
    nb = max(i__1,nb4);
    lwkopt = *m + np + max(*n,*p) * nb;
    work[1] = (real) lwkopt;
    lquery = *lwork == -1;
    if (*n < 0) {
	*info = -1;
    } else if (*m < 0 || *m > *n) {
	*info = -2;
    } else if (*p < 0 || *p < *n - *m) {
	*info = -3;
    } else if (*lda < max(1,*n)) {
	*info = -5;
    } else if (*ldb < max(1,*n)) {
	*info = -7;
    } else /* if(complicated condition) */ {
/* Computing MAX */
	i__1 = 1, i__2 = *n + *m + *p;
	if (*lwork < max(i__1,i__2) && ! lquery) {
	    *info = -12;
	}
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("SGGGLM", &i__1);
	return 0;
    } else if (lquery) {
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }

/*     Compute the GQR factorization of matrices A and B:   

              Q'*A = ( R11 ) M,    Q'*B*Z' = ( T11   T12 ) M   
                     (  0  ) N-M             (  0    T22 ) N-M   
                        M                     M+P-N  N-M   

       where R11 and T22 are upper triangular, and Q and Z are   
       orthogonal. */

    i__1 = *lwork - *m - np;
    sggqrf_(n, m, p, &a[a_offset], lda, &work[1], &b[b_offset], ldb, &work[*m 
	    + 1], &work[*m + np + 1], &i__1, info);
    lopt = work[*m + np + 1];

/*     Update left-hand-side vector d = Q'*d = ( d1 ) M   
                                               ( d2 ) N-M */

    i__1 = max(1,*n);
    i__2 = *lwork - *m - np;
    sormqr_("Left", "Transpose", n, &c__1, m, &a[a_offset], lda, &work[1], &
	    d__[1], &i__1, &work[*m + np + 1], &i__2, info);
/* Computing MAX */
    i__1 = lopt, i__2 = (integer) work[*m + np + 1];
    lopt = max(i__1,i__2);

/*     Solve T22*y2 = d2 for y2 */

    i__1 = *n - *m;
    strsv_("Upper", "No transpose", "Non unit", &i__1, &b_ref(*m + 1, *m + *p 
	    - *n + 1), ldb, &d__[*m + 1], &c__1);
    i__1 = *n - *m;
    scopy_(&i__1, &d__[*m + 1], &c__1, &y[*m + *p - *n + 1], &c__1);

/*     Set y1 = 0 */

    i__1 = *m + *p - *n;
    for (i__ = 1; i__ <= i__1; ++i__) {
	y[i__] = 0.f;
/* L10: */
    }

/*     Update d1 = d1 - T12*y2 */

    i__1 = *n - *m;
    sgemv_("No transpose", m, &i__1, &c_b32, &b_ref(1, *m + *p - *n + 1), ldb,
	     &y[*m + *p - *n + 1], &c__1, &c_b34, &d__[1], &c__1);

/*     Solve triangular system: R11*x = d1 */

    strsv_("Upper", "No Transpose", "Non unit", m, &a[a_offset], lda, &d__[1],
	     &c__1);

/*     Copy D to X */

    scopy_(m, &d__[1], &c__1, &x[1], &c__1);

/*     Backward transformation y = Z'*y   

   Computing MAX */
    i__1 = 1, i__2 = *n - *p + 1;
    i__3 = max(1,*p);
    i__4 = *lwork - *m - np;
    sormrq_("Left", "Transpose", p, &c__1, &np, &b_ref(max(i__1,i__2), 1), 
	    ldb, &work[*m + 1], &y[1], &i__3, &work[*m + np + 1], &i__4, info);
/* Computing MAX */
    i__1 = lopt, i__2 = (integer) work[*m + np + 1];
    work[1] = (real) (*m + np + max(i__1,i__2));

    return 0;

/*     End of SGGGLM */

} /* sggglm_ */
/* Subroutine */ int strrfs_(char *uplo, char *trans, char *diag, integer *n, 
	integer *nrhs, real *a, integer *lda, real *b, integer *ldb, real *x, 
	integer *ldx, real *ferr, real *berr, real *work, integer *iwork, 
	integer *info)
{
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, 
	    i__3;
    real r__1, r__2, r__3;

    /* Local variables */
    integer i__, j, k;
    real s, xk;
    integer nz;
    real eps;
    integer kase;
    real safe1, safe2;
    integer isave[3];
    logical upper;
    real safmin;
    logical notran;
    char transt[1];
    logical nounit;
    real lstres;

/*  -- LAPACK routine (version 3.2) -- */
/*     November 2006 */

/*     Modified to call SLACN2 in place of SLACON, 7 Feb 03, SJH. */

/*  Purpose */
/*  ======= */

/*  STRRFS provides error bounds and backward error estimates for the */
/*  solution to a system of linear equations with a triangular */
/*  coefficient matrix. */

/*  The solution matrix X must be computed by STRTRS or some other */
/*  means before entering this routine.  STRRFS does not do iterative */
/*  refinement because doing so cannot improve the backward error. */

/*  Arguments */
/*  ========= */

/*  UPLO    (input) CHARACTER*1 */
/*          = 'U':  A is upper triangular; */
/*          = 'L':  A is lower triangular. */

/*  TRANS   (input) CHARACTER*1 */
/*          Specifies the form of the system of equations: */
/*          = 'N':  A * X = B  (No transpose) */
/*          = 'T':  A**T * X = B  (Transpose) */
/*          = 'C':  A**H * X = B  (Conjugate transpose = Transpose) */

/*  DIAG    (input) CHARACTER*1 */
/*          = 'N':  A is non-unit triangular; */
/*          = 'U':  A is unit triangular. */

/*  N       (input) INTEGER */
/*          The order of the matrix A.  N >= 0. */

/*  NRHS    (input) INTEGER */
/*          The number of right hand sides, i.e., the number of columns */
/*          of the matrices B and X.  NRHS >= 0. */

/*  A       (input) REAL array, dimension (LDA,N) */
/*          The triangular matrix A.  If UPLO = 'U', the leading N-by-N */
/*          upper triangular part of the array A contains the upper */
/*          triangular matrix, and the strictly lower triangular part of */
/*          A is not referenced.  If UPLO = 'L', the leading N-by-N lower */
/*          triangular part of the array A contains the lower triangular */
/*          matrix, and the strictly upper triangular part of A is not */
/*          referenced.  If DIAG = 'U', the diagonal elements of A are */
/*          also not referenced and are assumed to be 1. */

/*  LDA     (input) INTEGER */
/*          The leading dimension of the array A.  LDA >= max(1,N). */

/*  B       (input) REAL array, dimension (LDB,NRHS) */
/*          The right hand side matrix B. */

/*  LDB     (input) INTEGER */
/*          The leading dimension of the array B.  LDB >= max(1,N). */

/*  X       (input) REAL array, dimension (LDX,NRHS) */
/*          The solution matrix X. */

/*  LDX     (input) INTEGER */
/*          The leading dimension of the array X.  LDX >= max(1,N). */

/*  FERR    (output) REAL array, dimension (NRHS) */
/*          The estimated forward error bound for each solution vector */
/*          X(j) (the j-th column of the solution matrix X). */
/*          If XTRUE is the true solution corresponding to X(j), FERR(j) */
/*          is an estimated upper bound for the magnitude of the largest */
/*          element in (X(j) - XTRUE) divided by the magnitude of the */
/*          largest element in X(j).  The estimate is as reliable as */
/*          the estimate for RCOND, and is almost always a slight */
/*          overestimate of the true error. */

/*  BERR    (output) REAL array, dimension (NRHS) */
/*          The componentwise relative backward error of each solution */
/*          vector X(j) (i.e., the smallest relative change in */
/*          any element of A or B that makes X(j) an exact solution). */

/*  WORK    (workspace) REAL array, dimension (3*N) */

/*  IWORK   (workspace) INTEGER array, dimension (N) */

/*  INFO    (output) INTEGER */
/*          = 0:  successful exit */
/*          < 0:  if INFO = -i, the i-th argument had an illegal value */

/*  ===================================================================== */

/*     Test the input parameters. */

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1;
    b -= b_offset;
    x_dim1 = *ldx;
    x_offset = 1 + x_dim1;
    x -= x_offset;
    --ferr;
    --berr;
    --work;
    --iwork;

    /* Function Body */
    *info = 0;
    upper = lsame_(uplo, "U");
    notran = lsame_(trans, "N");
    nounit = lsame_(diag, "N");

    if (! upper && ! lsame_(uplo, "L")) {
	*info = -1;
    } else if (! notran && ! lsame_(trans, "T") && ! 
	    lsame_(trans, "C")) {
	*info = -2;
    } else if (! nounit && ! lsame_(diag, "U")) {
	*info = -3;
    } else if (*n < 0) {
	*info = -4;
    } else if (*nrhs < 0) {
	*info = -5;
    } else if (*lda < max(1,*n)) {
	*info = -7;
    } else if (*ldb < max(1,*n)) {
	*info = -9;
    } else if (*ldx < max(1,*n)) {
	*info = -11;
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("STRRFS", &i__1);
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0 || *nrhs == 0) {
	i__1 = *nrhs;
	for (j = 1; j <= i__1; ++j) {
	    ferr[j] = 0.f;
	    berr[j] = 0.f;
	}
	return 0;
    }

    if (notran) {
	*(unsigned char *)transt = 'T';
    } else {
	*(unsigned char *)transt = 'N';
    }

/*     NZ = maximum number of nonzero elements in each row of A, plus 1 */

    nz = *n + 1;
    eps = slamch_("Epsilon");
    safmin = slamch_("Safe minimum");
    safe1 = nz * safmin;
    safe2 = safe1 / eps;

/*     Do for each right hand side */

    i__1 = *nrhs;
    for (j = 1; j <= i__1; ++j) {

/*        Compute residual R = B - op(A) * X, */
/*        where op(A) = A or A', depending on TRANS. */

	scopy_(n, &x[j * x_dim1 + 1], &c__1, &work[*n + 1], &c__1);
	strmv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1);
	saxpy_(n, &c_b19, &b[j * b_dim1 + 1], &c__1, &work[*n + 1], &c__1);

/*        Compute componentwise relative backward error from formula */

/*        max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) */

/*        where abs(Z) is the componentwise absolute value of the matrix */
/*        or vector Z.  If the i-th component of the denominator is less */
/*        than SAFE2, then SAFE1 is added to the i-th components of the */
/*        numerator and denominator before dividing. */

	i__2 = *n;
	for (i__ = 1; i__ <= i__2; ++i__) {
	    work[i__] = (r__1 = b[i__ + j * b_dim1], dabs(r__1));
	}

	if (notran) {

/*           Compute abs(A)*abs(X) + abs(B). */

	    if (upper) {
		if (nounit) {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			xk = (r__1 = x[k + j * x_dim1], dabs(r__1));
			i__3 = k;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    work[i__] += (r__1 = a[i__ + k * a_dim1], dabs(
				    r__1)) * xk;
			}
		    }
		} else {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			xk = (r__1 = x[k + j * x_dim1], dabs(r__1));
			i__3 = k - 1;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    work[i__] += (r__1 = a[i__ + k * a_dim1], dabs(
				    r__1)) * xk;
			}
			work[k] += xk;
		    }
		}
	    } else {
		if (nounit) {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			xk = (r__1 = x[k + j * x_dim1], dabs(r__1));
			i__3 = *n;
			for (i__ = k; i__ <= i__3; ++i__) {
			    work[i__] += (r__1 = a[i__ + k * a_dim1], dabs(
				    r__1)) * xk;
			}
		    }
		} else {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			xk = (r__1 = x[k + j * x_dim1], dabs(r__1));
			i__3 = *n;
			for (i__ = k + 1; i__ <= i__3; ++i__) {
			    work[i__] += (r__1 = a[i__ + k * a_dim1], dabs(
				    r__1)) * xk;
			}
			work[k] += xk;
		    }
		}
	    }
	} else {

/*           Compute abs(A')*abs(X) + abs(B). */

	    if (upper) {
		if (nounit) {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			s = 0.f;
			i__3 = k;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    s += (r__1 = a[i__ + k * a_dim1], dabs(r__1)) * (
				    r__2 = x[i__ + j * x_dim1], dabs(r__2));
			}
			work[k] += s;
		    }
		} else {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			s = (r__1 = x[k + j * x_dim1], dabs(r__1));
			i__3 = k - 1;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    s += (r__1 = a[i__ + k * a_dim1], dabs(r__1)) * (
				    r__2 = x[i__ + j * x_dim1], dabs(r__2));
			}
			work[k] += s;
		    }
		}
	    } else {
		if (nounit) {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			s = 0.f;
			i__3 = *n;
			for (i__ = k; i__ <= i__3; ++i__) {
			    s += (r__1 = a[i__ + k * a_dim1], dabs(r__1)) * (
				    r__2 = x[i__ + j * x_dim1], dabs(r__2));
			}
			work[k] += s;
		    }
		} else {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			s = (r__1 = x[k + j * x_dim1], dabs(r__1));
			i__3 = *n;
			for (i__ = k + 1; i__ <= i__3; ++i__) {
			    s += (r__1 = a[i__ + k * a_dim1], dabs(r__1)) * (
				    r__2 = x[i__ + j * x_dim1], dabs(r__2));
			}
			work[k] += s;
		    }
		}
	    }
	}
	s = 0.f;
	i__2 = *n;
	for (i__ = 1; i__ <= i__2; ++i__) {
	    if (work[i__] > safe2) {
/* Computing MAX */
		r__2 = s, r__3 = (r__1 = work[*n + i__], dabs(r__1)) / work[
			i__];
		s = dmax(r__2,r__3);
	    } else {
/* Computing MAX */
		r__2 = s, r__3 = ((r__1 = work[*n + i__], dabs(r__1)) + safe1)
			 / (work[i__] + safe1);
		s = dmax(r__2,r__3);
	    }
	}
	berr[j] = s;

/*        Bound error from formula */

/*        norm(X - XTRUE) / norm(X) .le. FERR = */
/*        norm( abs(inv(op(A)))* */
/*           ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) */

/*        where */
/*          norm(Z) is the magnitude of the largest component of Z */
/*          inv(op(A)) is the inverse of op(A) */
/*          abs(Z) is the componentwise absolute value of the matrix or */
/*             vector Z */
/*          NZ is the maximum number of nonzeros in any row of A, plus 1 */
/*          EPS is machine epsilon */

/*        The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) */
/*        is incremented by SAFE1 if the i-th component of */
/*        abs(op(A))*abs(X) + abs(B) is less than SAFE2. */

/*        Use SLACN2 to estimate the infinity-norm of the matrix */
/*           inv(op(A)) * diag(W), */
/*        where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */

	i__2 = *n;
	for (i__ = 1; i__ <= i__2; ++i__) {
	    if (work[i__] > safe2) {
		work[i__] = (r__1 = work[*n + i__], dabs(r__1)) + nz * eps * 
			work[i__];
	    } else {
		work[i__] = (r__1 = work[*n + i__], dabs(r__1)) + nz * eps * 
			work[i__] + safe1;
	    }
	}

	kase = 0;
L210:
	slacn2_(n, &work[(*n << 1) + 1], &work[*n + 1], &iwork[1], &ferr[j], &
		kase, isave);
	if (kase != 0) {
	    if (kase == 1) {

/*              Multiply by diag(W)*inv(op(A)'). */

		strsv_(uplo, transt, diag, n, &a[a_offset], lda, &work[*n + 1]
, &c__1);
		i__2 = *n;
		for (i__ = 1; i__ <= i__2; ++i__) {
		    work[*n + i__] = work[i__] * work[*n + i__];
		}
	    } else {

/*              Multiply by inv(op(A))*diag(W). */

		i__2 = *n;
		for (i__ = 1; i__ <= i__2; ++i__) {
		    work[*n + i__] = work[i__] * work[*n + i__];
		}
		strsv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], 
			 &c__1);
	    }
	    goto L210;
	}

/*        Normalize error. */

	lstres = 0.f;
	i__2 = *n;
	for (i__ = 1; i__ <= i__2; ++i__) {
/* Computing MAX */
	    r__2 = lstres, r__3 = (r__1 = x[i__ + j * x_dim1], dabs(r__1));
	    lstres = dmax(r__2,r__3);
	}
	if (lstres != 0.f) {
	    ferr[j] /= lstres;
	}

    }

    return 0;

/*     End of STRRFS */

} /* strrfs_ */
Exemple #14
0
/* Subroutine */ int slatrs_(char *uplo, char *trans, char *diag, char *
	normin, integer *n, real *a, integer *lda, real *x, real *scale, real 
	*cnorm, integer *info, ftnlen uplo_len, ftnlen trans_len, ftnlen 
	diag_len, ftnlen normin_len)
{
    /* System generated locals */
    integer a_dim1, a_offset, i__1, i__2, i__3;
    real r__1, r__2, r__3;

    /* Local variables */
    static integer i__, j;
    static real xj, rec, tjj;
    static integer jinc;
    static real xbnd;
    static integer imax;
    static real tmax, tjjs;
    extern doublereal sdot_(integer *, real *, integer *, real *, integer *);
    static real xmax, grow, sumj;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
    static real tscal, uscal;
    static integer jlast;
    extern doublereal sasum_(integer *, real *, integer *);
    static logical upper;
    extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, 
	    real *, integer *), strsv_(char *, char *, char *, integer *, 
	    real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen);
    extern doublereal slamch_(char *, ftnlen);
    extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
    static real bignum;
    extern integer isamax_(integer *, real *, integer *);
    static logical notran;
    static integer jfirst;
    static real smlnum;
    static logical nounit;


/*  -- LAPACK auxiliary routine (version 3.0) -- */
/*     Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., */
/*     Courant Institute, Argonne National Lab, and Rice University */
/*     June 30, 1992 */

/*     .. Scalar Arguments .. */
/*     .. */
/*     .. Array Arguments .. */
/*     .. */

/*  Purpose */
/*  ======= */

/*  SLATRS solves one of the triangular systems */

/*     A *x = s*b  or  A'*x = s*b */

/*  with scaling to prevent overflow.  Here A is an upper or lower */
/*  triangular matrix, A' denotes the transpose of A, x and b are */
/*  n-element vectors, and s is a scaling factor, usually less than */
/*  or equal to 1, chosen so that the components of x will be less than */
/*  the overflow threshold.  If the unscaled problem will not cause */
/*  overflow, the Level 2 BLAS routine STRSV is called.  If the matrix A */
/*  is singular (A(j,j) = 0 for some j), then s is set to 0 and a */
/*  non-trivial solution to A*x = 0 is returned. */

/*  Arguments */
/*  ========= */

/*  UPLO    (input) CHARACTER*1 */
/*          Specifies whether the matrix A is upper or lower triangular. */
/*          = 'U':  Upper triangular */
/*          = 'L':  Lower triangular */

/*  TRANS   (input) CHARACTER*1 */
/*          Specifies the operation applied to A. */
/*          = 'N':  Solve A * x = s*b  (No transpose) */
/*          = 'T':  Solve A'* x = s*b  (Transpose) */
/*          = 'C':  Solve A'* x = s*b  (Conjugate transpose = Transpose) */

/*  DIAG    (input) CHARACTER*1 */
/*          Specifies whether or not the matrix A is unit triangular. */
/*          = 'N':  Non-unit triangular */
/*          = 'U':  Unit triangular */

/*  NORMIN  (input) CHARACTER*1 */
/*          Specifies whether CNORM has been set or not. */
/*          = 'Y':  CNORM contains the column norms on entry */
/*          = 'N':  CNORM is not set on entry.  On exit, the norms will */
/*                  be computed and stored in CNORM. */

/*  N       (input) INTEGER */
/*          The order of the matrix A.  N >= 0. */

/*  A       (input) REAL array, dimension (LDA,N) */
/*          The triangular matrix A.  If UPLO = 'U', the leading n by n */
/*          upper triangular part of the array A contains the upper */
/*          triangular matrix, and the strictly lower triangular part of */
/*          A is not referenced.  If UPLO = 'L', the leading n by n lower */
/*          triangular part of the array A contains the lower triangular */
/*          matrix, and the strictly upper triangular part of A is not */
/*          referenced.  If DIAG = 'U', the diagonal elements of A are */
/*          also not referenced and are assumed to be 1. */

/*  LDA     (input) INTEGER */
/*          The leading dimension of the array A.  LDA >= max (1,N). */

/*  X       (input/output) REAL array, dimension (N) */
/*          On entry, the right hand side b of the triangular system. */
/*          On exit, X is overwritten by the solution vector x. */

/*  SCALE   (output) REAL */
/*          The scaling factor s for the triangular system */
/*             A * x = s*b  or  A'* x = s*b. */
/*          If SCALE = 0, the matrix A is singular or badly scaled, and */
/*          the vector x is an exact or approximate solution to A*x = 0. */

/*  CNORM   (input or output) REAL array, dimension (N) */

/*          If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */
/*          contains the norm of the off-diagonal part of the j-th column */
/*          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal */
/*          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */
/*          must be greater than or equal to the 1-norm. */

/*          If NORMIN = 'N', CNORM is an output argument and CNORM(j) */
/*          returns the 1-norm of the offdiagonal part of the j-th column */
/*          of A. */

/*  INFO    (output) INTEGER */
/*          = 0:  successful exit */
/*          < 0:  if INFO = -k, the k-th argument had an illegal value */

/*  Further Details */
/*  ======= ======= */

/*  A rough bound on x is computed; if that is less than overflow, STRSV */
/*  is called, otherwise, specific code is used which checks for possible */
/*  overflow or divide-by-zero at every operation. */

/*  A columnwise scheme is used for solving A*x = b.  The basic algorithm */
/*  if A is lower triangular is */

/*       x[1:n] := b[1:n] */
/*       for j = 1, ..., n */
/*            x(j) := x(j) / A(j,j) */
/*            x[j+1:n] := x[j+1:n] - x(j) * A[j+1:n,j] */
/*       end */

/*  Define bounds on the components of x after j iterations of the loop: */
/*     M(j) = bound on x[1:j] */
/*     G(j) = bound on x[j+1:n] */
/*  Initially, let M(0) = 0 and G(0) = max{x(i), i=1,...,n}. */

/*  Then for iteration j+1 we have */
/*     M(j+1) <= G(j) / | A(j+1,j+1) | */
/*     G(j+1) <= G(j) + M(j+1) * | A[j+2:n,j+1] | */
/*            <= G(j) ( 1 + CNORM(j+1) / | A(j+1,j+1) | ) */

/*  where CNORM(j+1) is greater than or equal to the infinity-norm of */
/*  column j+1 of A, not counting the diagonal.  Hence */

/*     G(j) <= G(0) product ( 1 + CNORM(i) / | A(i,i) | ) */
/*                  1<=i<=j */
/*  and */

/*     |x(j)| <= ( G(0) / |A(j,j)| ) product ( 1 + CNORM(i) / |A(i,i)| ) */
/*                                   1<=i< j */

/*  Since |x(j)| <= M(j), we use the Level 2 BLAS routine STRSV if the */
/*  reciprocal of the largest M(j), j=1,..,n, is larger than */
/*  max(underflow, 1/overflow). */

/*  The bound on x(j) is also used to determine when a step in the */
/*  columnwise method can be performed without fear of overflow.  If */
/*  the computed bound is greater than a large constant, x is scaled to */
/*  prevent overflow, but if the bound overflows, x is set to 0, x(j) to */
/*  1, and scale to 0, and a non-trivial solution to A*x = 0 is found. */

/*  Similarly, a row-wise scheme is used to solve A'*x = b.  The basic */
/*  algorithm for A upper triangular is */

/*       for j = 1, ..., n */
/*            x(j) := ( b(j) - A[1:j-1,j]' * x[1:j-1] ) / A(j,j) */
/*       end */

/*  We simultaneously compute two bounds */
/*       G(j) = bound on ( b(i) - A[1:i-1,i]' * x[1:i-1] ), 1<=i<=j */
/*       M(j) = bound on x(i), 1<=i<=j */

/*  The initial values are G(0) = 0, M(0) = max{b(i), i=1,..,n}, and we */
/*  add the constraint G(j) >= G(j-1) and M(j) >= M(j-1) for j >= 1. */
/*  Then the bound on x(j) is */

/*       M(j) <= M(j-1) * ( 1 + CNORM(j) ) / | A(j,j) | */

/*            <= M(0) * product ( ( 1 + CNORM(i) ) / |A(i,i)| ) */
/*                      1<=i<=j */

/*  and we can safely call STRSV if 1/M(n) and 1/G(n) are both greater */
/*  than max(underflow, 1/overflow). */

/*  ===================================================================== */

/*     .. Parameters .. */
/*     .. */
/*     .. Local Scalars .. */
/*     .. */
/*     .. External Functions .. */
/*     .. */
/*     .. External Subroutines .. */
/*     .. */
/*     .. Intrinsic Functions .. */
/*     .. */
/*     .. Executable Statements .. */

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --x;
    --cnorm;

    /* Function Body */
    *info = 0;
    upper = lsame_(uplo, "U", (ftnlen)1, (ftnlen)1);
    notran = lsame_(trans, "N", (ftnlen)1, (ftnlen)1);
    nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);

/*     Test the input parameters. */

    if (! upper && ! lsame_(uplo, "L", (ftnlen)1, (ftnlen)1)) {
	*info = -1;
    } else if (! notran && ! lsame_(trans, "T", (ftnlen)1, (ftnlen)1) && ! 
	    lsame_(trans, "C", (ftnlen)1, (ftnlen)1)) {
	*info = -2;
    } else if (! nounit && ! lsame_(diag, "U", (ftnlen)1, (ftnlen)1)) {
	*info = -3;
    } else if (! lsame_(normin, "Y", (ftnlen)1, (ftnlen)1) && ! lsame_(normin,
	     "N", (ftnlen)1, (ftnlen)1)) {
	*info = -4;
    } else if (*n < 0) {
	*info = -5;
    } else if (*lda < max(1,*n)) {
	*info = -7;
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("SLATRS", &i__1, (ftnlen)6);
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }

/*     Determine machine dependent parameters to control overflow. */

    smlnum = slamch_("Safe minimum", (ftnlen)12) / slamch_("Precision", (
	    ftnlen)9);
    bignum = 1.f / smlnum;
    *scale = 1.f;

    if (lsame_(normin, "N", (ftnlen)1, (ftnlen)1)) {

/*        Compute the 1-norm of each column, not including the diagonal. */

	if (upper) {

/*           A is upper triangular. */

	    i__1 = *n;
	    for (j = 1; j <= i__1; ++j) {
		i__2 = j - 1;
		cnorm[j] = sasum_(&i__2, &a[j * a_dim1 + 1], &c__1);
/* L10: */
	    }
	} else {

/*           A is lower triangular. */

	    i__1 = *n - 1;
	    for (j = 1; j <= i__1; ++j) {
		i__2 = *n - j;
		cnorm[j] = sasum_(&i__2, &a[j + 1 + j * a_dim1], &c__1);
/* L20: */
	    }
	    cnorm[*n] = 0.f;
	}
    }

/*     Scale the column norms by TSCAL if the maximum element in CNORM is */
/*     greater than BIGNUM. */

    imax = isamax_(n, &cnorm[1], &c__1);
    tmax = cnorm[imax];
    if (tmax <= bignum) {
	tscal = 1.f;
    } else {
	tscal = 1.f / (smlnum * tmax);
	sscal_(n, &tscal, &cnorm[1], &c__1);
    }

/*     Compute a bound on the computed solution vector to see if the */
/*     Level 2 BLAS routine STRSV can be used. */

    j = isamax_(n, &x[1], &c__1);
    xmax = (r__1 = x[j], dabs(r__1));
    xbnd = xmax;
    if (notran) {

/*        Compute the growth in A * x = b. */

	if (upper) {
	    jfirst = *n;
	    jlast = 1;
	    jinc = -1;
	} else {
	    jfirst = 1;
	    jlast = *n;
	    jinc = 1;
	}

	if (tscal != 1.f) {
	    grow = 0.f;
	    goto L50;
	}

	if (nounit) {

/*           A is non-unit triangular. */

/*           Compute GROW = 1/G(j) and XBND = 1/M(j). */
/*           Initially, G(0) = max{x(i), i=1,...,n}. */

	    grow = 1.f / dmax(xbnd,smlnum);
	    xbnd = grow;
	    i__1 = jlast;
	    i__2 = jinc;
	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {

/*              Exit the loop if the growth factor is too small. */

		if (grow <= smlnum) {
		    goto L50;
		}

/*              M(j) = G(j-1) / abs(A(j,j)) */

		tjj = (r__1 = a[j + j * a_dim1], dabs(r__1));
/* Computing MIN */
		r__1 = xbnd, r__2 = dmin(1.f,tjj) * grow;
		xbnd = dmin(r__1,r__2);
		if (tjj + cnorm[j] >= smlnum) {

/*                 G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */

		    grow *= tjj / (tjj + cnorm[j]);
		} else {

/*                 G(j) could overflow, set GROW to 0. */

		    grow = 0.f;
		}
/* L30: */
	    }
	    grow = xbnd;
	} else {

/*           A is unit triangular. */

/*           Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. */

/* Computing MIN */
	    r__1 = 1.f, r__2 = 1.f / dmax(xbnd,smlnum);
	    grow = dmin(r__1,r__2);
	    i__2 = jlast;
	    i__1 = jinc;
	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {

/*              Exit the loop if the growth factor is too small. */

		if (grow <= smlnum) {
		    goto L50;
		}

/*              G(j) = G(j-1)*( 1 + CNORM(j) ) */

		grow *= 1.f / (cnorm[j] + 1.f);
/* L40: */
	    }
	}
L50:

	;
    } else {

/*        Compute the growth in A' * x = b. */

	if (upper) {
	    jfirst = 1;
	    jlast = *n;
	    jinc = 1;
	} else {
	    jfirst = *n;
	    jlast = 1;
	    jinc = -1;
	}

	if (tscal != 1.f) {
	    grow = 0.f;
	    goto L80;
	}

	if (nounit) {

/*           A is non-unit triangular. */

/*           Compute GROW = 1/G(j) and XBND = 1/M(j). */
/*           Initially, M(0) = max{x(i), i=1,...,n}. */

	    grow = 1.f / dmax(xbnd,smlnum);
	    xbnd = grow;
	    i__1 = jlast;
	    i__2 = jinc;
	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {

/*              Exit the loop if the growth factor is too small. */

		if (grow <= smlnum) {
		    goto L80;
		}

/*              G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */

		xj = cnorm[j] + 1.f;
/* Computing MIN */
		r__1 = grow, r__2 = xbnd / xj;
		grow = dmin(r__1,r__2);

/*              M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */

		tjj = (r__1 = a[j + j * a_dim1], dabs(r__1));
		if (xj > tjj) {
		    xbnd *= tjj / xj;
		}
/* L60: */
	    }
	    grow = dmin(grow,xbnd);
	} else {

/*           A is unit triangular. */

/*           Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. */

/* Computing MIN */
	    r__1 = 1.f, r__2 = 1.f / dmax(xbnd,smlnum);
	    grow = dmin(r__1,r__2);
	    i__2 = jlast;
	    i__1 = jinc;
	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {

/*              Exit the loop if the growth factor is too small. */

		if (grow <= smlnum) {
		    goto L80;
		}

/*              G(j) = ( 1 + CNORM(j) )*G(j-1) */

		xj = cnorm[j] + 1.f;
		grow /= xj;
/* L70: */
	    }
	}
L80:
	;
    }

    if (grow * tscal > smlnum) {

/*        Use the Level 2 BLAS solve if the reciprocal of the bound on */
/*        elements of X is not too small. */

	strsv_(uplo, trans, diag, n, &a[a_offset], lda, &x[1], &c__1, (ftnlen)
		1, (ftnlen)1, (ftnlen)1);
    } else {

/*        Use a Level 1 BLAS solve, scaling intermediate results. */

	if (xmax > bignum) {

/*           Scale X so that its components are less than or equal to */
/*           BIGNUM in absolute value. */

	    *scale = bignum / xmax;
	    sscal_(n, scale, &x[1], &c__1);
	    xmax = bignum;
	}

	if (notran) {

/*           Solve A * x = b */

	    i__1 = jlast;
	    i__2 = jinc;
	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {

/*              Compute x(j) = b(j) / A(j,j), scaling x if necessary. */

		xj = (r__1 = x[j], dabs(r__1));
		if (nounit) {
		    tjjs = a[j + j * a_dim1] * tscal;
		} else {
		    tjjs = tscal;
		    if (tscal == 1.f) {
			goto L95;
		    }
		}
		tjj = dabs(tjjs);
		if (tjj > smlnum) {

/*                    abs(A(j,j)) > SMLNUM: */

		    if (tjj < 1.f) {
			if (xj > tjj * bignum) {

/*                          Scale x by 1/b(j). */

			    rec = 1.f / xj;
			    sscal_(n, &rec, &x[1], &c__1);
			    *scale *= rec;
			    xmax *= rec;
			}
		    }
		    x[j] /= tjjs;
		    xj = (r__1 = x[j], dabs(r__1));
		} else if (tjj > 0.f) {

/*                    0 < abs(A(j,j)) <= SMLNUM: */

		    if (xj > tjj * bignum) {

/*                       Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM */
/*                       to avoid overflow when dividing by A(j,j). */

			rec = tjj * bignum / xj;
			if (cnorm[j] > 1.f) {

/*                          Scale by 1/CNORM(j) to avoid overflow when */
/*                          multiplying x(j) times column j. */

			    rec /= cnorm[j];
			}
			sscal_(n, &rec, &x[1], &c__1);
			*scale *= rec;
			xmax *= rec;
		    }
		    x[j] /= tjjs;
		    xj = (r__1 = x[j], dabs(r__1));
		} else {

/*                    A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
/*                    scale = 0, and compute a solution to A*x = 0. */

		    i__3 = *n;
		    for (i__ = 1; i__ <= i__3; ++i__) {
			x[i__] = 0.f;
/* L90: */
		    }
		    x[j] = 1.f;
		    xj = 1.f;
		    *scale = 0.f;
		    xmax = 0.f;
		}
L95:

/*              Scale x if necessary to avoid overflow when adding a */
/*              multiple of column j of A. */

		if (xj > 1.f) {
		    rec = 1.f / xj;
		    if (cnorm[j] > (bignum - xmax) * rec) {

/*                    Scale x by 1/(2*abs(x(j))). */

			rec *= .5f;
			sscal_(n, &rec, &x[1], &c__1);
			*scale *= rec;
		    }
		} else if (xj * cnorm[j] > bignum - xmax) {

/*                 Scale x by 1/2. */

		    sscal_(n, &c_b36, &x[1], &c__1);
		    *scale *= .5f;
		}

		if (upper) {
		    if (j > 1) {

/*                    Compute the update */
/*                       x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */

			i__3 = j - 1;
			r__1 = -x[j] * tscal;
			saxpy_(&i__3, &r__1, &a[j * a_dim1 + 1], &c__1, &x[1],
				 &c__1);
			i__3 = j - 1;
			i__ = isamax_(&i__3, &x[1], &c__1);
			xmax = (r__1 = x[i__], dabs(r__1));
		    }
		} else {
		    if (j < *n) {

/*                    Compute the update */
/*                       x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */

			i__3 = *n - j;
			r__1 = -x[j] * tscal;
			saxpy_(&i__3, &r__1, &a[j + 1 + j * a_dim1], &c__1, &
				x[j + 1], &c__1);
			i__3 = *n - j;
			i__ = j + isamax_(&i__3, &x[j + 1], &c__1);
			xmax = (r__1 = x[i__], dabs(r__1));
		    }
		}
/* L100: */
	    }

	} else {

/*           Solve A' * x = b */

	    i__2 = jlast;
	    i__1 = jinc;
	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {

/*              Compute x(j) = b(j) - sum A(k,j)*x(k). */
/*                                    k<>j */

		xj = (r__1 = x[j], dabs(r__1));
		uscal = tscal;
		rec = 1.f / dmax(xmax,1.f);
		if (cnorm[j] > (bignum - xj) * rec) {

/*                 If x(j) could overflow, scale x by 1/(2*XMAX). */

		    rec *= .5f;
		    if (nounit) {
			tjjs = a[j + j * a_dim1] * tscal;
		    } else {
			tjjs = tscal;
		    }
		    tjj = dabs(tjjs);
		    if (tjj > 1.f) {

/*                       Divide by A(j,j) when scaling x if A(j,j) > 1. */

/* Computing MIN */
			r__1 = 1.f, r__2 = rec * tjj;
			rec = dmin(r__1,r__2);
			uscal /= tjjs;
		    }
		    if (rec < 1.f) {
			sscal_(n, &rec, &x[1], &c__1);
			*scale *= rec;
			xmax *= rec;
		    }
		}

		sumj = 0.f;
		if (uscal == 1.f) {

/*                 If the scaling needed for A in the dot product is 1, */
/*                 call SDOT to perform the dot product. */

		    if (upper) {
			i__3 = j - 1;
			sumj = sdot_(&i__3, &a[j * a_dim1 + 1], &c__1, &x[1], 
				&c__1);
		    } else if (j < *n) {
			i__3 = *n - j;
			sumj = sdot_(&i__3, &a[j + 1 + j * a_dim1], &c__1, &x[
				j + 1], &c__1);
		    }
		} else {

/*                 Otherwise, use in-line code for the dot product. */

		    if (upper) {
			i__3 = j - 1;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    sumj += a[i__ + j * a_dim1] * uscal * x[i__];
/* L110: */
			}
		    } else if (j < *n) {
			i__3 = *n;
			for (i__ = j + 1; i__ <= i__3; ++i__) {
			    sumj += a[i__ + j * a_dim1] * uscal * x[i__];
/* L120: */
			}
		    }
		}

		if (uscal == tscal) {

/*                 Compute x(j) := ( x(j) - sumj ) / A(j,j) if 1/A(j,j) */
/*                 was not used to scale the dotproduct. */

		    x[j] -= sumj;
		    xj = (r__1 = x[j], dabs(r__1));
		    if (nounit) {
			tjjs = a[j + j * a_dim1] * tscal;
		    } else {
			tjjs = tscal;
			if (tscal == 1.f) {
			    goto L135;
			}
		    }

/*                    Compute x(j) = x(j) / A(j,j), scaling if necessary. */

		    tjj = dabs(tjjs);
		    if (tjj > smlnum) {

/*                       abs(A(j,j)) > SMLNUM: */

			if (tjj < 1.f) {
			    if (xj > tjj * bignum) {

/*                             Scale X by 1/abs(x(j)). */

				rec = 1.f / xj;
				sscal_(n, &rec, &x[1], &c__1);
				*scale *= rec;
				xmax *= rec;
			    }
			}
			x[j] /= tjjs;
		    } else if (tjj > 0.f) {

/*                       0 < abs(A(j,j)) <= SMLNUM: */

			if (xj > tjj * bignum) {

/*                          Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */

			    rec = tjj * bignum / xj;
			    sscal_(n, &rec, &x[1], &c__1);
			    *scale *= rec;
			    xmax *= rec;
			}
			x[j] /= tjjs;
		    } else {

/*                       A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
/*                       scale = 0, and compute a solution to A'*x = 0. */

			i__3 = *n;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    x[i__] = 0.f;
/* L130: */
			}
			x[j] = 1.f;
			*scale = 0.f;
			xmax = 0.f;
		    }
L135:
		    ;
		} else {

/*                 Compute x(j) := x(j) / A(j,j)  - sumj if the dot */
/*                 product has already been divided by 1/A(j,j). */

		    x[j] = x[j] / tjjs - sumj;
		}
/* Computing MAX */
		r__2 = xmax, r__3 = (r__1 = x[j], dabs(r__1));
		xmax = dmax(r__2,r__3);
/* L140: */
	    }
	}
	*scale /= tscal;
    }

/*     Scale the column norms by 1/TSCAL for return. */

    if (tscal != 1.f) {
	r__1 = 1.f / tscal;
	sscal_(n, &r__1, &cnorm[1], &c__1);
    }

    return 0;

/*     End of SLATRS */

} /* slatrs_ */
Exemple #15
0
/* Subroutine */
int slatrs_(char *uplo, char *trans, char *diag, char * normin, integer *n, real *a, integer *lda, real *x, real *scale, real *cnorm, integer *info)
{
    /* System generated locals */
    integer a_dim1, a_offset, i__1, i__2, i__3;
    real r__1, r__2, r__3;
    /* Local variables */
    integer i__, j;
    real xj, rec, tjj;
    integer jinc;
    real xbnd;
    integer imax;
    real tmax, tjjs;
    extern real sdot_(integer *, real *, integer *, real *, integer *);
    real xmax, grow, sumj;
    extern logical lsame_(char *, char *);
    extern /* Subroutine */
    int sscal_(integer *, real *, real *, integer *);
    real tscal, uscal;
    integer jlast;
    extern real sasum_(integer *, real *, integer *);
    logical upper;
    extern /* Subroutine */
    int saxpy_(integer *, real *, real *, integer *, real *, integer *), strsv_(char *, char *, char *, integer *, real *, integer *, real *, integer *);
    extern real slamch_(char *);
    extern /* Subroutine */
    int xerbla_(char *, integer *);
    real bignum;
    extern integer isamax_(integer *, real *, integer *);
    logical notran;
    integer jfirst;
    real smlnum;
    logical nounit;
    /* -- LAPACK auxiliary routine (version 3.4.2) -- */
    /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */
    /* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
    /* September 2012 */
    /* .. Scalar Arguments .. */
    /* .. */
    /* .. Array Arguments .. */
    /* .. */
    /* ===================================================================== */
    /* .. Parameters .. */
    /* .. */
    /* .. Local Scalars .. */
    /* .. */
    /* .. External Functions .. */
    /* .. */
    /* .. External Subroutines .. */
    /* .. */
    /* .. Intrinsic Functions .. */
    /* .. */
    /* .. Executable Statements .. */
    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --x;
    --cnorm;
    /* Function Body */
    *info = 0;
    upper = lsame_(uplo, "U");
    notran = lsame_(trans, "N");
    nounit = lsame_(diag, "N");
    /* Test the input parameters. */
    if (! upper && ! lsame_(uplo, "L"))
    {
        *info = -1;
    }
    else if (! notran && ! lsame_(trans, "T") && ! lsame_(trans, "C"))
    {
        *info = -2;
    }
    else if (! nounit && ! lsame_(diag, "U"))
    {
        *info = -3;
    }
    else if (! lsame_(normin, "Y") && ! lsame_(normin, "N"))
    {
        *info = -4;
    }
    else if (*n < 0)
    {
        *info = -5;
    }
    else if (*lda < max(1,*n))
    {
        *info = -7;
    }
    if (*info != 0)
    {
        i__1 = -(*info);
        xerbla_("SLATRS", &i__1);
        return 0;
    }
    /* Quick return if possible */
    if (*n == 0)
    {
        return 0;
    }
    /* Determine machine dependent parameters to control overflow. */
    smlnum = slamch_("Safe minimum") / slamch_("Precision");
    bignum = 1.f / smlnum;
    *scale = 1.f;
    if (lsame_(normin, "N"))
    {
        /* Compute the 1-norm of each column, not including the diagonal. */
        if (upper)
        {
            /* A is upper triangular. */
            i__1 = *n;
            for (j = 1;
                    j <= i__1;
                    ++j)
            {
                i__2 = j - 1;
                cnorm[j] = sasum_(&i__2, &a[j * a_dim1 + 1], &c__1);
                /* L10: */
            }
        }
        else
        {
            /* A is lower triangular. */
            i__1 = *n - 1;
            for (j = 1;
                    j <= i__1;
                    ++j)
            {
                i__2 = *n - j;
                cnorm[j] = sasum_(&i__2, &a[j + 1 + j * a_dim1], &c__1);
                /* L20: */
            }
            cnorm[*n] = 0.f;
        }
    }
    /* Scale the column norms by TSCAL if the maximum element in CNORM is */
    /* greater than BIGNUM. */
    imax = isamax_(n, &cnorm[1], &c__1);
    tmax = cnorm[imax];
    if (tmax <= bignum)
    {
        tscal = 1.f;
    }
    else
    {
        tscal = 1.f / (smlnum * tmax);
        sscal_(n, &tscal, &cnorm[1], &c__1);
    }
    /* Compute a bound on the computed solution vector to see if the */
    /* Level 2 BLAS routine STRSV can be used. */
    j = isamax_(n, &x[1], &c__1);
    xmax = (r__1 = x[j], f2c_abs(r__1));
    xbnd = xmax;
    if (notran)
    {
        /* Compute the growth in A * x = b. */
        if (upper)
        {
            jfirst = *n;
            jlast = 1;
            jinc = -1;
        }
        else
        {
            jfirst = 1;
            jlast = *n;
            jinc = 1;
        }
        if (tscal != 1.f)
        {
            grow = 0.f;
            goto L50;
        }
        if (nounit)
        {
            /* A is non-unit triangular. */
            /* Compute GROW = 1/G(j) and XBND = 1/M(j). */
            /* Initially, G(0) = max{
            x(i), i=1,...,n}
            . */
            grow = 1.f / max(xbnd,smlnum);
            xbnd = grow;
            i__1 = jlast;
            i__2 = jinc;
            for (j = jfirst;
                    i__2 < 0 ? j >= i__1 : j <= i__1;
                    j += i__2)
            {
                /* Exit the loop if the growth factor is too small. */
                if (grow <= smlnum)
                {
                    goto L50;
                }
                /* M(j) = G(j-1) / f2c_abs(A(j,j)) */
                tjj = (r__1 = a[j + j * a_dim1], f2c_abs(r__1));
                /* Computing MIN */
                r__1 = xbnd;
                r__2 = min(1.f,tjj) * grow; // , expr subst
                xbnd = min(r__1,r__2);
                if (tjj + cnorm[j] >= smlnum)
                {
                    /* G(j) = G(j-1)*( 1 + CNORM(j) / f2c_abs(A(j,j)) ) */
                    grow *= tjj / (tjj + cnorm[j]);
                }
                else
                {
                    /* G(j) could overflow, set GROW to 0. */
                    grow = 0.f;
                }
                /* L30: */
            }
            grow = xbnd;
        }
        else
        {
            /* A is unit triangular. */
            /* Compute GROW = 1/G(j), where G(0) = max{
            x(i), i=1,...,n}
            . */
            /* Computing MIN */
            r__1 = 1.f;
            r__2 = 1.f / max(xbnd,smlnum); // , expr subst
            grow = min(r__1,r__2);
            i__2 = jlast;
            i__1 = jinc;
            for (j = jfirst;
                    i__1 < 0 ? j >= i__2 : j <= i__2;
                    j += i__1)
            {
                /* Exit the loop if the growth factor is too small. */
                if (grow <= smlnum)
                {
                    goto L50;
                }
                /* G(j) = G(j-1)*( 1 + CNORM(j) ) */
                grow *= 1.f / (cnorm[j] + 1.f);
                /* L40: */
            }
        }
L50:
        ;
    }
    else
    {
        /* Compute the growth in A**T * x = b. */
        if (upper)
        {
            jfirst = 1;
            jlast = *n;
            jinc = 1;
        }
        else
        {
            jfirst = *n;
            jlast = 1;
            jinc = -1;
        }
        if (tscal != 1.f)
        {
            grow = 0.f;
            goto L80;
        }
        if (nounit)
        {
            /* A is non-unit triangular. */
            /* Compute GROW = 1/G(j) and XBND = 1/M(j). */
            /* Initially, M(0) = max{
            x(i), i=1,...,n}
            . */
            grow = 1.f / max(xbnd,smlnum);
            xbnd = grow;
            i__1 = jlast;
            i__2 = jinc;
            for (j = jfirst;
                    i__2 < 0 ? j >= i__1 : j <= i__1;
                    j += i__2)
            {
                /* Exit the loop if the growth factor is too small. */
                if (grow <= smlnum)
                {
                    goto L80;
                }
                /* G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */
                xj = cnorm[j] + 1.f;
                /* Computing MIN */
                r__1 = grow;
                r__2 = xbnd / xj; // , expr subst
                grow = min(r__1,r__2);
                /* M(j) = M(j-1)*( 1 + CNORM(j) ) / f2c_abs(A(j,j)) */
                tjj = (r__1 = a[j + j * a_dim1], f2c_abs(r__1));
                if (xj > tjj)
                {
                    xbnd *= tjj / xj;
                }
                /* L60: */
            }
            grow = min(grow,xbnd);
        }
        else
        {
            /* A is unit triangular. */
            /* Compute GROW = 1/G(j), where G(0) = max{
            x(i), i=1,...,n}
            . */
            /* Computing MIN */
            r__1 = 1.f;
            r__2 = 1.f / max(xbnd,smlnum); // , expr subst
            grow = min(r__1,r__2);
            i__2 = jlast;
            i__1 = jinc;
            for (j = jfirst;
                    i__1 < 0 ? j >= i__2 : j <= i__2;
                    j += i__1)
            {
                /* Exit the loop if the growth factor is too small. */
                if (grow <= smlnum)
                {
                    goto L80;
                }
                /* G(j) = ( 1 + CNORM(j) )*G(j-1) */
                xj = cnorm[j] + 1.f;
                grow /= xj;
                /* L70: */
            }
        }
L80:
        ;
    }
    if (grow * tscal > smlnum)
    {
        /* Use the Level 2 BLAS solve if the reciprocal of the bound on */
        /* elements of X is not too small. */
        strsv_(uplo, trans, diag, n, &a[a_offset], lda, &x[1], &c__1);
    }
    else
    {
        /* Use a Level 1 BLAS solve, scaling intermediate results. */
        if (xmax > bignum)
        {
            /* Scale X so that its components are less than or equal to */
            /* BIGNUM in absolute value. */
            *scale = bignum / xmax;
            sscal_(n, scale, &x[1], &c__1);
            xmax = bignum;
        }
        if (notran)
        {
            /* Solve A * x = b */
            i__1 = jlast;
            i__2 = jinc;
            for (j = jfirst;
                    i__2 < 0 ? j >= i__1 : j <= i__1;
                    j += i__2)
            {
                /* Compute x(j) = b(j) / A(j,j), scaling x if necessary. */
                xj = (r__1 = x[j], f2c_abs(r__1));
                if (nounit)
                {
                    tjjs = a[j + j * a_dim1] * tscal;
                }
                else
                {
                    tjjs = tscal;
                    if (tscal == 1.f)
                    {
                        goto L95;
                    }
                }
                tjj = f2c_abs(tjjs);
                if (tjj > smlnum)
                {
                    /* f2c_abs(A(j,j)) > SMLNUM: */
                    if (tjj < 1.f)
                    {
                        if (xj > tjj * bignum)
                        {
                            /* Scale x by 1/b(j). */
                            rec = 1.f / xj;
                            sscal_(n, &rec, &x[1], &c__1);
                            *scale *= rec;
                            xmax *= rec;
                        }
                    }
                    x[j] /= tjjs;
                    xj = (r__1 = x[j], f2c_abs(r__1));
                }
                else if (tjj > 0.f)
                {
                    /* 0 < f2c_abs(A(j,j)) <= SMLNUM: */
                    if (xj > tjj * bignum)
                    {
                        /* Scale x by (1/f2c_abs(x(j)))*f2c_abs(A(j,j))*BIGNUM */
                        /* to avoid overflow when dividing by A(j,j). */
                        rec = tjj * bignum / xj;
                        if (cnorm[j] > 1.f)
                        {
                            /* Scale by 1/CNORM(j) to avoid overflow when */
                            /* multiplying x(j) times column j. */
                            rec /= cnorm[j];
                        }
                        sscal_(n, &rec, &x[1], &c__1);
                        *scale *= rec;
                        xmax *= rec;
                    }
                    x[j] /= tjjs;
                    xj = (r__1 = x[j], f2c_abs(r__1));
                }
                else
                {
                    /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */
                    /* scale = 0, and compute a solution to A*x = 0. */
                    i__3 = *n;
                    for (i__ = 1;
                            i__ <= i__3;
                            ++i__)
                    {
                        x[i__] = 0.f;
                        /* L90: */
                    }
                    x[j] = 1.f;
                    xj = 1.f;
                    *scale = 0.f;
                    xmax = 0.f;
                }
L95: /* Scale x if necessary to avoid overflow when adding a */
                /* multiple of column j of A. */
                if (xj > 1.f)
                {
                    rec = 1.f / xj;
                    if (cnorm[j] > (bignum - xmax) * rec)
                    {
                        /* Scale x by 1/(2*f2c_abs(x(j))). */
                        rec *= .5f;
                        sscal_(n, &rec, &x[1], &c__1);
                        *scale *= rec;
                    }
                }
                else if (xj * cnorm[j] > bignum - xmax)
                {
                    /* Scale x by 1/2. */
                    sscal_(n, &c_b36, &x[1], &c__1);
                    *scale *= .5f;
                }
                if (upper)
                {
                    if (j > 1)
                    {
                        /* Compute the update */
                        /* x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */
                        i__3 = j - 1;
                        r__1 = -x[j] * tscal;
                        saxpy_(&i__3, &r__1, &a[j * a_dim1 + 1], &c__1, &x[1], &c__1);
                        i__3 = j - 1;
                        i__ = isamax_(&i__3, &x[1], &c__1);
                        xmax = (r__1 = x[i__], f2c_abs(r__1));
                    }
                }
                else
                {
                    if (j < *n)
                    {
                        /* Compute the update */
                        /* x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */
                        i__3 = *n - j;
                        r__1 = -x[j] * tscal;
                        saxpy_(&i__3, &r__1, &a[j + 1 + j * a_dim1], &c__1, & x[j + 1], &c__1);
                        i__3 = *n - j;
                        i__ = j + isamax_(&i__3, &x[j + 1], &c__1);
                        xmax = (r__1 = x[i__], f2c_abs(r__1));
                    }
                }
                /* L100: */
            }
        }
        else
        {
            /* Solve A**T * x = b */
            i__2 = jlast;
            i__1 = jinc;
            for (j = jfirst;
                    i__1 < 0 ? j >= i__2 : j <= i__2;
                    j += i__1)
            {
                /* Compute x(j) = b(j) - sum A(k,j)*x(k). */
                /* k<>j */
                xj = (r__1 = x[j], f2c_abs(r__1));
                uscal = tscal;
                rec = 1.f / max(xmax,1.f);
                if (cnorm[j] > (bignum - xj) * rec)
                {
                    /* If x(j) could overflow, scale x by 1/(2*XMAX). */
                    rec *= .5f;
                    if (nounit)
                    {
                        tjjs = a[j + j * a_dim1] * tscal;
                    }
                    else
                    {
                        tjjs = tscal;
                    }
                    tjj = f2c_abs(tjjs);
                    if (tjj > 1.f)
                    {
                        /* Divide by A(j,j) when scaling x if A(j,j) > 1. */
                        /* Computing MIN */
                        r__1 = 1.f;
                        r__2 = rec * tjj; // , expr subst
                        rec = min(r__1,r__2);
                        uscal /= tjjs;
                    }
                    if (rec < 1.f)
                    {
                        sscal_(n, &rec, &x[1], &c__1);
                        *scale *= rec;
                        xmax *= rec;
                    }
                }
                sumj = 0.f;
                if (uscal == 1.f)
                {
                    /* If the scaling needed for A in the dot product is 1, */
                    /* call SDOT to perform the dot product. */
                    if (upper)
                    {
                        i__3 = j - 1;
                        sumj = sdot_(&i__3, &a[j * a_dim1 + 1], &c__1, &x[1], &c__1);
                    }
                    else if (j < *n)
                    {
                        i__3 = *n - j;
                        sumj = sdot_(&i__3, &a[j + 1 + j * a_dim1], &c__1, &x[ j + 1], &c__1);
                    }
                }
                else
                {
                    /* Otherwise, use in-line code for the dot product. */
                    if (upper)
                    {
                        i__3 = j - 1;
                        for (i__ = 1;
                                i__ <= i__3;
                                ++i__)
                        {
                            sumj += a[i__ + j * a_dim1] * uscal * x[i__];
                            /* L110: */
                        }
                    }
                    else if (j < *n)
                    {
                        i__3 = *n;
                        for (i__ = j + 1;
                                i__ <= i__3;
                                ++i__)
                        {
                            sumj += a[i__ + j * a_dim1] * uscal * x[i__];
                            /* L120: */
                        }
                    }
                }
                if (uscal == tscal)
                {
                    /* Compute x(j) := ( x(j) - sumj ) / A(j,j) if 1/A(j,j) */
                    /* was not used to scale the dotproduct. */
                    x[j] -= sumj;
                    xj = (r__1 = x[j], f2c_abs(r__1));
                    if (nounit)
                    {
                        tjjs = a[j + j * a_dim1] * tscal;
                    }
                    else
                    {
                        tjjs = tscal;
                        if (tscal == 1.f)
                        {
                            goto L135;
                        }
                    }
                    /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */
                    tjj = f2c_abs(tjjs);
                    if (tjj > smlnum)
                    {
                        /* f2c_abs(A(j,j)) > SMLNUM: */
                        if (tjj < 1.f)
                        {
                            if (xj > tjj * bignum)
                            {
                                /* Scale X by 1/f2c_abs(x(j)). */
                                rec = 1.f / xj;
                                sscal_(n, &rec, &x[1], &c__1);
                                *scale *= rec;
                                xmax *= rec;
                            }
                        }
                        x[j] /= tjjs;
                    }
                    else if (tjj > 0.f)
                    {
                        /* 0 < f2c_abs(A(j,j)) <= SMLNUM: */
                        if (xj > tjj * bignum)
                        {
                            /* Scale x by (1/f2c_abs(x(j)))*f2c_abs(A(j,j))*BIGNUM. */
                            rec = tjj * bignum / xj;
                            sscal_(n, &rec, &x[1], &c__1);
                            *scale *= rec;
                            xmax *= rec;
                        }
                        x[j] /= tjjs;
                    }
                    else
                    {
                        /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */
                        /* scale = 0, and compute a solution to A**T*x = 0. */
                        i__3 = *n;
                        for (i__ = 1;
                                i__ <= i__3;
                                ++i__)
                        {
                            x[i__] = 0.f;
                            /* L130: */
                        }
                        x[j] = 1.f;
                        *scale = 0.f;
                        xmax = 0.f;
                    }
L135:
                    ;
                }
                else
                {
                    /* Compute x(j) := x(j) / A(j,j) - sumj if the dot */
                    /* product has already been divided by 1/A(j,j). */
                    x[j] = x[j] / tjjs - sumj;
                }
                /* Computing MAX */
                r__2 = xmax;
                r__3 = (r__1 = x[j], f2c_abs(r__1)); // , expr subst
                xmax = max(r__2,r__3);
                /* L140: */
            }
        }
        *scale /= tscal;
    }
    /* Scale the column norms by 1/TSCAL for return. */
    if (tscal != 1.f)
    {
        r__1 = 1.f / tscal;
        sscal_(n, &r__1, &cnorm[1], &c__1);
    }
    return 0;
    /* End of SLATRS */
}
/* Subroutine */ int strrfs_(char *uplo, char *trans, char *diag, integer *n, 
	integer *nrhs, real *a, integer *lda, real *b, integer *ldb, real *x, 
	integer *ldx, real *ferr, real *berr, real *work, integer *iwork, 
	integer *info)
{
/*  -- LAPACK routine (version 3.0) --   
       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
       Courant Institute, Argonne National Lab, and Rice University   
       September 30, 1994   


    Purpose   
    =======   

    STRRFS provides error bounds and backward error estimates for the   
    solution to a system of linear equations with a triangular   
    coefficient matrix.   

    The solution matrix X must be computed by STRTRS or some other   
    means before entering this routine.  STRRFS does not do iterative   
    refinement because doing so cannot improve the backward error.   

    Arguments   
    =========   

    UPLO    (input) CHARACTER*1   
            = 'U':  A is upper triangular;   
            = 'L':  A is lower triangular.   

    TRANS   (input) CHARACTER*1   
            Specifies the form of the system of equations:   
            = 'N':  A * X = B  (No transpose)   
            = 'T':  A**T * X = B  (Transpose)   
            = 'C':  A**H * X = B  (Conjugate transpose = Transpose)   

    DIAG    (input) CHARACTER*1   
            = 'N':  A is non-unit triangular;   
            = 'U':  A is unit triangular.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    NRHS    (input) INTEGER   
            The number of right hand sides, i.e., the number of columns   
            of the matrices B and X.  NRHS >= 0.   

    A       (input) REAL array, dimension (LDA,N)   
            The triangular matrix A.  If UPLO = 'U', the leading N-by-N   
            upper triangular part of the array A contains the upper   
            triangular matrix, and the strictly lower triangular part of   
            A is not referenced.  If UPLO = 'L', the leading N-by-N lower   
            triangular part of the array A contains the lower triangular   
            matrix, and the strictly upper triangular part of A is not   
            referenced.  If DIAG = 'U', the diagonal elements of A are   
            also not referenced and are assumed to be 1.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    B       (input) REAL array, dimension (LDB,NRHS)   
            The right hand side matrix B.   

    LDB     (input) INTEGER   
            The leading dimension of the array B.  LDB >= max(1,N).   

    X       (input) REAL array, dimension (LDX,NRHS)   
            The solution matrix X.   

    LDX     (input) INTEGER   
            The leading dimension of the array X.  LDX >= max(1,N).   

    FERR    (output) REAL array, dimension (NRHS)   
            The estimated forward error bound for each solution vector   
            X(j) (the j-th column of the solution matrix X).   
            If XTRUE is the true solution corresponding to X(j), FERR(j)   
            is an estimated upper bound for the magnitude of the largest   
            element in (X(j) - XTRUE) divided by the magnitude of the   
            largest element in X(j).  The estimate is as reliable as   
            the estimate for RCOND, and is almost always a slight   
            overestimate of the true error.   

    BERR    (output) REAL array, dimension (NRHS)   
            The componentwise relative backward error of each solution   
            vector X(j) (i.e., the smallest relative change in   
            any element of A or B that makes X(j) an exact solution).   

    WORK    (workspace) REAL array, dimension (3*N)   

    IWORK   (workspace) INTEGER array, dimension (N)   

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   

    =====================================================================   


       Test the input parameters.   

       Parameter adjustments */
    /* Table of constant values */
    static integer c__1 = 1;
    static real c_b19 = -1.f;
    
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, 
	    i__3;
    real r__1, r__2, r__3;
    /* Local variables */
    static integer kase;
    static real safe1, safe2;
    static integer i__, j, k;
    static real s;
    extern logical lsame_(char *, char *);
    static logical upper;
    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, 
	    integer *), saxpy_(integer *, real *, real *, integer *, real *, 
	    integer *), strmv_(char *, char *, char *, integer *, real *, 
	    integer *, real *, integer *), strsv_(
	    char *, char *, char *, integer *, real *, integer *, real *, 
	    integer *);
    static real xk;
    extern doublereal slamch_(char *);
    static integer nz;
    static real safmin;
    extern /* Subroutine */ int xerbla_(char *, integer *), slacon_(
	    integer *, real *, real *, integer *, real *, integer *);
    static logical notran;
    static char transt[1];
    static logical nounit;
    static real lstres, eps;
#define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
#define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1]
#define x_ref(a_1,a_2) x[(a_2)*x_dim1 + a_1]


    a_dim1 = *lda;
    a_offset = 1 + a_dim1 * 1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1 * 1;
    b -= b_offset;
    x_dim1 = *ldx;
    x_offset = 1 + x_dim1 * 1;
    x -= x_offset;
    --ferr;
    --berr;
    --work;
    --iwork;

    /* Function Body */
    *info = 0;
    upper = lsame_(uplo, "U");
    notran = lsame_(trans, "N");
    nounit = lsame_(diag, "N");

    if (! upper && ! lsame_(uplo, "L")) {
	*info = -1;
    } else if (! notran && ! lsame_(trans, "T") && ! 
	    lsame_(trans, "C")) {
	*info = -2;
    } else if (! nounit && ! lsame_(diag, "U")) {
	*info = -3;
    } else if (*n < 0) {
	*info = -4;
    } else if (*nrhs < 0) {
	*info = -5;
    } else if (*lda < max(1,*n)) {
	*info = -7;
    } else if (*ldb < max(1,*n)) {
	*info = -9;
    } else if (*ldx < max(1,*n)) {
	*info = -11;
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("STRRFS", &i__1);
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0 || *nrhs == 0) {
	i__1 = *nrhs;
	for (j = 1; j <= i__1; ++j) {
	    ferr[j] = 0.f;
	    berr[j] = 0.f;
/* L10: */
	}
	return 0;
    }

    if (notran) {
	*(unsigned char *)transt = 'T';
    } else {
	*(unsigned char *)transt = 'N';
    }

/*     NZ = maximum number of nonzero elements in each row of A, plus 1 */

    nz = *n + 1;
    eps = slamch_("Epsilon");
    safmin = slamch_("Safe minimum");
    safe1 = nz * safmin;
    safe2 = safe1 / eps;

/*     Do for each right hand side */

    i__1 = *nrhs;
    for (j = 1; j <= i__1; ++j) {

/*        Compute residual R = B - op(A) * X,   
          where op(A) = A or A', depending on TRANS. */

	scopy_(n, &x_ref(1, j), &c__1, &work[*n + 1], &c__1);
	strmv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1);
	saxpy_(n, &c_b19, &b_ref(1, j), &c__1, &work[*n + 1], &c__1);

/*        Compute componentwise relative backward error from formula   

          max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) )   

          where abs(Z) is the componentwise absolute value of the matrix   
          or vector Z.  If the i-th component of the denominator is less   
          than SAFE2, then SAFE1 is added to the i-th components of the   
          numerator and denominator before dividing. */

	i__2 = *n;
	for (i__ = 1; i__ <= i__2; ++i__) {
	    work[i__] = (r__1 = b_ref(i__, j), dabs(r__1));
/* L20: */
	}

	if (notran) {

/*           Compute abs(A)*abs(X) + abs(B). */

	    if (upper) {
		if (nounit) {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			xk = (r__1 = x_ref(k, j), dabs(r__1));
			i__3 = k;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    work[i__] += (r__1 = a_ref(i__, k), dabs(r__1)) * 
				    xk;
/* L30: */
			}
/* L40: */
		    }
		} else {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			xk = (r__1 = x_ref(k, j), dabs(r__1));
			i__3 = k - 1;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    work[i__] += (r__1 = a_ref(i__, k), dabs(r__1)) * 
				    xk;
/* L50: */
			}
			work[k] += xk;
/* L60: */
		    }
		}
	    } else {
		if (nounit) {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			xk = (r__1 = x_ref(k, j), dabs(r__1));
			i__3 = *n;
			for (i__ = k; i__ <= i__3; ++i__) {
			    work[i__] += (r__1 = a_ref(i__, k), dabs(r__1)) * 
				    xk;
/* L70: */
			}
/* L80: */
		    }
		} else {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			xk = (r__1 = x_ref(k, j), dabs(r__1));
			i__3 = *n;
			for (i__ = k + 1; i__ <= i__3; ++i__) {
			    work[i__] += (r__1 = a_ref(i__, k), dabs(r__1)) * 
				    xk;
/* L90: */
			}
			work[k] += xk;
/* L100: */
		    }
		}
	    }
	} else {

/*           Compute abs(A')*abs(X) + abs(B). */

	    if (upper) {
		if (nounit) {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			s = 0.f;
			i__3 = k;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    s += (r__1 = a_ref(i__, k), dabs(r__1)) * (r__2 = 
				    x_ref(i__, j), dabs(r__2));
/* L110: */
			}
			work[k] += s;
/* L120: */
		    }
		} else {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			s = (r__1 = x_ref(k, j), dabs(r__1));
			i__3 = k - 1;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    s += (r__1 = a_ref(i__, k), dabs(r__1)) * (r__2 = 
				    x_ref(i__, j), dabs(r__2));
/* L130: */
			}
			work[k] += s;
/* L140: */
		    }
		}
	    } else {
		if (nounit) {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			s = 0.f;
			i__3 = *n;
			for (i__ = k; i__ <= i__3; ++i__) {
			    s += (r__1 = a_ref(i__, k), dabs(r__1)) * (r__2 = 
				    x_ref(i__, j), dabs(r__2));
/* L150: */
			}
			work[k] += s;
/* L160: */
		    }
		} else {
		    i__2 = *n;
		    for (k = 1; k <= i__2; ++k) {
			s = (r__1 = x_ref(k, j), dabs(r__1));
			i__3 = *n;
			for (i__ = k + 1; i__ <= i__3; ++i__) {
			    s += (r__1 = a_ref(i__, k), dabs(r__1)) * (r__2 = 
				    x_ref(i__, j), dabs(r__2));
/* L170: */
			}
			work[k] += s;
/* L180: */
		    }
		}
	    }
	}
	s = 0.f;
	i__2 = *n;
	for (i__ = 1; i__ <= i__2; ++i__) {
	    if (work[i__] > safe2) {
/* Computing MAX */
		r__2 = s, r__3 = (r__1 = work[*n + i__], dabs(r__1)) / work[
			i__];
		s = dmax(r__2,r__3);
	    } else {
/* Computing MAX */
		r__2 = s, r__3 = ((r__1 = work[*n + i__], dabs(r__1)) + safe1)
			 / (work[i__] + safe1);
		s = dmax(r__2,r__3);
	    }
/* L190: */
	}
	berr[j] = s;

/*        Bound error from formula   

          norm(X - XTRUE) / norm(X) .le. FERR =   
          norm( abs(inv(op(A)))*   
             ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X)   

          where   
            norm(Z) is the magnitude of the largest component of Z   
            inv(op(A)) is the inverse of op(A)   
            abs(Z) is the componentwise absolute value of the matrix or   
               vector Z   
            NZ is the maximum number of nonzeros in any row of A, plus 1   
            EPS is machine epsilon   

          The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B))   
          is incremented by SAFE1 if the i-th component of   
          abs(op(A))*abs(X) + abs(B) is less than SAFE2.   

          Use SLACON to estimate the infinity-norm of the matrix   
             inv(op(A)) * diag(W),   
          where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */

	i__2 = *n;
	for (i__ = 1; i__ <= i__2; ++i__) {
	    if (work[i__] > safe2) {
		work[i__] = (r__1 = work[*n + i__], dabs(r__1)) + nz * eps * 
			work[i__];
	    } else {
		work[i__] = (r__1 = work[*n + i__], dabs(r__1)) + nz * eps * 
			work[i__] + safe1;
	    }
/* L200: */
	}

	kase = 0;
L210:
	slacon_(n, &work[(*n << 1) + 1], &work[*n + 1], &iwork[1], &ferr[j], &
		kase);
	if (kase != 0) {
	    if (kase == 1) {

/*              Multiply by diag(W)*inv(op(A)'). */

		strsv_(uplo, transt, diag, n, &a[a_offset], lda, &work[*n + 1]
			, &c__1);
		i__2 = *n;
		for (i__ = 1; i__ <= i__2; ++i__) {
		    work[*n + i__] = work[i__] * work[*n + i__];
/* L220: */
		}
	    } else {

/*              Multiply by inv(op(A))*diag(W). */

		i__2 = *n;
		for (i__ = 1; i__ <= i__2; ++i__) {
		    work[*n + i__] = work[i__] * work[*n + i__];
/* L230: */
		}
		strsv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1],
			 &c__1);
	    }
	    goto L210;
	}

/*        Normalize error. */

	lstres = 0.f;
	i__2 = *n;
	for (i__ = 1; i__ <= i__2; ++i__) {
/* Computing MAX */
	    r__2 = lstres, r__3 = (r__1 = x_ref(i__, j), dabs(r__1));
	    lstres = dmax(r__2,r__3);
/* L240: */
	}
	if (lstres != 0.f) {
	    ferr[j] /= lstres;
	}

/* L250: */
    }

    return 0;

/*     End of STRRFS */

} /* strrfs_ */
Exemple #17
0
/* Return value:   0 - successful return
 *               > 0 - number of bytes allocated when run out of space
 */
int
scolumn_bmod (
	     const int  jcol,	  /* in */
	     const int  nseg,	  /* in */
	     double     *dense,	  /* in */
	     double     *tempv,	  /* working array */
	     int        *segrep,  /* in */
	     int        *repfnz,  /* in */
	     int        fpanelc,  /* in -- first column in the current panel */
	     GlobalLU_t *Glu,     /* modified */
	     SuperLUStat_t *stat  /* output */
	     )
{
/*
 * Purpose:
 * ========
 *    Performs numeric block updates (sup-col) in topological order.
 *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
 *    Special processing on the supernodal portion of L\U[*,j]
 *
 */
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif

#ifdef USE_VENDOR_BLAS
    int         incx = 1, incy = 1;
    double      alpha, beta;
#endif
    
    /* krep = representative of current k-th supernode
     * fsupc = first supernodal column
     * nsupc = no of columns in supernode
     * nsupr = no of rows in supernode (used as leading dimension)
     * luptr = location of supernodal LU-block in storage
     * kfnz = first nonz in the k-th supernodal segment
     * no_zeros = no of leading zeros in a supernodal U-segment
     */
    double       ukj, ukj1, ukj2;
    int          luptr, luptr1, luptr2;
    int          fsupc, nsupc, nsupr, segsze;
    int          nrow;	  /* No of rows in the matrix of matrix-vector */
    int          jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
    register int lptr, kfnz, isub, irow, i;
    register int no_zeros, new_next; 
    int          ufirst, nextlu;
    int          fst_col; /* First column within small LU update */
    int          d_fsupc; /* Distance between the first column of the current
			     panel and the first column of the current snode. */
    int          *xsup, *supno;
    int          *lsub, *xlsub;
    double       *lusup;
    int          *xlusup;
    int          nzlumax;
    double       *tempv1;
    double      zero = 0.0;
#ifdef USE_VENDOR_BLAS
    double      one = 1.0;
    double      none = -1.0;
#endif
    int          mem_error;
    flops_t      *ops = stat->ops;

    xsup    = Glu->xsup;
    supno   = Glu->supno;
    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    lusup   = Glu->lusup;
    xlusup  = Glu->xlusup;
    nzlumax = Glu->nzlumax;
    jcolp1 = jcol + 1;
    jsupno = supno[jcol];
    
    /* 
     * For each nonz supernode segment of U[*,j] in topological order 
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) {

	krep = segrep[k];
	k--;
	ksupno = supno[krep];
	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */

	    fsupc = xsup[ksupno];
	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

  	    /* Distance from the current supernode to the current panel; 
	       d_fsupc=0 if fsupc > fpanelc. */
  	    d_fsupc = fst_col - fsupc; 

	    luptr = xlusup[fst_col] + d_fsupc;
	    lptr = xlsub[fsupc] + d_fsupc;

	    kfnz = repfnz[krep];
	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );

	    segsze = krep - kfnz + 1;
	    nsupc = krep - fst_col + 1;
	    nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
	    nrow = nsupr - d_fsupc - nsupc;
	    krep_ind = lptr + nsupc - 1;

	    ops[TRSV] += segsze * (segsze - 1);
	    ops[GEMV] += 2 * nrow * segsze;


	    /* 
	     * Case 1: Update U-segment of size 1 -- col-col update 
	     */
	    if ( segsze == 1 ) {
	  	ukj = dense[lsub[krep_ind]];
		luptr += nsupr*(nsupc-1) + nsupc;

		for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
		    irow = lsub[i];
		    dense[irow] -=  ukj*lusup[luptr];
		    luptr++;
		}

	    } else if ( segsze <= 3 ) {
		ukj = dense[lsub[krep_ind]];
		luptr += nsupr*(nsupc-1) + nsupc-1;
		ukj1 = dense[lsub[krep_ind - 1]];
		luptr1 = luptr - nsupr;

		if ( segsze == 2 ) { /* Case 2: 2cols-col update */
		    ukj -= ukj1 * lusup[luptr1];
		    dense[lsub[krep_ind]] = ukj;
		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
		    	irow = lsub[i];
		    	luptr++;
		    	luptr1++;
		    	dense[irow] -= ( ukj*lusup[luptr]
					+ ukj1*lusup[luptr1] );
		    }
		} else { /* Case 3: 3cols-col update */
		    ukj2 = dense[lsub[krep_ind - 2]];
		    luptr2 = luptr1 - nsupr;
		    ukj1 -= ukj2 * lusup[luptr2-1];
		    ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
		    dense[lsub[krep_ind]] = ukj;
		    dense[lsub[krep_ind-1]] = ukj1;
		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
		    	irow = lsub[i];
		    	luptr++;
		    	luptr1++;
			luptr2++;
		    	dense[irow] -= ( ukj*lusup[luptr]
			     + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
		    }
		}



	    } else {
	  	/*
		 * Case: sup-col update
		 * Perform a triangular solve and block update,
		 * then scatter the result of sup-col update to dense
		 */

		no_zeros = kfnz - fst_col;

	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
	        isub = lptr + no_zeros;
	        for (i = 0; i < segsze; i++) {
	  	    irow = lsub[isub];
		    tempv[i] = dense[irow];
		    ++isub; 
	        }

	        /* Dense triangular solve -- start effective triangle */
		luptr += nsupr * no_zeros + no_zeros; 
		
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
		STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#else		
		strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#endif		
 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
                alpha = one;
                beta = zero;
#ifdef _CRAY
		SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
		sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
		slsolve ( nsupr, segsze, &lusup[luptr], tempv );

 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
#endif
		
		
                /* Scatter tempv[] into SPA dense[] as a temporary storage */
                isub = lptr + no_zeros;
                for (i = 0; i < segsze; i++) {
                    irow = lsub[isub];
                    dense[irow] = tempv[i];
                    tempv[i] = zero;
                    ++isub;
                }

		/* Scatter tempv1[] into SPA dense[] */
		for (i = 0; i < nrow; i++) {
		    irow = lsub[isub];
		    dense[irow] -= tempv1[i];
		    tempv1[i] = zero;
		    ++isub;
		}
	    }
	    
	} /* if jsupno ... */

    } /* for each segment... */

    /*
     *	Process the supernodal portion of L\U[*,j]
     */
    nextlu = xlusup[jcol];
    fsupc = xsup[jsupno];

    /* Copy the SPA dense into L\U[*,j] */
    new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
    while ( new_next > nzlumax ) {
	if ((mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)))
	    return (mem_error);
	lusup = Glu->lusup;
	lsub = Glu->lsub;
    }

    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
  	irow = lsub[isub];
	lusup[nextlu] = dense[irow];
        dense[irow] = zero;
	++nextlu;
    }

    xlusup[jcolp1] = nextlu;	/* Close L\U[*,jcol] */

    /* For more updates within the panel (also within the current supernode), 
     * should start from the first column of the panel, or the first column 
     * of the supernode, whichever is bigger. There are 2 cases:
     *    1) fsupc < fpanelc, then fst_col := fpanelc
     *    2) fsupc >= fpanelc, then fst_col := fsupc
     */
    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

    if ( fst_col < jcol ) {

  	/* Distance between the current supernode and the current panel.
	   d_fsupc=0 if fsupc >= fpanelc. */
  	d_fsupc = fst_col - fsupc;

	luptr = xlusup[fst_col] + d_fsupc;
	nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
	nsupc = jcol - fst_col;	/* Excluding jcol */
	nrow = nsupr - d_fsupc - nsupc;

	/* Points to the beginning of jcol in snode L\U(jsupno) */
	ufirst = xlusup[jcol] + d_fsupc;	

	ops[TRSV] += nsupc * (nsupc - 1);
	ops[GEMV] += 2 * nrow * nsupc;
	
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
#else
	strsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
#endif
	
	alpha = none; beta = one; /* y := beta*y + alpha*A*x */

#ifdef _CRAY
	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
	sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
	slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );

	smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
		&lusup[ufirst], tempv );
	
        /* Copy updates from tempv[*] into lusup[*] */
	isub = ufirst + nsupc;
	for (i = 0; i < nrow; i++) {
	    lusup[isub] -= tempv[i];
	    tempv[i] = 0.0;
	    ++isub;
	}

#endif
	
	
    } /* if fst_col < jcol ... */ 

    return 0;
}
Exemple #18
0
int
sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L, 
         SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info)
{
/*
 *   Purpose
 *   =======
 *
 *   sp_strsv() solves one of the systems of equations   
 *       A*x = b,   or   A'*x = b,
 *   where b and x are n element vectors and A is a sparse unit , or   
 *   non-unit, upper or lower triangular matrix.   
 *   No test for singularity or near-singularity is included in this   
 *   routine. Such tests must be performed before calling this routine.   
 *
 *   Parameters   
 *   ==========   
 *
 *   uplo   - (input) char*
 *            On entry, uplo specifies whether the matrix is an upper or   
 *             lower triangular matrix as follows:   
 *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
 *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
 *
 *   trans  - (input) char*
 *             On entry, trans specifies the equations to be solved as   
 *             follows:   
 *                trans = 'N' or 'n'   A*x = b.   
 *                trans = 'T' or 't'   A'*x = b.
 *                trans = 'C' or 'c'   A'*x = b.   
 *
 *   diag   - (input) char*
 *             On entry, diag specifies whether or not A is unit   
 *             triangular as follows:   
 *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
 *                diag = 'N' or 'n'   A is not assumed to be unit   
 *                                    triangular.   
 *	     
 *   L       - (input) SuperMatrix*
 *	       The factor L from the factorization Pr*A*Pc=L*U. Use
 *             compressed row subscripts storage for supernodes,
 *             i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU.
 *
 *   U       - (input) SuperMatrix*
 *	        The factor U from the factorization Pr*A*Pc=L*U.
 *	        U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU.
 *    
 *   x       - (input/output) float*
 *             Before entry, the incremented array X must contain the n   
 *             element right-hand side vector b. On exit, X is overwritten 
 *             with the solution vector x.
 *
 *   info    - (output) int*
 *             If *info = -i, the i-th argument had an illegal value.
 *
 */
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
	 ftcs2 = _cptofcd("N", strlen("N")),
	 ftcs3 = _cptofcd("U", strlen("U"));
#endif
    SCformat *Lstore;
    NCformat *Ustore;
    float   *Lval, *Uval;
    int incx = 1;
    int nrow;
    int fsupc, nsupr, nsupc, luptr, istart, irow;
    int i, k, iptr, jcol;
    float *work;
    flops_t solve_ops;

    /* Test the input parameters */
    *info = 0;
    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && 
              !lsame_(trans, "C")) *info = -2;
    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
    if ( *info ) {
	i = -(*info);
	xerbla_("sp_strsv", &i);
	return 0;
    }

    Lstore = L->Store;
    Lval = Lstore->nzval;
    Ustore = U->Store;
    Uval = Ustore->nzval;
    solve_ops = 0;

    if ( !(work = floatCalloc(L->nrow)) )
	ABORT("Malloc fails for work in sp_strsv().");
    
    if ( lsame_(trans, "N") ) {	/* Form x := inv(A)*x. */
	
	if ( lsame_(uplo, "L") ) {
	    /* Form x := inv(L)*x */
    	    if ( L->nrow == 0 ) {
			SUPERLU_FREE(work);
			return 0; /* Quick return */
		}
	    
	    for (k = 0; k <= Lstore->nsuper; k++) {
		fsupc = L_FST_SUPC(k);
		istart = L_SUB_START(fsupc);
		nsupr = L_SUB_START(fsupc+1) - istart;
		nsupc = L_FST_SUPC(k+1) - fsupc;
		luptr = L_NZ_START(fsupc);
		nrow = nsupr - nsupc;

	        solve_ops += nsupc * (nsupc - 1);
	        solve_ops += 2 * nrow * nsupc;

		if ( nsupc == 1 ) {
		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
			irow = L_SUB(iptr);
			++luptr;
			x[irow] -= x[fsupc] * Lval[luptr];
		    }
		} else {
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
		       	&x[fsupc], &incx);
		
		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#else
		    strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
		       	&x[fsupc], &incx);
		
		    sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#endif
#else
		    slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
		
		    smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
                             &x[fsupc], &work[0] );
#endif		
		
		    iptr = istart + nsupc;
		    for (i = 0; i < nrow; ++i, ++iptr) {
			irow = L_SUB(iptr);
			x[irow] -= work[i];	/* Scatter */
			work[i] = 0.0;

		    }
	 	}
	    } /* for k ... */
	    
	} else {
	    /* Form x := inv(U)*x */
	    
	    if ( U->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = Lstore->nsuper; k >= 0; k--) {
	    	fsupc = L_FST_SUPC(k);
	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
	    	nsupc = L_FST_SUPC(k+1) - fsupc;
	    	luptr = L_NZ_START(fsupc);
		
    	        solve_ops += nsupc * (nsupc + 1);

		if ( nsupc == 1 ) {
		    x[fsupc] /= Lval[luptr];
		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
			irow = U_SUB(i);
			x[irow] -= x[fsupc] * Uval[i];
		    }
		} else {
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
		    STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
		       &x[fsupc], &incx);
#else
		    strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
                           &x[fsupc], &incx);
#endif
#else		
		    susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
#endif		

		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
		        solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
				i++) {
			    irow = U_SUB(i);
			    x[irow] -= x[jcol] * Uval[i];
		    	}
                    }
		}
	    } /* for k ... */
	    
	}
    } else { /* Form x := inv(A')*x */
	
	if ( lsame_(uplo, "L") ) {
	    /* Form x := inv(L')*x */
    	    if ( L->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = Lstore->nsuper; k >= 0; --k) {
	    	fsupc = L_FST_SUPC(k);
	    	istart = L_SUB_START(fsupc);
	    	nsupr = L_SUB_START(fsupc+1) - istart;
	    	nsupc = L_FST_SUPC(k+1) - fsupc;
	    	luptr = L_NZ_START(fsupc);

		solve_ops += 2 * (nsupr - nsupc) * nsupc;

		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
		    iptr = istart + nsupc;
		    for (i = L_NZ_START(jcol) + nsupc; 
				i < L_NZ_START(jcol+1); i++) {
			irow = L_SUB(iptr);
			x[jcol] -= x[irow] * Lval[i];
			iptr++;
		    }
		}
		
		if ( nsupc > 1 ) {
		    solve_ops += nsupc * (nsupc - 1);
#ifdef _CRAY
                    ftcs1 = _cptofcd("L", strlen("L"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("U", strlen("U"));
		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
			&x[fsupc], &incx);
#else
		    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
			&x[fsupc], &incx);
#endif
		}
	    }
	} else {
	    /* Form x := inv(U')*x */
	    if ( U->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = 0; k <= Lstore->nsuper; k++) {
	    	fsupc = L_FST_SUPC(k);
	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
	    	nsupc = L_FST_SUPC(k+1) - fsupc;
	    	luptr = L_NZ_START(fsupc);

		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
		    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
			irow = U_SUB(i);
			x[jcol] -= x[irow] * Uval[i];
		    }
		}

		solve_ops += nsupc * (nsupc + 1);

		if ( nsupc == 1 ) {
		    x[fsupc] /= Lval[luptr];
		} else {
#ifdef _CRAY
                    ftcs1 = _cptofcd("U", strlen("U"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("N", strlen("N"));
		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
			    &x[fsupc], &incx);
#else
		    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
			    &x[fsupc], &incx);
#endif
		}
	    } /* for k ... */
	}
    }

    stat->ops[SOLVE] += solve_ops;
    SUPERLU_FREE(work);
    return 0;
}
Exemple #19
0
/* Subroutine */ int sgglse_(integer *m, integer *n, integer *p, real *a, 
	integer *lda, real *b, integer *ldb, real *c__, real *d__, real *x, 
	real *work, integer *lwork, integer *info)
{
/*  -- LAPACK driver routine (version 3.0) --   
       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
       Courant Institute, Argonne National Lab, and Rice University   
       June 30, 1999   


    Purpose   
    =======   

    SGGLSE solves the linear equality-constrained least squares (LSE)   
    problem:   

            minimize || c - A*x ||_2   subject to   B*x = d   

    where A is an M-by-N matrix, B is a P-by-N matrix, c is a given   
    M-vector, and d is a given P-vector. It is assumed that   
    P <= N <= M+P, and   

             rank(B) = P and  rank( ( A ) ) = N.   
                                  ( ( B ) )   

    These conditions ensure that the LSE problem has a unique solution,   
    which is obtained using a GRQ factorization of the matrices B and A.   

    Arguments   
    =========   

    M       (input) INTEGER   
            The number of rows of the matrix A.  M >= 0.   

    N       (input) INTEGER   
            The number of columns of the matrices A and B. N >= 0.   

    P       (input) INTEGER   
            The number of rows of the matrix B. 0 <= P <= N <= M+P.   

    A       (input/output) REAL array, dimension (LDA,N)   
            On entry, the M-by-N matrix A.   
            On exit, A is destroyed.   

    LDA     (input) INTEGER   
            The leading dimension of the array A. LDA >= max(1,M).   

    B       (input/output) REAL array, dimension (LDB,N)   
            On entry, the P-by-N matrix B.   
            On exit, B is destroyed.   

    LDB     (input) INTEGER   
            The leading dimension of the array B. LDB >= max(1,P).   

    C       (input/output) REAL array, dimension (M)   
            On entry, C contains the right hand side vector for the   
            least squares part of the LSE problem.   
            On exit, the residual sum of squares for the solution   
            is given by the sum of squares of elements N-P+1 to M of   
            vector C.   

    D       (input/output) REAL array, dimension (P)   
            On entry, D contains the right hand side vector for the   
            constrained equation.   
            On exit, D is destroyed.   

    X       (output) REAL array, dimension (N)   
            On exit, X is the solution of the LSE problem.   

    WORK    (workspace/output) REAL array, dimension (LWORK)   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK. LWORK >= max(1,M+N+P).   
            For optimum performance LWORK >= P+min(M,N)+max(M,N)*NB,   
            where NB is an upper bound for the optimal blocksizes for   
            SGEQRF, SGERQF, SORMQR and SORMRQ.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    INFO    (output) INTEGER   
            = 0:  successful exit.   
            < 0:  if INFO = -i, the i-th argument had an illegal value.   

    =====================================================================   


       Test the input parameters   

       Parameter adjustments */
    /* Table of constant values */
    static integer c__1 = 1;
    static integer c_n1 = -1;
    static real c_b29 = -1.f;
    static real c_b31 = 1.f;
    
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
    /* Local variables */
    static integer lopt;
    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, 
	    real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), 
	    saxpy_(integer *, real *, real *, integer *, real *, integer *), 
	    strmv_(char *, char *, char *, integer *, real *, integer *, real 
	    *, integer *), strsv_(char *, char *, 
	    char *, integer *, real *, integer *, real *, integer *);
    static integer nb, mn, nr;
    extern /* Subroutine */ int xerbla_(char *, integer *);
    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
	    integer *, integer *, ftnlen, ftnlen);
    extern /* Subroutine */ int sggrqf_(integer *, integer *, integer *, real 
	    *, integer *, real *, real *, integer *, real *, real *, integer *
	    , integer *);
    static integer nb1, nb2, nb3, nb4, lwkopt;
    static logical lquery;
    extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, 
	    integer *, real *, integer *, real *, real *, integer *, real *, 
	    integer *, integer *), sormrq_(char *, char *, 
	    integer *, integer *, integer *, real *, integer *, real *, real *
	    , integer *, real *, integer *, integer *);
#define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1]
#define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1]


    a_dim1 = *lda;
    a_offset = 1 + a_dim1 * 1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1 * 1;
    b -= b_offset;
    --c__;
    --d__;
    --x;
    --work;

    /* Function Body */
    *info = 0;
    mn = min(*m,*n);
    nb1 = ilaenv_(&c__1, "SGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (
	    ftnlen)1);
    nb2 = ilaenv_(&c__1, "SGERQF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (
	    ftnlen)1);
    nb3 = ilaenv_(&c__1, "SORMQR", " ", m, n, p, &c_n1, (ftnlen)6, (ftnlen)1);
    nb4 = ilaenv_(&c__1, "SORMRQ", " ", m, n, p, &c_n1, (ftnlen)6, (ftnlen)1);
/* Computing MAX */
    i__1 = max(nb1,nb2), i__1 = max(i__1,nb3);
    nb = max(i__1,nb4);
    lwkopt = *p + mn + max(*m,*n) * nb;
    work[1] = (real) lwkopt;
    lquery = *lwork == -1;
    if (*m < 0) {
	*info = -1;
    } else if (*n < 0) {
	*info = -2;
    } else if (*p < 0 || *p > *n || *p < *n - *m) {
	*info = -3;
    } else if (*lda < max(1,*m)) {
	*info = -5;
    } else if (*ldb < max(1,*p)) {
	*info = -7;
    } else /* if(complicated condition) */ {
/* Computing MAX */
	i__1 = 1, i__2 = *m + *n + *p;
	if (*lwork < max(i__1,i__2) && ! lquery) {
	    *info = -12;
	}
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("SGGLSE", &i__1);
	return 0;
    } else if (lquery) {
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }

/*     Compute the GRQ factorization of matrices B and A:   

              B*Q' = (  0  T12 ) P   Z'*A*Q' = ( R11 R12 ) N-P   
                       N-P  P                  (  0  R22 ) M+P-N   
                                                 N-P  P   

       where T12 and R11 are upper triangular, and Q and Z are   
       orthogonal. */

    i__1 = *lwork - *p - mn;
    sggrqf_(p, m, n, &b[b_offset], ldb, &work[1], &a[a_offset], lda, &work[*p 
	    + 1], &work[*p + mn + 1], &i__1, info);
    lopt = work[*p + mn + 1];

/*     Update c = Z'*c = ( c1 ) N-P   
                         ( c2 ) M+P-N */

    i__1 = max(1,*m);
    i__2 = *lwork - *p - mn;
    sormqr_("Left", "Transpose", m, &c__1, &mn, &a[a_offset], lda, &work[*p + 
	    1], &c__[1], &i__1, &work[*p + mn + 1], &i__2, info);
/* Computing MAX */
    i__1 = lopt, i__2 = (integer) work[*p + mn + 1];
    lopt = max(i__1,i__2);

/*     Solve T12*x2 = d for x2 */

    strsv_("Upper", "No transpose", "Non unit", p, &b_ref(1, *n - *p + 1), 
	    ldb, &d__[1], &c__1);

/*     Update c1 */

    i__1 = *n - *p;
    sgemv_("No transpose", &i__1, p, &c_b29, &a_ref(1, *n - *p + 1), lda, &
	    d__[1], &c__1, &c_b31, &c__[1], &c__1);

/*     Sovle R11*x1 = c1 for x1 */

    i__1 = *n - *p;
    strsv_("Upper", "No transpose", "Non unit", &i__1, &a[a_offset], lda, &
	    c__[1], &c__1);

/*     Put the solutions in X */

    i__1 = *n - *p;
    scopy_(&i__1, &c__[1], &c__1, &x[1], &c__1);
    scopy_(p, &d__[1], &c__1, &x[*n - *p + 1], &c__1);

/*     Compute the residual vector: */

    if (*m < *n) {
	nr = *m + *p - *n;
	i__1 = *n - *m;
	sgemv_("No transpose", &nr, &i__1, &c_b29, &a_ref(*n - *p + 1, *m + 1)
		, lda, &d__[nr + 1], &c__1, &c_b31, &c__[*n - *p + 1], &c__1);
    } else {
	nr = *p;
    }
    strmv_("Upper", "No transpose", "Non unit", &nr, &a_ref(*n - *p + 1, *n - 
	    *p + 1), lda, &d__[1], &c__1);
    saxpy_(&nr, &c_b29, &d__[1], &c__1, &c__[*n - *p + 1], &c__1);

/*     Backward transformation x = Q'*x */

    i__1 = *lwork - *p - mn;
    sormrq_("Left", "Transpose", n, &c__1, p, &b[b_offset], ldb, &work[1], &x[
	    1], n, &work[*p + mn + 1], &i__1, info);
/* Computing MAX */
    i__1 = lopt, i__2 = (integer) work[*p + mn + 1];
    work[1] = (real) (*p + mn + max(i__1,i__2));

    return 0;

/*     End of SGGLSE */

} /* sgglse_ */
int
psgstrf_column_bmod(
		    const int  pnum,   /* process number */
		    const int  jcol,   /* current column in the panel */
		    const int  fpanelc,/* first column in the panel */
		    const int  nseg,   /* number of s-nodes to update jcol */
		    int        *segrep,/* in */
		    int        *repfnz,/* in */
		    float     *dense, /* modified */
		    float     *tempv, /* working array */
		    pxgstrf_shared_t *pxgstrf_shared, /* modified */
		    Gstat_t *Gstat     /* modified */
		    )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose:
 * ========
 *    Performs numeric block updates (sup-col) in topological order.
 *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
 *    Special processing on the supernodal portion of L\U[*,j].
 *
 * Return value:
 * =============
 *      0 - successful return
 *    > 0 - number of bytes allocated when run out of space
 *
 */
#if ( MACH==CRAY_PVP )
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    
#ifdef USE_VENDOR_BLAS    
    int         incx = 1, incy = 1;
    float      alpha, beta;
#endif
    GlobalLU_t *Glu = pxgstrf_shared->Glu;   /* modified */
    
    /* krep = representative of current k-th supernode
     * fsupc = first supernodal column
     * nsupc = no of columns in supernode
     * nsupr = no of rows in supernode (used as leading dimension)
     * luptr = location of supernodal LU-block in storage
     * kfnz = first nonz in the k-th supernodal segment
     * no_zeros = no of leading zeros in a supernodal U-segment
     */
    float	  ukj, ukj1, ukj2;
    register int lptr, kfnz, isub, irow, i, no_zeros;
    register int luptr, luptr1, luptr2;
    int          fsupc, nsupc, nsupr, segsze;
    int          nrow;	  /* No of rows in the matrix of matrix-vector */
    int          jsupno, k, ksub, krep, krep_ind, ksupno;
    int          ufirst, nextlu;
    int          fst_col; /* First column within small LU update */
    int          d_fsupc; /* Distance between the first column of the current
			     panel and the first column of the current snode.*/
    int          *xsup, *supno;
    int          *lsub, *xlsub, *xlsub_end;
    float       *lusup;
    int          *xlusup, *xlusup_end;
    float       *tempv1;
    int          mem_error;
    register float flopcnt;

    float      zero = 0.0;
    float      one = 1.0;
    float      none = -1.0;

    xsup       = Glu->xsup;
    supno      = Glu->supno;
    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    xlsub_end  = Glu->xlsub_end;
    lusup      = Glu->lusup;
    xlusup     = Glu->xlusup;
    xlusup_end = Glu->xlusup_end;
    jsupno     = supno[jcol];

    /* 
     * For each nonz supernode segment of U[*,j] in topological order 
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) {

	krep = segrep[k];
	k--;
	ksupno = supno[krep];
#if ( DEBUGlvel>=2 )
if (jcol==BADCOL)
printf("(%d) psgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n",
       pnum, jcol, nseg, krep, jsupno, ksupno);
#endif    
	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */

	    fsupc = xsup[ksupno];
	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

  	    /* Distance from the current supernode to the current panel; 
	       d_fsupc=0 if fsupc >= fpanelc. */
  	    d_fsupc = fst_col - fsupc; 

	    luptr = xlusup[fst_col] + d_fsupc;
	    lptr = xlsub[fsupc] + d_fsupc;
	    kfnz = repfnz[krep];
	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
	    segsze = krep - kfnz + 1;
	    nsupc = krep - fst_col + 1;
	    nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */
	    nrow = nsupr - d_fsupc - nsupc;
	    krep_ind = lptr + nsupc - 1;

            flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze;
	    Gstat->procstat[pnum].fcops += flopcnt;

#if ( DEBUGlevel>=2 )
if (jcol==BADCOL)	    
printf("(%d) psgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\
fsupc %d, nsupr %d, nsupc %d\n",
       pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc);

#endif


	    /* 
	     * Case 1: Update U-segment of size 1 -- col-col update 
	     */
	    if ( segsze == 1 ) {
	  	ukj = dense[lsub[krep_ind]];
		luptr += nsupr*(nsupc-1) + nsupc;

		for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
		    irow = lsub[i];
		    dense[irow] -=  ukj*lusup[luptr];
		    luptr++;
		}
	    } else if ( segsze <= 3 ) {
		ukj = dense[lsub[krep_ind]];
		luptr += nsupr*(nsupc-1) + nsupc-1;
		ukj1 = dense[lsub[krep_ind - 1]];
		luptr1 = luptr - nsupr;
		if ( segsze == 2 ) { /* Case 2: 2cols-col update */
		    ukj -= ukj1 * lusup[luptr1];
		    dense[lsub[krep_ind]] = ukj;
		    for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
		    	irow = lsub[i];
		    	luptr++;
		    	luptr1++;
		    	dense[irow] -= ( ukj*lusup[luptr]
					+ ukj1*lusup[luptr1] );
		    }
		} else { /* Case 3: 3cols-col update */
		    ukj2 = dense[lsub[krep_ind - 2]];
		    luptr2 = luptr1 - nsupr;
		    ukj1 -= ukj2 * lusup[luptr2-1];
		    ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
		    dense[lsub[krep_ind]] = ukj;
		    dense[lsub[krep_ind-1]] = ukj1;
		    for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
		    	irow = lsub[i];
		    	luptr++;
		    	luptr1++;
			luptr2++;
		    	dense[irow] -= ( ukj*lusup[luptr]
			     + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
		    }
		}


	    } else {
	  	/*
		 * Case: sup-col update
		 * Perform a triangular solve and block update,
		 * then scatter the result of sup-col update to dense
		 */
		no_zeros = kfnz - fst_col;

	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
	        isub = lptr + no_zeros;
	        for (i = 0; i < segsze; i++) {
	  	    irow = lsub[isub];
		    tempv[i] = dense[irow];
		    ++isub; 
	        }

	        /* Dense triangular solve -- start effective triangle */
		luptr += nsupr * no_zeros + no_zeros; 
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
		STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#else
		strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#endif
		
 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		alpha = one;
		beta = zero;
#if ( MACH==CRAY_PVP )
		SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
		sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
		slsolve ( nsupr, segsze, &lusup[luptr], tempv );

 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
#endif
                /* Scatter tempv[] into SPA dense[*] */
                isub = lptr + no_zeros;
                for (i = 0; i < segsze; i++) {
                    irow = lsub[isub];
                    dense[irow] = tempv[i]; /* Scatter */
                    tempv[i] = zero;
                    isub++;
                }

		/* Scatter tempv1[] into SPA dense[*] */
		for (i = 0; i < nrow; i++) {
		    irow = lsub[isub];
                    dense[irow] -= tempv1[i];
		    tempv1[i] = zero;
		    ++isub;
		}
	    } /* else segsze >= 4 */
	    
	} /* if jsupno ... */

    } /* for each segment... */

    
    /* ------------------------------------------
       Process the supernodal portion of L\U[*,j]
       ------------------------------------------ */
    
    fsupc = SUPER_FSUPC (jsupno);
    nsupr = xlsub_end[fsupc] - xlsub[fsupc];
    if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, 
			       pxgstrf_shared)) )
	return mem_error;
    xlusup[jcol] = nextlu;
    lusup = Glu->lusup;
    
    /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */
    for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) {
  	irow = lsub[isub];
	lusup[nextlu] = dense[irow];
	dense[irow] = zero;
#ifdef DEBUG
if (jcol == -1)
    printf("(%d) psgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n",
	   pnum, jcol, irow, lusup[nextlu]);
#endif	
	++nextlu;
    }
    xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */

#if ( DEBUGlevel>=2 )
if (jcol == -1) {
    nrow = xlusup_end[jcol] - xlusup[jcol];
    print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]],
		     &lusup[xlusup[jcol]]);
}
#endif    
    
    /*
     * For more updates within the panel (also within the current supernode), 
     * should start from the first column of the panel, or the first column 
     * of the supernode, whichever is bigger. There are 2 cases:
     *    (1) fsupc < fpanelc,  then fst_col := fpanelc
     *    (2) fsupc >= fpanelc, then fst_col := fsupc
     */
    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

    if ( fst_col < jcol ) {

  	/* distance between the current supernode and the current panel;
	   d_fsupc=0 if fsupc >= fpanelc. */
  	d_fsupc = fst_col - fsupc;

	lptr = xlsub[fsupc] + d_fsupc;
	luptr = xlusup[fst_col] + d_fsupc;
	nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */
	nsupc = jcol - fst_col;	/* Excluding jcol */
	nrow = nsupr - d_fsupc - nsupc;

	/* points to the beginning of jcol in supernode L\U[*,jsupno] */
	ufirst = xlusup[jcol] + d_fsupc;	

#if ( DEBUGlevel>=2 )
if (jcol==BADCOL)
printf("(%d) psgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n",
       pnum, jcol, fsupc, nsupr, nsupc, nrow);
#endif    

        flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc;
	Gstat->procstat[pnum].fcops += flopcnt;

/*	ops[TRSV] += nsupc * (nsupc - 1);
	ops[GEMV] += 2 * nrow * nsupc;    */
	
#ifdef USE_VENDOR_BLAS
	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
#if ( MACH==CRAY_PVP )
	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
	strsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
	sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
	slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );

	smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
		 &lusup[ufirst], tempv );
	
        /* Copy updates from tempv[*] into lusup[*] */
	isub = ufirst + nsupc;
	for (i = 0; i < nrow; i++) {
            lusup[isub] -= tempv[i];
            tempv[i] = 0.0;
	    ++isub;
	}
#endif
    } /* if fst_col < jcol ... */ 

    return 0;
}