int f2c_strsv(char* uplo, char* trans, char* diag, integer* N, real* A, integer* lda, real* X, integer* incX) { strsv_(uplo, trans, diag, N, A, lda, X, incX); return 0; }
void psgstrf_bmod1D( const int pnum, /* process number */ const int m, /* number of rows in the matrix */ const int w, /* current panel width */ const int jcol, /* leading column of the current panel */ const int fsupc, /* leading column of the updating supernode */ const int krep, /* last column of the updating supernode */ const int nsupc, /* number of columns in the updating s-node */ int nsupr, /* number of rows in the updating supernode */ int nrow, /* number of rows below the diagonal block of the updating supernode */ int *repfnz, /* in */ int *panel_lsub, /* modified */ int *w_lsub_end, /* modified */ int *spa_marker, /* modified; size n-by-w */ float *dense, /* modified */ float *tempv, /* working array - zeros on entry/exit */ GlobalLU_t *Glu, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * Performs numeric block updates (sup-panel) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Results are returned in SPA dense[*,w]. * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; float alpha, beta; #endif float ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; register int lptr; /* start of row subscripts of the updating supernode */ register int i, krep_ind, kfnz, isub, irow, no_zeros; register int jj; /* index through each column in the panel */ int *repfnz_col; /* repfnz[] for a column in the panel */ float *dense_col; /* dense[] for a column in the panel */ float *tempv1; /* used to store matrix-vector result */ int *col_marker; /* each column of the spa_marker[*,w] */ int *col_lsub; /* each column of the panel_lsub[*,w] */ int *lsub, *xlsub_end; float *lusup; int *xlusup; register float flopcnt; float zero = 0.0; float one = 1.0; #ifdef TIMING double *utime = Gstat->utime; double f_time; #endif lsub = Glu->lsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; lptr = Glu->xlsub[fsupc]; krep_ind = lptr + nsupc - 1; /* Pointers to each column of the w-wide arrays. */ repfnz_col= repfnz; dense_col = dense; col_marker= spa_marker; col_lsub = panel_lsub; #if ( DEBUGlevel>=2 ) if (jcol == BADPAN && krep == BADREP) { printf("(%d) psgstrf_bmod1D[1] jcol %d, fsupc %d, krep %d, nsupc %d, nsupr %d, nrow %d\n", pnum, jcol, fsupc, krep, nsupc, nsupr, nrow); PrintInt10("lsub[xlsub[2774]]", nsupr, &lsub[lptr]); } #endif /* * Sequence through each column in the panel ... */ for (jj = jcol; jj < jcol + w; ++jj, col_marker += m, col_lsub += m, repfnz_col += m, dense_col += m) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; /* Calculate flops: tri-solve + mat-vector */ flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze; Gstat->procstat[pnum].fcops += flopcnt; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { #ifdef TIMING f_time = SuperLU_timer_(); #endif ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; #if ( DEBUGlevel>=2 ) if (krep == BADCOL && jj == -1) { printf("(%d) psgstrf_bmod1D[segsze=1]: k %d, j %d, ukj %.10e\n", pnum, lsub[krep_ind], jj, ukj); PrintInt10("segsze=1", nsupr, &lsub[lptr]); } #endif for (i = lptr + nsupc; i < xlsub_end[fsupc]; i++) { irow = lsub[i]; dense_col[irow] -= ukj * lusup[luptr]; ++luptr; #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else if ( segsze <= 3 ) { #ifdef TIMING f_time = SuperLU_timer_(); #endif ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { ukj -= ukj1 * lusup[luptr1]; dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; dense_col[irow] -= (ukj * lusup[luptr] + ukj1 * lusup[luptr1]); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; ++luptr2; dense_col[irow] -= (ukj * lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2]); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else { /* segsze >= 4 */ /* * Perform a triangular solve and matrix-vector update, * then scatter the result of sup-col update to dense[*]. */ no_zeros = kfnz - fsupc; /* Gather U[*,j] segment from dense[*] to tempv[*]: * The result of triangular solve is in tempv[*]; * The result of matrix vector update is in dense_col[*] */ isub = lptr + no_zeros; /*#pragma ivdep*/ for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tempv[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef TIMING f_time = SuperLU_timer_(); #endif #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else strsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #if ( MACH==CRAY_PVP ) SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif /* _CRAY_PVP */ #else slsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); #endif #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif /* Scatter tempv[*] into SPA dense[*] temporarily, * such that tempv[*] can be used for the triangular solve of * the next column of the panel. They will be copied into * ucol[*] after the whole panel has been finished. */ isub = lptr + no_zeros; /*#pragma ivdep*/ for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = tempv[i]; /* Scatter */ tempv[i] = zero; isub++; #if ( DEBUGlevel>=2 ) if (jj == -1 && krep == 3423) printf("(%d) psgstrf_bmod1D[scatter] jj %d, dense_col[%d] %e\n", pnum, jj, irow, dense_col[irow]); #endif } /* Scatter the update from tempv1[*] into SPA dense[*] */ /*#pragma ivdep*/ for (i = 0; i < nrow; i++) { irow = lsub[isub]; dense_col[irow] -= tempv1[i]; /* Scatter-add */ #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif tempv1[i] = zero; isub++; } } /* else segsze >= 4 ... */ } /* for jj ... */ }
void strsv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx) { strsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); }
/* Subroutine */ int ssygs2_(integer *itype, char *uplo, integer *n, real *a, integer *lda, real *b, integer *ldb, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; real r__1; /* Local variables */ integer k; real ct, akk, bkk; extern /* Subroutine */ int ssyr2_(char *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); extern logical lsame_(char *, char *); extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); logical upper; extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *), strsv_(char *, char *, char *, integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *); /* -- LAPACK routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SSYGS2 reduces a real symmetric-definite generalized eigenproblem */ /* to standard form. */ /* If ITYPE = 1, the problem is A*x = lambda*B*x, */ /* and A is overwritten by inv(U')*A*inv(U) or inv(L)*A*inv(L') */ /* If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or */ /* B*A*x = lambda*x, and A is overwritten by U*A*U` or L'*A*L. */ /* B must have been previously factorized as U'*U or L*L' by SPOTRF. */ /* Arguments */ /* ========= */ /* ITYPE (input) INTEGER */ /* = 1: compute inv(U')*A*inv(U) or inv(L)*A*inv(L'); */ /* = 2 or 3: compute U*A*U' or L'*A*L. */ /* UPLO (input) CHARACTER*1 */ /* Specifies whether the upper or lower triangular part of the */ /* symmetric matrix A is stored, and how B has been factorized. */ /* = 'U': Upper triangular */ /* = 'L': Lower triangular */ /* N (input) INTEGER */ /* The order of the matrices A and B. N >= 0. */ /* A (input/output) REAL array, dimension (LDA,N) */ /* On entry, the symmetric matrix A. If UPLO = 'U', the leading */ /* n by n upper triangular part of A contains the upper */ /* triangular part of the matrix A, and the strictly lower */ /* triangular part of A is not referenced. If UPLO = 'L', the */ /* leading n by n lower triangular part of A contains the lower */ /* triangular part of the matrix A, and the strictly upper */ /* triangular part of A is not referenced. */ /* On exit, if INFO = 0, the transformed matrix, stored in the */ /* same format as A. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,N). */ /* B (input) REAL array, dimension (LDB,N) */ /* The triangular factor from the Cholesky factorization of B, */ /* as returned by SPOTRF. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= max(1,N). */ /* INFO (output) INTEGER */ /* = 0: successful exit. */ /* < 0: if INFO = -i, the i-th argument had an illegal value. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); if (*itype < 1 || *itype > 3) { *info = -1; } else if (! upper && ! lsame_(uplo, "L")) { *info = -2; } else if (*n < 0) { *info = -3; } else if (*lda < max(1,*n)) { *info = -5; } else if (*ldb < max(1,*n)) { *info = -7; } if (*info != 0) { i__1 = -(*info); xerbla_("SSYGS2", &i__1); return 0; } if (*itype == 1) { if (upper) { /* Compute inv(U')*A*inv(U) */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the upper triangle of A(k:n,k:n) */ akk = a[k + k * a_dim1]; bkk = b[k + k * b_dim1]; /* Computing 2nd power */ r__1 = bkk; akk /= r__1 * r__1; a[k + k * a_dim1] = akk; if (k < *n) { i__2 = *n - k; r__1 = 1.f / bkk; sscal_(&i__2, &r__1, &a[k + (k + 1) * a_dim1], lda); ct = akk * -.5f; i__2 = *n - k; saxpy_(&i__2, &ct, &b[k + (k + 1) * b_dim1], ldb, &a[k + ( k + 1) * a_dim1], lda); i__2 = *n - k; ssyr2_(uplo, &i__2, &c_b6, &a[k + (k + 1) * a_dim1], lda, &b[k + (k + 1) * b_dim1], ldb, &a[k + 1 + (k + 1) * a_dim1], lda); i__2 = *n - k; saxpy_(&i__2, &ct, &b[k + (k + 1) * b_dim1], ldb, &a[k + ( k + 1) * a_dim1], lda); i__2 = *n - k; strsv_(uplo, "Transpose", "Non-unit", &i__2, &b[k + 1 + ( k + 1) * b_dim1], ldb, &a[k + (k + 1) * a_dim1], lda); } /* L10: */ } } else { /* Compute inv(L)*A*inv(L') */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the lower triangle of A(k:n,k:n) */ akk = a[k + k * a_dim1]; bkk = b[k + k * b_dim1]; /* Computing 2nd power */ r__1 = bkk; akk /= r__1 * r__1; a[k + k * a_dim1] = akk; if (k < *n) { i__2 = *n - k; r__1 = 1.f / bkk; sscal_(&i__2, &r__1, &a[k + 1 + k * a_dim1], &c__1); ct = akk * -.5f; i__2 = *n - k; saxpy_(&i__2, &ct, &b[k + 1 + k * b_dim1], &c__1, &a[k + 1 + k * a_dim1], &c__1); i__2 = *n - k; ssyr2_(uplo, &i__2, &c_b6, &a[k + 1 + k * a_dim1], &c__1, &b[k + 1 + k * b_dim1], &c__1, &a[k + 1 + (k + 1) * a_dim1], lda); i__2 = *n - k; saxpy_(&i__2, &ct, &b[k + 1 + k * b_dim1], &c__1, &a[k + 1 + k * a_dim1], &c__1); i__2 = *n - k; strsv_(uplo, "No transpose", "Non-unit", &i__2, &b[k + 1 + (k + 1) * b_dim1], ldb, &a[k + 1 + k * a_dim1], &c__1); } /* L20: */ } } } else { if (upper) { /* Compute U*A*U' */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the upper triangle of A(1:k,1:k) */ akk = a[k + k * a_dim1]; bkk = b[k + k * b_dim1]; i__2 = k - 1; strmv_(uplo, "No transpose", "Non-unit", &i__2, &b[b_offset], ldb, &a[k * a_dim1 + 1], &c__1); ct = akk * .5f; i__2 = k - 1; saxpy_(&i__2, &ct, &b[k * b_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); i__2 = k - 1; ssyr2_(uplo, &i__2, &c_b27, &a[k * a_dim1 + 1], &c__1, &b[k * b_dim1 + 1], &c__1, &a[a_offset], lda); i__2 = k - 1; saxpy_(&i__2, &ct, &b[k * b_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); i__2 = k - 1; sscal_(&i__2, &bkk, &a[k * a_dim1 + 1], &c__1); /* Computing 2nd power */ r__1 = bkk; a[k + k * a_dim1] = akk * (r__1 * r__1); /* L30: */ } } else { /* Compute L'*A*L */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the lower triangle of A(1:k,1:k) */ akk = a[k + k * a_dim1]; bkk = b[k + k * b_dim1]; i__2 = k - 1; strmv_(uplo, "Transpose", "Non-unit", &i__2, &b[b_offset], ldb, &a[k + a_dim1], lda); ct = akk * .5f; i__2 = k - 1; saxpy_(&i__2, &ct, &b[k + b_dim1], ldb, &a[k + a_dim1], lda); i__2 = k - 1; ssyr2_(uplo, &i__2, &c_b27, &a[k + a_dim1], lda, &b[k + b_dim1], ldb, &a[a_offset], lda); i__2 = k - 1; saxpy_(&i__2, &ct, &b[k + b_dim1], ldb, &a[k + a_dim1], lda); i__2 = k - 1; sscal_(&i__2, &bkk, &a[k + a_dim1], lda); /* Computing 2nd power */ r__1 = bkk; a[k + k * a_dim1] = akk * (r__1 * r__1); /* L40: */ } } } return 0; /* End of SSYGS2 */ } /* ssygs2_ */
void spanel_bmod ( const int m, /* in - number of rows in the matrix */ const int w, /* in */ const int jcol, /* in */ const int nseg, /* in */ float *dense, /* out, of size n by w */ float *tempv, /* working array */ int *segrep, /* in */ int *repfnz, /* in, of size n by w */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { #ifdef USE_VENDOR_BLAS #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; float alpha, beta; #endif register int k, ksub; int fsupc, nsupc, nsupr, nrow; int krep, krep_ind; float ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; int block_nrow; /* no of rows in a block row */ register int lptr; /* Points to the row subscripts of a supernode */ int kfnz, irow, no_zeros; register int isub, isub1, i; register int jj; /* Index through each column in the panel */ int *xsup, *supno; int *lsub, *xlsub; float *lusup; int *xlusup; int *repfnz_col; /* repfnz[] for a column in the panel */ float *dense_col; /* dense[] for a column in the panel */ float *tempv1; /* Used in 1-D update */ float *TriTmp, *MatvecTmp; /* used in 2-D update */ float zero = 0.0; float one = 1.0; register int ldaTmp; register int r_ind, r_hi; int maxsuper, rowblk, colblk; flops_t *ops = stat->ops; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = (float *) Glu->lusup; xlusup = Glu->xlusup; maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ); rowblk = sp_ienv(4); colblk = sp_ienv(5); ldaTmp = maxsuper + rowblk; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in a supernode * nsupr = no of rows in a supernode */ krep = segrep[k--]; fsupc = xsup[supno[krep]]; nsupc = krep - fsupc + 1; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; nrow = nsupr - nsupc; lptr = xlsub[fsupc]; krep_ind = lptr + nsupc - 1; repfnz_col = repfnz; dense_col = dense; if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ TriTmp = tempv; /* Sequence through each column in panel -- triangular solves */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += segsze * (segsze - 1); ops[GEMV] += 2 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; dense_col[irow] -= ukj * lusup[luptr]; ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr += nsupr*(nsupc-1) + nsupc-1; luptr1 = luptr - nsupr; if ( segsze == 2 ) { ukj -= ukj1 * lusup[luptr1]; dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; dense_col[irow] -= (ukj*lusup[luptr] + ukj1*lusup[luptr1]); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; dense_col[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); } } } else { /* segsze >= 4 */ /* Copy U[*,j] segment from dense[*] to TriTmp[*], which holds the result of triangular solves. */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; TriTmp[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #else strsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #endif #else slsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); #endif } /* else ... */ } /* for jj ... end tri-solves */ /* Block row updates; push all the way into dense[*] block */ for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); luptr = xlusup[fsupc] + nsupc + r_ind; isub1 = lptr + nsupc + r_ind; repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; /* Sequence through each column in panel -- matrix-vector */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ /* Perform a block update, and scatter the result of matrix-vector to dense[]. */ no_zeros = kfnz - fsupc; luptr1 = luptr + nsupr * no_zeros; MatvecTmp = &TriTmp[maxsuper]; #ifdef USE_VENDOR_BLAS alpha = one; beta = zero; #ifdef _CRAY SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #else sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #endif #else smatvec(nsupr, block_nrow, segsze, &lusup[luptr1], TriTmp, MatvecTmp); #endif /* Scatter MatvecTmp[*] into SPA dense[*] temporarily * such that MatvecTmp[*] can be re-used for the * the next blok row update. dense[] will be copied into * global store after the whole panel has been finished. */ isub = isub1; for (i = 0; i < block_nrow; i++) { irow = lsub[isub]; dense_col[irow] -= MatvecTmp[i]; MatvecTmp[i] = zero; ++isub; } } /* for jj ... */ } /* for each block row ... */ /* Scatter the triangular solves into SPA dense[*] */ repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = TriTmp[i]; TriTmp[i] = zero; ++isub; } } /* for jj ... */ } else { /* 1-D block modification */ /* Sequence through each column in the panel */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += segsze * (segsze - 1); ops[GEMV] += 2 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; dense_col[irow] -= ukj * lusup[luptr]; ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { ukj -= ukj1 * lusup[luptr1]; dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; dense_col[irow] -= (ukj*lusup[luptr] + ukj1*lusup[luptr1]); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; ++luptr2; dense_col[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); } } } else { /* segsze >= 4 */ /* * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense[]. */ no_zeros = kfnz - fsupc; /* Copy U[*,j] segment from dense[*] to tempv[*]: * The result of triangular solve is in tempv[*]; * The result of matrix vector update is in dense_col[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tempv[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else strsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #ifdef _CRAY SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else slsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[*] into SPA dense[*] temporarily, such * that tempv[*] can be used for the triangular solve of * the next column of the panel. They will be copied into * ucol[*] after the whole panel has been finished. */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = tempv[i]; tempv[i] = zero; isub++; } /* Scatter the update from tempv1[*] into SPA dense[*] */ /* Start dense rectangular L */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; dense_col[irow] -= tempv1[i]; tempv1[i] = zero; ++isub; } } /* else segsze>=4 ... */ } /* for each column in the panel... */ } /* else 1-D update ... */ } /* for each updating supernode ... */ }
void psgstrf_bmod1D_mv2( const int pnum, /* process number */ const int n, /* number of rows in the matrix */ const int w, /* current panel width */ const int jcol, /* leading column of the current panel */ const int fsupc,/* leading column of the updating s-node */ const int krep, /* last column of the updating s-node */ const int nsupc,/* number of columns in the updating s-node */ int nsupr, /* number of rows in the updating supernode */ int nrow, /* number of rows below the diagonal block of the updating supernode */ int *repfnz, /* in */ int *panel_lsub,/* modified */ int *w_lsub_end,/* modified */ int *spa_marker,/* modified; size n-by-w */ float *dense, /* modified */ float *tempv, /* working array - zeros on entry/exit */ GlobalLU_t *Glu,/* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * Performs numeric block updates (sup-panel) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Results are returned in SPA dense[*,w]. * */ float zero = 0.0; float one = 1.0; #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; float alpha = one, beta = zero; #endif float ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; register int lptr; /* start of row subscripts of the updating supernode */ register int i, j, kfnz, krep_ind, isub, irow, no_zeros, twocols; register int jj; /* index through each column in the panel */ int kfnz2[2], jj2[2]; /* detect two identical columns */ int *repfnz_col, *repfnz_col1; /* repfnz[] for a column in the panel */ float *dense_col, *dense_col1; /* dense[] for a column in the panel */ float *tri[2], *matvec[2]; int *col_marker, *col_marker1; /* each column of the spa_marker[*,w] */ int *col_lsub, *col_lsub1; /* each column of the panel_lsub[*,w] */ int *lsub, *xlsub_end; float *lusup; int *xlusup; register float flopcnt; #ifdef TIMING double *utime = Gstat->utime; double f_time; #endif lsub = Glu->lsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; lptr = Glu->xlsub[fsupc]; krep_ind = lptr + nsupc - 1; twocols = 0; tri[0] = tempv; tri[1] = tempv + n; #ifdef DEBUG if (jcol == BADPAN && krep == BADREP) { printf("(%d) dbmod1D[1] jcol %d, fsupc %d, krep %d, nsupc %d, nsupr %d, nrow %d\n", pnum, jcol, fsupc, krep, nsupc, nsupr, nrow); PrintInt10("lsub[xlsub[2774]", nsupr, &lsub[lptr]); } #endif /* ----------------------------------------------- * Sequence through each column in the panel ... * ----------------------------------------------- */ repfnz_col= repfnz; dense_col = dense; col_marker= spa_marker; col_lsub = panel_lsub; for (jj = jcol; jj < jcol + w; ++jj, col_marker += n, col_lsub += n, repfnz_col += n, dense_col += n) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze; Gstat->procstat[pnum].fcops += flopcnt; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { #ifdef TIMING f_time = SuperLU_timer_(); #endif ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; #ifdef DEBUG if (krep == BADCOL && jj == -1) { printf("(%d) dbmod1D[segsze=1]: k %d, j %d, ukj %.10e\n", pnum, lsub[krep_ind], jj, ukj); PrintInt10("segsze=1", nsupr, &lsub[lptr]); } #endif for (i = lptr + nsupc; i < xlsub_end[fsupc]; i++) { irow = lsub[i]; dense_col[irow] -= ukj * lusup[luptr]; ++luptr; #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else if ( segsze <= 3 ) { #ifdef TIMING f_time = SuperLU_timer_(); #endif ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { ukj -= ukj1 * lusup[luptr1]; dense_col[lsub[krep_ind]] = ukj; /*#pragma ivdep*/ for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; dense_col[irow] -= (ukj * lusup[luptr] + ukj1 * lusup[luptr1]); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; ++luptr2; dense_col[irow] -= (ukj * lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2]); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else { /* segsze >= 4 */ if ( twocols == 1 ) { jj2[1] = jj; /* got two columns */ twocols = 0; for (j = 0; j < 2; ++j) { /* Do two tri-solves */ i = n * (jj2[j] - jcol); repfnz_col1 = &repfnz[i]; dense_col1 = &dense[i]; kfnz2[j] = repfnz_col1[krep]; no_zeros = kfnz2[j] - fsupc; segsze = krep - kfnz2[j] + 1; matvec[j] = tri[j] + segsze; /* Gather U[*,j] segment from dense[*] to tri[*]. */ isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tri[j][i] = dense_col1[irow]; /* Gather */ ++isub; } #ifdef TIMING f_time = SuperLU_timer_(); #endif /* start effective triangle */ luptr = xlusup[fsupc] + nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tri[j], &incx ); #else strsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tri[j], &incx ); #endif #else slsolve ( nsupr, segsze, &lusup[luptr], tri[j] ); #endif #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } /* end for j ... two tri-solves */ #ifdef TIMING f_time = SuperLU_timer_(); #endif if ( kfnz2[0] < kfnz2[1] ) { /* First column is bigger */ no_zeros = kfnz2[0] - fsupc; segsze = kfnz2[1] - kfnz2[0]; luptr = xlusup[fsupc] + nsupr * no_zeros + nsupc; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[0], &incx, &beta, matvec[0], &incy ); #else sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[0], &incx, &beta, matvec[0], &incy ); #endif #else smatvec (nsupr, nrow, segsze, &lusup[luptr], tri[0], matvec[0]); #endif } else if ( kfnz2[0] > kfnz2[1] ) { no_zeros = kfnz2[1] - fsupc; segsze = kfnz2[0] - kfnz2[1]; luptr = xlusup[fsupc] + nsupr * no_zeros + nsupc; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[1], &incx, &beta, matvec[1], &incy ); #else sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[1], &incx, &beta, matvec[1], &incy ); #endif #else smatvec (nsupr, nrow, segsze, &lusup[luptr], tri[1], matvec[1]); #endif } /* Do matrix-vector multiply with two destinations */ kfnz = SUPERLU_MAX( kfnz2[0], kfnz2[1] ); no_zeros = kfnz - fsupc; segsze = krep - kfnz + 1; luptr = xlusup[fsupc] + nsupr * no_zeros + nsupc; #if ( MACH==DEC ) sgemv2_(&nsupr, &nrow, &segsze, &lusup[luptr], &tri[0][kfnz-kfnz2[0]], &tri[1][kfnz-kfnz2[1]], matvec[0], matvec[1]); /*#elif ( MACH==CRAY_PVP ) SGEMV2(&nsupr, &nrow, &segsze, &lusup[luptr], &tri[0][kfnz-kfnz2[0]], &tri[1][kfnz-kfnz2[1]], matvec[0], matvec[1]);*/ #else //smatvec2(nsupr, nrow, segsze, &lusup[luptr], // &tri[0][kfnz-kfnz2[0]], &tri[1][kfnz-kfnz2[1]], // matvec[0], matvec[1]); #endif #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif for (j = 0; j < 2; ++j) { i = n * (jj2[j] - jcol); dense_col1 = &dense[i]; col_marker1 = &spa_marker[i]; col_lsub1 = &panel_lsub[i]; no_zeros = kfnz2[j] - fsupc; segsze = krep - kfnz2[j] + 1; /* Scatter tri[*] into SPA dense[*]. */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col1[irow] = tri[j][i]; /* Scatter */ tri[j][i] = zero; ++isub; #ifdef DEBUG if (jj == -1 && krep == 3423) printf("(%d) dbmod1D[scatter] jj %d, dense_col[%d] %e\n", pnum, jj, irow, dense_col[irow]); #endif } /* Scatter matvec[*] into SPA dense[*]. */ /*#pragma ivdep*/ for (i = 0; i < nrow; i++) { irow = lsub[isub]; dense_col1[irow] -= matvec[j][i]; /* Scatter-add */ #ifdef SCATTER_FOUND if ( col_marker1[irow] != jj2[j] ) { col_marker1[irow] = jj2[j]; col_lsub1[w_lsub_end[jj2[j]-jcol]++] = irow; } #endif matvec[j][i] = zero; ++isub; } } /* end for two destination update */ } else { /* wait for a second column */ jj2[0] = jj; twocols = 1; } } /* else segsze >= 4 */ } /* for jj ... */ if ( twocols == 1 ) { /* one more column left */ i = n * (jj2[0] - jcol); repfnz_col1 = &repfnz[i]; dense_col1 = &dense[i]; col_marker1 = &spa_marker[i]; col_lsub1 = &panel_lsub[i]; kfnz = repfnz_col1[krep]; no_zeros = kfnz - fsupc; segsze = krep - kfnz + 1; /* Gather U[*,j] segment from dense[*] to tri[*]. */ isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tri[0][i] = dense_col1[irow]; /* Gather */ ++isub; } #ifdef TIMING f_time = SuperLU_timer_(); #endif /* start effective triangle */ luptr = xlusup[fsupc] + nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tri[0], &incx ); #else strsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tri[0], &incx ); #endif #else slsolve ( nsupr, segsze, &lusup[luptr], tri[0] ); #endif luptr += segsze; /* Dense matrix-vector */ matvec[0] = tri[0] + segsze; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[0], &incx, &beta, matvec[0], &incy ); #else sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[0], &incx, &beta, matvec[0], &incy ); #endif #else smatvec (nsupr, nrow, segsze, &lusup[luptr], tri[0], matvec[0]); #endif #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif /* Scatter tri[*] into SPA dense[*]. */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col1[irow] = tri[0][i]; /* Scatter */ tri[0][i] = zero; ++isub; #ifdef DEBUG if (jj == -1 && krep == 3423) printf("(%d) dbmod1D[scatter] jj %d, dense_col[%d] %e\n", pnum, jj, irow, dense_col[irow]); #endif } /* Scatter matvec[*] into SPA dense[*]. */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; dense_col1[irow] -= matvec[0][i]; /* Scatter-add */ #ifdef SCATTER_FOUND if ( col_marker1[irow] != jj2[0] ) { col_marker1[irow] = jj2[0]; col_lsub1[w_lsub_end[jj2[0]-jcol]++] = irow; } #endif matvec[0][i] = zero; ++isub; } } /* if twocols == 1 */ }
void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, const int n, const float *A, const int lda, float *x, const int incx) { strsv_(uplo, trans, diag, &n, A, &lda, x, &incx); }
/* * Performs numeric block updates within the relaxed snode. */ int ssnode_bmod ( const int jcol, /* in */ const int fsupc, /* in */ double *dense, /* in */ double *tempv, /* working array */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { #ifdef USE_VENDOR_BLAS #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; double alpha = -1.0, beta = 1.0; #endif int luptr, nsupc, nsupr, nrow; int isub, irow, i, iptr; register int ufirst, nextlu; int *lsub, *xlsub; double *lusup; int *xlusup; flops_t *ops = stat->ops; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; nextlu = xlusup[jcol]; /* * Process the supernodal portion of L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = 0; ++nextlu; } xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ if ( fsupc < jcol ) { luptr = xlusup[fsupc]; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; nsupc = jcol - fsupc; /* Excluding jcol */ ufirst = xlusup[jcol]; /* Points to the beginning of column jcol in supernode L\U(jsupno). */ nrow = nsupr - nsupc; ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], &tempv[0] ); /* Scatter tempv[*] into lusup[*] */ iptr = ufirst + nsupc; for (i = 0; i < nrow; i++) { lusup[iptr++] -= tempv[i]; tempv[i] = 0.0; } #endif } return 0; }
int psgstrf_snode_bmod( const int pnum, /* process number */ const int jcol, /* in - current column in the s-node */ const int jsupno, /* in */ const int fsupc, /* in - first column in the s-node */ float *dense, /* in */ float *tempv, /* working array */ GlobalLU_t *Glu, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Performs numeric block updates within the relaxed supernode. */ float zero = 0.0; float one = 1.0; float none = -1.0; #if ( MACH==CRAY_PVP ) _fcd ftcs1, ftcs2, ftcs3; #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; float alpha = none, beta = one; #endif int luptr, nsupc, nsupr, nrow; int isub, irow, i, iptr; register int ufirst, nextlu; float *lusup; int *lsub, *xlsub, *xlsub_end, *xlusup, *xlusup_end; register float flopcnt; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; xlusup_end = Glu->xlusup_end; nextlu = xlusup[jcol]; /* * Process the supernodal portion of L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; isub++) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; ++nextlu; } xlusup_end[jcol] = nextlu; if ( fsupc < jcol ) { luptr = xlusup[fsupc]; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; nsupc = jcol - fsupc; /* Excluding jcol */ ufirst = xlusup[jcol]; /* Points to the beginning of column jcol in supernode L\U(jsupno). */ nrow = nsupr - nsupc; flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; Gstat->procstat[pnum].fcops += flopcnt; /* ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; */ #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], &tempv[0] ); /* Scatter tempv[*] into lusup[*] */ iptr = ufirst + nsupc; for (i = 0; i < nrow; i++) { lusup[iptr++] -= tempv[i]; tempv[i] = zero; } #endif } return 0; }
/* Subroutine */ int sgglse_(integer *m, integer *n, integer *p, real *a, integer *lda, real *b, integer *ldb, real *c, real *d, real *x, real * work, integer *lwork, integer *info) { /* -- LAPACK driver routine (version 2.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University September 30, 1994 Purpose ======= SGGLSE solves the linear equality-constrained least squares (LSE) problem: minimize || c - A*x ||_2 subject to B*x = d where A is an M-by-N matrix, B is a P-by-N matrix, c is a given M-vector, and d is a given P-vector. It is assumed that P <= N <= M+P, and rank(B) = P and rank( ( A ) ) = N. ( ( B ) ) These conditions ensure that the LSE problem has a unique solution, which is obtained using a GRQ factorization of the matrices B and A. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrices A and B. N >= 0. P (input) INTEGER The number of rows of the matrix B. 0 <= P <= N <= M+P. A (input/output) REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). B (input/output) REAL array, dimension (LDB,N) On entry, the P-by-N matrix B. On exit, B is destroyed. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,P). C (input/output) REAL array, dimension (M) On entry, C contains the right hand side vector for the least squares part of the LSE problem. On exit, the residual sum of squares for the solution is given by the sum of squares of elements N-P+1 to M of vector C. D (input/output) REAL array, dimension (P) On entry, D contains the right hand side vector for the constrained equation. On exit, D is destroyed. X (output) REAL array, dimension (N) On exit, X is the solution of the LSE problem. WORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,M+N+P). For optimum performance LWORK >= P+min(M,N)+max(M,N)*NB, where NB is an upper bound for the optimal blocksizes for SGEQRF, SGERQF, SORMQR and SORMRQ. INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. ===================================================================== Test the input parameters Parameter adjustments Function Body */ /* Table of constant values */ static integer c__1 = 1; static real c_b11 = -1.f; static real c_b13 = 1.f; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; /* Local variables */ static integer lopt; extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *), strsv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); static integer mn, nr; extern /* Subroutine */ int xerbla_(char *, integer *), sggrqf_( integer *, integer *, integer *, real *, integer *, real *, real * , integer *, real *, real *, integer *, integer *), sormqr_(char * , char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *), sormrq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); #define C(I) c[(I)-1] #define D(I) d[(I)-1] #define X(I) x[(I)-1] #define WORK(I) work[(I)-1] #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] #define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)] *info = 0; mn = min(*m,*n); if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*p < 0 || *p > *n || *p < *n - *m) { *info = -3; } else if (*lda < max(1,*m)) { *info = -5; } else if (*ldb < max(1,*p)) { *info = -7; } else /* if(complicated condition) */ { /* Computing MAX */ i__1 = 1, i__2 = *m + *n + *p; if (*lwork < max(i__1,i__2)) { *info = -12; } } if (*info != 0) { i__1 = -(*info); xerbla_("SGGLSE", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Compute the GRQ factorization of matrices B and A: B*Q' = ( 0 T12 ) P Z'*A*Q' = ( R11 R12 ) N-P N-P P ( 0 R22 ) M+P-N N-P P where T12 and R11 are upper triangular, and Q and Z are orthogonal. */ i__1 = *lwork - *p - mn; sggrqf_(p, m, n, &B(1,1), ldb, &WORK(1), &A(1,1), lda, &WORK(*p + 1), &WORK(*p + mn + 1), &i__1, info); lopt = WORK(*p + mn + 1); /* Update c = Z'*c = ( c1 ) N-P ( c2 ) M+P-N */ i__1 = max(1,*m); i__2 = *lwork - *p - mn; sormqr_("Left", "Transpose", m, &c__1, &mn, &A(1,1), lda, &WORK(*p + 1), &C(1), &i__1, &WORK(*p + mn + 1), &i__2, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) WORK(*p + mn + 1); lopt = max(i__1,i__2); /* Solve T12*x2 = d for x2 */ strsv_("Upper", "No transpose", "Non unit", p, &B(1,*n-*p+1), ldb, &D(1), &c__1); /* Update c1 */ i__1 = *n - *p; sgemv_("No transpose", &i__1, p, &c_b11, &A(1,*n-*p+1), lda, &D(1), &c__1, &c_b13, &C(1), &c__1); /* Sovle R11*x1 = c1 for x1 */ i__1 = *n - *p; strsv_("Upper", "No transpose", "Non unit", &i__1, &A(1,1), lda, &C( 1), &c__1); /* Put the solutions in X */ i__1 = *n - *p; scopy_(&i__1, &C(1), &c__1, &X(1), &c__1); scopy_(p, &D(1), &c__1, &X(*n - *p + 1), &c__1); /* Compute the residual vector: */ if (*m < *n) { nr = *m + *p - *n; i__1 = *n - *m; sgemv_("No transpose", &nr, &i__1, &c_b11, &A(*n-*p+1,*m+1), lda, &D(nr + 1), &c__1, &c_b13, &C(*n - *p + 1), & c__1); } else { nr = *p; } strmv_("Upper", "No transpose", "Non unit", &nr, &A(*n-*p+1,*n-*p+1), lda, &D(1), &c__1); saxpy_(&nr, &c_b11, &D(1), &c__1, &C(*n - *p + 1), &c__1); /* Backward transformation x = Q'*x */ i__1 = *lwork - *p - mn; sormrq_("Left", "Transpose", n, &c__1, p, &B(1,1), ldb, &WORK(1), &X( 1), n, &WORK(*p + mn + 1), &i__1, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) WORK(*p + mn + 1); WORK(1) = (real) (*p + mn + max(i__1,i__2)); return 0; /* End of SGGLSE */ } /* sgglse_ */
/*! \brief Solves one of the systems of equations A*x = b, or A'*x = b * * <pre> * Purpose * ======= * * sp_strsv() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A'*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU. * * x - (input/output) float* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * </pre> */ int sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif SCformat *Lstore; NCformat *Ustore; float *Lval, *Uval; int incx = 1, incy = 1; float alpha = 1.0, beta = 1.0; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; float *work; flops_t solve_ops; /* Test the input parameters */ *info = 0; if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) *info = -2; else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); xerbla_("sp_strsv", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = floatCalloc(L->nrow)) ) ABORT("Malloc fails for work in sp_strsv()."); if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1); solve_ops += 2 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; x[irow] -= x[fsupc] * Lval[luptr]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif #else slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); x[irow] -= work[i]; /* Scatter */ work[i] = 0.0; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); x[irow] -= x[fsupc] * Uval[i]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[irow] -= x[jcol] * Uval[i]; } } } } /* for k ... */ } } else { /* Form x := inv(A')*x */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 2 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); x[jcol] -= x[irow] * Lval[i]; iptr++; } } if ( nsupc > 1 ) { solve_ops += nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[jcol] -= x[irow] * Uval[i]; } } solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } stat->ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }
/* Subroutine */ int sggglm_(integer *n, integer *m, integer *p, real *a, integer *lda, real *b, integer *ldb, real *d__, real *x, real *y, real *work, integer *lwork, integer *info) { /* -- LAPACK driver routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University June 30, 1999 Purpose ======= SGGGLM solves a general Gauss-Markov linear model (GLM) problem: minimize || y ||_2 subject to d = A*x + B*y x where A is an N-by-M matrix, B is an N-by-P matrix, and d is a given N-vector. It is assumed that M <= N <= M+P, and rank(A) = M and rank( A B ) = N. Under these assumptions, the constrained equation is always consistent, and there is a unique solution x and a minimal 2-norm solution y, which is obtained using a generalized QR factorization of A and B. In particular, if matrix B is square nonsingular, then the problem GLM is equivalent to the following weighted linear least squares problem minimize || inv(B)*(d-A*x) ||_2 x where inv(B) denotes the inverse of B. Arguments ========= N (input) INTEGER The number of rows of the matrices A and B. N >= 0. M (input) INTEGER The number of columns of the matrix A. 0 <= M <= N. P (input) INTEGER The number of columns of the matrix B. P >= N-M. A (input/output) REAL array, dimension (LDA,M) On entry, the N-by-M matrix A. On exit, A is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) REAL array, dimension (LDB,P) On entry, the N-by-P matrix B. On exit, B is destroyed. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). D (input/output) REAL array, dimension (N) On entry, D is the left hand side of the GLM equation. On exit, D is destroyed. X (output) REAL array, dimension (M) Y (output) REAL array, dimension (P) On exit, X and Y are the solutions of the GLM problem. WORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,N+M+P). For optimum performance, LWORK >= M+min(N,P)+max(N,P)*NB, where NB is an upper bound for the optimal blocksizes for SGEQRF, SGERQF, SORMQR and SORMRQ. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. =================================================================== Test the input parameters Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; static integer c_n1 = -1; static real c_b32 = -1.f; static real c_b34 = 1.f; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4; /* Local variables */ static integer lopt, i__; extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), strsv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); static integer nb, np; extern /* Subroutine */ int xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern /* Subroutine */ int sggqrf_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, real *, integer * , integer *); static integer nb1, nb2, nb3, nb4, lwkopt; static logical lquery; extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *), sormrq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real * , integer *, real *, integer *, integer *); #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; --d__; --x; --y; --work; /* Function Body */ *info = 0; np = min(*n,*p); nb1 = ilaenv_(&c__1, "SGEQRF", " ", n, m, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); nb2 = ilaenv_(&c__1, "SGERQF", " ", n, m, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); nb3 = ilaenv_(&c__1, "SORMQR", " ", n, m, p, &c_n1, (ftnlen)6, (ftnlen)1); nb4 = ilaenv_(&c__1, "SORMRQ", " ", n, m, p, &c_n1, (ftnlen)6, (ftnlen)1); /* Computing MAX */ i__1 = max(nb1,nb2), i__1 = max(i__1,nb3); nb = max(i__1,nb4); lwkopt = *m + np + max(*n,*p) * nb; work[1] = (real) lwkopt; lquery = *lwork == -1; if (*n < 0) { *info = -1; } else if (*m < 0 || *m > *n) { *info = -2; } else if (*p < 0 || *p < *n - *m) { *info = -3; } else if (*lda < max(1,*n)) { *info = -5; } else if (*ldb < max(1,*n)) { *info = -7; } else /* if(complicated condition) */ { /* Computing MAX */ i__1 = 1, i__2 = *n + *m + *p; if (*lwork < max(i__1,i__2) && ! lquery) { *info = -12; } } if (*info != 0) { i__1 = -(*info); xerbla_("SGGGLM", &i__1); return 0; } else if (lquery) { return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Compute the GQR factorization of matrices A and B: Q'*A = ( R11 ) M, Q'*B*Z' = ( T11 T12 ) M ( 0 ) N-M ( 0 T22 ) N-M M M+P-N N-M where R11 and T22 are upper triangular, and Q and Z are orthogonal. */ i__1 = *lwork - *m - np; sggqrf_(n, m, p, &a[a_offset], lda, &work[1], &b[b_offset], ldb, &work[*m + 1], &work[*m + np + 1], &i__1, info); lopt = work[*m + np + 1]; /* Update left-hand-side vector d = Q'*d = ( d1 ) M ( d2 ) N-M */ i__1 = max(1,*n); i__2 = *lwork - *m - np; sormqr_("Left", "Transpose", n, &c__1, m, &a[a_offset], lda, &work[1], & d__[1], &i__1, &work[*m + np + 1], &i__2, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) work[*m + np + 1]; lopt = max(i__1,i__2); /* Solve T22*y2 = d2 for y2 */ i__1 = *n - *m; strsv_("Upper", "No transpose", "Non unit", &i__1, &b_ref(*m + 1, *m + *p - *n + 1), ldb, &d__[*m + 1], &c__1); i__1 = *n - *m; scopy_(&i__1, &d__[*m + 1], &c__1, &y[*m + *p - *n + 1], &c__1); /* Set y1 = 0 */ i__1 = *m + *p - *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.f; /* L10: */ } /* Update d1 = d1 - T12*y2 */ i__1 = *n - *m; sgemv_("No transpose", m, &i__1, &c_b32, &b_ref(1, *m + *p - *n + 1), ldb, &y[*m + *p - *n + 1], &c__1, &c_b34, &d__[1], &c__1); /* Solve triangular system: R11*x = d1 */ strsv_("Upper", "No Transpose", "Non unit", m, &a[a_offset], lda, &d__[1], &c__1); /* Copy D to X */ scopy_(m, &d__[1], &c__1, &x[1], &c__1); /* Backward transformation y = Z'*y Computing MAX */ i__1 = 1, i__2 = *n - *p + 1; i__3 = max(1,*p); i__4 = *lwork - *m - np; sormrq_("Left", "Transpose", p, &c__1, &np, &b_ref(max(i__1,i__2), 1), ldb, &work[*m + 1], &y[1], &i__3, &work[*m + np + 1], &i__4, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) work[*m + np + 1]; work[1] = (real) (*m + np + max(i__1,i__2)); return 0; /* End of SGGGLM */ } /* sggglm_ */
/* Subroutine */ int strrfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real *work, integer *iwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3; real r__1, r__2, r__3; /* Local variables */ integer i__, j, k; real s, xk; integer nz; real eps; integer kase; real safe1, safe2; integer isave[3]; logical upper; real safmin; logical notran; char transt[1]; logical nounit; real lstres; /* -- LAPACK routine (version 3.2) -- */ /* November 2006 */ /* Modified to call SLACN2 in place of SLACON, 7 Feb 03, SJH. */ /* Purpose */ /* ======= */ /* STRRFS provides error bounds and backward error estimates for the */ /* solution to a system of linear equations with a triangular */ /* coefficient matrix. */ /* The solution matrix X must be computed by STRTRS or some other */ /* means before entering this routine. STRRFS does not do iterative */ /* refinement because doing so cannot improve the backward error. */ /* Arguments */ /* ========= */ /* UPLO (input) CHARACTER*1 */ /* = 'U': A is upper triangular; */ /* = 'L': A is lower triangular. */ /* TRANS (input) CHARACTER*1 */ /* Specifies the form of the system of equations: */ /* = 'N': A * X = B (No transpose) */ /* = 'T': A**T * X = B (Transpose) */ /* = 'C': A**H * X = B (Conjugate transpose = Transpose) */ /* DIAG (input) CHARACTER*1 */ /* = 'N': A is non-unit triangular; */ /* = 'U': A is unit triangular. */ /* N (input) INTEGER */ /* The order of the matrix A. N >= 0. */ /* NRHS (input) INTEGER */ /* The number of right hand sides, i.e., the number of columns */ /* of the matrices B and X. NRHS >= 0. */ /* A (input) REAL array, dimension (LDA,N) */ /* The triangular matrix A. If UPLO = 'U', the leading N-by-N */ /* upper triangular part of the array A contains the upper */ /* triangular matrix, and the strictly lower triangular part of */ /* A is not referenced. If UPLO = 'L', the leading N-by-N lower */ /* triangular part of the array A contains the lower triangular */ /* matrix, and the strictly upper triangular part of A is not */ /* referenced. If DIAG = 'U', the diagonal elements of A are */ /* also not referenced and are assumed to be 1. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,N). */ /* B (input) REAL array, dimension (LDB,NRHS) */ /* The right hand side matrix B. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= max(1,N). */ /* X (input) REAL array, dimension (LDX,NRHS) */ /* The solution matrix X. */ /* LDX (input) INTEGER */ /* The leading dimension of the array X. LDX >= max(1,N). */ /* FERR (output) REAL array, dimension (NRHS) */ /* The estimated forward error bound for each solution vector */ /* X(j) (the j-th column of the solution matrix X). */ /* If XTRUE is the true solution corresponding to X(j), FERR(j) */ /* is an estimated upper bound for the magnitude of the largest */ /* element in (X(j) - XTRUE) divided by the magnitude of the */ /* largest element in X(j). The estimate is as reliable as */ /* the estimate for RCOND, and is almost always a slight */ /* overestimate of the true error. */ /* BERR (output) REAL array, dimension (NRHS) */ /* The componentwise relative backward error of each solution */ /* vector X(j) (i.e., the smallest relative change in */ /* any element of A or B that makes X(j) an exact solution). */ /* WORK (workspace) REAL array, dimension (3*N) */ /* IWORK (workspace) INTEGER array, dimension (N) */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* ===================================================================== */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; x_dim1 = *ldx; x_offset = 1 + x_dim1; x -= x_offset; --ferr; --berr; --work; --iwork; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); notran = lsame_(trans, "N"); nounit = lsame_(diag, "N"); if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (! notran && ! lsame_(trans, "T") && ! lsame_(trans, "C")) { *info = -2; } else if (! nounit && ! lsame_(diag, "U")) { *info = -3; } else if (*n < 0) { *info = -4; } else if (*nrhs < 0) { *info = -5; } else if (*lda < max(1,*n)) { *info = -7; } else if (*ldb < max(1,*n)) { *info = -9; } else if (*ldx < max(1,*n)) { *info = -11; } if (*info != 0) { i__1 = -(*info); xerbla_("STRRFS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { ferr[j] = 0.f; berr[j] = 0.f; } return 0; } if (notran) { *(unsigned char *)transt = 'T'; } else { *(unsigned char *)transt = 'N'; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = *n + 1; eps = slamch_("Epsilon"); safmin = slamch_("Safe minimum"); safe1 = nz * safmin; safe2 = safe1 / eps; /* Do for each right hand side */ i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { /* Compute residual R = B - op(A) * X, */ /* where op(A) = A or A', depending on TRANS. */ scopy_(n, &x[j * x_dim1 + 1], &c__1, &work[*n + 1], &c__1); strmv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1); saxpy_(n, &c_b19, &b[j * b_dim1 + 1], &c__1, &work[*n + 1], &c__1); /* Compute componentwise relative backward error from formula */ /* max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) */ /* where abs(Z) is the componentwise absolute value of the matrix */ /* or vector Z. If the i-th component of the denominator is less */ /* than SAFE2, then SAFE1 is added to the i-th components of the */ /* numerator and denominator before dividing. */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[i__] = (r__1 = b[i__ + j * b_dim1], dabs(r__1)); } if (notran) { /* Compute abs(A)*abs(X) + abs(B). */ if (upper) { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (r__1 = x[k + j * x_dim1], dabs(r__1)); i__3 = k; for (i__ = 1; i__ <= i__3; ++i__) { work[i__] += (r__1 = a[i__ + k * a_dim1], dabs( r__1)) * xk; } } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (r__1 = x[k + j * x_dim1], dabs(r__1)); i__3 = k - 1; for (i__ = 1; i__ <= i__3; ++i__) { work[i__] += (r__1 = a[i__ + k * a_dim1], dabs( r__1)) * xk; } work[k] += xk; } } } else { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (r__1 = x[k + j * x_dim1], dabs(r__1)); i__3 = *n; for (i__ = k; i__ <= i__3; ++i__) { work[i__] += (r__1 = a[i__ + k * a_dim1], dabs( r__1)) * xk; } } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (r__1 = x[k + j * x_dim1], dabs(r__1)); i__3 = *n; for (i__ = k + 1; i__ <= i__3; ++i__) { work[i__] += (r__1 = a[i__ + k * a_dim1], dabs( r__1)) * xk; } work[k] += xk; } } } } else { /* Compute abs(A')*abs(X) + abs(B). */ if (upper) { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = 0.f; i__3 = k; for (i__ = 1; i__ <= i__3; ++i__) { s += (r__1 = a[i__ + k * a_dim1], dabs(r__1)) * ( r__2 = x[i__ + j * x_dim1], dabs(r__2)); } work[k] += s; } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = (r__1 = x[k + j * x_dim1], dabs(r__1)); i__3 = k - 1; for (i__ = 1; i__ <= i__3; ++i__) { s += (r__1 = a[i__ + k * a_dim1], dabs(r__1)) * ( r__2 = x[i__ + j * x_dim1], dabs(r__2)); } work[k] += s; } } } else { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = 0.f; i__3 = *n; for (i__ = k; i__ <= i__3; ++i__) { s += (r__1 = a[i__ + k * a_dim1], dabs(r__1)) * ( r__2 = x[i__ + j * x_dim1], dabs(r__2)); } work[k] += s; } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = (r__1 = x[k + j * x_dim1], dabs(r__1)); i__3 = *n; for (i__ = k + 1; i__ <= i__3; ++i__) { s += (r__1 = a[i__ + k * a_dim1], dabs(r__1)) * ( r__2 = x[i__ + j * x_dim1], dabs(r__2)); } work[k] += s; } } } } s = 0.f; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] > safe2) { /* Computing MAX */ r__2 = s, r__3 = (r__1 = work[*n + i__], dabs(r__1)) / work[ i__]; s = dmax(r__2,r__3); } else { /* Computing MAX */ r__2 = s, r__3 = ((r__1 = work[*n + i__], dabs(r__1)) + safe1) / (work[i__] + safe1); s = dmax(r__2,r__3); } } berr[j] = s; /* Bound error from formula */ /* norm(X - XTRUE) / norm(X) .le. FERR = */ /* norm( abs(inv(op(A)))* */ /* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) */ /* where */ /* norm(Z) is the magnitude of the largest component of Z */ /* inv(op(A)) is the inverse of op(A) */ /* abs(Z) is the componentwise absolute value of the matrix or */ /* vector Z */ /* NZ is the maximum number of nonzeros in any row of A, plus 1 */ /* EPS is machine epsilon */ /* The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) */ /* is incremented by SAFE1 if the i-th component of */ /* abs(op(A))*abs(X) + abs(B) is less than SAFE2. */ /* Use SLACN2 to estimate the infinity-norm of the matrix */ /* inv(op(A)) * diag(W), */ /* where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] > safe2) { work[i__] = (r__1 = work[*n + i__], dabs(r__1)) + nz * eps * work[i__]; } else { work[i__] = (r__1 = work[*n + i__], dabs(r__1)) + nz * eps * work[i__] + safe1; } } kase = 0; L210: slacn2_(n, &work[(*n << 1) + 1], &work[*n + 1], &iwork[1], &ferr[j], & kase, isave); if (kase != 0) { if (kase == 1) { /* Multiply by diag(W)*inv(op(A)'). */ strsv_(uplo, transt, diag, n, &a[a_offset], lda, &work[*n + 1] , &c__1); i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[*n + i__] = work[i__] * work[*n + i__]; } } else { /* Multiply by inv(op(A))*diag(W). */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[*n + i__] = work[i__] * work[*n + i__]; } strsv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1); } goto L210; } /* Normalize error. */ lstres = 0.f; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { /* Computing MAX */ r__2 = lstres, r__3 = (r__1 = x[i__ + j * x_dim1], dabs(r__1)); lstres = dmax(r__2,r__3); } if (lstres != 0.f) { ferr[j] /= lstres; } } return 0; /* End of STRRFS */ } /* strrfs_ */
/* Subroutine */ int slatrs_(char *uplo, char *trans, char *diag, char * normin, integer *n, real *a, integer *lda, real *x, real *scale, real *cnorm, integer *info, ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len, ftnlen normin_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; real r__1, r__2, r__3; /* Local variables */ static integer i__, j; static real xj, rec, tjj; static integer jinc; static real xbnd; static integer imax; static real tmax, tjjs; extern doublereal sdot_(integer *, real *, integer *, real *, integer *); static real xmax, grow, sumj; extern logical lsame_(char *, char *, ftnlen, ftnlen); extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); static real tscal, uscal; static integer jlast; extern doublereal sasum_(integer *, real *, integer *); static logical upper; extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, real *, integer *), strsv_(char *, char *, char *, integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen); extern doublereal slamch_(char *, ftnlen); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); static real bignum; extern integer isamax_(integer *, real *, integer *); static logical notran; static integer jfirst; static real smlnum; static logical nounit; /* -- LAPACK auxiliary routine (version 3.0) -- */ /* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., */ /* Courant Institute, Argonne National Lab, and Rice University */ /* June 30, 1992 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SLATRS solves one of the triangular systems */ /* A *x = s*b or A'*x = s*b */ /* with scaling to prevent overflow. Here A is an upper or lower */ /* triangular matrix, A' denotes the transpose of A, x and b are */ /* n-element vectors, and s is a scaling factor, usually less than */ /* or equal to 1, chosen so that the components of x will be less than */ /* the overflow threshold. If the unscaled problem will not cause */ /* overflow, the Level 2 BLAS routine STRSV is called. If the matrix A */ /* is singular (A(j,j) = 0 for some j), then s is set to 0 and a */ /* non-trivial solution to A*x = 0 is returned. */ /* Arguments */ /* ========= */ /* UPLO (input) CHARACTER*1 */ /* Specifies whether the matrix A is upper or lower triangular. */ /* = 'U': Upper triangular */ /* = 'L': Lower triangular */ /* TRANS (input) CHARACTER*1 */ /* Specifies the operation applied to A. */ /* = 'N': Solve A * x = s*b (No transpose) */ /* = 'T': Solve A'* x = s*b (Transpose) */ /* = 'C': Solve A'* x = s*b (Conjugate transpose = Transpose) */ /* DIAG (input) CHARACTER*1 */ /* Specifies whether or not the matrix A is unit triangular. */ /* = 'N': Non-unit triangular */ /* = 'U': Unit triangular */ /* NORMIN (input) CHARACTER*1 */ /* Specifies whether CNORM has been set or not. */ /* = 'Y': CNORM contains the column norms on entry */ /* = 'N': CNORM is not set on entry. On exit, the norms will */ /* be computed and stored in CNORM. */ /* N (input) INTEGER */ /* The order of the matrix A. N >= 0. */ /* A (input) REAL array, dimension (LDA,N) */ /* The triangular matrix A. If UPLO = 'U', the leading n by n */ /* upper triangular part of the array A contains the upper */ /* triangular matrix, and the strictly lower triangular part of */ /* A is not referenced. If UPLO = 'L', the leading n by n lower */ /* triangular part of the array A contains the lower triangular */ /* matrix, and the strictly upper triangular part of A is not */ /* referenced. If DIAG = 'U', the diagonal elements of A are */ /* also not referenced and are assumed to be 1. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max (1,N). */ /* X (input/output) REAL array, dimension (N) */ /* On entry, the right hand side b of the triangular system. */ /* On exit, X is overwritten by the solution vector x. */ /* SCALE (output) REAL */ /* The scaling factor s for the triangular system */ /* A * x = s*b or A'* x = s*b. */ /* If SCALE = 0, the matrix A is singular or badly scaled, and */ /* the vector x is an exact or approximate solution to A*x = 0. */ /* CNORM (input or output) REAL array, dimension (N) */ /* If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ /* contains the norm of the off-diagonal part of the j-th column */ /* of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ /* to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ /* must be greater than or equal to the 1-norm. */ /* If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ /* returns the 1-norm of the offdiagonal part of the j-th column */ /* of A. */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -k, the k-th argument had an illegal value */ /* Further Details */ /* ======= ======= */ /* A rough bound on x is computed; if that is less than overflow, STRSV */ /* is called, otherwise, specific code is used which checks for possible */ /* overflow or divide-by-zero at every operation. */ /* A columnwise scheme is used for solving A*x = b. The basic algorithm */ /* if A is lower triangular is */ /* x[1:n] := b[1:n] */ /* for j = 1, ..., n */ /* x(j) := x(j) / A(j,j) */ /* x[j+1:n] := x[j+1:n] - x(j) * A[j+1:n,j] */ /* end */ /* Define bounds on the components of x after j iterations of the loop: */ /* M(j) = bound on x[1:j] */ /* G(j) = bound on x[j+1:n] */ /* Initially, let M(0) = 0 and G(0) = max{x(i), i=1,...,n}. */ /* Then for iteration j+1 we have */ /* M(j+1) <= G(j) / | A(j+1,j+1) | */ /* G(j+1) <= G(j) + M(j+1) * | A[j+2:n,j+1] | */ /* <= G(j) ( 1 + CNORM(j+1) / | A(j+1,j+1) | ) */ /* where CNORM(j+1) is greater than or equal to the infinity-norm of */ /* column j+1 of A, not counting the diagonal. Hence */ /* G(j) <= G(0) product ( 1 + CNORM(i) / | A(i,i) | ) */ /* 1<=i<=j */ /* and */ /* |x(j)| <= ( G(0) / |A(j,j)| ) product ( 1 + CNORM(i) / |A(i,i)| ) */ /* 1<=i< j */ /* Since |x(j)| <= M(j), we use the Level 2 BLAS routine STRSV if the */ /* reciprocal of the largest M(j), j=1,..,n, is larger than */ /* max(underflow, 1/overflow). */ /* The bound on x(j) is also used to determine when a step in the */ /* columnwise method can be performed without fear of overflow. If */ /* the computed bound is greater than a large constant, x is scaled to */ /* prevent overflow, but if the bound overflows, x is set to 0, x(j) to */ /* 1, and scale to 0, and a non-trivial solution to A*x = 0 is found. */ /* Similarly, a row-wise scheme is used to solve A'*x = b. The basic */ /* algorithm for A upper triangular is */ /* for j = 1, ..., n */ /* x(j) := ( b(j) - A[1:j-1,j]' * x[1:j-1] ) / A(j,j) */ /* end */ /* We simultaneously compute two bounds */ /* G(j) = bound on ( b(i) - A[1:i-1,i]' * x[1:i-1] ), 1<=i<=j */ /* M(j) = bound on x(i), 1<=i<=j */ /* The initial values are G(0) = 0, M(0) = max{b(i), i=1,..,n}, and we */ /* add the constraint G(j) >= G(j-1) and M(j) >= M(j-1) for j >= 1. */ /* Then the bound on x(j) is */ /* M(j) <= M(j-1) * ( 1 + CNORM(j) ) / | A(j,j) | */ /* <= M(0) * product ( ( 1 + CNORM(i) ) / |A(i,i)| ) */ /* 1<=i<=j */ /* and we can safely call STRSV if 1/M(n) and 1/G(n) are both greater */ /* than max(underflow, 1/overflow). */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --x; --cnorm; /* Function Body */ *info = 0; upper = lsame_(uplo, "U", (ftnlen)1, (ftnlen)1); notran = lsame_(trans, "N", (ftnlen)1, (ftnlen)1); nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1); /* Test the input parameters. */ if (! upper && ! lsame_(uplo, "L", (ftnlen)1, (ftnlen)1)) { *info = -1; } else if (! notran && ! lsame_(trans, "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (ftnlen)1)) { *info = -2; } else if (! nounit && ! lsame_(diag, "U", (ftnlen)1, (ftnlen)1)) { *info = -3; } else if (! lsame_(normin, "Y", (ftnlen)1, (ftnlen)1) && ! lsame_(normin, "N", (ftnlen)1, (ftnlen)1)) { *info = -4; } else if (*n < 0) { *info = -5; } else if (*lda < max(1,*n)) { *info = -7; } if (*info != 0) { i__1 = -(*info); xerbla_("SLATRS", &i__1, (ftnlen)6); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Determine machine dependent parameters to control overflow. */ smlnum = slamch_("Safe minimum", (ftnlen)12) / slamch_("Precision", ( ftnlen)9); bignum = 1.f / smlnum; *scale = 1.f; if (lsame_(normin, "N", (ftnlen)1, (ftnlen)1)) { /* Compute the 1-norm of each column, not including the diagonal. */ if (upper) { /* A is upper triangular. */ i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j - 1; cnorm[j] = sasum_(&i__2, &a[j * a_dim1 + 1], &c__1); /* L10: */ } } else { /* A is lower triangular. */ i__1 = *n - 1; for (j = 1; j <= i__1; ++j) { i__2 = *n - j; cnorm[j] = sasum_(&i__2, &a[j + 1 + j * a_dim1], &c__1); /* L20: */ } cnorm[*n] = 0.f; } } /* Scale the column norms by TSCAL if the maximum element in CNORM is */ /* greater than BIGNUM. */ imax = isamax_(n, &cnorm[1], &c__1); tmax = cnorm[imax]; if (tmax <= bignum) { tscal = 1.f; } else { tscal = 1.f / (smlnum * tmax); sscal_(n, &tscal, &cnorm[1], &c__1); } /* Compute a bound on the computed solution vector to see if the */ /* Level 2 BLAS routine STRSV can be used. */ j = isamax_(n, &x[1], &c__1); xmax = (r__1 = x[j], dabs(r__1)); xbnd = xmax; if (notran) { /* Compute the growth in A * x = b. */ if (upper) { jfirst = *n; jlast = 1; jinc = -1; } else { jfirst = 1; jlast = *n; jinc = 1; } if (tscal != 1.f) { grow = 0.f; goto L50; } if (nounit) { /* A is non-unit triangular. */ /* Compute GROW = 1/G(j) and XBND = 1/M(j). */ /* Initially, G(0) = max{x(i), i=1,...,n}. */ grow = 1.f / dmax(xbnd,smlnum); xbnd = grow; i__1 = jlast; i__2 = jinc; for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L50; } /* M(j) = G(j-1) / abs(A(j,j)) */ tjj = (r__1 = a[j + j * a_dim1], dabs(r__1)); /* Computing MIN */ r__1 = xbnd, r__2 = dmin(1.f,tjj) * grow; xbnd = dmin(r__1,r__2); if (tjj + cnorm[j] >= smlnum) { /* G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */ grow *= tjj / (tjj + cnorm[j]); } else { /* G(j) could overflow, set GROW to 0. */ grow = 0.f; } /* L30: */ } grow = xbnd; } else { /* A is unit triangular. */ /* Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. */ /* Computing MIN */ r__1 = 1.f, r__2 = 1.f / dmax(xbnd,smlnum); grow = dmin(r__1,r__2); i__2 = jlast; i__1 = jinc; for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L50; } /* G(j) = G(j-1)*( 1 + CNORM(j) ) */ grow *= 1.f / (cnorm[j] + 1.f); /* L40: */ } } L50: ; } else { /* Compute the growth in A' * x = b. */ if (upper) { jfirst = 1; jlast = *n; jinc = 1; } else { jfirst = *n; jlast = 1; jinc = -1; } if (tscal != 1.f) { grow = 0.f; goto L80; } if (nounit) { /* A is non-unit triangular. */ /* Compute GROW = 1/G(j) and XBND = 1/M(j). */ /* Initially, M(0) = max{x(i), i=1,...,n}. */ grow = 1.f / dmax(xbnd,smlnum); xbnd = grow; i__1 = jlast; i__2 = jinc; for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L80; } /* G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */ xj = cnorm[j] + 1.f; /* Computing MIN */ r__1 = grow, r__2 = xbnd / xj; grow = dmin(r__1,r__2); /* M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */ tjj = (r__1 = a[j + j * a_dim1], dabs(r__1)); if (xj > tjj) { xbnd *= tjj / xj; } /* L60: */ } grow = dmin(grow,xbnd); } else { /* A is unit triangular. */ /* Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. */ /* Computing MIN */ r__1 = 1.f, r__2 = 1.f / dmax(xbnd,smlnum); grow = dmin(r__1,r__2); i__2 = jlast; i__1 = jinc; for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L80; } /* G(j) = ( 1 + CNORM(j) )*G(j-1) */ xj = cnorm[j] + 1.f; grow /= xj; /* L70: */ } } L80: ; } if (grow * tscal > smlnum) { /* Use the Level 2 BLAS solve if the reciprocal of the bound on */ /* elements of X is not too small. */ strsv_(uplo, trans, diag, n, &a[a_offset], lda, &x[1], &c__1, (ftnlen) 1, (ftnlen)1, (ftnlen)1); } else { /* Use a Level 1 BLAS solve, scaling intermediate results. */ if (xmax > bignum) { /* Scale X so that its components are less than or equal to */ /* BIGNUM in absolute value. */ *scale = bignum / xmax; sscal_(n, scale, &x[1], &c__1); xmax = bignum; } if (notran) { /* Solve A * x = b */ i__1 = jlast; i__2 = jinc; for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Compute x(j) = b(j) / A(j,j), scaling x if necessary. */ xj = (r__1 = x[j], dabs(r__1)); if (nounit) { tjjs = a[j + j * a_dim1] * tscal; } else { tjjs = tscal; if (tscal == 1.f) { goto L95; } } tjj = dabs(tjjs); if (tjj > smlnum) { /* abs(A(j,j)) > SMLNUM: */ if (tjj < 1.f) { if (xj > tjj * bignum) { /* Scale x by 1/b(j). */ rec = 1.f / xj; sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } } x[j] /= tjjs; xj = (r__1 = x[j], dabs(r__1)); } else if (tjj > 0.f) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if (xj > tjj * bignum) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM */ /* to avoid overflow when dividing by A(j,j). */ rec = tjj * bignum / xj; if (cnorm[j] > 1.f) { /* Scale by 1/CNORM(j) to avoid overflow when */ /* multiplying x(j) times column j. */ rec /= cnorm[j]; } sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } x[j] /= tjjs; xj = (r__1 = x[j], dabs(r__1)); } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0, and compute a solution to A*x = 0. */ i__3 = *n; for (i__ = 1; i__ <= i__3; ++i__) { x[i__] = 0.f; /* L90: */ } x[j] = 1.f; xj = 1.f; *scale = 0.f; xmax = 0.f; } L95: /* Scale x if necessary to avoid overflow when adding a */ /* multiple of column j of A. */ if (xj > 1.f) { rec = 1.f / xj; if (cnorm[j] > (bignum - xmax) * rec) { /* Scale x by 1/(2*abs(x(j))). */ rec *= .5f; sscal_(n, &rec, &x[1], &c__1); *scale *= rec; } } else if (xj * cnorm[j] > bignum - xmax) { /* Scale x by 1/2. */ sscal_(n, &c_b36, &x[1], &c__1); *scale *= .5f; } if (upper) { if (j > 1) { /* Compute the update */ /* x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */ i__3 = j - 1; r__1 = -x[j] * tscal; saxpy_(&i__3, &r__1, &a[j * a_dim1 + 1], &c__1, &x[1], &c__1); i__3 = j - 1; i__ = isamax_(&i__3, &x[1], &c__1); xmax = (r__1 = x[i__], dabs(r__1)); } } else { if (j < *n) { /* Compute the update */ /* x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */ i__3 = *n - j; r__1 = -x[j] * tscal; saxpy_(&i__3, &r__1, &a[j + 1 + j * a_dim1], &c__1, & x[j + 1], &c__1); i__3 = *n - j; i__ = j + isamax_(&i__3, &x[j + 1], &c__1); xmax = (r__1 = x[i__], dabs(r__1)); } } /* L100: */ } } else { /* Solve A' * x = b */ i__2 = jlast; i__1 = jinc; for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Compute x(j) = b(j) - sum A(k,j)*x(k). */ /* k<>j */ xj = (r__1 = x[j], dabs(r__1)); uscal = tscal; rec = 1.f / dmax(xmax,1.f); if (cnorm[j] > (bignum - xj) * rec) { /* If x(j) could overflow, scale x by 1/(2*XMAX). */ rec *= .5f; if (nounit) { tjjs = a[j + j * a_dim1] * tscal; } else { tjjs = tscal; } tjj = dabs(tjjs); if (tjj > 1.f) { /* Divide by A(j,j) when scaling x if A(j,j) > 1. */ /* Computing MIN */ r__1 = 1.f, r__2 = rec * tjj; rec = dmin(r__1,r__2); uscal /= tjjs; } if (rec < 1.f) { sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } } sumj = 0.f; if (uscal == 1.f) { /* If the scaling needed for A in the dot product is 1, */ /* call SDOT to perform the dot product. */ if (upper) { i__3 = j - 1; sumj = sdot_(&i__3, &a[j * a_dim1 + 1], &c__1, &x[1], &c__1); } else if (j < *n) { i__3 = *n - j; sumj = sdot_(&i__3, &a[j + 1 + j * a_dim1], &c__1, &x[ j + 1], &c__1); } } else { /* Otherwise, use in-line code for the dot product. */ if (upper) { i__3 = j - 1; for (i__ = 1; i__ <= i__3; ++i__) { sumj += a[i__ + j * a_dim1] * uscal * x[i__]; /* L110: */ } } else if (j < *n) { i__3 = *n; for (i__ = j + 1; i__ <= i__3; ++i__) { sumj += a[i__ + j * a_dim1] * uscal * x[i__]; /* L120: */ } } } if (uscal == tscal) { /* Compute x(j) := ( x(j) - sumj ) / A(j,j) if 1/A(j,j) */ /* was not used to scale the dotproduct. */ x[j] -= sumj; xj = (r__1 = x[j], dabs(r__1)); if (nounit) { tjjs = a[j + j * a_dim1] * tscal; } else { tjjs = tscal; if (tscal == 1.f) { goto L135; } } /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */ tjj = dabs(tjjs); if (tjj > smlnum) { /* abs(A(j,j)) > SMLNUM: */ if (tjj < 1.f) { if (xj > tjj * bignum) { /* Scale X by 1/abs(x(j)). */ rec = 1.f / xj; sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } } x[j] /= tjjs; } else if (tjj > 0.f) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if (xj > tjj * bignum) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */ rec = tjj * bignum / xj; sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } x[j] /= tjjs; } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0, and compute a solution to A'*x = 0. */ i__3 = *n; for (i__ = 1; i__ <= i__3; ++i__) { x[i__] = 0.f; /* L130: */ } x[j] = 1.f; *scale = 0.f; xmax = 0.f; } L135: ; } else { /* Compute x(j) := x(j) / A(j,j) - sumj if the dot */ /* product has already been divided by 1/A(j,j). */ x[j] = x[j] / tjjs - sumj; } /* Computing MAX */ r__2 = xmax, r__3 = (r__1 = x[j], dabs(r__1)); xmax = dmax(r__2,r__3); /* L140: */ } } *scale /= tscal; } /* Scale the column norms by 1/TSCAL for return. */ if (tscal != 1.f) { r__1 = 1.f / tscal; sscal_(n, &r__1, &cnorm[1], &c__1); } return 0; /* End of SLATRS */ } /* slatrs_ */
/* Subroutine */ int slatrs_(char *uplo, char *trans, char *diag, char * normin, integer *n, real *a, integer *lda, real *x, real *scale, real *cnorm, integer *info) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; real r__1, r__2, r__3; /* Local variables */ integer i__, j; real xj, rec, tjj; integer jinc; real xbnd; integer imax; real tmax, tjjs; extern real sdot_(integer *, real *, integer *, real *, integer *); real xmax, grow, sumj; extern logical lsame_(char *, char *); extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); real tscal, uscal; integer jlast; extern real sasum_(integer *, real *, integer *); logical upper; extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, real *, integer *), strsv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *); real bignum; extern integer isamax_(integer *, real *, integer *); logical notran; integer jfirst; real smlnum; logical nounit; /* -- LAPACK auxiliary routine (version 3.4.2) -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ /* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ /* September 2012 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --x; --cnorm; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); notran = lsame_(trans, "N"); nounit = lsame_(diag, "N"); /* Test the input parameters. */ if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (! notran && ! lsame_(trans, "T") && ! lsame_(trans, "C")) { *info = -2; } else if (! nounit && ! lsame_(diag, "U")) { *info = -3; } else if (! lsame_(normin, "Y") && ! lsame_(normin, "N")) { *info = -4; } else if (*n < 0) { *info = -5; } else if (*lda < max(1,*n)) { *info = -7; } if (*info != 0) { i__1 = -(*info); xerbla_("SLATRS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Determine machine dependent parameters to control overflow. */ smlnum = slamch_("Safe minimum") / slamch_("Precision"); bignum = 1.f / smlnum; *scale = 1.f; if (lsame_(normin, "N")) { /* Compute the 1-norm of each column, not including the diagonal. */ if (upper) { /* A is upper triangular. */ i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j - 1; cnorm[j] = sasum_(&i__2, &a[j * a_dim1 + 1], &c__1); /* L10: */ } } else { /* A is lower triangular. */ i__1 = *n - 1; for (j = 1; j <= i__1; ++j) { i__2 = *n - j; cnorm[j] = sasum_(&i__2, &a[j + 1 + j * a_dim1], &c__1); /* L20: */ } cnorm[*n] = 0.f; } } /* Scale the column norms by TSCAL if the maximum element in CNORM is */ /* greater than BIGNUM. */ imax = isamax_(n, &cnorm[1], &c__1); tmax = cnorm[imax]; if (tmax <= bignum) { tscal = 1.f; } else { tscal = 1.f / (smlnum * tmax); sscal_(n, &tscal, &cnorm[1], &c__1); } /* Compute a bound on the computed solution vector to see if the */ /* Level 2 BLAS routine STRSV can be used. */ j = isamax_(n, &x[1], &c__1); xmax = (r__1 = x[j], f2c_abs(r__1)); xbnd = xmax; if (notran) { /* Compute the growth in A * x = b. */ if (upper) { jfirst = *n; jlast = 1; jinc = -1; } else { jfirst = 1; jlast = *n; jinc = 1; } if (tscal != 1.f) { grow = 0.f; goto L50; } if (nounit) { /* A is non-unit triangular. */ /* Compute GROW = 1/G(j) and XBND = 1/M(j). */ /* Initially, G(0) = max{ x(i), i=1,...,n} . */ grow = 1.f / max(xbnd,smlnum); xbnd = grow; i__1 = jlast; i__2 = jinc; for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L50; } /* M(j) = G(j-1) / f2c_abs(A(j,j)) */ tjj = (r__1 = a[j + j * a_dim1], f2c_abs(r__1)); /* Computing MIN */ r__1 = xbnd; r__2 = min(1.f,tjj) * grow; // , expr subst xbnd = min(r__1,r__2); if (tjj + cnorm[j] >= smlnum) { /* G(j) = G(j-1)*( 1 + CNORM(j) / f2c_abs(A(j,j)) ) */ grow *= tjj / (tjj + cnorm[j]); } else { /* G(j) could overflow, set GROW to 0. */ grow = 0.f; } /* L30: */ } grow = xbnd; } else { /* A is unit triangular. */ /* Compute GROW = 1/G(j), where G(0) = max{ x(i), i=1,...,n} . */ /* Computing MIN */ r__1 = 1.f; r__2 = 1.f / max(xbnd,smlnum); // , expr subst grow = min(r__1,r__2); i__2 = jlast; i__1 = jinc; for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L50; } /* G(j) = G(j-1)*( 1 + CNORM(j) ) */ grow *= 1.f / (cnorm[j] + 1.f); /* L40: */ } } L50: ; } else { /* Compute the growth in A**T * x = b. */ if (upper) { jfirst = 1; jlast = *n; jinc = 1; } else { jfirst = *n; jlast = 1; jinc = -1; } if (tscal != 1.f) { grow = 0.f; goto L80; } if (nounit) { /* A is non-unit triangular. */ /* Compute GROW = 1/G(j) and XBND = 1/M(j). */ /* Initially, M(0) = max{ x(i), i=1,...,n} . */ grow = 1.f / max(xbnd,smlnum); xbnd = grow; i__1 = jlast; i__2 = jinc; for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L80; } /* G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */ xj = cnorm[j] + 1.f; /* Computing MIN */ r__1 = grow; r__2 = xbnd / xj; // , expr subst grow = min(r__1,r__2); /* M(j) = M(j-1)*( 1 + CNORM(j) ) / f2c_abs(A(j,j)) */ tjj = (r__1 = a[j + j * a_dim1], f2c_abs(r__1)); if (xj > tjj) { xbnd *= tjj / xj; } /* L60: */ } grow = min(grow,xbnd); } else { /* A is unit triangular. */ /* Compute GROW = 1/G(j), where G(0) = max{ x(i), i=1,...,n} . */ /* Computing MIN */ r__1 = 1.f; r__2 = 1.f / max(xbnd,smlnum); // , expr subst grow = min(r__1,r__2); i__2 = jlast; i__1 = jinc; for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L80; } /* G(j) = ( 1 + CNORM(j) )*G(j-1) */ xj = cnorm[j] + 1.f; grow /= xj; /* L70: */ } } L80: ; } if (grow * tscal > smlnum) { /* Use the Level 2 BLAS solve if the reciprocal of the bound on */ /* elements of X is not too small. */ strsv_(uplo, trans, diag, n, &a[a_offset], lda, &x[1], &c__1); } else { /* Use a Level 1 BLAS solve, scaling intermediate results. */ if (xmax > bignum) { /* Scale X so that its components are less than or equal to */ /* BIGNUM in absolute value. */ *scale = bignum / xmax; sscal_(n, scale, &x[1], &c__1); xmax = bignum; } if (notran) { /* Solve A * x = b */ i__1 = jlast; i__2 = jinc; for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Compute x(j) = b(j) / A(j,j), scaling x if necessary. */ xj = (r__1 = x[j], f2c_abs(r__1)); if (nounit) { tjjs = a[j + j * a_dim1] * tscal; } else { tjjs = tscal; if (tscal == 1.f) { goto L95; } } tjj = f2c_abs(tjjs); if (tjj > smlnum) { /* f2c_abs(A(j,j)) > SMLNUM: */ if (tjj < 1.f) { if (xj > tjj * bignum) { /* Scale x by 1/b(j). */ rec = 1.f / xj; sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } } x[j] /= tjjs; xj = (r__1 = x[j], f2c_abs(r__1)); } else if (tjj > 0.f) { /* 0 < f2c_abs(A(j,j)) <= SMLNUM: */ if (xj > tjj * bignum) { /* Scale x by (1/f2c_abs(x(j)))*f2c_abs(A(j,j))*BIGNUM */ /* to avoid overflow when dividing by A(j,j). */ rec = tjj * bignum / xj; if (cnorm[j] > 1.f) { /* Scale by 1/CNORM(j) to avoid overflow when */ /* multiplying x(j) times column j. */ rec /= cnorm[j]; } sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } x[j] /= tjjs; xj = (r__1 = x[j], f2c_abs(r__1)); } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0, and compute a solution to A*x = 0. */ i__3 = *n; for (i__ = 1; i__ <= i__3; ++i__) { x[i__] = 0.f; /* L90: */ } x[j] = 1.f; xj = 1.f; *scale = 0.f; xmax = 0.f; } L95: /* Scale x if necessary to avoid overflow when adding a */ /* multiple of column j of A. */ if (xj > 1.f) { rec = 1.f / xj; if (cnorm[j] > (bignum - xmax) * rec) { /* Scale x by 1/(2*f2c_abs(x(j))). */ rec *= .5f; sscal_(n, &rec, &x[1], &c__1); *scale *= rec; } } else if (xj * cnorm[j] > bignum - xmax) { /* Scale x by 1/2. */ sscal_(n, &c_b36, &x[1], &c__1); *scale *= .5f; } if (upper) { if (j > 1) { /* Compute the update */ /* x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */ i__3 = j - 1; r__1 = -x[j] * tscal; saxpy_(&i__3, &r__1, &a[j * a_dim1 + 1], &c__1, &x[1], &c__1); i__3 = j - 1; i__ = isamax_(&i__3, &x[1], &c__1); xmax = (r__1 = x[i__], f2c_abs(r__1)); } } else { if (j < *n) { /* Compute the update */ /* x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */ i__3 = *n - j; r__1 = -x[j] * tscal; saxpy_(&i__3, &r__1, &a[j + 1 + j * a_dim1], &c__1, & x[j + 1], &c__1); i__3 = *n - j; i__ = j + isamax_(&i__3, &x[j + 1], &c__1); xmax = (r__1 = x[i__], f2c_abs(r__1)); } } /* L100: */ } } else { /* Solve A**T * x = b */ i__2 = jlast; i__1 = jinc; for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Compute x(j) = b(j) - sum A(k,j)*x(k). */ /* k<>j */ xj = (r__1 = x[j], f2c_abs(r__1)); uscal = tscal; rec = 1.f / max(xmax,1.f); if (cnorm[j] > (bignum - xj) * rec) { /* If x(j) could overflow, scale x by 1/(2*XMAX). */ rec *= .5f; if (nounit) { tjjs = a[j + j * a_dim1] * tscal; } else { tjjs = tscal; } tjj = f2c_abs(tjjs); if (tjj > 1.f) { /* Divide by A(j,j) when scaling x if A(j,j) > 1. */ /* Computing MIN */ r__1 = 1.f; r__2 = rec * tjj; // , expr subst rec = min(r__1,r__2); uscal /= tjjs; } if (rec < 1.f) { sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } } sumj = 0.f; if (uscal == 1.f) { /* If the scaling needed for A in the dot product is 1, */ /* call SDOT to perform the dot product. */ if (upper) { i__3 = j - 1; sumj = sdot_(&i__3, &a[j * a_dim1 + 1], &c__1, &x[1], &c__1); } else if (j < *n) { i__3 = *n - j; sumj = sdot_(&i__3, &a[j + 1 + j * a_dim1], &c__1, &x[ j + 1], &c__1); } } else { /* Otherwise, use in-line code for the dot product. */ if (upper) { i__3 = j - 1; for (i__ = 1; i__ <= i__3; ++i__) { sumj += a[i__ + j * a_dim1] * uscal * x[i__]; /* L110: */ } } else if (j < *n) { i__3 = *n; for (i__ = j + 1; i__ <= i__3; ++i__) { sumj += a[i__ + j * a_dim1] * uscal * x[i__]; /* L120: */ } } } if (uscal == tscal) { /* Compute x(j) := ( x(j) - sumj ) / A(j,j) if 1/A(j,j) */ /* was not used to scale the dotproduct. */ x[j] -= sumj; xj = (r__1 = x[j], f2c_abs(r__1)); if (nounit) { tjjs = a[j + j * a_dim1] * tscal; } else { tjjs = tscal; if (tscal == 1.f) { goto L135; } } /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */ tjj = f2c_abs(tjjs); if (tjj > smlnum) { /* f2c_abs(A(j,j)) > SMLNUM: */ if (tjj < 1.f) { if (xj > tjj * bignum) { /* Scale X by 1/f2c_abs(x(j)). */ rec = 1.f / xj; sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } } x[j] /= tjjs; } else if (tjj > 0.f) { /* 0 < f2c_abs(A(j,j)) <= SMLNUM: */ if (xj > tjj * bignum) { /* Scale x by (1/f2c_abs(x(j)))*f2c_abs(A(j,j))*BIGNUM. */ rec = tjj * bignum / xj; sscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } x[j] /= tjjs; } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and */ /* scale = 0, and compute a solution to A**T*x = 0. */ i__3 = *n; for (i__ = 1; i__ <= i__3; ++i__) { x[i__] = 0.f; /* L130: */ } x[j] = 1.f; *scale = 0.f; xmax = 0.f; } L135: ; } else { /* Compute x(j) := x(j) / A(j,j) - sumj if the dot */ /* product has already been divided by 1/A(j,j). */ x[j] = x[j] / tjjs - sumj; } /* Computing MAX */ r__2 = xmax; r__3 = (r__1 = x[j], f2c_abs(r__1)); // , expr subst xmax = max(r__2,r__3); /* L140: */ } } *scale /= tscal; } /* Scale the column norms by 1/TSCAL for return. */ if (tscal != 1.f) { r__1 = 1.f / tscal; sscal_(n, &r__1, &cnorm[1], &c__1); } return 0; /* End of SLATRS */ }
/* Subroutine */ int strrfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real *work, integer *iwork, integer *info) { /* -- LAPACK routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University September 30, 1994 Purpose ======= STRRFS provides error bounds and backward error estimates for the solution to a system of linear equations with a triangular coefficient matrix. The solution matrix X must be computed by STRTRS or some other means before entering this routine. STRRFS does not do iterative refinement because doing so cannot improve the backward error. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A**T * X = B (Transpose) = 'C': A**H * X = B (Conjugate transpose = Transpose) DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrices B and X. NRHS >= 0. A (input) REAL array, dimension (LDA,N) The triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input) REAL array, dimension (LDB,NRHS) The right hand side matrix B. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). X (input) REAL array, dimension (LDX,NRHS) The solution matrix X. LDX (input) INTEGER The leading dimension of the array X. LDX >= max(1,N). FERR (output) REAL array, dimension (NRHS) The estimated forward error bound for each solution vector X(j) (the j-th column of the solution matrix X). If XTRUE is the true solution corresponding to X(j), FERR(j) is an estimated upper bound for the magnitude of the largest element in (X(j) - XTRUE) divided by the magnitude of the largest element in X(j). The estimate is as reliable as the estimate for RCOND, and is almost always a slight overestimate of the true error. BERR (output) REAL array, dimension (NRHS) The componentwise relative backward error of each solution vector X(j) (i.e., the smallest relative change in any element of A or B that makes X(j) an exact solution). WORK (workspace) REAL array, dimension (3*N) IWORK (workspace) INTEGER array, dimension (N) INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== Test the input parameters. Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; static real c_b19 = -1.f; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3; real r__1, r__2, r__3; /* Local variables */ static integer kase; static real safe1, safe2; static integer i__, j, k; static real s; extern logical lsame_(char *, char *); static logical upper; extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *), strsv_( char *, char *, char *, integer *, real *, integer *, real *, integer *); static real xk; extern doublereal slamch_(char *); static integer nz; static real safmin; extern /* Subroutine */ int xerbla_(char *, integer *), slacon_( integer *, real *, real *, integer *, real *, integer *); static logical notran; static char transt[1]; static logical nounit; static real lstres, eps; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] #define x_ref(a_1,a_2) x[(a_2)*x_dim1 + a_1] a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; x_dim1 = *ldx; x_offset = 1 + x_dim1 * 1; x -= x_offset; --ferr; --berr; --work; --iwork; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); notran = lsame_(trans, "N"); nounit = lsame_(diag, "N"); if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (! notran && ! lsame_(trans, "T") && ! lsame_(trans, "C")) { *info = -2; } else if (! nounit && ! lsame_(diag, "U")) { *info = -3; } else if (*n < 0) { *info = -4; } else if (*nrhs < 0) { *info = -5; } else if (*lda < max(1,*n)) { *info = -7; } else if (*ldb < max(1,*n)) { *info = -9; } else if (*ldx < max(1,*n)) { *info = -11; } if (*info != 0) { i__1 = -(*info); xerbla_("STRRFS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { ferr[j] = 0.f; berr[j] = 0.f; /* L10: */ } return 0; } if (notran) { *(unsigned char *)transt = 'T'; } else { *(unsigned char *)transt = 'N'; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = *n + 1; eps = slamch_("Epsilon"); safmin = slamch_("Safe minimum"); safe1 = nz * safmin; safe2 = safe1 / eps; /* Do for each right hand side */ i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { /* Compute residual R = B - op(A) * X, where op(A) = A or A', depending on TRANS. */ scopy_(n, &x_ref(1, j), &c__1, &work[*n + 1], &c__1); strmv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1); saxpy_(n, &c_b19, &b_ref(1, j), &c__1, &work[*n + 1], &c__1); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th components of the numerator and denominator before dividing. */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[i__] = (r__1 = b_ref(i__, j), dabs(r__1)); /* L20: */ } if (notran) { /* Compute abs(A)*abs(X) + abs(B). */ if (upper) { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (r__1 = x_ref(k, j), dabs(r__1)); i__3 = k; for (i__ = 1; i__ <= i__3; ++i__) { work[i__] += (r__1 = a_ref(i__, k), dabs(r__1)) * xk; /* L30: */ } /* L40: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (r__1 = x_ref(k, j), dabs(r__1)); i__3 = k - 1; for (i__ = 1; i__ <= i__3; ++i__) { work[i__] += (r__1 = a_ref(i__, k), dabs(r__1)) * xk; /* L50: */ } work[k] += xk; /* L60: */ } } } else { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (r__1 = x_ref(k, j), dabs(r__1)); i__3 = *n; for (i__ = k; i__ <= i__3; ++i__) { work[i__] += (r__1 = a_ref(i__, k), dabs(r__1)) * xk; /* L70: */ } /* L80: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (r__1 = x_ref(k, j), dabs(r__1)); i__3 = *n; for (i__ = k + 1; i__ <= i__3; ++i__) { work[i__] += (r__1 = a_ref(i__, k), dabs(r__1)) * xk; /* L90: */ } work[k] += xk; /* L100: */ } } } } else { /* Compute abs(A')*abs(X) + abs(B). */ if (upper) { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = 0.f; i__3 = k; for (i__ = 1; i__ <= i__3; ++i__) { s += (r__1 = a_ref(i__, k), dabs(r__1)) * (r__2 = x_ref(i__, j), dabs(r__2)); /* L110: */ } work[k] += s; /* L120: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = (r__1 = x_ref(k, j), dabs(r__1)); i__3 = k - 1; for (i__ = 1; i__ <= i__3; ++i__) { s += (r__1 = a_ref(i__, k), dabs(r__1)) * (r__2 = x_ref(i__, j), dabs(r__2)); /* L130: */ } work[k] += s; /* L140: */ } } } else { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = 0.f; i__3 = *n; for (i__ = k; i__ <= i__3; ++i__) { s += (r__1 = a_ref(i__, k), dabs(r__1)) * (r__2 = x_ref(i__, j), dabs(r__2)); /* L150: */ } work[k] += s; /* L160: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = (r__1 = x_ref(k, j), dabs(r__1)); i__3 = *n; for (i__ = k + 1; i__ <= i__3; ++i__) { s += (r__1 = a_ref(i__, k), dabs(r__1)) * (r__2 = x_ref(i__, j), dabs(r__2)); /* L170: */ } work[k] += s; /* L180: */ } } } } s = 0.f; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] > safe2) { /* Computing MAX */ r__2 = s, r__3 = (r__1 = work[*n + i__], dabs(r__1)) / work[ i__]; s = dmax(r__2,r__3); } else { /* Computing MAX */ r__2 = s, r__3 = ((r__1 = work[*n + i__], dabs(r__1)) + safe1) / (work[i__] + safe1); s = dmax(r__2,r__3); } /* L190: */ } berr[j] = s; /* Bound error from formula norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use SLACON to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] > safe2) { work[i__] = (r__1 = work[*n + i__], dabs(r__1)) + nz * eps * work[i__]; } else { work[i__] = (r__1 = work[*n + i__], dabs(r__1)) + nz * eps * work[i__] + safe1; } /* L200: */ } kase = 0; L210: slacon_(n, &work[(*n << 1) + 1], &work[*n + 1], &iwork[1], &ferr[j], & kase); if (kase != 0) { if (kase == 1) { /* Multiply by diag(W)*inv(op(A)'). */ strsv_(uplo, transt, diag, n, &a[a_offset], lda, &work[*n + 1] , &c__1); i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[*n + i__] = work[i__] * work[*n + i__]; /* L220: */ } } else { /* Multiply by inv(op(A))*diag(W). */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[*n + i__] = work[i__] * work[*n + i__]; /* L230: */ } strsv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1); } goto L210; } /* Normalize error. */ lstres = 0.f; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { /* Computing MAX */ r__2 = lstres, r__3 = (r__1 = x_ref(i__, j), dabs(r__1)); lstres = dmax(r__2,r__3); /* L240: */ } if (lstres != 0.f) { ferr[j] /= lstres; } /* L250: */ } return 0; /* End of STRRFS */ } /* strrfs_ */
/* Return value: 0 - successful return * > 0 - number of bytes allocated when run out of space */ int scolumn_bmod ( const int jcol, /* in */ const int nseg, /* in */ double *dense, /* in */ double *tempv, /* working array */ int *segrep, /* in */ int *repfnz, /* in */ int fpanelc, /* in -- first column in the current panel */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { /* * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j] * */ #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; double alpha, beta; #endif /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ double ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; register int lptr, kfnz, isub, irow, i; register int no_zeros, new_next; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode. */ int *xsup, *supno; int *lsub, *xlsub; double *lusup; int *xlusup; int nzlumax; double *tempv1; double zero = 0.0; #ifdef USE_VENDOR_BLAS double one = 1.0; double none = -1.0; #endif int mem_error; flops_t *ops = stat->ops; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; nzlumax = Glu->nzlumax; jcolp1 = jcol + 1; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = SUPERLU_MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc > fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = SUPERLU_MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; ops[TRSV] += segsze * (segsze - 1); ops[GEMV] += 2 * nrow * segsze; /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; dense[irow] -= ukj*lusup[luptr]; luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ ukj -= ukj1 * lusup[luptr1]; dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; dense[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] ); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; dense[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else strsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #ifdef _CRAY SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else slsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[] as a temporary storage */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; tempv[i] = zero; ++isub; } /* Scatter tempv1[] into SPA dense[] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; dense[irow] -= tempv1[i]; tempv1[i] = zero; ++isub; } } } /* if jsupno ... */ } /* for each segment... */ /* * Process the supernodal portion of L\U[*,j] */ nextlu = xlusup[jcol]; fsupc = xsup[jsupno]; /* Copy the SPA dense into L\U[*,j] */ new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; while ( new_next > nzlumax ) { if ((mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))) return (mem_error); lusup = Glu->lusup; lsub = Glu->lsub; } for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; ++nextlu; } xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ /* For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * 1) fsupc < fpanelc, then fst_col := fpanelc * 2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = SUPERLU_MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* Distance between the current supernode and the current panel. d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* Points to the beginning of jcol in snode L\U(jsupno) */ ufirst = xlusup[jcol] + d_fsupc; ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); #else strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); #endif alpha = none; beta = one; /* y := beta*y + alpha*A*x */ #ifdef _CRAY SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { lusup[isub] -= tempv[i]; tempv[i] = 0.0; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }
int sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * sp_strsv() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A'*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU. * * x - (input/output) float* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * */ #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif SCformat *Lstore; NCformat *Ustore; float *Lval, *Uval; int incx = 1; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; float *work; flops_t solve_ops; /* Test the input parameters */ *info = 0; if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) *info = -2; else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); xerbla_("sp_strsv", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = floatCalloc(L->nrow)) ) ABORT("Malloc fails for work in sp_strsv()."); if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) { SUPERLU_FREE(work); return 0; /* Quick return */ } for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1); solve_ops += 2 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; x[irow] -= x[fsupc] * Lval[luptr]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif #else slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); x[irow] -= work[i]; /* Scatter */ work[i] = 0.0; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); x[irow] -= x[fsupc] * Uval[i]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[irow] -= x[jcol] * Uval[i]; } } } } /* for k ... */ } } else { /* Form x := inv(A')*x */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 2 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); x[jcol] -= x[irow] * Lval[i]; iptr++; } } if ( nsupc > 1 ) { solve_ops += nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[jcol] -= x[irow] * Uval[i]; } } solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } stat->ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }
/* Subroutine */ int sgglse_(integer *m, integer *n, integer *p, real *a, integer *lda, real *b, integer *ldb, real *c__, real *d__, real *x, real *work, integer *lwork, integer *info) { /* -- LAPACK driver routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University June 30, 1999 Purpose ======= SGGLSE solves the linear equality-constrained least squares (LSE) problem: minimize || c - A*x ||_2 subject to B*x = d where A is an M-by-N matrix, B is a P-by-N matrix, c is a given M-vector, and d is a given P-vector. It is assumed that P <= N <= M+P, and rank(B) = P and rank( ( A ) ) = N. ( ( B ) ) These conditions ensure that the LSE problem has a unique solution, which is obtained using a GRQ factorization of the matrices B and A. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrices A and B. N >= 0. P (input) INTEGER The number of rows of the matrix B. 0 <= P <= N <= M+P. A (input/output) REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). B (input/output) REAL array, dimension (LDB,N) On entry, the P-by-N matrix B. On exit, B is destroyed. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,P). C (input/output) REAL array, dimension (M) On entry, C contains the right hand side vector for the least squares part of the LSE problem. On exit, the residual sum of squares for the solution is given by the sum of squares of elements N-P+1 to M of vector C. D (input/output) REAL array, dimension (P) On entry, D contains the right hand side vector for the constrained equation. On exit, D is destroyed. X (output) REAL array, dimension (N) On exit, X is the solution of the LSE problem. WORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,M+N+P). For optimum performance LWORK >= P+min(M,N)+max(M,N)*NB, where NB is an upper bound for the optimal blocksizes for SGEQRF, SGERQF, SORMQR and SORMRQ. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. ===================================================================== Test the input parameters Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; static integer c_n1 = -1; static real c_b29 = -1.f; static real c_b31 = 1.f; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; /* Local variables */ static integer lopt; extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *), strsv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); static integer nb, mn, nr; extern /* Subroutine */ int xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern /* Subroutine */ int sggrqf_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, real *, integer * , integer *); static integer nb1, nb2, nb3, nb4, lwkopt; static logical lquery; extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *), sormrq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real * , integer *, real *, integer *, integer *); #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; --c__; --d__; --x; --work; /* Function Body */ *info = 0; mn = min(*m,*n); nb1 = ilaenv_(&c__1, "SGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); nb2 = ilaenv_(&c__1, "SGERQF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); nb3 = ilaenv_(&c__1, "SORMQR", " ", m, n, p, &c_n1, (ftnlen)6, (ftnlen)1); nb4 = ilaenv_(&c__1, "SORMRQ", " ", m, n, p, &c_n1, (ftnlen)6, (ftnlen)1); /* Computing MAX */ i__1 = max(nb1,nb2), i__1 = max(i__1,nb3); nb = max(i__1,nb4); lwkopt = *p + mn + max(*m,*n) * nb; work[1] = (real) lwkopt; lquery = *lwork == -1; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*p < 0 || *p > *n || *p < *n - *m) { *info = -3; } else if (*lda < max(1,*m)) { *info = -5; } else if (*ldb < max(1,*p)) { *info = -7; } else /* if(complicated condition) */ { /* Computing MAX */ i__1 = 1, i__2 = *m + *n + *p; if (*lwork < max(i__1,i__2) && ! lquery) { *info = -12; } } if (*info != 0) { i__1 = -(*info); xerbla_("SGGLSE", &i__1); return 0; } else if (lquery) { return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Compute the GRQ factorization of matrices B and A: B*Q' = ( 0 T12 ) P Z'*A*Q' = ( R11 R12 ) N-P N-P P ( 0 R22 ) M+P-N N-P P where T12 and R11 are upper triangular, and Q and Z are orthogonal. */ i__1 = *lwork - *p - mn; sggrqf_(p, m, n, &b[b_offset], ldb, &work[1], &a[a_offset], lda, &work[*p + 1], &work[*p + mn + 1], &i__1, info); lopt = work[*p + mn + 1]; /* Update c = Z'*c = ( c1 ) N-P ( c2 ) M+P-N */ i__1 = max(1,*m); i__2 = *lwork - *p - mn; sormqr_("Left", "Transpose", m, &c__1, &mn, &a[a_offset], lda, &work[*p + 1], &c__[1], &i__1, &work[*p + mn + 1], &i__2, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) work[*p + mn + 1]; lopt = max(i__1,i__2); /* Solve T12*x2 = d for x2 */ strsv_("Upper", "No transpose", "Non unit", p, &b_ref(1, *n - *p + 1), ldb, &d__[1], &c__1); /* Update c1 */ i__1 = *n - *p; sgemv_("No transpose", &i__1, p, &c_b29, &a_ref(1, *n - *p + 1), lda, & d__[1], &c__1, &c_b31, &c__[1], &c__1); /* Sovle R11*x1 = c1 for x1 */ i__1 = *n - *p; strsv_("Upper", "No transpose", "Non unit", &i__1, &a[a_offset], lda, & c__[1], &c__1); /* Put the solutions in X */ i__1 = *n - *p; scopy_(&i__1, &c__[1], &c__1, &x[1], &c__1); scopy_(p, &d__[1], &c__1, &x[*n - *p + 1], &c__1); /* Compute the residual vector: */ if (*m < *n) { nr = *m + *p - *n; i__1 = *n - *m; sgemv_("No transpose", &nr, &i__1, &c_b29, &a_ref(*n - *p + 1, *m + 1) , lda, &d__[nr + 1], &c__1, &c_b31, &c__[*n - *p + 1], &c__1); } else { nr = *p; } strmv_("Upper", "No transpose", "Non unit", &nr, &a_ref(*n - *p + 1, *n - *p + 1), lda, &d__[1], &c__1); saxpy_(&nr, &c_b29, &d__[1], &c__1, &c__[*n - *p + 1], &c__1); /* Backward transformation x = Q'*x */ i__1 = *lwork - *p - mn; sormrq_("Left", "Transpose", n, &c__1, p, &b[b_offset], ldb, &work[1], &x[ 1], n, &work[*p + mn + 1], &i__1, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) work[*p + mn + 1]; work[1] = (real) (*p + mn + max(i__1,i__2)); return 0; /* End of SGGLSE */ } /* sgglse_ */
int psgstrf_column_bmod( const int pnum, /* process number */ const int jcol, /* current column in the panel */ const int fpanelc,/* first column in the panel */ const int nseg, /* number of s-nodes to update jcol */ int *segrep,/* in */ int *repfnz,/* in */ float *dense, /* modified */ float *tempv, /* working array */ pxgstrf_shared_t *pxgstrf_shared, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j]. * * Return value: * ============= * 0 - successful return * > 0 - number of bytes allocated when run out of space * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; float alpha, beta; #endif GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ float ukj, ukj1, ukj2; register int lptr, kfnz, isub, irow, i, no_zeros; register int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jsupno, k, ksub, krep, krep_ind, ksupno; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode.*/ int *xsup, *supno; int *lsub, *xlsub, *xlsub_end; float *lusup; int *xlusup, *xlusup_end; float *tempv1; int mem_error; register float flopcnt; float zero = 0.0; float one = 1.0; float none = -1.0; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; xlusup_end = Glu->xlusup_end; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; #if ( DEBUGlvel>=2 ) if (jcol==BADCOL) printf("(%d) psgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n", pnum, jcol, nseg, krep, jsupno, ksupno); #endif if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = SUPERLU_MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = SUPERLU_MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze; Gstat->procstat[pnum].fcops += flopcnt; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) psgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\ fsupc %d, nsupr %d, nsupc %d\n", pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc); #endif /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; dense[irow] -= ukj*lusup[luptr]; luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ ukj -= ukj1 * lusup[luptr1]; dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; dense[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] ); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; dense[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else strsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #if ( MACH==CRAY_PVP ) SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else slsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; /* Scatter */ tempv[i] = zero; isub++; } /* Scatter tempv1[] into SPA dense[*] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; dense[irow] -= tempv1[i]; tempv1[i] = zero; ++isub; } } /* else segsze >= 4 */ } /* if jsupno ... */ } /* for each segment... */ /* ------------------------------------------ Process the supernodal portion of L\U[*,j] ------------------------------------------ */ fsupc = SUPER_FSUPC (jsupno); nsupr = xlsub_end[fsupc] - xlsub[fsupc]; if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, pxgstrf_shared)) ) return mem_error; xlusup[jcol] = nextlu; lusup = Glu->lusup; /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; #ifdef DEBUG if (jcol == -1) printf("(%d) psgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n", pnum, jcol, irow, lusup[nextlu]); #endif ++nextlu; } xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */ #if ( DEBUGlevel>=2 ) if (jcol == -1) { nrow = xlusup_end[jcol] - xlusup[jcol]; print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]], &lusup[xlusup[jcol]]); } #endif /* * For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * (1) fsupc < fpanelc, then fst_col := fpanelc * (2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = SUPERLU_MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* distance between the current supernode and the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; lptr = xlsub[fsupc] + d_fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* points to the beginning of jcol in supernode L\U[*,jsupno] */ ufirst = xlusup[jcol] + d_fsupc; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) psgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n", pnum, jcol, fsupc, nsupr, nsupc, nrow); #endif flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; Gstat->procstat[pnum].fcops += flopcnt; /* ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; */ #ifdef USE_VENDOR_BLAS alpha = none; beta = one; /* y := beta*y + alpha*A*x */ #if ( MACH==CRAY_PVP ) STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { lusup[isub] -= tempv[i]; tempv[i] = 0.0; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }