int f2c_dtrsv(char* uplo, char* trans, char* diag, integer* N, doublereal* A, integer* lda, doublereal* X, integer* incX) { dtrsv_(uplo, trans, diag, N, A, lda, X, incX); return 0; }
void dtrsv(const UPLO Uplo, const TRANSPOSE TransA, const DIAG Diag, const int N, const double *A, const int lda, double *X, const int incX) { dtrsv_(UploChar[Uplo], TransposeChar[TransA], DiagChar[Diag], &N, A, &lda, X, &incX); }
int sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, double *x, int *info) { /* * Purpose * ======= * * sp_dtrsv_dist() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A'*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = D, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = D, Mtype = TRU. * * x - (input/output) double* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * */ #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3; #endif SCformat *Lstore; NCformat *Ustore; double *Lval, *Uval; int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; double *work; flops_t solve_ops; extern SuperLUStat_t SuperLUStat; /* Test the input parameters */ *info = 0; if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; else if ( !lsame_(trans, "N") && !lsame_(trans, "T") ) *info = -2; else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); xerbla_("sp_dtrsv_dist", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = doubleCalloc_dist(L->nrow)) ) ABORT("Malloc fails for work in sp_dtrsv_dist()."); if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1); solve_ops += 2 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; x[irow] -= x[fsupc] * Lval[luptr]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else dtrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); dgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif /* _CRAY */ #else dlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); dmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); x[irow] -= work[i]; /* Scatter */ work[i] = 0.0; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); x[irow] -= x[fsupc] * Uval[i]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("N", strlen("N")); STRSV(ftcs1, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else dtrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else dusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[irow] -= x[jcol] * Uval[i]; } } } } /* for k ... */ } } else { /* Form x := inv(A')*x */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 2 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); x[jcol] -= x[irow] * Lval[i]; iptr++; } } if ( nsupc > 1 ) { solve_ops += nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else dtrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[jcol] -= x[irow] * Uval[i]; } } solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else dtrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } SuperLUStat.ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }
void dtrsv(char uplo, char transa, char diag, int n, double *a, int lda, double *x, int incx) { dtrsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); }
/* Subroutine */ int dsygs2_(integer *itype, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * info) { /* -- LAPACK routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University February 29, 1992 Purpose ======= DSYGS2 reduces a real symmetric-definite generalized eigenproblem to standard form. If ITYPE = 1, the problem is A*x = lambda*B*x, and A is overwritten by inv(U')*A*inv(U) or inv(L)*A*inv(L') If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or B*A*x = lambda*x, and A is overwritten by U*A*U` or L'*A*L. B must have been previously factorized as U'*U or L*L' by DPOTRF. Arguments ========= ITYPE (input) INTEGER = 1: compute inv(U')*A*inv(U) or inv(L)*A*inv(L'); = 2 or 3: compute U*A*U' or L'*A*L. UPLO (input) CHARACTER Specifies whether the upper or lower triangular part of the symmetric matrix A is stored, and how B has been factorized. = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrices A and B. N >= 0. A (input/output) DOUBLE PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading n by n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n by n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the transformed matrix, stored in the same format as A. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input) DOUBLE PRECISION array, dimension (LDB,N) The triangular factor from the Cholesky factorization of B, as returned by DPOTRF. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. ===================================================================== Test the input parameters. Parameter adjustments */ /* Table of constant values */ static doublereal c_b6 = -1.; static integer c__1 = 1; static doublereal c_b27 = 1.; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; doublereal d__1; /* Local variables */ extern /* Subroutine */ int dsyr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); static integer k; extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static logical upper; extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dtrsv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); static doublereal ct; extern /* Subroutine */ int xerbla_(char *, integer *); static doublereal akk, bkk; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); if (*itype < 1 || *itype > 3) { *info = -1; } else if (! upper && ! lsame_(uplo, "L")) { *info = -2; } else if (*n < 0) { *info = -3; } else if (*lda < max(1,*n)) { *info = -5; } else if (*ldb < max(1,*n)) { *info = -7; } if (*info != 0) { i__1 = -(*info); xerbla_("DSYGS2", &i__1); return 0; } if (*itype == 1) { if (upper) { /* Compute inv(U')*A*inv(U) */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the upper triangle of A(k:n,k:n) */ akk = a_ref(k, k); bkk = b_ref(k, k); /* Computing 2nd power */ d__1 = bkk; akk /= d__1 * d__1; a_ref(k, k) = akk; if (k < *n) { i__2 = *n - k; d__1 = 1. / bkk; dscal_(&i__2, &d__1, &a_ref(k, k + 1), lda); ct = akk * -.5; i__2 = *n - k; daxpy_(&i__2, &ct, &b_ref(k, k + 1), ldb, &a_ref(k, k + 1) , lda); i__2 = *n - k; dsyr2_(uplo, &i__2, &c_b6, &a_ref(k, k + 1), lda, &b_ref( k, k + 1), ldb, &a_ref(k + 1, k + 1), lda); i__2 = *n - k; daxpy_(&i__2, &ct, &b_ref(k, k + 1), ldb, &a_ref(k, k + 1) , lda); i__2 = *n - k; dtrsv_(uplo, "Transpose", "Non-unit", &i__2, &b_ref(k + 1, k + 1), ldb, &a_ref(k, k + 1), lda); } /* L10: */ } } else { /* Compute inv(L)*A*inv(L') */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the lower triangle of A(k:n,k:n) */ akk = a_ref(k, k); bkk = b_ref(k, k); /* Computing 2nd power */ d__1 = bkk; akk /= d__1 * d__1; a_ref(k, k) = akk; if (k < *n) { i__2 = *n - k; d__1 = 1. / bkk; dscal_(&i__2, &d__1, &a_ref(k + 1, k), &c__1); ct = akk * -.5; i__2 = *n - k; daxpy_(&i__2, &ct, &b_ref(k + 1, k), &c__1, &a_ref(k + 1, k), &c__1); i__2 = *n - k; dsyr2_(uplo, &i__2, &c_b6, &a_ref(k + 1, k), &c__1, & b_ref(k + 1, k), &c__1, &a_ref(k + 1, k + 1), lda); i__2 = *n - k; daxpy_(&i__2, &ct, &b_ref(k + 1, k), &c__1, &a_ref(k + 1, k), &c__1); i__2 = *n - k; dtrsv_(uplo, "No transpose", "Non-unit", &i__2, &b_ref(k + 1, k + 1), ldb, &a_ref(k + 1, k), &c__1); } /* L20: */ } } } else { if (upper) { /* Compute U*A*U' */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the upper triangle of A(1:k,1:k) */ akk = a_ref(k, k); bkk = b_ref(k, k); i__2 = k - 1; dtrmv_(uplo, "No transpose", "Non-unit", &i__2, &b[b_offset], ldb, &a_ref(1, k), &c__1); ct = akk * .5; i__2 = k - 1; daxpy_(&i__2, &ct, &b_ref(1, k), &c__1, &a_ref(1, k), &c__1); i__2 = k - 1; dsyr2_(uplo, &i__2, &c_b27, &a_ref(1, k), &c__1, &b_ref(1, k), &c__1, &a[a_offset], lda); i__2 = k - 1; daxpy_(&i__2, &ct, &b_ref(1, k), &c__1, &a_ref(1, k), &c__1); i__2 = k - 1; dscal_(&i__2, &bkk, &a_ref(1, k), &c__1); /* Computing 2nd power */ d__1 = bkk; a_ref(k, k) = akk * (d__1 * d__1); /* L30: */ } } else { /* Compute L'*A*L */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the lower triangle of A(1:k,1:k) */ akk = a_ref(k, k); bkk = b_ref(k, k); i__2 = k - 1; dtrmv_(uplo, "Transpose", "Non-unit", &i__2, &b[b_offset], ldb, &a_ref(k, 1), lda); ct = akk * .5; i__2 = k - 1; daxpy_(&i__2, &ct, &b_ref(k, 1), ldb, &a_ref(k, 1), lda); i__2 = k - 1; dsyr2_(uplo, &i__2, &c_b27, &a_ref(k, 1), lda, &b_ref(k, 1), ldb, &a[a_offset], lda); i__2 = k - 1; daxpy_(&i__2, &ct, &b_ref(k, 1), ldb, &a_ref(k, 1), lda); i__2 = k - 1; dscal_(&i__2, &bkk, &a_ref(k, 1), lda); /* Computing 2nd power */ d__1 = bkk; a_ref(k, k) = akk * (d__1 * d__1); /* L40: */ } } } return 0; /* End of DSYGS2 */ } /* dsygs2_ */
/* Subroutine */ int dsygs2_(integer *itype, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; doublereal d__1; /* Local variables */ integer k; doublereal ct, akk, bkk; extern /* Subroutine */ int dsyr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dtrsv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *); /* -- LAPACK routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DSYGS2 reduces a real symmetric-definite generalized eigenproblem */ /* to standard form. */ /* If ITYPE = 1, the problem is A*x = lambda*B*x, */ /* and A is overwritten by inv(U')*A*inv(U) or inv(L)*A*inv(L') */ /* If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or */ /* B*A*x = lambda*x, and A is overwritten by U*A*U` or L'*A*L. */ /* B must have been previously factorized as U'*U or L*L' by DPOTRF. */ /* Arguments */ /* ========= */ /* ITYPE (input) INTEGER */ /* = 1: compute inv(U')*A*inv(U) or inv(L)*A*inv(L'); */ /* = 2 or 3: compute U*A*U' or L'*A*L. */ /* UPLO (input) CHARACTER*1 */ /* Specifies whether the upper or lower triangular part of the */ /* symmetric matrix A is stored, and how B has been factorized. */ /* = 'U': Upper triangular */ /* = 'L': Lower triangular */ /* N (input) INTEGER */ /* The order of the matrices A and B. N >= 0. */ /* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) */ /* On entry, the symmetric matrix A. If UPLO = 'U', the leading */ /* n by n upper triangular part of A contains the upper */ /* triangular part of the matrix A, and the strictly lower */ /* triangular part of A is not referenced. If UPLO = 'L', the */ /* leading n by n lower triangular part of A contains the lower */ /* triangular part of the matrix A, and the strictly upper */ /* triangular part of A is not referenced. */ /* On exit, if INFO = 0, the transformed matrix, stored in the */ /* same format as A. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,N). */ /* B (input) DOUBLE PRECISION array, dimension (LDB,N) */ /* The triangular factor from the Cholesky factorization of B, */ /* as returned by DPOTRF. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= max(1,N). */ /* INFO (output) INTEGER */ /* = 0: successful exit. */ /* < 0: if INFO = -i, the i-th argument had an illegal value. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); if (*itype < 1 || *itype > 3) { *info = -1; } else if (! upper && ! lsame_(uplo, "L")) { *info = -2; } else if (*n < 0) { *info = -3; } else if (*lda < max(1,*n)) { *info = -5; } else if (*ldb < max(1,*n)) { *info = -7; } if (*info != 0) { i__1 = -(*info); xerbla_("DSYGS2", &i__1); return 0; } if (*itype == 1) { if (upper) { /* Compute inv(U')*A*inv(U) */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the upper triangle of A(k:n,k:n) */ akk = a[k + k * a_dim1]; bkk = b[k + k * b_dim1]; /* Computing 2nd power */ d__1 = bkk; akk /= d__1 * d__1; a[k + k * a_dim1] = akk; if (k < *n) { i__2 = *n - k; d__1 = 1. / bkk; dscal_(&i__2, &d__1, &a[k + (k + 1) * a_dim1], lda); ct = akk * -.5; i__2 = *n - k; daxpy_(&i__2, &ct, &b[k + (k + 1) * b_dim1], ldb, &a[k + ( k + 1) * a_dim1], lda); i__2 = *n - k; dsyr2_(uplo, &i__2, &c_b6, &a[k + (k + 1) * a_dim1], lda, &b[k + (k + 1) * b_dim1], ldb, &a[k + 1 + (k + 1) * a_dim1], lda); i__2 = *n - k; daxpy_(&i__2, &ct, &b[k + (k + 1) * b_dim1], ldb, &a[k + ( k + 1) * a_dim1], lda); i__2 = *n - k; dtrsv_(uplo, "Transpose", "Non-unit", &i__2, &b[k + 1 + ( k + 1) * b_dim1], ldb, &a[k + (k + 1) * a_dim1], lda); } /* L10: */ } } else { /* Compute inv(L)*A*inv(L') */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the lower triangle of A(k:n,k:n) */ akk = a[k + k * a_dim1]; bkk = b[k + k * b_dim1]; /* Computing 2nd power */ d__1 = bkk; akk /= d__1 * d__1; a[k + k * a_dim1] = akk; if (k < *n) { i__2 = *n - k; d__1 = 1. / bkk; dscal_(&i__2, &d__1, &a[k + 1 + k * a_dim1], &c__1); ct = akk * -.5; i__2 = *n - k; daxpy_(&i__2, &ct, &b[k + 1 + k * b_dim1], &c__1, &a[k + 1 + k * a_dim1], &c__1); i__2 = *n - k; dsyr2_(uplo, &i__2, &c_b6, &a[k + 1 + k * a_dim1], &c__1, &b[k + 1 + k * b_dim1], &c__1, &a[k + 1 + (k + 1) * a_dim1], lda); i__2 = *n - k; daxpy_(&i__2, &ct, &b[k + 1 + k * b_dim1], &c__1, &a[k + 1 + k * a_dim1], &c__1); i__2 = *n - k; dtrsv_(uplo, "No transpose", "Non-unit", &i__2, &b[k + 1 + (k + 1) * b_dim1], ldb, &a[k + 1 + k * a_dim1], &c__1); } /* L20: */ } } } else { if (upper) { /* Compute U*A*U' */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the upper triangle of A(1:k,1:k) */ akk = a[k + k * a_dim1]; bkk = b[k + k * b_dim1]; i__2 = k - 1; dtrmv_(uplo, "No transpose", "Non-unit", &i__2, &b[b_offset], ldb, &a[k * a_dim1 + 1], &c__1); ct = akk * .5; i__2 = k - 1; daxpy_(&i__2, &ct, &b[k * b_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); i__2 = k - 1; dsyr2_(uplo, &i__2, &c_b27, &a[k * a_dim1 + 1], &c__1, &b[k * b_dim1 + 1], &c__1, &a[a_offset], lda); i__2 = k - 1; daxpy_(&i__2, &ct, &b[k * b_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); i__2 = k - 1; dscal_(&i__2, &bkk, &a[k * a_dim1 + 1], &c__1); /* Computing 2nd power */ d__1 = bkk; a[k + k * a_dim1] = akk * (d__1 * d__1); /* L30: */ } } else { /* Compute L'*A*L */ i__1 = *n; for (k = 1; k <= i__1; ++k) { /* Update the lower triangle of A(1:k,1:k) */ akk = a[k + k * a_dim1]; bkk = b[k + k * b_dim1]; i__2 = k - 1; dtrmv_(uplo, "Transpose", "Non-unit", &i__2, &b[b_offset], ldb, &a[k + a_dim1], lda); ct = akk * .5; i__2 = k - 1; daxpy_(&i__2, &ct, &b[k + b_dim1], ldb, &a[k + a_dim1], lda); i__2 = k - 1; dsyr2_(uplo, &i__2, &c_b27, &a[k + a_dim1], lda, &b[k + b_dim1], ldb, &a[a_offset], lda); i__2 = k - 1; daxpy_(&i__2, &ct, &b[k + b_dim1], ldb, &a[k + a_dim1], lda); i__2 = k - 1; dscal_(&i__2, &bkk, &a[k + a_dim1], lda); /* Computing 2nd power */ d__1 = bkk; a[k + k * a_dim1] = akk * (d__1 * d__1); /* L40: */ } } } return 0; /* End of DSYGS2 */ } /* dsygs2_ */
/* * Cholesky-based solution of the * sequence of Feasible Generalized Least-Squares problem * in the context of GWAS: */ int fgls_chol( FGLS_config_t cf ) { int n = cf.n, m = cf.m, p = cf.p, t = cf.t, x_b = cf.x_b, /*y_b = cf.y_b,*/ wXL = cf.wXL, wXR = cf.wXR; /* In-core operands */ double *Phi; double *M; double *ests; double *h2; double *res_sigma; double alpha; double beta; /* Out-of-core operands */ double *Bij; // Auxiliary variables /* Reusable data thanks to constant XL */ double *XL; double *XL_orig; // XL and a copy (XL is overwritten at every iteration of j) double *B_t; // Top part of b ( in inv(S) b ) double *V_tl; // Top-Left part of V /* BLAS / LAPACK constants */ double ZERO = 0.0; double ONE = 1.0; int iONE = 1; /* LAPACK error value */ int info; /* iterators and auxiliar vars */ int ib, i, j, k, l; // size_t int nn = cf.n * cf.n; // size_t size_t size_one_b_record = p + (p*(p+1))/2; // Threading int id; double *tmpBs, *tmpVs; // Buffer with one B and one V per thread double *oneB, *oneV; // Each thread pointer to its B and V if ( cf.y_b != 1 ) { fprintf(stderr, "\n[Warning] y_b not used (set to 1)\n"); cf.y_b = 1; } /* Memory allocation */ // In-core build_SPD_Phi( cf.n, cf.Z, cf.W, cf.Phi ); Phi = cf.Phi; M = ( double * ) fgls_malloc ( (size_t)cf.n * cf.n * sizeof(double) ); ests = cf.ests; h2 = ests; res_sigma = &ests[2*cf.t]; XL_orig = cf.XL; XL = ( double * ) fgls_malloc ( cf.wXL * cf.n * sizeof(double) ); B_t = ( double * ) fgls_malloc ( cf.wXL * sizeof(double) ); V_tl = ( double * ) fgls_malloc ( cf.wXL * cf.wXL * sizeof(double) ); // Temporary storage prior to copying in db_B tmpBs = ( double * ) fgls_malloc ( cf.p * cf.num_threads * sizeof(double) ); tmpVs = ( double * ) fgls_malloc ( cf.p * cf.p * cf.num_threads * sizeof(double) ); /* Files and pointers for out-of-core */ double *XR_comp, *Y_comp, *B_comp; /* Asynchronous IO data structures */ double_buffering db_XR, db_Y, db_B; double_buffering_init( &db_XR, (size_t)cf.n * cf.wXR * cf.x_b * sizeof(double), cf.XR, &cf ); // _fp double_buffering_init( &db_Y, (size_t)cf.n * cf.y_b * sizeof(double), cf.Y, &cf ); double_buffering_init( &db_B, (size_t)size_one_b_record * cf.x_b * cf.y_b * sizeof(double), cf.B, &cf ); #if VAMPIR VT_USER_START("READ_X"); #endif /* Read first block of XR's */ double_buffering_read_XR( &db_XR, IO_BUFF, 0, (size_t)MIN( cf.x_b, cf.m ) - 1 ); double_buffering_swap( &db_XR ); #if VAMPIR VT_USER_END("READ_X"); #endif #if VAMPIR VT_USER_START("READ_Y"); #endif /* Read first Y */ double_buffering_read_Y( &db_Y, IO_BUFF, 0, 0 ); double_buffering_swap( &db_Y ); #if VAMPIR VT_USER_END("READ_Y"); #endif int iter = 0; for ( j = 0; j < t; j++ ) { /* Set the number of threads for the multi-threaded BLAS */ set_multi_threaded_BLAS( cf.num_threads ); #if VAMPIR VT_USER_START("READ_Y"); #endif /* Read next Y */ size_t next_j = (j+1) >= t ? 0 : j+1; double_buffering_read_Y( &db_Y, IO_BUFF, next_j, next_j ); #if VAMPIR VT_USER_END("READ_Y"); #endif #if VAMPIR VT_USER_START("COMP_J"); #endif /* M := sigma * ( h^2 Phi - (1 - h^2) I ) */ memcpy( M, Phi, (size_t)n * n * sizeof(double) ); alpha = res_sigma[j] * h2[j]; beta = res_sigma[j] * (1 - h2[j]); dscal_(&nn, &alpha, M, &iONE); for ( i = 0; i < n; i++ ) M[i*n + i] = M[i*n + i] + beta; /* L * L' = M */ dpotrf_(LOWER, &n, M, &n, &info); if (info != 0) { char err[STR_BUFFER_SIZE]; snprintf(err, STR_BUFFER_SIZE, "dpotrf(M) failed (info: %d)", info); error_msg(err, 1); } /* XL := inv(L) * XL */ memcpy( XL, XL_orig, wXL * n * sizeof(double) ); dtrsm_(LEFT, LOWER, NO_TRANS, NON_UNIT, &n, &wXL, &ONE, M, &n, XL, &n); #if VAMPIR VT_USER_START("WAIT_Y"); #endif // Wait until current Y is available for computation double_buffering_wait( &db_Y, COMP_BUFF ); #if VAMPIR VT_USER_END("WAIT_Y"); #endif /* y := inv(L) * y */ Y_comp = double_buffering_get_comp_buffer( &db_Y ); // Sanity check average( Y_comp, n, 1, cf.threshold, "TRAIT", &cf.Y_fvi->fvi_data[n*NAMELENGTH], NAMELENGTH, 0 ); dtrsv_(LOWER, NO_TRANS, NON_UNIT, &n, M, &n, Y_comp, &iONE); /* B_t := XL' * y */ dgemv_(TRANS, &n, &wXL, &ONE, XL, &n, Y_comp, &iONE, &ZERO, B_t, &iONE); /* V_tl := XL' * XL */ dsyrk_(LOWER, TRANS, &wXL, &n, &ONE, XL, &n, &ZERO, V_tl, &wXL); #if VAMPIR VT_USER_END("COMP_J"); #endif /* Solve for x_b X's at once */ for (ib = 0; ib < m; ib += x_b) { #if VAMPIR VT_USER_START("READ_X"); #endif /* Read next block of XR's */ size_t next_x_from = ((size_t)ib + x_b) >= m ? 0 : (size_t)ib + x_b; size_t next_x_to = ((size_t)ib + x_b) >= m ? MIN( (size_t)x_b, (size_t)m ) - 1 : next_x_from + MIN( (size_t)x_b, (size_t)m - next_x_from ) - 1; double_buffering_read_XR( &db_XR, IO_BUFF, next_x_from, next_x_to ); #if VAMPIR VT_USER_END("READ_X"); #endif #if VAMPIR VT_USER_START("WAIT_X"); #endif /* Wait until current block of XR's is available for computation */ double_buffering_wait( &db_XR, COMP_BUFF ); #if VAMPIR VT_USER_END("WAIT_X"); #endif /* Set the number of threads for the multi-threaded BLAS */ set_multi_threaded_BLAS( cf.num_threads ); #if VAMPIR VT_USER_START("COMP_IB"); #endif /* XR := inv(L) XR */ XR_comp = double_buffering_get_comp_buffer( &db_XR ); // Auxiliar variables int x_inc = MIN(x_b, m - ib); int rhss = wXR * x_inc; // Sanity check average( XR_comp, n, x_inc, cf.threshold, "SNP", &cf.XR_fvi->fvi_data[(n+ib)*NAMELENGTH], NAMELENGTH, 1 ); // Computation dtrsm_(LEFT, LOWER, NO_TRANS, NON_UNIT, &n, &rhss, &ONE, M, &n, XR_comp, &n); #if VAMPIR VT_USER_END("COMP_IB"); #endif #if CHOL_MIX_PARALLELISM /* Set the number of threads for the multi-threaded BLAS to 1. * The innermost loop is parallelized using OPENMP */ set_single_threaded_BLAS(); #endif #if VAMPIR VT_USER_START("COMP_I"); #endif B_comp = double_buffering_get_comp_buffer( &db_B ); #if CHOL_MIX_PARALLELISM #pragma omp parallel for private(Bij, oneB, oneV, i, k, info, id) schedule(static) num_threads(cf.num_threads) #endif for (i = 0; i < x_inc; i++) { id = omp_get_thread_num(); oneB = &tmpBs[ id * p ]; oneV = &tmpVs[ id * p * p ]; Bij = &B_comp[i * size_one_b_record]; // Building B // Copy B_T memcpy(oneB, B_t, wXL * sizeof(double)); // B_B := XR' * y dgemv_("T", &n, &wXR, &ONE, &XR_comp[i * wXR * n], &n, Y_comp, &iONE, &ZERO, &oneB[wXL], &iONE); // Building V // Copy V_TL for( k = 0; k < wXL; k++ ) dcopy_(&wXL, &V_tl[k*wXL], &iONE, &oneV[k*p], &iONE); // V_TL // V_BL := XR' * XL dgemm_("T", "N", &wXR, &wXL, &n, &ONE, &XR_comp[i * wXR * n], &n, XL, &n, &ZERO, &oneV[wXL], &p); // V_BL // V_BR := XR' * XR dsyrk_("L", "T", &wXR, &n, &ONE, &XR_comp[i * wXR * n], &n, &ZERO, &oneV[wXL * p + wXL], &p); // V_BR // B := inv(V) * B dpotrf_(LOWER, &p, oneV, &p, &info); if (info != 0) { for ( k = 0; k < size_one_b_record; k++ ) Bij[k] = 0.0/0.0; //nan("char-sequence"); continue; } dtrsv_(LOWER, NO_TRANS, NON_UNIT, &p, oneV, &p, oneB, &iONE); dtrsv_(LOWER, TRANS, NON_UNIT, &p, oneV, &p, oneB, &iONE); /* V := res_sigma * inv( X' inv(M) X) */ dpotri_(LOWER, &p, oneV, &p, &info); if (info != 0) { char err[STR_BUFFER_SIZE]; snprintf(err, STR_BUFFER_SIZE, "dpotri failed (info: %d)", info); error_msg(err, 1); } // Copy output for ( k = 0; k < p; k++ ) Bij[k] = (float) oneB[k]; for ( k = 0; k < p; k++ ) Bij[p+k] = (float)sqrt(oneV[k*p+k]); int idx = 0; for ( k = 0; k < p-1; k++ ) // Cols of V for ( l = k+1; l < p; l++ ) // Rows of V { Bij[p+p+idx] = (float)oneV[k*p+l]; idx++; } #if 0 printf("Chi square: %.6f\n", ( (oneB[p-1] / Bij[p+p-1]) * (oneB[p-1] / Bij[p+p-1]) ) ); #endif } #if VAMPIR VT_USER_END("COMP_I"); #endif #if VAMPIR VT_USER_START("WAIT_BV"); #endif /* Wait until the previous blocks of B's and V's are written */ if ( iter > 0) double_buffering_wait( &db_B, IO_BUFF ); #if VAMPIR VT_USER_END("WAIT_BV"); #endif /* Write current blocks of B's and V's */ #if VAMPIR VT_USER_START("WRITE_BV"); #endif double_buffering_write_B( &db_B, COMP_BUFF, ib, ib+x_inc - 1, j, j ); #if VAMPIR VT_USER_END("WRITE_BV"); #endif /* Swap buffers */ double_buffering_swap( &db_XR ); double_buffering_swap( &db_B ); iter++; } /* Swap buffers */ double_buffering_swap( &db_Y ); } #if VAMPIR VT_USER_START("WAIT_ALL"); #endif /* Wait for the remaining IO operations issued */ double_buffering_wait( &db_XR, COMP_BUFF ); double_buffering_wait( &db_Y, COMP_BUFF ); double_buffering_wait( &db_B, IO_BUFF ); #if VAMPIR VT_USER_END("WAIT_ALL"); #endif /* Clean-up */ free( M ); free( XL ); free( B_t ); free( V_tl ); free( tmpBs ); free( tmpVs ); double_buffering_destroy( &db_XR ); double_buffering_destroy( &db_Y ); double_buffering_destroy( &db_B ); return 0; }
void dpanel_bmod ( const int m, /* in - number of rows in the matrix */ const int w, /* in */ const int jcol, /* in */ const int nseg, /* in */ double *dense, /* out, of size n by w */ double *tempv, /* working array */ int *segrep, /* in */ int *repfnz, /* in, of size n by w */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { /* * Purpose * ======= * * Performs numeric block updates (sup-panel) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j] * * Before entering this routine, the original nonzeros in the panel * were already copied into the spa[m,w]. * * Updated/Output parameters- * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned * collectively in the m-by-w vector dense[*]. * */ #ifdef USE_VENDOR_BLAS #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; double alpha, beta; #endif register int k, ksub; int fsupc, nsupc, nsupr, nrow; int krep, krep_ind; double ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; int block_nrow; /* no of rows in a block row */ register int lptr; /* Points to the row subscripts of a supernode */ int kfnz, irow, no_zeros; register int isub, isub1, i; register int jj; /* Index through each column in the panel */ int *xsup, *supno; int *lsub, *xlsub; double *lusup; int *xlusup; int *repfnz_col; /* repfnz[] for a column in the panel */ double *dense_col; /* dense[] for a column in the panel */ double *tempv1; /* Used in 1-D update */ double *TriTmp, *MatvecTmp; /* used in 2-D update */ double zero = 0.0; double one = 1.0; register int ldaTmp; register int r_ind, r_hi; static int first = 1, maxsuper, rowblk, colblk; flops_t *ops = stat->ops; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; if ( first ) { maxsuper = sp_ienv(3); rowblk = sp_ienv(4); colblk = sp_ienv(5); first = 0; } ldaTmp = maxsuper + rowblk; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in a supernode * nsupr = no of rows in a supernode */ krep = segrep[k--]; fsupc = xsup[supno[krep]]; nsupc = krep - fsupc + 1; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; nrow = nsupr - nsupc; lptr = xlsub[fsupc]; krep_ind = lptr + nsupc - 1; repfnz_col = repfnz; dense_col = dense; if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ TriTmp = tempv; /* Sequence through each column in panel -- triangular solves */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += segsze * (segsze - 1); ops[GEMV] += 2 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; dense_col[irow] -= ukj * lusup[luptr]; ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr += nsupr*(nsupc-1) + nsupc-1; luptr1 = luptr - nsupr; if ( segsze == 2 ) { ukj -= ukj1 * lusup[luptr1]; dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; dense_col[irow] -= (ukj*lusup[luptr] + ukj1*lusup[luptr1]); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; dense_col[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); } } } else { /* segsze >= 4 */ /* Copy U[*,j] segment from dense[*] to TriTmp[*], which holds the result of triangular solves. */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; TriTmp[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #else dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #endif #else dlsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); #endif } /* else ... */ } /* for jj ... end tri-solves */ /* Block row updates; push all the way into dense[*] block */ for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); luptr = xlusup[fsupc] + nsupc + r_ind; isub1 = lptr + nsupc + r_ind; repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; /* Sequence through each column in panel -- matrix-vector */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ /* Perform a block update, and scatter the result of matrix-vector to dense[]. */ no_zeros = kfnz - fsupc; luptr1 = luptr + nsupr * no_zeros; MatvecTmp = &TriTmp[maxsuper]; #ifdef USE_VENDOR_BLAS alpha = one; beta = zero; #ifdef _CRAY SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #else dgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #endif #else dmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], TriTmp, MatvecTmp); #endif /* Scatter MatvecTmp[*] into SPA dense[*] temporarily * such that MatvecTmp[*] can be re-used for the * the next blok row update. dense[] will be copied into * global store after the whole panel has been finished. */ isub = isub1; for (i = 0; i < block_nrow; i++) { irow = lsub[isub]; dense_col[irow] -= MatvecTmp[i]; MatvecTmp[i] = zero; ++isub; } } /* for jj ... */ } /* for each block row ... */ /* Scatter the triangular solves into SPA dense[*] */ repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = TriTmp[i]; TriTmp[i] = zero; ++isub; } } /* for jj ... */ } else { /* 1-D block modification */ /* Sequence through each column in the panel */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += segsze * (segsze - 1); ops[GEMV] += 2 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; dense_col[irow] -= ukj * lusup[luptr]; ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { ukj -= ukj1 * lusup[luptr1]; dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; dense_col[irow] -= (ukj*lusup[luptr] + ukj1*lusup[luptr1]); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; ++luptr2; dense_col[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); } } } else { /* segsze >= 4 */ /* * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense[]. */ no_zeros = kfnz - fsupc; /* Copy U[*,j] segment from dense[*] to tempv[*]: * The result of triangular solve is in tempv[*]; * The result of matrix vector update is in dense_col[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tempv[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #ifdef _CRAY SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else dgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else dlsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; dmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[*] into SPA dense[*] temporarily, such * that tempv[*] can be used for the triangular solve of * the next column of the panel. They will be copied into * ucol[*] after the whole panel has been finished. */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = tempv[i]; tempv[i] = zero; isub++; } /* Scatter the update from tempv1[*] into SPA dense[*] */ /* Start dense rectangular L */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; dense_col[irow] -= tempv1[i]; tempv1[i] = zero; ++isub; } } /* else segsze>=4 ... */ } /* for each column in the panel... */ } /* else 1-D update ... */ } /* for each updating supernode ... */ }
/*! \brief * * <pre> * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j] * Return value: 0 - successful return * > 0 - number of bytes allocated when run out of space * </pre> */ int dcolumn_bmod ( const int jcol, /* in */ const int nseg, /* in */ double *dense, /* in */ double *tempv, /* working array */ int *segrep, /* in */ int *repfnz, /* in */ int fpanelc, /* in -- first column in the current panel */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; double alpha, beta; /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ double ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; register int lptr, kfnz, isub, irow, i; register int no_zeros, new_next; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode. */ int *xsup, *supno; int *lsub, *xlsub; double *lusup; int *xlusup; int nzlumax; double *tempv1; double zero = 0.0; double one = 1.0; double none = -1.0; int mem_error; flops_t *ops = stat->ops; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = (double *) Glu->lusup; xlusup = Glu->xlusup; nzlumax = Glu->nzlumax; jcolp1 = jcol + 1; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = SUPERLU_MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc > fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = SUPERLU_MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; ops[TRSV] += segsze * (segsze - 1); ops[GEMV] += 2 * nrow * segsze; /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; dense[irow] -= ukj*lusup[luptr]; luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ ukj -= ukj1 * lusup[luptr1]; dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; dense[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] ); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; dense[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #ifdef _CRAY SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else dgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else dlsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; dmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[] as a temporary storage */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; tempv[i] = zero; ++isub; } /* Scatter tempv1[] into SPA dense[] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; dense[irow] -= tempv1[i]; tempv1[i] = zero; ++isub; } } } /* if jsupno ... */ } /* for each segment... */ /* * Process the supernodal portion of L\U[*,j] */ nextlu = xlusup[jcol]; fsupc = xsup[jsupno]; /* Copy the SPA dense into L\U[*,j] */ new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; while ( new_next > nzlumax ) { if (mem_error = dLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) return (mem_error); lusup = (double *) Glu->lusup; lsub = Glu->lsub; } for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; ++nextlu; } xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ /* For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * 1) fsupc < fpanelc, then fst_col := fpanelc * 2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = SUPERLU_MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* Distance between the current supernode and the current panel. d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; lptr = xlsub[fsupc] + d_fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* Points to the beginning of jcol in snode L\U(jsupno) */ ufirst = xlusup[jcol] + d_fsupc; ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); #else dtrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); #endif alpha = none; beta = one; /* y := beta*y + alpha*A*x */ #ifdef _CRAY SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else dgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else dlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); dmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { lusup[isub] -= tempv[i]; tempv[i] = 0.0; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }
/* Subroutine */ int dlatrs_(char *uplo, char *trans, char *diag, char * normin, integer *n, doublereal *a, integer *lda, doublereal *x, doublereal *scale, doublereal *cnorm, integer *info) { /* -- LAPACK auxiliary routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University June 30, 1992 Purpose ======= DLATRS solves one of the triangular systems A *x = s*b or A'*x = s*b with scaling to prevent overflow. Here A is an upper or lower triangular matrix, A' denotes the transpose of A, x and b are n-element vectors, and s is a scaling factor, usually less than or equal to 1, chosen so that the components of x will be less than the overflow threshold. If the unscaled problem will not cause overflow, the Level 2 BLAS routine DTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j), then s is set to 0 and a non-trivial solution to A*x = 0 is returned. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the matrix A is upper or lower triangular. = 'U': Upper triangular = 'L': Lower triangular TRANS (input) CHARACTER*1 Specifies the operation applied to A. = 'N': Solve A * x = s*b (No transpose) = 'T': Solve A'* x = s*b (Transpose) = 'C': Solve A'* x = s*b (Conjugate transpose = Transpose) DIAG (input) CHARACTER*1 Specifies whether or not the matrix A is unit triangular. = 'N': Non-unit triangular = 'U': Unit triangular NORMIN (input) CHARACTER*1 Specifies whether CNORM has been set or not. = 'Y': CNORM contains the column norms on entry = 'N': CNORM is not set on entry. On exit, the norms will be computed and stored in CNORM. N (input) INTEGER The order of the matrix A. N >= 0. A (input) DOUBLE PRECISION array, dimension (LDA,N) The triangular matrix A. If UPLO = 'U', the leading n by n upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n by n lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. LDA (input) INTEGER The leading dimension of the array A. LDA >= max (1,N). X (input/output) DOUBLE PRECISION array, dimension (N) On entry, the right hand side b of the triangular system. On exit, X is overwritten by the solution vector x. SCALE (output) DOUBLE PRECISION The scaling factor s for the triangular system A * x = s*b or A'* x = s*b. If SCALE = 0, the matrix A is singular or badly scaled, and the vector x is an exact or approximate solution to A*x = 0. CNORM (input or output) DOUBLE PRECISION array, dimension (N) If NORMIN = 'Y', CNORM is an input argument and CNORM(j) contains the norm of the off-diagonal part of the j-th column of A. If TRANS = 'N', CNORM(j) must be greater than or equal to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) must be greater than or equal to the 1-norm. If NORMIN = 'N', CNORM is an output argument and CNORM(j) returns the 1-norm of the offdiagonal part of the j-th column of A. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -k, the k-th argument had an illegal value Further Details ======= ======= A rough bound on x is computed; if that is less than overflow, DTRSV is called, otherwise, specific code is used which checks for possible overflow or divide-by-zero at every operation. A columnwise scheme is used for solving A*x = b. The basic algorithm if A is lower triangular is x[1:n] := b[1:n] for j = 1, ..., n x(j) := x(j) / A(j,j) x[j+1:n] := x[j+1:n] - x(j) * A[j+1:n,j] end Define bounds on the components of x after j iterations of the loop: M(j) = bound on x[1:j] G(j) = bound on x[j+1:n] Initially, let M(0) = 0 and G(0) = max{x(i), i=1,...,n}. Then for iteration j+1 we have M(j+1) <= G(j) / | A(j+1,j+1) | G(j+1) <= G(j) + M(j+1) * | A[j+2:n,j+1] | <= G(j) ( 1 + CNORM(j+1) / | A(j+1,j+1) | ) where CNORM(j+1) is greater than or equal to the infinity-norm of column j+1 of A, not counting the diagonal. Hence G(j) <= G(0) product ( 1 + CNORM(i) / | A(i,i) | ) 1<=i<=j and |x(j)| <= ( G(0) / |A(j,j)| ) product ( 1 + CNORM(i) / |A(i,i)| ) 1<=i< j Since |x(j)| <= M(j), we use the Level 2 BLAS routine DTRSV if the reciprocal of the largest M(j), j=1,..,n, is larger than max(underflow, 1/overflow). The bound on x(j) is also used to determine when a step in the columnwise method can be performed without fear of overflow. If the computed bound is greater than a large constant, x is scaled to prevent overflow, but if the bound overflows, x is set to 0, x(j) to 1, and scale to 0, and a non-trivial solution to A*x = 0 is found. Similarly, a row-wise scheme is used to solve A'*x = b. The basic algorithm for A upper triangular is for j = 1, ..., n x(j) := ( b(j) - A[1:j-1,j]' * x[1:j-1] ) / A(j,j) end We simultaneously compute two bounds G(j) = bound on ( b(i) - A[1:i-1,i]' * x[1:i-1] ), 1<=i<=j M(j) = bound on x(i), 1<=i<=j The initial values are G(0) = 0, M(0) = max{b(i), i=1,..,n}, and we add the constraint G(j) >= G(j-1) and M(j) >= M(j-1) for j >= 1. Then the bound on x(j) is M(j) <= M(j-1) * ( 1 + CNORM(j) ) / | A(j,j) | <= M(0) * product ( ( 1 + CNORM(i) ) / |A(i,i)| ) 1<=i<=j and we can safely call DTRSV if 1/M(n) and 1/G(n) are both greater than max(underflow, 1/overflow). ===================================================================== Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; static doublereal c_b36 = .5; /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; doublereal d__1, d__2, d__3; /* Local variables */ static integer jinc; extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); static doublereal xbnd; static integer imax; static doublereal tmax, tjjs, xmax, grow, sumj; static integer i__, j; extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); static doublereal tscal, uscal; extern doublereal dasum_(integer *, doublereal *, integer *); static integer jlast; extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static logical upper; extern /* Subroutine */ int dtrsv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamch_(char *); static doublereal xj; extern integer idamax_(integer *, doublereal *, integer *); extern /* Subroutine */ int xerbla_(char *, integer *); static doublereal bignum; static logical notran; static integer jfirst; static doublereal smlnum; static logical nounit; static doublereal rec, tjj; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; --cnorm; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); notran = lsame_(trans, "N"); nounit = lsame_(diag, "N"); /* Test the input parameters. */ if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (! notran && ! lsame_(trans, "T") && ! lsame_(trans, "C")) { *info = -2; } else if (! nounit && ! lsame_(diag, "U")) { *info = -3; } else if (! lsame_(normin, "Y") && ! lsame_(normin, "N")) { *info = -4; } else if (*n < 0) { *info = -5; } else if (*lda < max(1,*n)) { *info = -7; } if (*info != 0) { i__1 = -(*info); xerbla_("DLATRS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Determine machine dependent parameters to control overflow. */ smlnum = dlamch_("Safe minimum") / dlamch_("Precision"); bignum = 1. / smlnum; *scale = 1.; if (lsame_(normin, "N")) { /* Compute the 1-norm of each column, not including the diagonal. */ if (upper) { /* A is upper triangular. */ i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j - 1; cnorm[j] = dasum_(&i__2, &a_ref(1, j), &c__1); /* L10: */ } } else { /* A is lower triangular. */ i__1 = *n - 1; for (j = 1; j <= i__1; ++j) { i__2 = *n - j; cnorm[j] = dasum_(&i__2, &a_ref(j + 1, j), &c__1); /* L20: */ } cnorm[*n] = 0.; } } /* Scale the column norms by TSCAL if the maximum element in CNORM is greater than BIGNUM. */ imax = idamax_(n, &cnorm[1], &c__1); tmax = cnorm[imax]; if (tmax <= bignum) { tscal = 1.; } else { tscal = 1. / (smlnum * tmax); dscal_(n, &tscal, &cnorm[1], &c__1); } /* Compute a bound on the computed solution vector to see if the Level 2 BLAS routine DTRSV can be used. */ j = idamax_(n, &x[1], &c__1); xmax = (d__1 = x[j], abs(d__1)); xbnd = xmax; if (notran) { /* Compute the growth in A * x = b. */ if (upper) { jfirst = *n; jlast = 1; jinc = -1; } else { jfirst = 1; jlast = *n; jinc = 1; } if (tscal != 1.) { grow = 0.; goto L50; } if (nounit) { /* A is non-unit triangular. Compute GROW = 1/G(j) and XBND = 1/M(j). Initially, G(0) = max{x(i), i=1,...,n}. */ grow = 1. / max(xbnd,smlnum); xbnd = grow; i__1 = jlast; i__2 = jinc; for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L50; } /* M(j) = G(j-1) / abs(A(j,j)) */ tjj = (d__1 = a_ref(j, j), abs(d__1)); /* Computing MIN */ d__1 = xbnd, d__2 = min(1.,tjj) * grow; xbnd = min(d__1,d__2); if (tjj + cnorm[j] >= smlnum) { /* G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */ grow *= tjj / (tjj + cnorm[j]); } else { /* G(j) could overflow, set GROW to 0. */ grow = 0.; } /* L30: */ } grow = xbnd; } else { /* A is unit triangular. Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. Computing MIN */ d__1 = 1., d__2 = 1. / max(xbnd,smlnum); grow = min(d__1,d__2); i__2 = jlast; i__1 = jinc; for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L50; } /* G(j) = G(j-1)*( 1 + CNORM(j) ) */ grow *= 1. / (cnorm[j] + 1.); /* L40: */ } } L50: ; } else { /* Compute the growth in A' * x = b. */ if (upper) { jfirst = 1; jlast = *n; jinc = 1; } else { jfirst = *n; jlast = 1; jinc = -1; } if (tscal != 1.) { grow = 0.; goto L80; } if (nounit) { /* A is non-unit triangular. Compute GROW = 1/G(j) and XBND = 1/M(j). Initially, M(0) = max{x(i), i=1,...,n}. */ grow = 1. / max(xbnd,smlnum); xbnd = grow; i__1 = jlast; i__2 = jinc; for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L80; } /* G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */ xj = cnorm[j] + 1.; /* Computing MIN */ d__1 = grow, d__2 = xbnd / xj; grow = min(d__1,d__2); /* M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */ tjj = (d__1 = a_ref(j, j), abs(d__1)); if (xj > tjj) { xbnd *= tjj / xj; } /* L60: */ } grow = min(grow,xbnd); } else { /* A is unit triangular. Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. Computing MIN */ d__1 = 1., d__2 = 1. / max(xbnd,smlnum); grow = min(d__1,d__2); i__2 = jlast; i__1 = jinc; for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Exit the loop if the growth factor is too small. */ if (grow <= smlnum) { goto L80; } /* G(j) = ( 1 + CNORM(j) )*G(j-1) */ xj = cnorm[j] + 1.; grow /= xj; /* L70: */ } } L80: ; } if (grow * tscal > smlnum) { /* Use the Level 2 BLAS solve if the reciprocal of the bound on elements of X is not too small. */ dtrsv_(uplo, trans, diag, n, &a[a_offset], lda, &x[1], &c__1); } else { /* Use a Level 1 BLAS solve, scaling intermediate results. */ if (xmax > bignum) { /* Scale X so that its components are less than or equal to BIGNUM in absolute value. */ *scale = bignum / xmax; dscal_(n, scale, &x[1], &c__1); xmax = bignum; } if (notran) { /* Solve A * x = b */ i__1 = jlast; i__2 = jinc; for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Compute x(j) = b(j) / A(j,j), scaling x if necessary. */ xj = (d__1 = x[j], abs(d__1)); if (nounit) { tjjs = a_ref(j, j) * tscal; } else { tjjs = tscal; if (tscal == 1.) { goto L100; } } tjj = abs(tjjs); if (tjj > smlnum) { /* abs(A(j,j)) > SMLNUM: */ if (tjj < 1.) { if (xj > tjj * bignum) { /* Scale x by 1/b(j). */ rec = 1. / xj; dscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } } x[j] /= tjjs; xj = (d__1 = x[j], abs(d__1)); } else if (tjj > 0.) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if (xj > tjj * bignum) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM to avoid overflow when dividing by A(j,j). */ rec = tjj * bignum / xj; if (cnorm[j] > 1.) { /* Scale by 1/CNORM(j) to avoid overflow when multiplying x(j) times column j. */ rec /= cnorm[j]; } dscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } x[j] /= tjjs; xj = (d__1 = x[j], abs(d__1)); } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and scale = 0, and compute a solution to A*x = 0. */ i__3 = *n; for (i__ = 1; i__ <= i__3; ++i__) { x[i__] = 0.; /* L90: */ } x[j] = 1.; xj = 1.; *scale = 0.; xmax = 0.; } L100: /* Scale x if necessary to avoid overflow when adding a multiple of column j of A. */ if (xj > 1.) { rec = 1. / xj; if (cnorm[j] > (bignum - xmax) * rec) { /* Scale x by 1/(2*abs(x(j))). */ rec *= .5; dscal_(n, &rec, &x[1], &c__1); *scale *= rec; } } else if (xj * cnorm[j] > bignum - xmax) { /* Scale x by 1/2. */ dscal_(n, &c_b36, &x[1], &c__1); *scale *= .5; } if (upper) { if (j > 1) { /* Compute the update x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */ i__3 = j - 1; d__1 = -x[j] * tscal; daxpy_(&i__3, &d__1, &a_ref(1, j), &c__1, &x[1], & c__1); i__3 = j - 1; i__ = idamax_(&i__3, &x[1], &c__1); xmax = (d__1 = x[i__], abs(d__1)); } } else { if (j < *n) { /* Compute the update x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */ i__3 = *n - j; d__1 = -x[j] * tscal; daxpy_(&i__3, &d__1, &a_ref(j + 1, j), &c__1, &x[j + 1], &c__1); i__3 = *n - j; i__ = j + idamax_(&i__3, &x[j + 1], &c__1); xmax = (d__1 = x[i__], abs(d__1)); } } /* L110: */ } } else { /* Solve A' * x = b */ i__2 = jlast; i__1 = jinc; for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Compute x(j) = b(j) - sum A(k,j)*x(k). k<>j */ xj = (d__1 = x[j], abs(d__1)); uscal = tscal; rec = 1. / max(xmax,1.); if (cnorm[j] > (bignum - xj) * rec) { /* If x(j) could overflow, scale x by 1/(2*XMAX). */ rec *= .5; if (nounit) { tjjs = a_ref(j, j) * tscal; } else { tjjs = tscal; } tjj = abs(tjjs); if (tjj > 1.) { /* Divide by A(j,j) when scaling x if A(j,j) > 1. Computing MIN */ d__1 = 1., d__2 = rec * tjj; rec = min(d__1,d__2); uscal /= tjjs; } if (rec < 1.) { dscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } } sumj = 0.; if (uscal == 1.) { /* If the scaling needed for A in the dot product is 1, call DDOT to perform the dot product. */ if (upper) { i__3 = j - 1; sumj = ddot_(&i__3, &a_ref(1, j), &c__1, &x[1], &c__1) ; } else if (j < *n) { i__3 = *n - j; sumj = ddot_(&i__3, &a_ref(j + 1, j), &c__1, &x[j + 1] , &c__1); } } else { /* Otherwise, use in-line code for the dot product. */ if (upper) { i__3 = j - 1; for (i__ = 1; i__ <= i__3; ++i__) { sumj += a_ref(i__, j) * uscal * x[i__]; /* L120: */ } } else if (j < *n) { i__3 = *n; for (i__ = j + 1; i__ <= i__3; ++i__) { sumj += a_ref(i__, j) * uscal * x[i__]; /* L130: */ } } } if (uscal == tscal) { /* Compute x(j) := ( x(j) - sumj ) / A(j,j) if 1/A(j,j) was not used to scale the dotproduct. */ x[j] -= sumj; xj = (d__1 = x[j], abs(d__1)); if (nounit) { tjjs = a_ref(j, j) * tscal; } else { tjjs = tscal; if (tscal == 1.) { goto L150; } } /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */ tjj = abs(tjjs); if (tjj > smlnum) { /* abs(A(j,j)) > SMLNUM: */ if (tjj < 1.) { if (xj > tjj * bignum) { /* Scale X by 1/abs(x(j)). */ rec = 1. / xj; dscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } } x[j] /= tjjs; } else if (tjj > 0.) { /* 0 < abs(A(j,j)) <= SMLNUM: */ if (xj > tjj * bignum) { /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */ rec = tjj * bignum / xj; dscal_(n, &rec, &x[1], &c__1); *scale *= rec; xmax *= rec; } x[j] /= tjjs; } else { /* A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and scale = 0, and compute a solution to A'*x = 0. */ i__3 = *n; for (i__ = 1; i__ <= i__3; ++i__) { x[i__] = 0.; /* L140: */ } x[j] = 1.; *scale = 0.; xmax = 0.; } L150: ; } else { /* Compute x(j) := x(j) / A(j,j) - sumj if the dot product has already been divided by 1/A(j,j). */ x[j] = x[j] / tjjs - sumj; } /* Computing MAX */ d__2 = xmax, d__3 = (d__1 = x[j], abs(d__1)); xmax = max(d__2,d__3); /* L160: */ } } *scale /= tscal; } /* Scale the column norms by 1/TSCAL for return. */ if (tscal != 1.) { d__1 = 1. / tscal; dscal_(n, &d__1, &cnorm[1], &c__1); } return 0; /* End of DLATRS */ } /* dlatrs_ */
void dtrpcg(int n, double *A, double *g, double delta, double *L, double tol, double stol, double *w, int *iters, int *info) { /* c ********* c c Subroutine dtrpcg c c Given a dense symmetric positive semidefinite matrix A, this c subroutine uses a preconditioned conjugate gradient method to find c an approximate minimizer of the trust region subproblem c c min { q(s) : || L'*s || <= delta }. c c where q is the quadratic c c q(s) = 0.5*s'*A*s + g'*s, c c This subroutine generates the conjugate gradient iterates for c the equivalent problem c c min { Q(w) : || w || <= delta }. c c where Q is the quadratic defined by c c Q(w) = q(s), w = L'*s. c c Termination occurs if the conjugate gradient iterates leave c the trust region, a negative curvature direction is generated, c or one of the following two convergence tests is satisfied. c c Convergence in the original variables: c c || grad q(s) || <= tol c c Convergence in the scaled variables: c c || grad Q(w) || <= stol c c Note that if w = L'*s, then L*grad Q(w) = grad q(s). c c parameters: c c n is an integer variable. c On entry n is the number of variables. c On exit n is unchanged. c c A is a double precision array of dimension n*n. c On entry A specifies the matrix A. c On exit A is unchanged. c c g is a double precision array of dimension n. c On entry g must contain the vector g. c On exit g is unchanged. c c delta is a double precision variable. c On entry delta is the trust region size. c On exit delta is unchanged. c c L is a double precision array of dimension n*n. c On entry L need not to be specified. c On exit the lower triangular part of L contains the matrix L. c c tol is a double precision variable. c On entry tol specifies the convergence test c in the un-scaled variables. c On exit tol is unchanged c c stol is a double precision variable. c On entry stol specifies the convergence test c in the scaled variables. c On exit stol is unchanged c c w is a double precision array of dimension n. c On entry w need not be specified. c On exit w contains the final conjugate gradient iterate. c c iters is an integer variable. c On entry iters need not be specified. c On exit iters is set to the number of conjugate c gradient iterations. c c info is an integer variable. c On entry info need not be specified. c On exit info is set as follows: c c info = 1 Convergence in the original variables. c || grad q(s) || <= tol c c info = 2 Convergence in the scaled variables. c || grad Q(w) || <= stol c c info = 3 Negative curvature direction generated. c In this case || w || = delta and a direction c c of negative curvature w can be recovered by c solving L'*w = p. c c info = 4 Conjugate gradient iterates exit the c trust region. In this case || w || = delta. c c info = 5 Failure to converge within itermax(n) iterations. c c ********** */ int i, inc = 1; double one = 1, zero = 0, alpha, malpha, beta, ptq, rho; double *p, *q, *t, *r, *z, sigma, rtr, rnorm, rnorm0, tnorm; p = (double *) xmalloc(sizeof(double)*n); q = (double *) xmalloc(sizeof(double)*n); t = (double *) xmalloc(sizeof(double)*n); r = (double *) xmalloc(sizeof(double)*n); z = (double *) xmalloc(sizeof(double)*n); /* Initialize the iterate w and the residual r. Initialize the residual t of grad q to -g. Initialize the residual r of grad Q by solving L*r = -g. Note that t = L*r. */ for (i=0;i<n;i++) { w[i] = 0; r[i] = t[i] = -g[i]; } dtrsv_("L", "N", "N", &n, L, &n, r, &inc); /* Initialize the direction p. */ memcpy(p, r, sizeof(double)*n); /* Initialize rho and the norms of r and t. */ rho = ddot_(&n, r, &inc, r, &inc); rnorm0 = sqrt(rho); /* Exit if g = 0. */ *iters = 0; if (rnorm0 <= stol) { *info = 1; goto return0; } for (*iters=1;*iters<=n;*iters++) { /* Compute z by solving L'*z = p. */ memcpy(z, p, sizeof(double)*n); dtrsv_("L", "T", "N", &n, L, &n, z, &inc); /* Compute q by solving L*q = A*z and save L*q for use in updating the residual t. */ dsymv_("U", &n, &one, A, &n, z, &inc, &zero, q, &inc); memcpy(z, q, sizeof(double)*n); dtrsv_("L", "N", "N", &n, L, &n, q, &inc); /* Compute alpha and determine sigma such that the trust region constraint || w + sigma*p || = delta is satisfied. */ ptq = ddot_(&n, p, &inc, q, &inc); if (ptq > 0) alpha = rho/ptq; else alpha = 0; dtrqsol(n, w, p, delta, &sigma); /* Exit if there is negative curvature or if the iterates exit the trust region. */ if (ptq <= 0 || alpha >= sigma) { daxpy_(&n, &sigma, p, &inc, w, &inc); if (ptq <= 0) *info = 3; else *info = 4; goto return0; } /* Update w and the residuals r and t. Note that t = L*r. */ malpha = -alpha; daxpy_(&n, &alpha, p, &inc, w, &inc); daxpy_(&n, &malpha, q, &inc, r, &inc); daxpy_(&n, &malpha, z, &inc, t,&inc); /* Exit if the residual convergence test is satisfied. */ rtr = ddot_(&n, r, &inc, r, &inc); rnorm = sqrt(rtr); tnorm = sqrt(ddot_(&n, t, &inc, t, &inc)); if (tnorm <= tol) { *info = 1; goto return0; } if (rnorm <= stol) { *info = 2; goto return0; } /* Compute p = r + beta*p and update rho. */ beta = rtr/rho; dscal_(&n, &beta, p, &inc); daxpy_(&n, &one, r, &inc, p, &inc); rho = rtr; } /* iters > itermax = n */ *info = 5; return0: free(p); free(q); free(r); free(t); free(z); }
/* Subroutine */ int dggglm_(integer *n, integer *m, integer *p, doublereal * a, integer *lda, doublereal *b, integer *ldb, doublereal *d__, doublereal *x, doublereal *y, doublereal *work, integer *lwork, integer *info) { /* -- LAPACK driver routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University June 30, 1999 Purpose ======= DGGGLM solves a general Gauss-Markov linear model (GLM) problem: minimize || y ||_2 subject to d = A*x + B*y x where A is an N-by-M matrix, B is an N-by-P matrix, and d is a given N-vector. It is assumed that M <= N <= M+P, and rank(A) = M and rank( A B ) = N. Under these assumptions, the constrained equation is always consistent, and there is a unique solution x and a minimal 2-norm solution y, which is obtained using a generalized QR factorization of A and B. In particular, if matrix B is square nonsingular, then the problem GLM is equivalent to the following weighted linear least squares problem minimize || inv(B)*(d-A*x) ||_2 x where inv(B) denotes the inverse of B. Arguments ========= N (input) INTEGER The number of rows of the matrices A and B. N >= 0. M (input) INTEGER The number of columns of the matrix A. 0 <= M <= N. P (input) INTEGER The number of columns of the matrix B. P >= N-M. A (input/output) DOUBLE PRECISION array, dimension (LDA,M) On entry, the N-by-M matrix A. On exit, A is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) DOUBLE PRECISION array, dimension (LDB,P) On entry, the N-by-P matrix B. On exit, B is destroyed. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). D (input/output) DOUBLE PRECISION array, dimension (N) On entry, D is the left hand side of the GLM equation. On exit, D is destroyed. X (output) DOUBLE PRECISION array, dimension (M) Y (output) DOUBLE PRECISION array, dimension (P) On exit, X and Y are the solutions of the GLM problem. WORK (workspace/output) DOUBLE PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,N+M+P). For optimum performance, LWORK >= M+min(N,P)+max(N,P)*NB, where NB is an upper bound for the optimal blocksizes for DGEQRF, SGERQF, DORMQR and SORMRQ. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. =================================================================== Test the input parameters Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; static integer c_n1 = -1; static doublereal c_b32 = -1.; static doublereal c_b34 = 1.; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4; /* Local variables */ static integer lopt, i__; extern /* Subroutine */ int dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dtrsv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); static integer nb, np; extern /* Subroutine */ int dggqrf_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); static integer nb1, nb2, nb3, nb4; extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dormrq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); static integer lwkopt; static logical lquery; #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; --d__; --x; --y; --work; /* Function Body */ *info = 0; np = min(*n,*p); nb1 = ilaenv_(&c__1, "DGEQRF", " ", n, m, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); nb2 = ilaenv_(&c__1, "DGERQF", " ", n, m, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); nb3 = ilaenv_(&c__1, "DORMQR", " ", n, m, p, &c_n1, (ftnlen)6, (ftnlen)1); nb4 = ilaenv_(&c__1, "DORMRQ", " ", n, m, p, &c_n1, (ftnlen)6, (ftnlen)1); /* Computing MAX */ i__1 = max(nb1,nb2), i__1 = max(i__1,nb3); nb = max(i__1,nb4); lwkopt = *m + np + max(*n,*p) * nb; work[1] = (doublereal) lwkopt; lquery = *lwork == -1; if (*n < 0) { *info = -1; } else if (*m < 0 || *m > *n) { *info = -2; } else if (*p < 0 || *p < *n - *m) { *info = -3; } else if (*lda < max(1,*n)) { *info = -5; } else if (*ldb < max(1,*n)) { *info = -7; } else /* if(complicated condition) */ { /* Computing MAX */ i__1 = 1, i__2 = *n + *m + *p; if (*lwork < max(i__1,i__2) && ! lquery) { *info = -12; } } if (*info != 0) { i__1 = -(*info); xerbla_("DGGGLM", &i__1); return 0; } else if (lquery) { return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Compute the GQR factorization of matrices A and B: Q'*A = ( R11 ) M, Q'*B*Z' = ( T11 T12 ) M ( 0 ) N-M ( 0 T22 ) N-M M M+P-N N-M where R11 and T22 are upper triangular, and Q and Z are orthogonal. */ i__1 = *lwork - *m - np; dggqrf_(n, m, p, &a[a_offset], lda, &work[1], &b[b_offset], ldb, &work[*m + 1], &work[*m + np + 1], &i__1, info); lopt = (integer) work[*m + np + 1]; /* Update left-hand-side vector d = Q'*d = ( d1 ) M ( d2 ) N-M */ i__1 = max(1,*n); i__2 = *lwork - *m - np; dormqr_("Left", "Transpose", n, &c__1, m, &a[a_offset], lda, &work[1], & d__[1], &i__1, &work[*m + np + 1], &i__2, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) work[*m + np + 1]; lopt = max(i__1,i__2); /* Solve T22*y2 = d2 for y2 */ i__1 = *n - *m; dtrsv_("Upper", "No transpose", "Non unit", &i__1, &b_ref(*m + 1, *m + *p - *n + 1), ldb, &d__[*m + 1], &c__1); i__1 = *n - *m; dcopy_(&i__1, &d__[*m + 1], &c__1, &y[*m + *p - *n + 1], &c__1); /* Set y1 = 0 */ i__1 = *m + *p - *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.; /* L10: */ } /* Update d1 = d1 - T12*y2 */ i__1 = *n - *m; dgemv_("No transpose", m, &i__1, &c_b32, &b_ref(1, *m + *p - *n + 1), ldb, &y[*m + *p - *n + 1], &c__1, &c_b34, &d__[1], &c__1); /* Solve triangular system: R11*x = d1 */ dtrsv_("Upper", "No Transpose", "Non unit", m, &a[a_offset], lda, &d__[1], &c__1); /* Copy D to X */ dcopy_(m, &d__[1], &c__1, &x[1], &c__1); /* Backward transformation y = Z'*y Computing MAX */ i__1 = 1, i__2 = *n - *p + 1; i__3 = max(1,*p); i__4 = *lwork - *m - np; dormrq_("Left", "Transpose", p, &c__1, &np, &b_ref(max(i__1,i__2), 1), ldb, &work[*m + 1], &y[1], &i__3, &work[*m + np + 1], &i__4, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) work[*m + np + 1]; work[1] = (doublereal) (*m + np + max(i__1,i__2)); return 0; /* End of DGGGLM */ } /* dggglm_ */
/* Subroutine */ int dtrrfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer * ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3; doublereal d__1, d__2, d__3; /* Local variables */ integer i__, j, k; doublereal s, xk; integer nz; doublereal eps; integer kase; doublereal safe1, safe2; extern logical lsame_(char *, char *); integer isave[3]; extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dtrsv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *); logical notran; char transt[1]; logical nounit; doublereal lstres; /* -- LAPACK computational routine (version 3.4.0) -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ /* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ /* November 2011 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. Local Arrays .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; x_dim1 = *ldx; x_offset = 1 + x_dim1; x -= x_offset; --ferr; --berr; --work; --iwork; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); notran = lsame_(trans, "N"); nounit = lsame_(diag, "N"); if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (! notran && ! lsame_(trans, "T") && ! lsame_(trans, "C")) { *info = -2; } else if (! nounit && ! lsame_(diag, "U")) { *info = -3; } else if (*n < 0) { *info = -4; } else if (*nrhs < 0) { *info = -5; } else if (*lda < max(1,*n)) { *info = -7; } else if (*ldb < max(1,*n)) { *info = -9; } else if (*ldx < max(1,*n)) { *info = -11; } if (*info != 0) { i__1 = -(*info); xerbla_("DTRRFS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { ferr[j] = 0.; berr[j] = 0.; /* L10: */ } return 0; } if (notran) { *(unsigned char *)transt = 'T'; } else { *(unsigned char *)transt = 'N'; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = *n + 1; eps = dlamch_("Epsilon"); safmin = dlamch_("Safe minimum"); safe1 = nz * safmin; safe2 = safe1 / eps; /* Do for each right hand side */ i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { /* Compute residual R = B - op(A) * X, */ /* where op(A) = A or A**T, depending on TRANS. */ dcopy_(n, &x[j * x_dim1 + 1], &c__1, &work[*n + 1], &c__1); dtrmv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1); daxpy_(n, &c_b19, &b[j * b_dim1 + 1], &c__1, &work[*n + 1], &c__1); /* Compute componentwise relative backward error from formula */ /* max(i) ( f2c_abs(R(i)) / ( f2c_abs(op(A))*f2c_abs(X) + f2c_abs(B) )(i) ) */ /* where f2c_abs(Z) is the componentwise absolute value of the matrix */ /* or vector Z. If the i-th component of the denominator is less */ /* than SAFE2, then SAFE1 is added to the i-th components of the */ /* numerator and denominator before dividing. */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[i__] = (d__1 = b[i__ + j * b_dim1], f2c_abs(d__1)); /* L20: */ } if (notran) { /* Compute f2c_abs(A)*f2c_abs(X) + f2c_abs(B). */ if (upper) { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (d__1 = x[k + j * x_dim1], f2c_abs(d__1)); i__3 = k; for (i__ = 1; i__ <= i__3; ++i__) { work[i__] += (d__1 = a[i__ + k * a_dim1], f2c_abs( d__1)) * xk; /* L30: */ } /* L40: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (d__1 = x[k + j * x_dim1], f2c_abs(d__1)); i__3 = k - 1; for (i__ = 1; i__ <= i__3; ++i__) { work[i__] += (d__1 = a[i__ + k * a_dim1], f2c_abs( d__1)) * xk; /* L50: */ } work[k] += xk; /* L60: */ } } } else { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (d__1 = x[k + j * x_dim1], f2c_abs(d__1)); i__3 = *n; for (i__ = k; i__ <= i__3; ++i__) { work[i__] += (d__1 = a[i__ + k * a_dim1], f2c_abs( d__1)) * xk; /* L70: */ } /* L80: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (d__1 = x[k + j * x_dim1], f2c_abs(d__1)); i__3 = *n; for (i__ = k + 1; i__ <= i__3; ++i__) { work[i__] += (d__1 = a[i__ + k * a_dim1], f2c_abs( d__1)) * xk; /* L90: */ } work[k] += xk; /* L100: */ } } } } else { /* Compute f2c_abs(A**T)*f2c_abs(X) + f2c_abs(B). */ if (upper) { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = 0.; i__3 = k; for (i__ = 1; i__ <= i__3; ++i__) { s += (d__1 = a[i__ + k * a_dim1], f2c_abs(d__1)) * ( d__2 = x[i__ + j * x_dim1], f2c_abs(d__2)); /* L110: */ } work[k] += s; /* L120: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = (d__1 = x[k + j * x_dim1], f2c_abs(d__1)); i__3 = k - 1; for (i__ = 1; i__ <= i__3; ++i__) { s += (d__1 = a[i__ + k * a_dim1], f2c_abs(d__1)) * ( d__2 = x[i__ + j * x_dim1], f2c_abs(d__2)); /* L130: */ } work[k] += s; /* L140: */ } } } else { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = 0.; i__3 = *n; for (i__ = k; i__ <= i__3; ++i__) { s += (d__1 = a[i__ + k * a_dim1], f2c_abs(d__1)) * ( d__2 = x[i__ + j * x_dim1], f2c_abs(d__2)); /* L150: */ } work[k] += s; /* L160: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = (d__1 = x[k + j * x_dim1], f2c_abs(d__1)); i__3 = *n; for (i__ = k + 1; i__ <= i__3; ++i__) { s += (d__1 = a[i__ + k * a_dim1], f2c_abs(d__1)) * ( d__2 = x[i__ + j * x_dim1], f2c_abs(d__2)); /* L170: */ } work[k] += s; /* L180: */ } } } } s = 0.; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] > safe2) { /* Computing MAX */ d__2 = s; d__3 = (d__1 = work[*n + i__], f2c_abs(d__1)) / work[ i__]; // , expr subst s = max(d__2,d__3); } else { /* Computing MAX */ d__2 = s; d__3 = ((d__1 = work[*n + i__], f2c_abs(d__1)) + safe1) / (work[i__] + safe1); // , expr subst s = max(d__2,d__3); } /* L190: */ } berr[j] = s; /* Bound error from formula */ /* norm(X - XTRUE) / norm(X) .le. FERR = */ /* norm( f2c_abs(inv(op(A)))* */ /* ( f2c_abs(R) + NZ*EPS*( f2c_abs(op(A))*f2c_abs(X)+f2c_abs(B) ))) / norm(X) */ /* where */ /* norm(Z) is the magnitude of the largest component of Z */ /* inv(op(A)) is the inverse of op(A) */ /* f2c_abs(Z) is the componentwise absolute value of the matrix or */ /* vector Z */ /* NZ is the maximum number of nonzeros in any row of A, plus 1 */ /* EPS is machine epsilon */ /* The i-th component of f2c_abs(R)+NZ*EPS*(f2c_abs(op(A))*f2c_abs(X)+f2c_abs(B)) */ /* is incremented by SAFE1 if the i-th component of */ /* f2c_abs(op(A))*f2c_abs(X) + f2c_abs(B) is less than SAFE2. */ /* Use DLACN2 to estimate the infinity-norm of the matrix */ /* inv(op(A)) * diag(W), */ /* where W = f2c_abs(R) + NZ*EPS*( f2c_abs(op(A))*f2c_abs(X)+f2c_abs(B) ))) */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] > safe2) { work[i__] = (d__1 = work[*n + i__], f2c_abs(d__1)) + nz * eps * work[i__]; } else { work[i__] = (d__1 = work[*n + i__], f2c_abs(d__1)) + nz * eps * work[i__] + safe1; } /* L200: */ } kase = 0; L210: dlacn2_(n, &work[(*n << 1) + 1], &work[*n + 1], &iwork[1], &ferr[j], & kase, isave); if (kase != 0) { if (kase == 1) { /* Multiply by diag(W)*inv(op(A)**T). */ dtrsv_(uplo, transt, diag, n, &a[a_offset], lda, &work[*n + 1] , &c__1); i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[*n + i__] = work[i__] * work[*n + i__]; /* L220: */ } } else { /* Multiply by inv(op(A))*diag(W). */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[*n + i__] = work[i__] * work[*n + i__]; /* L230: */ } dtrsv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1); } goto L210; } /* Normalize error. */ lstres = 0.; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { /* Computing MAX */ d__2 = lstres; d__3 = (d__1 = x[i__ + j * x_dim1], f2c_abs(d__1)); // , expr subst lstres = max(d__2,d__3); /* L240: */ } if (lstres != 0.) { ferr[j] /= lstres; } /* L250: */ } return 0; /* End of DTRRFS */ }
int pdgstrf_column_bmod( const int pnum, /* process number */ const int jcol, /* current column in the panel */ const int fpanelc,/* first column in the panel */ const int nseg, /* number of s-nodes to update jcol */ int *segrep,/* in */ int *repfnz,/* in */ double *dense, /* modified */ double *tempv, /* working array */ pxgstrf_shared_t *pxgstrf_shared, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 1.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * August 15, 1997 * * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j]. * * Return value: * ============= * 0 - successful return * > 0 - number of bytes allocated when run out of space * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; double alpha, beta; #endif GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ register double ukj, ukj1, ukj2; register int lptr, kfnz, isub, irow, i, no_zeros; register int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jsupno, k, ksub, krep, krep_ind, ksupno; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode.*/ int *xsup, *supno; int *lsub, *xlsub, *xlsub_end; double *lusup; int *xlusup, *xlusup_end; double *tempv1; int mem_error; register float flopcnt; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; xlusup_end = Glu->xlusup_end; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; #if ( DEBUGlvel>=2 ) if (jcol==BADCOL) printf("(%d) pdgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n", pnum, jcol, nseg, krep, jsupno, ksupno); #endif if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze; Gstat->procstat[pnum].fcops += flopcnt; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pdgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\ fsupc %d, nsupr %d, nsupc %d\n", pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc); #endif /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; dense[irow] -= ukj*lusup[luptr]; luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ ukj -= ukj1 * lusup[luptr1]; dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; dense[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] ); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; dense[irow] -= ( ukj*lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] ); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = 1.0; beta = 0.0; #if ( MACH==CRAY_PVP ) SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else dgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else dlsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; dmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; /* Scatter */ tempv[i] = 0.0; isub++; } /* Scatter tempv1[] into SPA dense[*] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; dense[irow] -= tempv1[i]; /* Scatter-add */ tempv1[i] = 0.0; ++isub; } } /* else segsze >= 4 */ } /* if jsupno ... */ } /* for each segment... */ /* ------------------------------------------ Process the supernodal portion of L\U[*,j] ------------------------------------------ */ fsupc = SUPER_FSUPC (jsupno); nsupr = xlsub_end[fsupc] - xlsub[fsupc]; if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, pxgstrf_shared)) ) return mem_error; xlusup[jcol] = nextlu; lusup = Glu->lusup; /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = 0.0; #ifdef DEBUG if (jcol == -1) printf("(%d) pdgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n", pnum, jcol, irow, lusup[nextlu]); #endif ++nextlu; } xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */ #if ( DEBUGlevel>=2 ) if (jcol == -1) { nrow = xlusup_end[jcol] - xlusup[jcol]; print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]], &lusup[xlusup[jcol]]); } #endif /* * For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * (1) fsupc < fpanelc, then fst_col := fpanelc * (2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* distance between the current supernode and the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; lptr = xlsub[fsupc] + d_fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* points to the beginning of jcol in supernode L\U[*,jsupno] */ ufirst = xlusup[jcol] + d_fsupc; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pdgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n", pnum, jcol, fsupc, nsupr, nsupc, nrow); #endif flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; Gstat->procstat[pnum].fcops += flopcnt; /* ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; */ #ifdef USE_VENDOR_BLAS alpha = -1.0; beta = 1.0; /* y := beta*y + alpha*A*x */ #if ( MACH==CRAY_PVP ) STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else dtrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); dgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else dlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); dmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { lusup[isub] -= tempv[i]; /* Scatter-add */ tempv[i] = 0.0; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }
/*! \brief Performs numeric block updates within the relaxed snode. */ int dsnode_bmod ( const int jcol, /* in */ const int jsupno, /* in */ const int fsupc, /* in */ double *dense, /* in */ double *tempv, /* working array */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { #ifdef USE_VENDOR_BLAS #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; double alpha = -1.0, beta = 1.0; #endif int luptr, nsupc, nsupr, nrow; int isub, irow, i, iptr; register int ufirst, nextlu; int *lsub, *xlsub; double *lusup; int *xlusup; flops_t *ops = stat->ops; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; nextlu = xlusup[jcol]; /* * Process the supernodal portion of L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = 0; ++nextlu; } xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ if ( fsupc < jcol ) { luptr = xlusup[fsupc]; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; nsupc = jcol - fsupc; /* Excluding jcol */ ufirst = xlusup[jcol]; /* Points to the beginning of column jcol in supernode L\U(jsupno). */ nrow = nsupr - nsupc; ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else dtrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); dgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else dlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); dmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], &tempv[0] ); /* Scatter tempv[*] into lusup[*] */ iptr = ufirst + nsupc; for (i = 0; i < nrow; i++) { lusup[iptr++] -= tempv[i]; tempv[i] = 0.0; } #endif } return 0; }
/*! \brief * * <pre> * Purpose * ======= * Perform parallel triangular solves * U(k,:) := A(k,:) \ L(k,k). * Only the process column that owns block column *k* participates * in the work. * * Arguments * ========= * * m (input) int (global) * Number of rows in the matrix. * * k (input) int (global) * The row number of the block row to be factorized. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization; * See SuperLUStat_t structure defined in util.h. * </pre> */ static void pdgstrs2 /************************************************************************/ #ifdef _CRAY ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 ) #else ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat ) #endif { int iam, pkk; int incx = 1; int nsupr; /* number of rows in the block L(:,k) (LDA) */ int segsize; int_t nsupc; /* number of columns in the block */ int_t luptr, iukp, rukp; int_t b, gb, j, klst, knsupc, lk, nb; int_t *xsup = Glu_persist->xsup; int_t *usub; double *lusup, *uval; /* Quick return. */ lk = LBi( k, grid ); /* Local block number */ if ( !Llu->Unzval_br_ptr[lk] ) return; /* Initialization. */ iam = grid->iam; pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); klst = FstBlockC( k+1 ); knsupc = SuperSize( k ); usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ uval = Llu->Unzval_br_ptr[lk]; nb = usub[0]; iukp = BR_HEADER; rukp = 0; if ( iam == pkk ) { lk = LBj( k, grid ); nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ lusup = Llu->Lnzval_bc_ptr[lk]; } else { nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ lusup = Llu->Lval_buf_2[k%2]; } /* Loop through all the row blocks. */ for (b = 0; b < nb; ++b) { gb = usub[iukp]; nsupc = SuperSize( gb ); iukp += UB_DESCRIPTOR; /* Loop through all the segments in the block. */ for (j = 0; j < nsupc; ++j) { segsize = klst - usub[iukp++]; if ( segsize ) { /* Nonzero segment. */ luptr = (knsupc - segsize) * (nsupr + 1); #ifdef _CRAY STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #else dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #endif stat->ops[FACT] += segsize * (segsize + 1); rukp += segsize; } } } /* for b ... */ } /* PDGSTRS2 */
/* Subroutine */ int dgglse_(integer *m, integer *n, integer *p, doublereal * a, integer *lda, doublereal *b, integer *ldb, doublereal *c__, doublereal *d__, doublereal *x, doublereal *work, integer *lwork, integer *info) { /* -- LAPACK driver routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University June 30, 1999 Purpose ======= DGGLSE solves the linear equality-constrained least squares (LSE) problem: minimize || c - A*x ||_2 subject to B*x = d where A is an M-by-N matrix, B is a P-by-N matrix, c is a given M-vector, and d is a given P-vector. It is assumed that P <= N <= M+P, and rank(B) = P and rank( ( A ) ) = N. ( ( B ) ) These conditions ensure that the LSE problem has a unique solution, which is obtained using a GRQ factorization of the matrices B and A. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrices A and B. N >= 0. P (input) INTEGER The number of rows of the matrix B. 0 <= P <= N <= M+P. A (input/output) DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). B (input/output) DOUBLE PRECISION array, dimension (LDB,N) On entry, the P-by-N matrix B. On exit, B is destroyed. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,P). C (input/output) DOUBLE PRECISION array, dimension (M) On entry, C contains the right hand side vector for the least squares part of the LSE problem. On exit, the residual sum of squares for the solution is given by the sum of squares of elements N-P+1 to M of vector C. D (input/output) DOUBLE PRECISION array, dimension (P) On entry, D contains the right hand side vector for the constrained equation. On exit, D is destroyed. X (output) DOUBLE PRECISION array, dimension (N) On exit, X is the solution of the LSE problem. WORK (workspace/output) DOUBLE PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,M+N+P). For optimum performance LWORK >= P+min(M,N)+max(M,N)*NB, where NB is an upper bound for the optimal blocksizes for DGEQRF, SGERQF, DORMQR and SORMRQ. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. ===================================================================== Test the input parameters Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; static integer c_n1 = -1; static doublereal c_b29 = -1.; static doublereal c_b31 = 1.; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; /* Local variables */ static integer lopt; extern /* Subroutine */ int dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *) , dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dtrsv_(char * , char *, char *, integer *, doublereal *, integer *, doublereal * , integer *); static integer nb, mn, nr; extern /* Subroutine */ int dggrqf_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); static integer nb1, nb2, nb3, nb4; extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dormrq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); static integer lwkopt; static logical lquery; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; --c__; --d__; --x; --work; /* Function Body */ *info = 0; mn = min(*m,*n); nb1 = ilaenv_(&c__1, "DGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); nb2 = ilaenv_(&c__1, "DGERQF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); nb3 = ilaenv_(&c__1, "DORMQR", " ", m, n, p, &c_n1, (ftnlen)6, (ftnlen)1); nb4 = ilaenv_(&c__1, "DORMRQ", " ", m, n, p, &c_n1, (ftnlen)6, (ftnlen)1); /* Computing MAX */ i__1 = max(nb1,nb2), i__1 = max(i__1,nb3); nb = max(i__1,nb4); lwkopt = *p + mn + max(*m,*n) * nb; work[1] = (doublereal) lwkopt; lquery = *lwork == -1; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*p < 0 || *p > *n || *p < *n - *m) { *info = -3; } else if (*lda < max(1,*m)) { *info = -5; } else if (*ldb < max(1,*p)) { *info = -7; } else /* if(complicated condition) */ { /* Computing MAX */ i__1 = 1, i__2 = *m + *n + *p; if (*lwork < max(i__1,i__2) && ! lquery) { *info = -12; } } if (*info != 0) { i__1 = -(*info); xerbla_("DGGLSE", &i__1); return 0; } else if (lquery) { return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Compute the GRQ factorization of matrices B and A: B*Q' = ( 0 T12 ) P Z'*A*Q' = ( R11 R12 ) N-P N-P P ( 0 R22 ) M+P-N N-P P where T12 and R11 are upper triangular, and Q and Z are orthogonal. */ i__1 = *lwork - *p - mn; dggrqf_(p, m, n, &b[b_offset], ldb, &work[1], &a[a_offset], lda, &work[*p + 1], &work[*p + mn + 1], &i__1, info); lopt = (integer) work[*p + mn + 1]; /* Update c = Z'*c = ( c1 ) N-P ( c2 ) M+P-N */ i__1 = max(1,*m); i__2 = *lwork - *p - mn; dormqr_("Left", "Transpose", m, &c__1, &mn, &a[a_offset], lda, &work[*p + 1], &c__[1], &i__1, &work[*p + mn + 1], &i__2, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) work[*p + mn + 1]; lopt = max(i__1,i__2); /* Solve T12*x2 = d for x2 */ dtrsv_("Upper", "No transpose", "Non unit", p, &b_ref(1, *n - *p + 1), ldb, &d__[1], &c__1); /* Update c1 */ i__1 = *n - *p; dgemv_("No transpose", &i__1, p, &c_b29, &a_ref(1, *n - *p + 1), lda, & d__[1], &c__1, &c_b31, &c__[1], &c__1); /* Sovle R11*x1 = c1 for x1 */ i__1 = *n - *p; dtrsv_("Upper", "No transpose", "Non unit", &i__1, &a[a_offset], lda, & c__[1], &c__1); /* Put the solutions in X */ i__1 = *n - *p; dcopy_(&i__1, &c__[1], &c__1, &x[1], &c__1); dcopy_(p, &d__[1], &c__1, &x[*n - *p + 1], &c__1); /* Compute the residual vector: */ if (*m < *n) { nr = *m + *p - *n; i__1 = *n - *m; dgemv_("No transpose", &nr, &i__1, &c_b29, &a_ref(*n - *p + 1, *m + 1) , lda, &d__[nr + 1], &c__1, &c_b31, &c__[*n - *p + 1], &c__1); } else { nr = *p; } dtrmv_("Upper", "No transpose", "Non unit", &nr, &a_ref(*n - *p + 1, *n - *p + 1), lda, &d__[1], &c__1); daxpy_(&nr, &c_b29, &d__[1], &c__1, &c__[*n - *p + 1], &c__1); /* Backward transformation x = Q'*x */ i__1 = *lwork - *p - mn; dormrq_("Left", "Transpose", n, &c__1, p, &b[b_offset], ldb, &work[1], &x[ 1], n, &work[*p + mn + 1], &i__1, info); /* Computing MAX */ i__1 = lopt, i__2 = (integer) work[*p + mn + 1]; work[1] = (doublereal) (*p + mn + max(i__1,i__2)); return 0; /* End of DGGLSE */ } /* dgglse_ */
static void pdgstrs2 /************************************************************************/ #ifdef _CRAY ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 ) #else ( int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat ) #endif /* * Purpose * ======= * Perform parallel triangular solves * U(k,:) := A(k,:) \ L(k,k). * Only the process row that owns block row *k* participates * in the work. * * Arguments * ========= * * m (input) int (global) * Number of rows in the matrix. * * k (input) int (global) * The row number of the block row to be factorized. * * Glu_persist (input) Glu_persist_t* * Global data structures (xsup, supno) replicated on all processes. * * grid (input) gridinfo_t* * The 2D process mesh. * * Llu (input/output) LocalLU_t* * Local data structures to store distributed L and U matrices. * * stat (output) SuperLUStat_t* * Record the statistics about the factorization; * See SuperLUStat_t structure defined in util.h. * */ { int iam, pkk; int incx = 1; int nsupr; /* number of rows in the block L(:,k) (LDA) */ int segsize; int_t nsupc; /* number of columns in the block */ int_t luptr, iukp, rukp; int_t b, gb, j, klst, knsupc, lk, nb; int_t *xsup = Glu_persist->xsup; int_t *usub; double *lusup, *uval; /* Quick return. */ lk = LBi( k, grid ); /* Local block number */ if ( !Llu->Unzval_br_ptr[lk] ) return; /* Initialization. */ iam = grid->iam; pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); klst = FstBlockC( k+1 ); knsupc = SuperSize( k ); usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ uval = Llu->Unzval_br_ptr[lk]; nb = usub[0]; iukp = BR_HEADER; rukp = 0; if ( iam == pkk ) { lk = LBj( k, grid ); nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ lusup = Llu->Lnzval_bc_ptr[lk]; } else { nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ lusup = Llu->Lval_buf_2[k%2]; } /* Loop through all the row blocks. */ for (b = 0; b < nb; ++b) { gb = usub[iukp]; nsupc = SuperSize( gb ); iukp += UB_DESCRIPTOR; /* Loop through all the segments in the block. */ for (j = 0; j < nsupc; ++j) { segsize = klst - usub[iukp++]; if ( segsize ) { /* Nonzero segment. */ luptr = (knsupc - segsize) * (nsupr + 1); #ifdef _CRAY STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #elif defined (USE_VENDOR_BLAS) dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx, 1, 1, 1); #else dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx); #endif stat->ops[FACT] += segsize * (segsize + 1); rukp += segsize; } } } /* for b ... */ } /* PDGSTRS2 */
/* Subroutine */ int dtrrfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer * ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, integer *info) { /* -- LAPACK routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University September 30, 1994 Purpose ======= DTRRFS provides error bounds and backward error estimates for the solution to a system of linear equations with a triangular coefficient matrix. The solution matrix X must be computed by DTRTRS or some other means before entering this routine. DTRRFS does not do iterative refinement because doing so cannot improve the backward error. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A**T * X = B (Transpose) = 'C': A**H * X = B (Conjugate transpose = Transpose) DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrices B and X. NRHS >= 0. A (input) DOUBLE PRECISION array, dimension (LDA,N) The triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input) DOUBLE PRECISION array, dimension (LDB,NRHS) The right hand side matrix B. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). X (input) DOUBLE PRECISION array, dimension (LDX,NRHS) The solution matrix X. LDX (input) INTEGER The leading dimension of the array X. LDX >= max(1,N). FERR (output) DOUBLE PRECISION array, dimension (NRHS) The estimated forward error bound for each solution vector X(j) (the j-th column of the solution matrix X). If XTRUE is the true solution corresponding to X(j), FERR(j) is an estimated upper bound for the magnitude of the largest element in (X(j) - XTRUE) divided by the magnitude of the largest element in X(j). The estimate is as reliable as the estimate for RCOND, and is almost always a slight overestimate of the true error. BERR (output) DOUBLE PRECISION array, dimension (NRHS) The componentwise relative backward error of each solution vector X(j) (i.e., the smallest relative change in any element of A or B that makes X(j) an exact solution). WORK (workspace) DOUBLE PRECISION array, dimension (3*N) IWORK (workspace) INTEGER array, dimension (N) INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== Test the input parameters. Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; static doublereal c_b19 = -1.; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, x_dim1, x_offset, i__1, i__2, i__3; doublereal d__1, d__2, d__3; /* Local variables */ static integer kase; static doublereal safe1, safe2; static integer i__, j, k; static doublereal s; extern logical lsame_(char *, char *); extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static logical upper; extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dtrsv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamch_(char *); extern /* Subroutine */ int dlacon_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static doublereal xk; static integer nz; static doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *); static logical notran; static char transt[1]; static logical nounit; static doublereal lstres, eps; #define a_ref(a_1,a_2) a[(a_2)*a_dim1 + a_1] #define b_ref(a_1,a_2) b[(a_2)*b_dim1 + a_1] #define x_ref(a_1,a_2) x[(a_2)*x_dim1 + a_1] a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; x_dim1 = *ldx; x_offset = 1 + x_dim1 * 1; x -= x_offset; --ferr; --berr; --work; --iwork; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); notran = lsame_(trans, "N"); nounit = lsame_(diag, "N"); if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (! notran && ! lsame_(trans, "T") && ! lsame_(trans, "C")) { *info = -2; } else if (! nounit && ! lsame_(diag, "U")) { *info = -3; } else if (*n < 0) { *info = -4; } else if (*nrhs < 0) { *info = -5; } else if (*lda < max(1,*n)) { *info = -7; } else if (*ldb < max(1,*n)) { *info = -9; } else if (*ldx < max(1,*n)) { *info = -11; } if (*info != 0) { i__1 = -(*info); xerbla_("DTRRFS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { ferr[j] = 0.; berr[j] = 0.; /* L10: */ } return 0; } if (notran) { *(unsigned char *)transt = 'T'; } else { *(unsigned char *)transt = 'N'; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = *n + 1; eps = dlamch_("Epsilon"); safmin = dlamch_("Safe minimum"); safe1 = nz * safmin; safe2 = safe1 / eps; /* Do for each right hand side */ i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { /* Compute residual R = B - op(A) * X, where op(A) = A or A', depending on TRANS. */ dcopy_(n, &x_ref(1, j), &c__1, &work[*n + 1], &c__1); dtrmv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1); daxpy_(n, &c_b19, &b_ref(1, j), &c__1, &work[*n + 1], &c__1); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th components of the numerator and denominator before dividing. */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[i__] = (d__1 = b_ref(i__, j), abs(d__1)); /* L20: */ } if (notran) { /* Compute abs(A)*abs(X) + abs(B). */ if (upper) { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (d__1 = x_ref(k, j), abs(d__1)); i__3 = k; for (i__ = 1; i__ <= i__3; ++i__) { work[i__] += (d__1 = a_ref(i__, k), abs(d__1)) * xk; /* L30: */ } /* L40: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (d__1 = x_ref(k, j), abs(d__1)); i__3 = k - 1; for (i__ = 1; i__ <= i__3; ++i__) { work[i__] += (d__1 = a_ref(i__, k), abs(d__1)) * xk; /* L50: */ } work[k] += xk; /* L60: */ } } } else { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (d__1 = x_ref(k, j), abs(d__1)); i__3 = *n; for (i__ = k; i__ <= i__3; ++i__) { work[i__] += (d__1 = a_ref(i__, k), abs(d__1)) * xk; /* L70: */ } /* L80: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { xk = (d__1 = x_ref(k, j), abs(d__1)); i__3 = *n; for (i__ = k + 1; i__ <= i__3; ++i__) { work[i__] += (d__1 = a_ref(i__, k), abs(d__1)) * xk; /* L90: */ } work[k] += xk; /* L100: */ } } } } else { /* Compute abs(A')*abs(X) + abs(B). */ if (upper) { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = 0.; i__3 = k; for (i__ = 1; i__ <= i__3; ++i__) { s += (d__1 = a_ref(i__, k), abs(d__1)) * (d__2 = x_ref(i__, j), abs(d__2)); /* L110: */ } work[k] += s; /* L120: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = (d__1 = x_ref(k, j), abs(d__1)); i__3 = k - 1; for (i__ = 1; i__ <= i__3; ++i__) { s += (d__1 = a_ref(i__, k), abs(d__1)) * (d__2 = x_ref(i__, j), abs(d__2)); /* L130: */ } work[k] += s; /* L140: */ } } } else { if (nounit) { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = 0.; i__3 = *n; for (i__ = k; i__ <= i__3; ++i__) { s += (d__1 = a_ref(i__, k), abs(d__1)) * (d__2 = x_ref(i__, j), abs(d__2)); /* L150: */ } work[k] += s; /* L160: */ } } else { i__2 = *n; for (k = 1; k <= i__2; ++k) { s = (d__1 = x_ref(k, j), abs(d__1)); i__3 = *n; for (i__ = k + 1; i__ <= i__3; ++i__) { s += (d__1 = a_ref(i__, k), abs(d__1)) * (d__2 = x_ref(i__, j), abs(d__2)); /* L170: */ } work[k] += s; /* L180: */ } } } } s = 0.; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] > safe2) { /* Computing MAX */ d__2 = s, d__3 = (d__1 = work[*n + i__], abs(d__1)) / work[ i__]; s = max(d__2,d__3); } else { /* Computing MAX */ d__2 = s, d__3 = ((d__1 = work[*n + i__], abs(d__1)) + safe1) / (work[i__] + safe1); s = max(d__2,d__3); } /* L190: */ } berr[j] = s; /* Bound error from formula norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use DLACON to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] > safe2) { work[i__] = (d__1 = work[*n + i__], abs(d__1)) + nz * eps * work[i__]; } else { work[i__] = (d__1 = work[*n + i__], abs(d__1)) + nz * eps * work[i__] + safe1; } /* L200: */ } kase = 0; L210: dlacon_(n, &work[(*n << 1) + 1], &work[*n + 1], &iwork[1], &ferr[j], & kase); if (kase != 0) { if (kase == 1) { /* Multiply by diag(W)*inv(op(A)'). */ dtrsv_(uplo, transt, diag, n, &a[a_offset], lda, &work[*n + 1] , &c__1); i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[*n + i__] = work[i__] * work[*n + i__]; /* L220: */ } } else { /* Multiply by inv(op(A))*diag(W). */ i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work[*n + i__] = work[i__] * work[*n + i__]; /* L230: */ } dtrsv_(uplo, trans, diag, n, &a[a_offset], lda, &work[*n + 1], &c__1); } goto L210; } /* Normalize error. */ lstres = 0.; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { /* Computing MAX */ d__2 = lstres, d__3 = (d__1 = x_ref(i__, j), abs(d__1)); lstres = max(d__2,d__3); /* L240: */ } if (lstres != 0.) { ferr[j] /= lstres; } /* L250: */ } return 0; /* End of DTRRFS */ } /* dtrrfs_ */
// BLAS Level 2 void dtrsv( char uplo, char trans, char diag, int N, double *A, int lda, double *x, int incx ){ dtrsv_( &uplo, &trans, &diag, &N, A, &lda, x, &incx ); };
int main( int argc, char** argv ) { obj_t a, x; obj_t x_save; obj_t alpha; dim_t m; dim_t p; dim_t p_begin, p_end, p_inc; int m_input; num_t dt_a, dt_x; num_t dt_alpha; int r, n_repeats; uplo_t uplo; double dtime; double dtime_save; double gflops; //bli_init(); n_repeats = 3; #ifndef PRINT p_begin = 40; p_end = 2000; p_inc = 40; m_input = -1; #else p_begin = 16; p_end = 16; p_inc = 1; m_input = 15; n_input = 15; #endif dt_alpha = dt_a = dt_x = BLIS_DOUBLE; uplo = BLIS_LOWER; // Begin with initializing the last entry to zero so that // matlab allocates space for the entire array once up-front. for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; #ifdef BLIS printf( "data_trsv_blis" ); #else printf( "data_trv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", ( unsigned long )(p - p_begin + 1)/p_inc + 1, ( unsigned long )0, 0.0 ); for ( p = p_begin; p <= p_end; p += p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); else m = ( dim_t ) m_input; bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); bli_obj_create( dt_a, m, m, 0, 0, &a ); bli_obj_create( dt_x, m, 1, 0, 0, &x ); bli_obj_create( dt_x, m, 1, 0, 0, &x_save ); bli_randm( &a ); bli_randm( &x ); bli_obj_set_struc( BLIS_TRIANGULAR, &a ); bli_obj_set_uplo( uplo, &a ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a ); bli_obj_set_diag( BLIS_NONUNIT_DIAG, &a ); // Randomize A and zero the unstored triangle to ensure the // implementation reads only from the stored region. bli_randm( &a ); bli_mktrim( &a ); // Load the diagonal of A to make it more likely to be invertible. bli_shiftd( &BLIS_TWO, &a ); bli_setsc( (1.0/1.0), 0.0, &alpha ); bli_copym( &x, &x_save ); dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { bli_copym( &x_save, &x ); dtime = bli_clock(); #ifdef PRINT bli_printm( "a", &a, "%4.1f", "" ); bli_printm( "x", &x, "%4.1f", "" ); #endif #ifdef BLIS bli_trsv( &BLIS_ONE, &a, &x ); #else f77_char uploa = 'L'; f77_char transa = 'N'; f77_char diaga = 'N'; f77_int mm = bli_obj_length( &a ); f77_int lda = bli_obj_col_stride( &a ); f77_int incx = bli_obj_vector_inc( &x ); double* ap = bli_obj_buffer( &a ); double* xp = bli_obj_buffer( &x ); dtrsv_( &uploa, &transa, &diaga, &mm, ap, &lda, xp, &incx ); #endif #ifdef PRINT bli_printm( "x after", &x, "%4.1f", "" ); exit(1); #endif dtime_save = bli_clock_min_diff( dtime_save, dtime ); } gflops = ( 1.0 * m * m ) / ( dtime_save * 1.0e9 ); #ifdef BLIS printf( "data_trsv_blis" ); #else printf( "data_trsv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", ( unsigned long )(p - p_begin + 1)/p_inc + 1, ( unsigned long )m, gflops ); bli_obj_free( &alpha ); bli_obj_free( &a ); bli_obj_free( &x ); bli_obj_free( &x_save ); } //bli_finalize(); return 0; }