int f2c_ztrsm(char* side, char* uplo, char* trans, char* diag, integer* M, integer* N, doublecomplex* alpha, doublecomplex* A, integer* lda, doublecomplex* B, integer* ldb) { ztrsm_(side, uplo, trans, diag, M, N, alpha, A, lda, B, ldb); return 0; }
// complex double void TriangularSolve(char *side, char *uplo, char *transa, char *diag, int *m, int *n, std::complex<double> *alpha, std::complex<double> *A, int *lda, std::complex<double> *B, int *ldb) { #ifndef RELEASE CallStackEntry entry("TriangularSolve"); if (m <= 0) throw std::logic_error("Invalid matrix height for triangular solve"); if (n <= 0) throw std::logic_error("Invalid matrix width for triangular solve"); #endif assert(m > 0 && n > 0); ztrsm_(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb); }
void cublasDtrsm( char side, char uplo, char trans, char diag, int m, int n, double alpha, double *A, int ldA, double *B, int ldB ) { double zalpha_[REAL_PART+IMAG_PART+1]; double *zalpha = &(zalpha_[0]); zalpha[REAL_PART] = creal(alpha); zalpha[IMAG_PART] = cimag(alpha); ztrsm_( &side, &uplo, &trans, &diag, &m, &n, zalpha, (double *) A, &ldA, (double *) B, &ldB ); }
int zhegst_(int *itype, char *uplo, int *n, doublecomplex *a, int *lda, doublecomplex *b, int *ldb, int *info) { /* System generated locals */ int a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; doublecomplex z__1; /* Local variables */ int k, kb, nb; extern int lsame_(char *, char *); extern int zhemm_(char *, char *, int *, int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, doublecomplex *, int *); int upper; extern int ztrmm_(char *, char *, char *, char *, int *, int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *), ztrsm_(char *, char *, char *, char *, int *, int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *), zhegs2_(int *, char *, int *, doublecomplex *, int *, doublecomplex *, int *, int *), zher2k_(char *, char *, int *, int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *, double *, doublecomplex *, int *), xerbla_(char *, int *); extern int ilaenv_(int *, char *, char *, int *, int *, int *, int *); /* -- LAPACK routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZHEGST reduces a complex Hermitian-definite generalized */ /* eigenproblem to standard form. */ /* If ITYPE = 1, the problem is A*x = lambda*B*x, */ /* and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H) */ /* If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or */ /* B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L. */ /* B must have been previously factorized as U**H*U or L*L**H by ZPOTRF. */ /* Arguments */ /* ========= */ /* ITYPE (input) INTEGER */ /* = 1: compute inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H); */ /* = 2 or 3: compute U*A*U**H or L**H*A*L. */ /* UPLO (input) CHARACTER*1 */ /* = 'U': Upper triangle of A is stored and B is factored as */ /* U**H*U; */ /* = 'L': Lower triangle of A is stored and B is factored as */ /* L*L**H. */ /* N (input) INTEGER */ /* The order of the matrices A and B. N >= 0. */ /* A (input/output) COMPLEX*16 array, dimension (LDA,N) */ /* On entry, the Hermitian matrix A. If UPLO = 'U', the leading */ /* N-by-N upper triangular part of A contains the upper */ /* triangular part of the matrix A, and the strictly lower */ /* triangular part of A is not referenced. If UPLO = 'L', the */ /* leading N-by-N lower triangular part of A contains the lower */ /* triangular part of the matrix A, and the strictly upper */ /* triangular part of A is not referenced. */ /* On exit, if INFO = 0, the transformed matrix, stored in the */ /* same format as A. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= MAX(1,N). */ /* B (input) COMPLEX*16 array, dimension (LDB,N) */ /* The triangular factor from the Cholesky factorization of B, */ /* as returned by ZPOTRF. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= MAX(1,N). */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); if (*itype < 1 || *itype > 3) { *info = -1; } else if (! upper && ! lsame_(uplo, "L")) { *info = -2; } else if (*n < 0) { *info = -3; } else if (*lda < MAX(1,*n)) { *info = -5; } else if (*ldb < MAX(1,*n)) { *info = -7; } if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGST", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Determine the block size for this environment. */ nb = ilaenv_(&c__1, "ZHEGST", uplo, n, &c_n1, &c_n1, &c_n1); if (nb <= 1 || nb >= *n) { /* Use unblocked code */ zhegs2_(itype, uplo, n, &a[a_offset], lda, &b[b_offset], ldb, info); } else { /* Use blocked code */ if (*itype == 1) { if (upper) { /* Compute inv(U')*A*inv(U) */ i__1 = *n; i__2 = nb; for (k = 1; i__2 < 0 ? k >= i__1 : k <= i__1; k += i__2) { /* Computing MIN */ i__3 = *n - k + 1; kb = MIN(i__3,nb); /* Update the upper triangle of A(k:n,k:n) */ zhegs2_(itype, uplo, &kb, &a[k + k * a_dim1], lda, &b[k + k * b_dim1], ldb, info); if (k + kb <= *n) { i__3 = *n - k - kb + 1; ztrsm_("Left", uplo, "Conjugate transpose", "Non-unit" , &kb, &i__3, &c_b1, &b[k + k * b_dim1], ldb, &a[k + (k + kb) * a_dim1], lda); i__3 = *n - k - kb + 1; z__1.r = -.5, z__1.i = -0.; zhemm_("Left", uplo, &kb, &i__3, &z__1, &a[k + k * a_dim1], lda, &b[k + (k + kb) * b_dim1], ldb, &c_b1, &a[k + (k + kb) * a_dim1], lda); i__3 = *n - k - kb + 1; z__1.r = -1., z__1.i = -0.; zher2k_(uplo, "Conjugate transpose", &i__3, &kb, & z__1, &a[k + (k + kb) * a_dim1], lda, &b[k + ( k + kb) * b_dim1], ldb, &c_b18, &a[k + kb + ( k + kb) * a_dim1], lda) ; i__3 = *n - k - kb + 1; z__1.r = -.5, z__1.i = -0.; zhemm_("Left", uplo, &kb, &i__3, &z__1, &a[k + k * a_dim1], lda, &b[k + (k + kb) * b_dim1], ldb, &c_b1, &a[k + (k + kb) * a_dim1], lda); i__3 = *n - k - kb + 1; ztrsm_("Right", uplo, "No transpose", "Non-unit", &kb, &i__3, &c_b1, &b[k + kb + (k + kb) * b_dim1], ldb, &a[k + (k + kb) * a_dim1], lda); } /* L10: */ } } else { /* Compute inv(L)*A*inv(L') */ i__2 = *n; i__1 = nb; for (k = 1; i__1 < 0 ? k >= i__2 : k <= i__2; k += i__1) { /* Computing MIN */ i__3 = *n - k + 1; kb = MIN(i__3,nb); /* Update the lower triangle of A(k:n,k:n) */ zhegs2_(itype, uplo, &kb, &a[k + k * a_dim1], lda, &b[k + k * b_dim1], ldb, info); if (k + kb <= *n) { i__3 = *n - k - kb + 1; ztrsm_("Right", uplo, "Conjugate transpose", "Non-un" "it", &i__3, &kb, &c_b1, &b[k + k * b_dim1], ldb, &a[k + kb + k * a_dim1], lda); i__3 = *n - k - kb + 1; z__1.r = -.5, z__1.i = -0.; zhemm_("Right", uplo, &i__3, &kb, &z__1, &a[k + k * a_dim1], lda, &b[k + kb + k * b_dim1], ldb, & c_b1, &a[k + kb + k * a_dim1], lda); i__3 = *n - k - kb + 1; z__1.r = -1., z__1.i = -0.; zher2k_(uplo, "No transpose", &i__3, &kb, &z__1, &a[k + kb + k * a_dim1], lda, &b[k + kb + k * b_dim1], ldb, &c_b18, &a[k + kb + (k + kb) * a_dim1], lda); i__3 = *n - k - kb + 1; z__1.r = -.5, z__1.i = -0.; zhemm_("Right", uplo, &i__3, &kb, &z__1, &a[k + k * a_dim1], lda, &b[k + kb + k * b_dim1], ldb, & c_b1, &a[k + kb + k * a_dim1], lda); i__3 = *n - k - kb + 1; ztrsm_("Left", uplo, "No transpose", "Non-unit", & i__3, &kb, &c_b1, &b[k + kb + (k + kb) * b_dim1], ldb, &a[k + kb + k * a_dim1], lda); } /* L20: */ } } } else { if (upper) { /* Compute U*A*U' */ i__1 = *n; i__2 = nb; for (k = 1; i__2 < 0 ? k >= i__1 : k <= i__1; k += i__2) { /* Computing MIN */ i__3 = *n - k + 1; kb = MIN(i__3,nb); /* Update the upper triangle of A(1:k+kb-1,1:k+kb-1) */ i__3 = k - 1; ztrmm_("Left", uplo, "No transpose", "Non-unit", &i__3, & kb, &c_b1, &b[b_offset], ldb, &a[k * a_dim1 + 1], lda); i__3 = k - 1; zhemm_("Right", uplo, &i__3, &kb, &c_b2, &a[k + k * a_dim1], lda, &b[k * b_dim1 + 1], ldb, &c_b1, &a[ k * a_dim1 + 1], lda); i__3 = k - 1; zher2k_(uplo, "No transpose", &i__3, &kb, &c_b1, &a[k * a_dim1 + 1], lda, &b[k * b_dim1 + 1], ldb, &c_b18, &a[a_offset], lda); i__3 = k - 1; zhemm_("Right", uplo, &i__3, &kb, &c_b2, &a[k + k * a_dim1], lda, &b[k * b_dim1 + 1], ldb, &c_b1, &a[ k * a_dim1 + 1], lda); i__3 = k - 1; ztrmm_("Right", uplo, "Conjugate transpose", "Non-unit", & i__3, &kb, &c_b1, &b[k + k * b_dim1], ldb, &a[k * a_dim1 + 1], lda); zhegs2_(itype, uplo, &kb, &a[k + k * a_dim1], lda, &b[k + k * b_dim1], ldb, info); /* L30: */ } } else { /* Compute L'*A*L */ i__2 = *n; i__1 = nb; for (k = 1; i__1 < 0 ? k >= i__2 : k <= i__2; k += i__1) { /* Computing MIN */ i__3 = *n - k + 1; kb = MIN(i__3,nb); /* Update the lower triangle of A(1:k+kb-1,1:k+kb-1) */ i__3 = k - 1; ztrmm_("Right", uplo, "No transpose", "Non-unit", &kb, & i__3, &c_b1, &b[b_offset], ldb, &a[k + a_dim1], lda); i__3 = k - 1; zhemm_("Left", uplo, &kb, &i__3, &c_b2, &a[k + k * a_dim1] , lda, &b[k + b_dim1], ldb, &c_b1, &a[k + a_dim1], lda); i__3 = k - 1; zher2k_(uplo, "Conjugate transpose", &i__3, &kb, &c_b1, & a[k + a_dim1], lda, &b[k + b_dim1], ldb, &c_b18, & a[a_offset], lda); i__3 = k - 1; zhemm_("Left", uplo, &kb, &i__3, &c_b2, &a[k + k * a_dim1] , lda, &b[k + b_dim1], ldb, &c_b1, &a[k + a_dim1], lda); i__3 = k - 1; ztrmm_("Left", uplo, "Conjugate transpose", "Non-unit", & kb, &i__3, &c_b1, &b[k + k * b_dim1], ldb, &a[k + a_dim1], lda); zhegs2_(itype, uplo, &kb, &a[k + k * a_dim1], lda, &b[k + k * b_dim1], ldb, info); /* L40: */ } } } } return 0; /* End of ZHEGST */ } /* zhegst_ */
/* Subroutine */ int zpotrs_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *info) { /* -- LAPACK routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University September 30, 1994 Purpose ======= ZPOTRS solves a system of linear equations A*X = B with a Hermitian positive definite matrix A using the Cholesky factorization A = U**H*U or A = L*L**H computed by ZPOTRF. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. A (input) COMPLEX*16 array, dimension (LDA,N) The triangular factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H, as computed by ZPOTRF. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== Test the input parameters. Parameter adjustments */ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ extern logical lsame_(char *, char *); static logical upper; extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *); a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*nrhs < 0) { *info = -3; } else if (*lda < max(1,*n)) { *info = -5; } else if (*ldb < max(1,*n)) { *info = -7; } if (*info != 0) { i__1 = -(*info); xerbla_("ZPOTRS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { return 0; } if (upper) { /* Solve A*X = B where A = U'*U. Solve U'*X = B, overwriting B with X. */ ztrsm_("Left", "Upper", "Conjugate transpose", "Non-unit", n, nrhs, & c_b1, &a[a_offset], lda, &b[b_offset], ldb); /* Solve U*X = B, overwriting B with X. */ ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, & a[a_offset], lda, &b[b_offset], ldb); } else { /* Solve A*X = B where A = L*L'. Solve L*X = B, overwriting B with X. */ ztrsm_("Left", "Lower", "No transpose", "Non-unit", n, nrhs, &c_b1, & a[a_offset], lda, &b[b_offset], ldb); /* Solve L'*X = B, overwriting B with X. */ ztrsm_("Left", "Lower", "Conjugate transpose", "Non-unit", n, nrhs, & c_b1, &a[a_offset], lda, &b[b_offset], ldb); } return 0; /* End of ZPOTRS */ } /* zpotrs_ */
/* Subroutine */ int zgels_(char *trans, integer *m, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { /* -- LAPACK driver routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University June 30, 1999 Purpose ======= ZGELS solves overdetermined or underdetermined complex linear systems involving an M-by-N matrix A, or its conjugate-transpose, using a QR or LQ factorization of A. It is assumed that A has full rank. The following options are provided: 1. If TRANS = 'N' and m >= n: find the least squares solution of an overdetermined system, i.e., solve the least squares problem minimize || B - A*X ||. 2. If TRANS = 'N' and m < n: find the minimum norm solution of an underdetermined system A * X = B. 3. If TRANS = 'C' and m >= n: find the minimum norm solution of an undetermined system A**H * X = B. 4. If TRANS = 'C' and m < n: find the least squares solution of an overdetermined system, i.e., solve the least squares problem minimize || B - A**H * X ||. Several right hand side vectors b and solution vectors x can be handled in a single call; they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS solution matrix X. Arguments ========= TRANS (input) CHARACTER = 'N': the linear system involves A; = 'C': the linear system involves A**H. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrices B and X. NRHS >= 0. A (input/output) COMPLEX*16 array, dimension (LDA,N) On entry, the M-by-N matrix A. if M >= N, A is overwritten by details of its QR factorization as returned by ZGEQRF; if M < N, A is overwritten by details of its LQ factorization as returned by ZGELQF. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) On entry, the matrix B of right hand side vectors, stored columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS if TRANS = 'C'. On exit, B is overwritten by the solution vectors, stored columnwise: if TRANS = 'N' and m >= n, rows 1 to n of B contain the least squares solution vectors; the residual sum of squares for the solution in each column is given by the sum of squares of elements N+1 to M in that column; if TRANS = 'N' and m < n, rows 1 to N of B contain the minimum norm solution vectors; if TRANS = 'C' and m >= n, rows 1 to M of B contain the minimum norm solution vectors; if TRANS = 'C' and m < n, rows 1 to M of B contain the least squares solution vectors; the residual sum of squares for the solution in each column is given by the sum of squares of elements M+1 to N in that column. LDB (input) INTEGER The leading dimension of the array B. LDB >= MAX(1,M,N). WORK (workspace/output) COMPLEX*16 array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max( 1, MN + max( MN, NRHS ) ). For optimal performance, LWORK >= max( 1, MN + max( MN, NRHS )*NB ). where MN = min(M,N) and NB is the optimum block size. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== Test the input arguments. Parameter adjustments */ /* Table of constant values */ static doublecomplex c_b1 = {0.,0.}; static doublecomplex c_b2 = {1.,0.}; static integer c__1 = 1; static integer c_n1 = -1; static integer c__0 = 0; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; doublereal d__1; /* Local variables */ static doublereal anrm, bnrm; static integer brow; static logical tpsd; static integer i__, j, iascl, ibscl; extern logical lsame_(char *, char *); static integer wsize; static doublereal rwork[1]; extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); static integer nb; extern doublereal dlamch_(char *); static integer mn; extern /* Subroutine */ int xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); static integer scllen; static doublereal bignum; extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zlaset_( char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); static doublereal smlnum; static logical lquery; extern /* Subroutine */ int zunmlq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); #define b_subscr(a_1,a_2) (a_2)*b_dim1 + a_1 #define b_ref(a_1,a_2) b[b_subscr(a_1,a_2)] a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; --work; /* Function Body */ *info = 0; mn = min(*m,*n); lquery = *lwork == -1; if (! (lsame_(trans, "N") || lsame_(trans, "C"))) { *info = -1; } else if (*m < 0) { *info = -2; } else if (*n < 0) { *info = -3; } else if (*nrhs < 0) { *info = -4; } else if (*lda < max(1,*m)) { *info = -6; } else /* if(complicated condition) */ { /* Computing MAX */ i__1 = max(1,*m); if (*ldb < max(i__1,*n)) { *info = -8; } else /* if(complicated condition) */ { /* Computing MAX */ i__1 = 1, i__2 = mn + max(mn,*nrhs); if (*lwork < max(i__1,i__2) && ! lquery) { *info = -10; } } } /* Figure out optimal block size */ if (*info == 0 || *info == -10) { tpsd = TRUE_; if (lsame_(trans, "N")) { tpsd = FALSE_; } if (*m >= *n) { nb = ilaenv_(&c__1, "ZGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1); if (tpsd) { /* Computing MAX */ i__1 = nb, i__2 = ilaenv_(&c__1, "ZUNMQR", "LN", m, nrhs, n, & c_n1, (ftnlen)6, (ftnlen)2); nb = max(i__1,i__2); } else { /* Computing MAX */ i__1 = nb, i__2 = ilaenv_(&c__1, "ZUNMQR", "LC", m, nrhs, n, & c_n1, (ftnlen)6, (ftnlen)2); nb = max(i__1,i__2); } } else { nb = ilaenv_(&c__1, "ZGELQF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1); if (tpsd) { /* Computing MAX */ i__1 = nb, i__2 = ilaenv_(&c__1, "ZUNMLQ", "LC", n, nrhs, m, & c_n1, (ftnlen)6, (ftnlen)2); nb = max(i__1,i__2); } else { /* Computing MAX */ i__1 = nb, i__2 = ilaenv_(&c__1, "ZUNMLQ", "LN", n, nrhs, m, & c_n1, (ftnlen)6, (ftnlen)2); nb = max(i__1,i__2); } } /* Computing MAX */ i__1 = 1, i__2 = mn + max(mn,*nrhs) * nb; wsize = max(i__1,i__2); d__1 = (doublereal) wsize; work[1].r = d__1, work[1].i = 0.; } if (*info != 0) { i__1 = -(*info); xerbla_("ZGELS ", &i__1); return 0; } else if (lquery) { return 0; } /* Quick return if possible Computing MIN */ i__1 = min(*m,*n); if (min(i__1,*nrhs) == 0) { i__1 = max(*m,*n); zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); return 0; } /* Get machine parameters */ smlnum = dlamch_("S") / dlamch_("P"); bignum = 1. / smlnum; dlabad_(&smlnum, &bignum); /* Scale A, B if max element outside range [SMLNUM,BIGNUM] */ anrm = zlange_("M", m, n, &a[a_offset], lda, rwork); iascl = 0; if (anrm > 0. && anrm < smlnum) { /* Scale matrix norm up to SMLNUM */ zlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, info); iascl = 1; } else if (anrm > bignum) { /* Scale matrix norm down to BIGNUM */ zlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, info); iascl = 2; } else if (anrm == 0.) { /* Matrix all zero. Return zero solution. */ i__1 = max(*m,*n); zlaset_("F", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); goto L50; } brow = *m; if (tpsd) { brow = *n; } bnrm = zlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); ibscl = 0; if (bnrm > 0. && bnrm < smlnum) { /* Scale matrix norm up to SMLNUM */ zlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], ldb, info); ibscl = 1; } else if (bnrm > bignum) { /* Scale matrix norm down to BIGNUM */ zlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], ldb, info); ibscl = 2; } if (*m >= *n) { /* compute QR factorization of A */ i__1 = *lwork - mn; zgeqrf_(m, n, &a[a_offset], lda, &work[1], &work[mn + 1], &i__1, info) ; /* workspace at least N, optimally N*NB */ if (! tpsd) { /* Least-Squares Problem min || A * X - B || B(1:M,1:NRHS) := Q' * B(1:M,1:NRHS) */ i__1 = *lwork - mn; zunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, &work[1], &b[b_offset], ldb, &work[mn + 1], &i__1, info); /* workspace at least NRHS, optimally NRHS*NB B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, & c_b2, &a[a_offset], lda, &b[b_offset], ldb); scllen = *n; } else { /* Overdetermined system of equations A' * X = B B(1:N,1:NRHS) := inv(R') * B(1:N,1:NRHS) */ ztrsm_("Left", "Upper", "Conjugate transpose", "Non-unit", n, nrhs, &c_b2, &a[a_offset], lda, &b[b_offset], ldb); /* B(N+1:M,1:NRHS) = ZERO */ i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = *n + 1; i__ <= i__2; ++i__) { i__3 = b_subscr(i__, j); b[i__3].r = 0., b[i__3].i = 0.; /* L10: */ } /* L20: */ } /* B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS) */ i__1 = *lwork - mn; zunmqr_("Left", "No transpose", m, nrhs, n, &a[a_offset], lda, & work[1], &b[b_offset], ldb, &work[mn + 1], &i__1, info); /* workspace at least NRHS, optimally NRHS*NB */ scllen = *m; } } else { /* Compute LQ factorization of A */ i__1 = *lwork - mn; zgelqf_(m, n, &a[a_offset], lda, &work[1], &work[mn + 1], &i__1, info) ; /* workspace at least M, optimally M*NB. */ if (! tpsd) { /* underdetermined system of equations A * X = B B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ ztrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, & c_b2, &a[a_offset], lda, &b[b_offset], ldb); /* B(M+1:N,1:NRHS) = 0 */ i__1 = *nrhs; for (j = 1; j <= i__1; ++j) { i__2 = *n; for (i__ = *m + 1; i__ <= i__2; ++i__) { i__3 = b_subscr(i__, j); b[i__3].r = 0., b[i__3].i = 0.; /* L30: */ } /* L40: */ } /* B(1:N,1:NRHS) := Q(1:N,:)' * B(1:M,1:NRHS) */ i__1 = *lwork - mn; zunmlq_("Left", "Conjugate transpose", n, nrhs, m, &a[a_offset], lda, &work[1], &b[b_offset], ldb, &work[mn + 1], &i__1, info); /* workspace at least NRHS, optimally NRHS*NB */ scllen = *n; } else { /* overdetermined system min || A' * X - B || B(1:N,1:NRHS) := Q * B(1:N,1:NRHS) */ i__1 = *lwork - mn; zunmlq_("Left", "No transpose", n, nrhs, m, &a[a_offset], lda, & work[1], &b[b_offset], ldb, &work[mn + 1], &i__1, info); /* workspace at least NRHS, optimally NRHS*NB B(1:M,1:NRHS) := inv(L') * B(1:M,1:NRHS) */ ztrsm_("Left", "Lower", "Conjugate transpose", "Non-unit", m, nrhs, &c_b2, &a[a_offset], lda, &b[b_offset], ldb); scllen = *m; } } /* Undo scaling */ if (iascl == 1) { zlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] , ldb, info); } else if (iascl == 2) { zlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] , ldb, info); } if (ibscl == 1) { zlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] , ldb, info); } else if (ibscl == 2) { zlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] , ldb, info); } L50: d__1 = (doublereal) wsize; work[1].r = d__1, work[1].i = 0.; return 0; /* End of ZGELS */ } /* zgels_ */
/* Subroutine */ int zgetrs_(char *trans, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ extern logical lsame_(char *, char *); extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *); logical notran; extern /* Subroutine */ int zlaswp_(integer *, doublecomplex *, integer *, integer *, integer *, integer *, integer *); /* -- LAPACK computational routine (version 3.4.0) -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ /* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ /* November 2011 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --ipiv; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; /* Function Body */ *info = 0; notran = lsame_(trans, "N"); if (! notran && ! lsame_(trans, "T") && ! lsame_( trans, "C")) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*nrhs < 0) { *info = -3; } else if (*lda < max(1,*n)) { *info = -5; } else if (*ldb < max(1,*n)) { *info = -8; } if (*info != 0) { i__1 = -(*info); xerbla_("ZGETRS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { return 0; } if (notran) { /* Solve A * X = B. */ /* Apply row interchanges to the right hand sides. */ zlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c__1); /* Solve L*X = B, overwriting B with X. */ ztrsm_("Left", "Lower", "No transpose", "Unit", n, nrhs, &c_b1, &a[ a_offset], lda, &b[b_offset], ldb); /* Solve U*X = B, overwriting B with X. */ ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, & a[a_offset], lda, &b[b_offset], ldb); } else { /* Solve A**T * X = B or A**H * X = B. */ /* Solve U**T *X = B or U**H *X = B, overwriting B with X. */ ztrsm_("Left", "Upper", trans, "Non-unit", n, nrhs, &c_b1, &a[ a_offset], lda, &b[b_offset], ldb); /* Solve L**T *X = B, or L**H *X = B overwriting B with X. */ ztrsm_("Left", "Lower", trans, "Unit", n, nrhs, &c_b1, &a[a_offset], lda, &b[b_offset], ldb); /* Apply row interchanges to the solution vectors. */ zlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1); } return 0; /* End of ZGETRS */ }
void ztrsm(char side, char uplo, char transa, char diag, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb) { ztrsm_(&side, &uplo, &transa, &diag, &m, &n, alpha, a, &lda, b, &ldb); }
/* Subroutine */ int zpbtrf_(char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, integer *info) { /* -- LAPACK routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University September 30, 1994 Purpose ======= ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. The factorization has the form A = U**H * U, if UPLO = 'U', or A = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. KD (input) INTEGER The number of superdiagonals of the matrix A if UPLO = 'U', or the number of subdiagonals if UPLO = 'L'. KD >= 0. AB (input/output) COMPLEX*16 array, dimension (LDAB,N) On entry, the upper or lower triangle of the Hermitian band matrix A, stored in the first KD+1 rows of the array. The j-th column of A is stored in the j-th column of the array AB as follows: if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). On exit, if INFO = 0, the triangular factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H of the band matrix A, in the same storage format as A. LDAB (input) INTEGER The leading dimension of the array AB. LDAB >= KD+1. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. Further Details =============== The band storage scheme is illustrated by the following example, when N = 6, KD = 2, and UPLO = 'U': On entry: On exit: * * a13 a24 a35 a46 * * u13 u24 u35 u46 * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 Similarly, if UPLO = 'L' the format of A is as follows: On entry: On exit: a11 a22 a33 a44 a55 a66 l11 l22 l33 l44 l55 l66 a21 a32 a43 a54 a65 * l21 l32 l43 l54 l65 * a31 a42 a53 a64 * * l31 l42 l53 l64 * * Array elements marked * are not used by the routine. Contributed by Peter Mayes and Giuseppe Radicati, IBM ECSEC, Rome, March 23, 1989 ===================================================================== Test the input parameters. Parameter adjustments */ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; static integer c__1 = 1; static integer c_n1 = -1; static doublereal c_b21 = -1.; static doublereal c_b22 = 1.; static integer c__33 = 33; /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4, i__5, i__6; doublecomplex z__1; /* Local variables */ static doublecomplex work[1056] /* was [33][32] */; static integer i__, j; extern logical lsame_(char *, char *); extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); static integer i2, i3; extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zpbtf2_(char *, integer *, integer *, doublecomplex *, integer *, integer *); static integer ib, nb, ii, jj; extern /* Subroutine */ int zpotf2_(char *, integer *, doublecomplex *, integer *, integer *), xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); #define work_subscr(a_1,a_2) (a_2)*33 + a_1 - 34 #define work_ref(a_1,a_2) work[work_subscr(a_1,a_2)] #define ab_subscr(a_1,a_2) (a_2)*ab_dim1 + a_1 #define ab_ref(a_1,a_2) ab[ab_subscr(a_1,a_2)] ab_dim1 = *ldab; ab_offset = 1 + ab_dim1 * 1; ab -= ab_offset; /* Function Body */ *info = 0; if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*kd < 0) { *info = -3; } else if (*ldab < *kd + 1) { *info = -5; } if (*info != 0) { i__1 = -(*info); xerbla_("ZPBTRF", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Determine the block size for this environment */ nb = ilaenv_(&c__1, "ZPBTRF", uplo, n, kd, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); /* The block size must not exceed the semi-bandwidth KD, and must not exceed the limit set by the size of the local array WORK. */ nb = min(nb,32); if (nb <= 1 || nb > *kd) { /* Use unblocked code */ zpbtf2_(uplo, n, kd, &ab[ab_offset], ldab, info); } else { /* Use blocked code */ if (lsame_(uplo, "U")) { /* Compute the Cholesky factorization of a Hermitian band matrix, given the upper triangle of the matrix in band storage. Zero the upper triangle of the work array. */ i__1 = nb; for (j = 1; j <= i__1; ++j) { i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = work_subscr(i__, j); work[i__3].r = 0., work[i__3].i = 0.; /* L10: */ } /* L20: */ } /* Process the band matrix one diagonal block at a time. */ i__1 = *n; i__2 = nb; for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { /* Computing MIN */ i__3 = nb, i__4 = *n - i__ + 1; ib = min(i__3,i__4); /* Factorize the diagonal block */ i__3 = *ldab - 1; zpotf2_(uplo, &ib, &ab_ref(*kd + 1, i__), &i__3, &ii); if (ii != 0) { *info = i__ + ii - 1; goto L150; } if (i__ + ib <= *n) { /* Update the relevant part of the trailing submatrix. If A11 denotes the diagonal block which has just been factorized, then we need to update the remaining blocks in the diagram: A11 A12 A13 A22 A23 A33 The numbers of rows and columns in the partitioning are IB, I2, I3 respectively. The blocks A12, A22 and A23 are empty if IB = KD. The upper triangle of A13 lies outside the band. Computing MIN */ i__3 = *kd - ib, i__4 = *n - i__ - ib + 1; i2 = min(i__3,i__4); /* Computing MIN */ i__3 = ib, i__4 = *n - i__ - *kd + 1; i3 = min(i__3,i__4); if (i2 > 0) { /* Update A12 */ i__3 = *ldab - 1; i__4 = *ldab - 1; ztrsm_("Left", "Upper", "Conjugate transpose", "Non-" "unit", &ib, &i2, &c_b1, &ab_ref(*kd + 1, i__), &i__3, &ab_ref(*kd + 1 - ib, i__ + ib), & i__4); /* Update A22 */ i__3 = *ldab - 1; i__4 = *ldab - 1; zherk_("Upper", "Conjugate transpose", &i2, &ib, & c_b21, &ab_ref(*kd + 1 - ib, i__ + ib), &i__3, &c_b22, &ab_ref(*kd + 1, i__ + ib), &i__4); } if (i3 > 0) { /* Copy the lower triangle of A13 into the work array. */ i__3 = i3; for (jj = 1; jj <= i__3; ++jj) { i__4 = ib; for (ii = jj; ii <= i__4; ++ii) { i__5 = work_subscr(ii, jj); i__6 = ab_subscr(ii - jj + 1, jj + i__ + *kd - 1); work[i__5].r = ab[i__6].r, work[i__5].i = ab[ i__6].i; /* L30: */ } /* L40: */ } /* Update A13 (in the work array). */ i__3 = *ldab - 1; ztrsm_("Left", "Upper", "Conjugate transpose", "Non-" "unit", &ib, &i3, &c_b1, &ab_ref(*kd + 1, i__), &i__3, work, &c__33); /* Update A23 */ if (i2 > 0) { z__1.r = -1., z__1.i = 0.; i__3 = *ldab - 1; i__4 = *ldab - 1; zgemm_("Conjugate transpose", "No transpose", &i2, &i3, &ib, &z__1, &ab_ref(*kd + 1 - ib, i__ + ib), &i__3, work, &c__33, &c_b1, & ab_ref(ib + 1, i__ + *kd), &i__4); } /* Update A33 */ i__3 = *ldab - 1; zherk_("Upper", "Conjugate transpose", &i3, &ib, & c_b21, work, &c__33, &c_b22, &ab_ref(*kd + 1, i__ + *kd), &i__3); /* Copy the lower triangle of A13 back into place. */ i__3 = i3; for (jj = 1; jj <= i__3; ++jj) { i__4 = ib; for (ii = jj; ii <= i__4; ++ii) { i__5 = ab_subscr(ii - jj + 1, jj + i__ + *kd - 1); i__6 = work_subscr(ii, jj); ab[i__5].r = work[i__6].r, ab[i__5].i = work[ i__6].i; /* L50: */ } /* L60: */ } } } /* L70: */ } } else { /* Compute the Cholesky factorization of a Hermitian band matrix, given the lower triangle of the matrix in band storage. Zero the lower triangle of the work array. */ i__2 = nb; for (j = 1; j <= i__2; ++j) { i__1 = nb; for (i__ = j + 1; i__ <= i__1; ++i__) { i__3 = work_subscr(i__, j); work[i__3].r = 0., work[i__3].i = 0.; /* L80: */ } /* L90: */ } /* Process the band matrix one diagonal block at a time. */ i__2 = *n; i__1 = nb; for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { /* Computing MIN */ i__3 = nb, i__4 = *n - i__ + 1; ib = min(i__3,i__4); /* Factorize the diagonal block */ i__3 = *ldab - 1; zpotf2_(uplo, &ib, &ab_ref(1, i__), &i__3, &ii); if (ii != 0) { *info = i__ + ii - 1; goto L150; } if (i__ + ib <= *n) { /* Update the relevant part of the trailing submatrix. If A11 denotes the diagonal block which has just been factorized, then we need to update the remaining blocks in the diagram: A11 A21 A22 A31 A32 A33 The numbers of rows and columns in the partitioning are IB, I2, I3 respectively. The blocks A21, A22 and A32 are empty if IB = KD. The lower triangle of A31 lies outside the band. Computing MIN */ i__3 = *kd - ib, i__4 = *n - i__ - ib + 1; i2 = min(i__3,i__4); /* Computing MIN */ i__3 = ib, i__4 = *n - i__ - *kd + 1; i3 = min(i__3,i__4); if (i2 > 0) { /* Update A21 */ i__3 = *ldab - 1; i__4 = *ldab - 1; ztrsm_("Right", "Lower", "Conjugate transpose", "Non" "-unit", &i2, &ib, &c_b1, &ab_ref(1, i__), & i__3, &ab_ref(ib + 1, i__), &i__4); /* Update A22 */ i__3 = *ldab - 1; i__4 = *ldab - 1; zherk_("Lower", "No transpose", &i2, &ib, &c_b21, & ab_ref(ib + 1, i__), &i__3, &c_b22, &ab_ref(1, i__ + ib), &i__4); } if (i3 > 0) { /* Copy the upper triangle of A31 into the work array. */ i__3 = ib; for (jj = 1; jj <= i__3; ++jj) { i__4 = min(jj,i3); for (ii = 1; ii <= i__4; ++ii) { i__5 = work_subscr(ii, jj); i__6 = ab_subscr(*kd + 1 - jj + ii, jj + i__ - 1); work[i__5].r = ab[i__6].r, work[i__5].i = ab[ i__6].i; /* L100: */ } /* L110: */ } /* Update A31 (in the work array). */ i__3 = *ldab - 1; ztrsm_("Right", "Lower", "Conjugate transpose", "Non" "-unit", &i3, &ib, &c_b1, &ab_ref(1, i__), & i__3, work, &c__33); /* Update A32 */ if (i2 > 0) { z__1.r = -1., z__1.i = 0.; i__3 = *ldab - 1; i__4 = *ldab - 1; zgemm_("No transpose", "Conjugate transpose", &i3, &i2, &ib, &z__1, work, &c__33, &ab_ref( ib + 1, i__), &i__3, &c_b1, &ab_ref(*kd + 1 - ib, i__ + ib), &i__4); } /* Update A33 */ i__3 = *ldab - 1; zherk_("Lower", "No transpose", &i3, &ib, &c_b21, work, &c__33, &c_b22, &ab_ref(1, i__ + *kd), & i__3); /* Copy the upper triangle of A31 back into place. */ i__3 = ib; for (jj = 1; jj <= i__3; ++jj) { i__4 = min(jj,i3); for (ii = 1; ii <= i__4; ++ii) { i__5 = ab_subscr(*kd + 1 - jj + ii, jj + i__ - 1); i__6 = work_subscr(ii, jj); ab[i__5].r = work[i__6].r, ab[i__5].i = work[ i__6].i; /* L120: */ } /* L130: */ } } } /* L140: */ } } } return 0; L150: return 0; /* End of ZPBTRF */ } /* zpbtrf_ */
int zgetrf_(int *m, int *n, doublecomplex *a, int *lda, int *ipiv, int *info) { /* System generated locals */ int a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; doublecomplex z__1; /* Local variables */ int i__, j, jb, nb, iinfo; extern int zgemm_(char *, char *, int *, int *, int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, doublecomplex *, int *), ztrsm_(char *, char *, char *, char *, int *, int *, doublecomplex *, doublecomplex *, int * , doublecomplex *, int *), zgetf2_(int *, int *, doublecomplex *, int *, int *, int *), xerbla_(char *, int *); extern int ilaenv_(int *, char *, char *, int *, int *, int *, int *); extern int zlaswp_(int *, doublecomplex *, int *, int *, int *, int *, int *); /* -- LAPACK routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZGETRF computes an LU factorization of a general M-by-N matrix A */ /* using partial pivoting with row interchanges. */ /* The factorization has the form */ /* A = P * L * U */ /* where P is a permutation matrix, L is lower triangular with unit */ /* diagonal elements (lower trapezoidal if m > n), and U is upper */ /* triangular (upper trapezoidal if m < n). */ /* This is the right-looking Level 3 BLAS version of the algorithm. */ /* Arguments */ /* ========= */ /* M (input) INTEGER */ /* The number of rows of the matrix A. M >= 0. */ /* N (input) INTEGER */ /* The number of columns of the matrix A. N >= 0. */ /* A (input/output) COMPLEX*16 array, dimension (LDA,N) */ /* On entry, the M-by-N matrix to be factored. */ /* On exit, the factors L and U from the factorization */ /* A = P*L*U; the unit diagonal elements of L are not stored. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= MAX(1,M). */ /* IPIV (output) INTEGER array, dimension (MIN(M,N)) */ /* The pivot indices; for 1 <= i <= MIN(M,N), row i of the */ /* matrix was interchanged with row IPIV(i). */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* > 0: if INFO = i, U(i,i) is exactly zero. The factorization */ /* has been completed, but the factor U is exactly */ /* singular, and division by zero will occur if it is used */ /* to solve a system of equations. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --ipiv; /* Function Body */ *info = 0; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < MAX(1,*m)) { *info = -4; } if (*info != 0) { i__1 = -(*info); xerbla_("ZGETRF", &i__1); return 0; } /* Quick return if possible */ if (*m == 0 || *n == 0) { return 0; } /* Determine the block size for this environment. */ nb = ilaenv_(&c__1, "ZGETRF", " ", m, n, &c_n1, &c_n1); if (nb <= 1 || nb >= MIN(*m,*n)) { /* Use unblocked code. */ zgetf2_(m, n, &a[a_offset], lda, &ipiv[1], info); } else { /* Use blocked code. */ i__1 = MIN(*m,*n); i__2 = nb; for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Computing MIN */ i__3 = MIN(*m,*n) - j + 1; jb = MIN(i__3,nb); /* Factor diagonal and subdiagonal blocks and test for exact */ /* singularity. */ i__3 = *m - j + 1; zgetf2_(&i__3, &jb, &a[j + j * a_dim1], lda, &ipiv[j], &iinfo); /* Adjust INFO and the pivot indices. */ if (*info == 0 && iinfo > 0) { *info = iinfo + j - 1; } /* Computing MIN */ i__4 = *m, i__5 = j + jb - 1; i__3 = MIN(i__4,i__5); for (i__ = j; i__ <= i__3; ++i__) { ipiv[i__] = j - 1 + ipiv[i__]; /* L10: */ } /* Apply interchanges to columns 1:J-1. */ i__3 = j - 1; i__4 = j + jb - 1; zlaswp_(&i__3, &a[a_offset], lda, &j, &i__4, &ipiv[1], &c__1); if (j + jb <= *n) { /* Apply interchanges to columns J+JB:N. */ i__3 = *n - j - jb + 1; i__4 = j + jb - 1; zlaswp_(&i__3, &a[(j + jb) * a_dim1 + 1], lda, &j, &i__4, & ipiv[1], &c__1); /* Compute block row of U. */ i__3 = *n - j - jb + 1; ztrsm_("Left", "Lower", "No transpose", "Unit", &jb, &i__3, & c_b1, &a[j + j * a_dim1], lda, &a[j + (j + jb) * a_dim1], lda); if (j + jb <= *m) { /* Update trailing submatrix. */ i__3 = *m - j - jb + 1; i__4 = *n - j - jb + 1; z__1.r = -1., z__1.i = -0.; zgemm_("No transpose", "No transpose", &i__3, &i__4, &jb, &z__1, &a[j + jb + j * a_dim1], lda, &a[j + (j + jb) * a_dim1], lda, &c_b1, &a[j + jb + (j + jb) * a_dim1], lda); } } /* L20: */ } } return 0; /* End of ZGETRF */ } /* zgetrf_ */
int main( int argc, char** argv ) { obj_t a, c; obj_t c_save; obj_t alpha; dim_t m, n; dim_t p; dim_t p_begin, p_max, p_inc; int m_input, n_input; ind_t ind; num_t dt; char dt_ch; int r, n_repeats; side_t side; uplo_t uploa; trans_t transa; diag_t diaga; f77_char f77_side; f77_char f77_uploa; f77_char f77_transa; f77_char f77_diaga; double dtime; double dtime_save; double gflops; //bli_init(); //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); n_repeats = 3; dt = DT; ind = IND; p_begin = P_BEGIN; p_max = P_MAX; p_inc = P_INC; m_input = -1; n_input = -1; // Supress compiler warnings about unused variable 'ind'. ( void )ind; #if 0 cntx_t* cntx; ind_t ind_mod = ind; // A hack to use 3m1 as 1mpb (with 1m as 1mbp). if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. cntx = bli_gks_query_ind_cntx( ind_mod, dt ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); #elif 1 //k_input = 256; #endif // Choose the char corresponding to the requested datatype. if ( bli_is_float( dt ) ) dt_ch = 's'; else if ( bli_is_double( dt ) ) dt_ch = 'd'; else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; else dt_ch = 'z'; #if 0 side = BLIS_LEFT; #else side = BLIS_RIGHT; #endif #if 0 uploa = BLIS_LOWER; #else uploa = BLIS_UPPER; #endif transa = BLIS_NO_TRANSPOSE; diaga = BLIS_NONUNIT_DIAG; bli_param_map_blis_to_netlib_side( side, &f77_side ); bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa ); bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga ); // Begin with initializing the last entry to zero so that // matlab allocates space for the entire array once up-front. for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", ( unsigned long )(p - p_begin + 1)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); for ( p = p_begin; p <= p_max; p += p_inc ) { if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); else m = ( dim_t ) m_input; if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); else n = ( dim_t ) n_input; bli_obj_create( dt, 1, 1, 0, 0, &alpha ); if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, 0, 0, &a ); else bli_obj_create( dt, n, n, 0, 0, &a ); bli_obj_create( dt, m, n, 0, 0, &c ); //bli_obj_create( dt, m, n, n, 1, &c ); bli_obj_create( dt, m, n, 0, 0, &c_save ); bli_randm( &a ); bli_randm( &c ); bli_obj_set_struc( BLIS_TRIANGULAR, &a ); bli_obj_set_uplo( uploa, &a ); bli_obj_set_conjtrans( transa, &a ); bli_obj_set_diag( diaga, &a ); bli_randm( &a ); bli_mktrim( &a ); // Load the diagonal of A to make it more likely to be invertible. bli_shiftd( &BLIS_TWO, &a ); bli_setsc( (2.0/1.0), 0.0, &alpha ); bli_copym( &c, &c_save ); #if 0 //def BLIS bli_ind_disable_all_dt( dt ); bli_ind_enable_dt( ind, dt ); #endif dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { bli_copym( &c_save, &c ); dtime = bli_clock(); #ifdef PRINT bli_printm( "a", &a, "%4.1f", "" ); bli_printm( "c", &c, "%4.1f", "" ); #endif #ifdef BLIS bli_trsm( side, &alpha, &a, &c ); #else if ( bli_is_float( dt ) ) { f77_int mm = bli_obj_length( &c ); f77_int kk = bli_obj_width( &c ); f77_int lda = bli_obj_col_stride( &a ); f77_int ldc = bli_obj_col_stride( &c ); float* alphap = ( float* )bli_obj_buffer( &alpha ); float* ap = ( float* )bli_obj_buffer( &a ); float* cp = ( float* )bli_obj_buffer( &c ); strsm_( &f77_side, &f77_uploa, &f77_transa, &f77_diaga, &mm, &kk, alphap, ap, &lda, cp, &ldc ); } else if ( bli_is_double( dt ) ) { f77_int mm = bli_obj_length( &c ); f77_int kk = bli_obj_width( &c ); f77_int lda = bli_obj_col_stride( &a ); f77_int ldc = bli_obj_col_stride( &c ); double* alphap = ( double* )bli_obj_buffer( &alpha ); double* ap = ( double* )bli_obj_buffer( &a ); double* cp = ( double* )bli_obj_buffer( &c ); dtrsm_( &f77_side, &f77_uploa, &f77_transa, &f77_diaga, &mm, &kk, alphap, ap, &lda, cp, &ldc ); } else if ( bli_is_scomplex( dt ) ) { f77_int mm = bli_obj_length( &c ); f77_int kk = bli_obj_width( &c ); f77_int lda = bli_obj_col_stride( &a ); f77_int ldc = bli_obj_col_stride( &c ); #ifdef EIGEN float* alphap = ( float* )bli_obj_buffer( &alpha ); float* ap = ( float* )bli_obj_buffer( &a ); float* cp = ( float* )bli_obj_buffer( &c ); #else scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha ); scomplex* ap = ( scomplex* )bli_obj_buffer( &a ); scomplex* cp = ( scomplex* )bli_obj_buffer( &c ); #endif ctrsm_( &f77_side, &f77_uploa, &f77_transa, &f77_diaga, &mm, &kk, alphap, ap, &lda, cp, &ldc ); } else if ( bli_is_dcomplex( dt ) ) { f77_int mm = bli_obj_length( &c ); f77_int kk = bli_obj_width( &c ); f77_int lda = bli_obj_col_stride( &a ); f77_int ldc = bli_obj_col_stride( &c ); #ifdef EIGEN double* alphap = ( double* )bli_obj_buffer( &alpha ); double* ap = ( double* )bli_obj_buffer( &a ); double* cp = ( double* )bli_obj_buffer( &c ); #else dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha ); dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a ); dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c ); #endif ztrsm_( &f77_side, &f77_uploa, &f77_transa, &f77_diaga, &mm, &kk, alphap, ap, &lda, cp, &ldc ); } #endif #ifdef PRINT bli_printm( "c after", &c, "%4.1f", "" ); exit(1); #endif dtime_save = bli_clock_min_diff( dtime_save, dtime ); } if ( bli_is_left( side ) ) gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); else gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); if ( bli_is_complex( dt ) ) gflops *= 4.0; printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", ( unsigned long )(p - p_begin + 1)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); bli_obj_free( &alpha ); bli_obj_free( &a ); bli_obj_free( &c ); bli_obj_free( &c_save ); } //bli_finalize(); return 0; }
/* Subroutine */ int zgetri_(integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; doublecomplex z__1; /* Local variables */ static integer i__, j, jb, nb, jj, jp, nn, iws, nbmin; extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); static integer ldwork, lwkopt; static logical lquery; extern /* Subroutine */ int ztrtri_(char *, char *, integer *, doublecomplex *, integer *, integer *, ftnlen, ftnlen); /* -- LAPACK routine (version 3.0) -- */ /* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., */ /* Courant Institute, Argonne National Lab, and Rice University */ /* June 30, 1999 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZGETRI computes the inverse of a matrix using the LU factorization */ /* computed by ZGETRF. */ /* This method inverts U and then computes inv(A) by solving the system */ /* inv(A)*L = inv(U) for inv(A). */ /* Arguments */ /* ========= */ /* N (input) INTEGER */ /* The order of the matrix A. N >= 0. */ /* A (input/output) COMPLEX*16 array, dimension (LDA,N) */ /* On entry, the factors L and U from the factorization */ /* A = P*L*U as computed by ZGETRF. */ /* On exit, if INFO = 0, the inverse of the original matrix A. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,N). */ /* IPIV (input) INTEGER array, dimension (N) */ /* The pivot indices from ZGETRF; for 1<=i<=N, row i of the */ /* matrix was interchanged with row IPIV(i). */ /* WORK (workspace/output) COMPLEX*16 array, dimension (LWORK) */ /* On exit, if INFO=0, then WORK(1) returns the optimal LWORK. */ /* LWORK (input) INTEGER */ /* The dimension of the array WORK. LWORK >= max(1,N). */ /* For optimal performance LWORK >= N*NB, where NB is */ /* the optimal blocksize returned by ILAENV. */ /* If LWORK = -1, then a workspace query is assumed; the routine */ /* only calculates the optimal size of the WORK array, returns */ /* this value as the first entry of the WORK array, and no error */ /* message related to LWORK is issued by XERBLA. */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* > 0: if INFO = i, U(i,i) is exactly zero; the matrix is */ /* singular and its inverse could not be computed. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --ipiv; --work; /* Function Body */ *info = 0; nb = ilaenv_(&c__1, "ZGETRI", " ", n, &c_n1, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); lwkopt = *n * nb; work[1].r = (doublereal) lwkopt, work[1].i = 0.; lquery = *lwork == -1; if (*n < 0) { *info = -1; } else if (*lda < max(1,*n)) { *info = -3; } else if (*lwork < max(1,*n) && ! lquery) { *info = -6; } if (*info != 0) { i__1 = -(*info); xerbla_("ZGETRI", &i__1, (ftnlen)6); return 0; } else if (lquery) { return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Form inv(U). If INFO > 0 from ZTRTRI, then U is singular, */ /* and the inverse is not computed. */ ztrtri_("Upper", "Non-unit", n, &a[a_offset], lda, info, (ftnlen)5, ( ftnlen)8); if (*info > 0) { return 0; } nbmin = 2; ldwork = *n; if (nb > 1 && nb < *n) { /* Computing MAX */ i__1 = ldwork * nb; iws = max(i__1,1); if (*lwork < iws) { nb = *lwork / ldwork; /* Computing MAX */ i__1 = 2, i__2 = ilaenv_(&c__2, "ZGETRI", " ", n, &c_n1, &c_n1, & c_n1, (ftnlen)6, (ftnlen)1); nbmin = max(i__1,i__2); } } else { iws = *n; } /* Solve the equation inv(A)*L = inv(U) for inv(A). */ if (nb < nbmin || nb >= *n) { /* Use unblocked code. */ for (j = *n; j >= 1; --j) { /* Copy current column of L to WORK and replace with zeros. */ i__1 = *n; for (i__ = j + 1; i__ <= i__1; ++i__) { i__2 = i__; i__3 = i__ + j * a_dim1; work[i__2].r = a[i__3].r, work[i__2].i = a[i__3].i; i__2 = i__ + j * a_dim1; a[i__2].r = 0., a[i__2].i = 0.; /* L10: */ } /* Compute current column of inv(A). */ if (j < *n) { i__1 = *n - j; z__1.r = -1., z__1.i = -0.; zgemv_("No transpose", n, &i__1, &z__1, &a[(j + 1) * a_dim1 + 1], lda, &work[j + 1], &c__1, &c_b2, &a[j * a_dim1 + 1], &c__1, (ftnlen)12); } /* L20: */ } } else { /* Use blocked code. */ nn = (*n - 1) / nb * nb + 1; i__1 = -nb; for (j = nn; i__1 < 0 ? j >= 1 : j <= 1; j += i__1) { /* Computing MIN */ i__2 = nb, i__3 = *n - j + 1; jb = min(i__2,i__3); /* Copy current block column of L to WORK and replace with */ /* zeros. */ i__2 = j + jb - 1; for (jj = j; jj <= i__2; ++jj) { i__3 = *n; for (i__ = jj + 1; i__ <= i__3; ++i__) { i__4 = i__ + (jj - j) * ldwork; i__5 = i__ + jj * a_dim1; work[i__4].r = a[i__5].r, work[i__4].i = a[i__5].i; i__4 = i__ + jj * a_dim1; a[i__4].r = 0., a[i__4].i = 0.; /* L30: */ } /* L40: */ } /* Compute current block column of inv(A). */ if (j + jb <= *n) { i__2 = *n - j - jb + 1; z__1.r = -1., z__1.i = -0.; zgemm_("No transpose", "No transpose", n, &jb, &i__2, &z__1, & a[(j + jb) * a_dim1 + 1], lda, &work[j + jb], &ldwork, &c_b2, &a[j * a_dim1 + 1], lda, (ftnlen)12, (ftnlen) 12); } ztrsm_("Right", "Lower", "No transpose", "Unit", n, &jb, &c_b2, & work[j], &ldwork, &a[j * a_dim1 + 1], lda, (ftnlen)5, ( ftnlen)5, (ftnlen)12, (ftnlen)4); /* L50: */ } } /* Apply column interchanges. */ for (j = *n - 1; j >= 1; --j) { jp = ipiv[j]; if (jp != j) { zswap_(n, &a[j * a_dim1 + 1], &c__1, &a[jp * a_dim1 + 1], &c__1); } /* L60: */ } work[1].r = (doublereal) iws, work[1].i = 0.; return 0; /* End of ZGETRI */ } /* zgetri_ */
void zgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, SuperMatrix *B, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * ZGSTRS solves a system of linear equations A*X=B or A'*X=B * with A sparse and B dense, using the LU factorization computed by * ZGSTRF. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * zgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * zgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. * * perm_c (input) int*, dimension (L->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int*, dimension (L->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif int incx = 1, incy = 1; #ifdef USE_VENDOR_BLAS doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; doublecomplex *work_col; #endif doublecomplex temp_comp; DNformat *Bstore; doublecomplex *Bmat; SCformat *Lstore; NCformat *Ustore; doublecomplex *Lval, *Uval; int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; doublecomplex *work, *rhs_work, *soln; flops_t solve_ops; void zprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU ) *info = -2; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU ) *info = -3; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) *info = -6; if ( *info ) { i = -(*info); xerbla_("zgstrs", &i); return; } n = L->nrow; work = doublecomplexCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = doublecomplexMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( trans == NOTRANS ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += 4 * nsupc * (nsupc - 1) * nrhs; solve_ops += 8 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; zz_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]); z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else ztrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); zgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); z_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]); work_col[i].r = 0.0; work_col[i].i = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; zlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); zmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); z_sub(&rhs_work[irow], &rhs_work[irow], &work[i]); work[i].r = 0.; work[i].i = 0.; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); zprint_soln(n, nrhs, Bmat); #endif /* * Back solve Ux=y. */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 4 * nsupc * (nsupc + 1) * nrhs; if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { z_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]); rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else ztrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0; j < nrhs; j++) zusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); #endif } for (j = 0; j < nrhs; ++j) { rhs_work = &Bmat[j*ldb]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ irow = U_SUB(i); zz_mult(&temp_comp, &rhs_work[jcol], &Uval[i]); z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); } } } } /* for U-solve */ #ifdef DEBUG printf("After U-solve: x=\n"); zprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X := Pc*X. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = solve_ops; } else { /* Solve A'*X=B */ /* Permute right hand sides to form Pc'*B. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = 0; if (trans == TRANS) { for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_ztrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by inv(L'). */ sp_ztrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); } } else { for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_ztrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by inv(L'). */ sp_ztrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info); } } /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } SUPERLU_FREE(work); SUPERLU_FREE(soln); }
/* Subroutine */ int zhegv_(integer *itype, char *jobz, char *uplo, integer * n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; /* Local variables */ integer nb, neig; char trans[1]; logical upper, wantz; integer lwkopt; logical lquery; /* -- LAPACK driver routine (version 3.2) -- */ /* November 2006 */ /* Purpose */ /* ======= */ /* ZHEGV computes all the eigenvalues, and optionally, the eigenvectors */ /* of a complex generalized Hermitian-definite eigenproblem, of the form */ /* A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. */ /* Here A and B are assumed to be Hermitian and B is also */ /* positive definite. */ /* Arguments */ /* ========= */ /* ITYPE (input) INTEGER */ /* Specifies the problem type to be solved: */ /* = 1: A*x = (lambda)*B*x */ /* = 2: A*B*x = (lambda)*x */ /* = 3: B*A*x = (lambda)*x */ /* JOBZ (input) CHARACTER*1 */ /* = 'N': Compute eigenvalues only; */ /* = 'V': Compute eigenvalues and eigenvectors. */ /* UPLO (input) CHARACTER*1 */ /* = 'U': Upper triangles of A and B are stored; */ /* = 'L': Lower triangles of A and B are stored. */ /* N (input) INTEGER */ /* The order of the matrices A and B. N >= 0. */ /* A (input/output) COMPLEX*16 array, dimension (LDA, N) */ /* On entry, the Hermitian matrix A. If UPLO = 'U', the */ /* leading N-by-N upper triangular part of A contains the */ /* upper triangular part of the matrix A. If UPLO = 'L', */ /* the leading N-by-N lower triangular part of A contains */ /* the lower triangular part of the matrix A. */ /* On exit, if JOBZ = 'V', then if INFO = 0, A contains the */ /* matrix Z of eigenvectors. The eigenvectors are normalized */ /* as follows: */ /* if ITYPE = 1 or 2, Z**H*B*Z = I; */ /* if ITYPE = 3, Z**H*inv(B)*Z = I. */ /* If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') */ /* or the lower triangle (if UPLO='L') of A, including the */ /* diagonal, is destroyed. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,N). */ /* B (input/output) COMPLEX*16 array, dimension (LDB, N) */ /* On entry, the Hermitian positive definite matrix B. */ /* If UPLO = 'U', the leading N-by-N upper triangular part of B */ /* contains the upper triangular part of the matrix B. */ /* If UPLO = 'L', the leading N-by-N lower triangular part of B */ /* contains the lower triangular part of the matrix B. */ /* On exit, if INFO <= N, the part of B containing the matrix is */ /* overwritten by the triangular factor U or L from the Cholesky */ /* factorization B = U**H*U or B = L*L**H. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= max(1,N). */ /* W (output) DOUBLE PRECISION array, dimension (N) */ /* If INFO = 0, the eigenvalues in ascending order. */ /* WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) */ /* On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ /* LWORK (input) INTEGER */ /* The length of the array WORK. LWORK >= max(1,2*N-1). */ /* For optimal efficiency, LWORK >= (NB+1)*N, */ /* where NB is the blocksize for ZHETRD returned by ILAENV. */ /* If LWORK = -1, then a workspace query is assumed; the routine */ /* only calculates the optimal size of the WORK array, returns */ /* this value as the first entry of the WORK array, and no error */ /* message related to LWORK is issued by XERBLA. */ /* RWORK (workspace) DOUBLE PRECISION array, dimension (max(1, 3*N-2)) */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* > 0: ZPOTRF or ZHEEV returned an error code: */ /* <= N: if INFO = i, ZHEEV failed to converge; */ /* i off-diagonal elements of an intermediate */ /* tridiagonal form did not converge to zero; */ /* > N: if INFO = N + i, for 1 <= i <= N, then the leading */ /* minor of order i of B is not positive definite. */ /* The factorization of B could not be completed and */ /* no eigenvalues or eigenvectors were computed. */ /* ===================================================================== */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; --w; --work; --rwork; /* Function Body */ wantz = lsame_(jobz, "V"); upper = lsame_(uplo, "U"); lquery = *lwork == -1; *info = 0; if (*itype < 1 || *itype > 3) { *info = -1; } else if (! (wantz || lsame_(jobz, "N"))) { *info = -2; } else if (! (upper || lsame_(uplo, "L"))) { *info = -3; } else if (*n < 0) { *info = -4; } else if (*lda < max(1,*n)) { *info = -6; } else if (*ldb < max(1,*n)) { *info = -8; } if (*info == 0) { nb = ilaenv_(&c__1, "ZHETRD", uplo, n, &c_n1, &c_n1, &c_n1); /* Computing MAX */ i__1 = 1, i__2 = (nb + 1) * *n; lwkopt = max(i__1,i__2); work[1].r = (doublereal) lwkopt, work[1].i = 0.; /* Computing MAX */ i__1 = 1, i__2 = (*n << 1) - 1; if (*lwork < max(i__1,i__2) && ! lquery) { *info = -11; } } if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGV ", &i__1); return 0; } else if (lquery) { return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Form a Cholesky factorization of B. */ zpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; return 0; } /* Transform problem to standard eigenvalue problem and solve. */ zhegst_(itype, uplo, n, &a[a_offset], lda, &b[b_offset], ldb, info); zheev_(jobz, uplo, n, &a[a_offset], lda, &w[1], &work[1], lwork, &rwork[1] , info); if (wantz) { /* Backtransform eigenvectors to the original problem. */ neig = *n; if (*info > 0) { neig = *info - 1; } if (*itype == 1 || *itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; */ /* backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (upper) { *(unsigned char *)trans = 'N'; } else { *(unsigned char *)trans = 'C'; } ztrsm_("Left", uplo, trans, "Non-unit", n, &neig, &c_b1, &b[ b_offset], ldb, &a[a_offset], lda); } else if (*itype == 3) { /* For B*A*x=(lambda)*x; */ /* backtransform eigenvectors: x = L*y or U'*y */ if (upper) { *(unsigned char *)trans = 'C'; } else { *(unsigned char *)trans = 'N'; } ztrmm_("Left", uplo, trans, "Non-unit", n, &neig, &c_b1, &b[ b_offset], ldb, &a[a_offset], lda); } } work[1].r = (doublereal) lwkopt, work[1].i = 0.; return 0; /* End of ZHEGV */ } /* zhegv_ */
/* Subroutine */ int ztrtrs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; /* Local variables */ logical nounit; /* -- LAPACK routine (version 3.2) -- */ /* November 2006 */ /* Purpose */ /* ======= */ /* ZTRTRS solves a triangular system of the form */ /* A * X = B, A**T * X = B, or A**H * X = B, */ /* where A is a triangular matrix of order N, and B is an N-by-NRHS */ /* matrix. A check is made to verify that A is nonsingular. */ /* Arguments */ /* ========= */ /* UPLO (input) CHARACTER*1 */ /* = 'U': A is upper triangular; */ /* = 'L': A is lower triangular. */ /* TRANS (input) CHARACTER*1 */ /* Specifies the form of the system of equations: */ /* = 'N': A * X = B (No transpose) */ /* = 'T': A**T * X = B (Transpose) */ /* = 'C': A**H * X = B (Conjugate transpose) */ /* DIAG (input) CHARACTER*1 */ /* = 'N': A is non-unit triangular; */ /* = 'U': A is unit triangular. */ /* N (input) INTEGER */ /* The order of the matrix A. N >= 0. */ /* NRHS (input) INTEGER */ /* The number of right hand sides, i.e., the number of columns */ /* of the matrix B. NRHS >= 0. */ /* A (input) COMPLEX*16 array, dimension (LDA,N) */ /* The triangular matrix A. If UPLO = 'U', the leading N-by-N */ /* upper triangular part of the array A contains the upper */ /* triangular matrix, and the strictly lower triangular part of */ /* A is not referenced. If UPLO = 'L', the leading N-by-N lower */ /* triangular part of the array A contains the lower triangular */ /* matrix, and the strictly upper triangular part of A is not */ /* referenced. If DIAG = 'U', the diagonal elements of A are */ /* also not referenced and are assumed to be 1. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,N). */ /* B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) */ /* On entry, the right hand side matrix B. */ /* On exit, if INFO = 0, the solution matrix X. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= max(1,N). */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* > 0: if INFO = i, the i-th diagonal element of A is zero, */ /* indicating that the matrix is singular and the solutions */ /* X have not been computed. */ /* ===================================================================== */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; /* Function Body */ *info = 0; nounit = lsame_(diag, "N"); if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { *info = -1; } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! lsame_(trans, "C")) { *info = -2; } else if (! nounit && ! lsame_(diag, "U")) { *info = -3; } else if (*n < 0) { *info = -4; } else if (*nrhs < 0) { *info = -5; } else if (*lda < max(1,*n)) { *info = -7; } else if (*ldb < max(1,*n)) { *info = -9; } if (*info != 0) { i__1 = -(*info); xerbla_("ZTRTRS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Check for singularity. */ if (nounit) { i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (a[i__2].r == 0. && a[i__2].i == 0.) { return 0; } } } *info = 0; /* Solve A * x = b, A**T * x = b, or A**H * x = b. */ ztrsm_("Left", uplo, trans, diag, n, nrhs, &c_b2, &a[a_offset], lda, &b[ b_offset], ldb); return 0; /* End of ZTRTRS */ } /* ztrtrs_ */
void zgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, SuperMatrix *B, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif int incx = 1, incy = 1; #ifdef USE_VENDOR_BLAS doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; doublecomplex *work_col; #endif doublecomplex temp_comp; DNformat *Bstore; doublecomplex *Bmat; SCformat *Lstore; NCformat *Ustore; doublecomplex *Lval, *Uval; int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; doublecomplex *work, *rhs_work, *soln; flops_t solve_ops; void zprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU ) *info = -2; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU ) *info = -3; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) *info = -6; if ( *info ) { i = -(*info); input_error("zgstrs", &i); return; } n = L->nrow; work = doublecomplexCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = doublecomplexMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( trans == NOTRANS ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += 4 * nsupc * (nsupc - 1) * nrhs; solve_ops += 8 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; zz_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]); z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else ztrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); zgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); z_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]); work_col[i].r = 0.0; work_col[i].i = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; zlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); zmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); z_sub(&rhs_work[irow], &rhs_work[irow], &work[i]); work[i].r = 0.; work[i].i = 0.; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); zprint_soln(n, nrhs, Bmat); #endif /* * Back solve Ux=y. */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 4 * nsupc * (nsupc + 1) * nrhs; if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { z_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]); rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else ztrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0; j < nrhs; j++) zusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); #endif } for (j = 0; j < nrhs; ++j) { rhs_work = &Bmat[j*ldb]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ irow = U_SUB(i); zz_mult(&temp_comp, &rhs_work[jcol], &Uval[i]); z_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); } } } } /* for U-solve */ #ifdef DEBUG printf("After U-solve: x=\n"); zprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X := Pc*X. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = solve_ops; } else { /* Solve A'*X=B or CONJ(A)*X=B */ /* Permute right hand sides to form Pc'*B. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = 0; if (trans == TRANS) { for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_ztrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by inv(L'). */ sp_ztrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); } } else { /* trans == CONJ */ for (k = 0; k < nrhs; ++k) { /* Multiply by conj(inv(U')). */ sp_ztrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by conj(inv(L')). */ sp_ztrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info); } } /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } SUPERLU_FREE(work); SUPERLU_FREE(soln); }
int main( int argc, char** argv ) { obj_t a, b, c; obj_t x, y; obj_t alpha, beta; dim_t m; num_t dt_a, dt_b, dt_c; num_t dt_alpha, dt_beta; int ii; #ifdef NBLIS bli_init(); #endif m = 4000; dt_a = BLIS_DOUBLE; dt_b = BLIS_DOUBLE; dt_c = BLIS_DOUBLE; dt_alpha = BLIS_DOUBLE; dt_beta = BLIS_DOUBLE; { #ifdef NBLIS bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); bli_obj_create( dt_a, m, 1, 0, 0, &x ); bli_obj_create( dt_a, m, 1, 0, 0, &y ); bli_obj_create( dt_a, m, m, 0, 0, &a ); bli_obj_create( dt_b, m, m, 0, 0, &b ); bli_obj_create( dt_c, m, m, 0, 0, &c ); bli_randm( &a ); bli_randm( &b ); bli_randm( &c ); bli_setsc( (2.0/1.0), 0.0, &alpha ); bli_setsc( -(1.0/1.0), 0.0, &beta ); #endif #ifdef NBLAS x.buffer = malloc( m * 1 * sizeof( double ) ); y.buffer = malloc( m * 1 * sizeof( double ) ); alpha.buffer = malloc( 1 * sizeof( double ) ); beta.buffer = malloc( 1 * sizeof( double ) ); a.buffer = malloc( m * m * sizeof( double ) ); a.m = m; a.n = m; a.cs = m; b.buffer = malloc( m * m * sizeof( double ) ); b.m = m; b.n = m; b.cs = m; c.buffer = malloc( m * m * sizeof( double ) ); c.m = m; c.n = m; c.cs = m; *((double*)alpha.buffer) = 2.0; *((double*)beta.buffer) = -1.0; #endif #ifdef NBLIS #if NBLIS >= 1 for ( ii = 0; ii < 2000000000; ++ii ) { bli_gemm( &BLIS_ONE, &a, &b, &BLIS_ONE, &c ); } #endif #if NBLIS >= 2 { bli_hemm( BLIS_LEFT, &BLIS_ONE, &a, &b, &BLIS_ONE, &c ); } #endif #if NBLIS >= 3 { bli_herk( &BLIS_ONE, &a, &BLIS_ONE, &c ); } #endif #if NBLIS >= 4 { bli_her2k( &BLIS_ONE, &a, &b, &BLIS_ONE, &c ); } #endif #if NBLIS >= 5 { bli_trmm( BLIS_LEFT, &BLIS_ONE, &a, &c ); } #endif #if NBLIS >= 6 { bli_trsm( BLIS_LEFT, &BLIS_ONE, &a, &c ); } #endif #endif #ifdef NBLAS #if NBLAS >= 1 for ( ii = 0; ii < 2000000000; ++ii ) { f77_char transa = 'N'; f77_char transb = 'N'; f77_int mm = bli_obj_length( c ); f77_int kk = bli_obj_width_after_trans( a ); f77_int nn = bli_obj_width( c ); f77_int lda = bli_obj_col_stride( a ); f77_int ldb = bli_obj_col_stride( b ); f77_int ldc = bli_obj_col_stride( c ); double* alphap = bli_obj_buffer( alpha ); double* ap = bli_obj_buffer( a ); double* bp = bli_obj_buffer( b ); double* betap = bli_obj_buffer( beta ); double* cp = bli_obj_buffer( c ); dgemm_( &transa, &transb, &mm, &nn, &kk, alphap, ap, &lda, bp, &ldb, betap, cp, &ldc ); } #endif #if NBLAS >= 2 { f77_char side = 'L'; f77_char uplo = 'L'; f77_int mm = bli_obj_length( c ); f77_int nn = bli_obj_width( c ); f77_int lda = bli_obj_col_stride( a ); f77_int ldb = bli_obj_col_stride( b ); f77_int ldc = bli_obj_col_stride( c ); double* alphap = bli_obj_buffer( alpha ); double* ap = bli_obj_buffer( a ); double* bp = bli_obj_buffer( b ); double* betap = bli_obj_buffer( beta ); double* cp = bli_obj_buffer( c ); dsymm_( &side, &uplo, &mm, &nn, alphap, ap, &lda, bp, &ldb, betap, cp, &ldc ); } #endif #if NBLAS >= 3 { f77_char uplo = 'L'; f77_char trans = 'N'; f77_int mm = bli_obj_length( c ); f77_int kk = bli_obj_width( a ); f77_int lda = bli_obj_col_stride( a ); f77_int ldc = bli_obj_col_stride( c ); double* alphap = bli_obj_buffer( alpha ); double* ap = bli_obj_buffer( a ); double* betap = bli_obj_buffer( beta ); double* cp = bli_obj_buffer( c ); dsyrk_( &uplo, &trans, &mm, &kk, alphap, ap, &lda, betap, cp, &ldc ); } #endif #if NBLAS >= 4 { f77_char uplo = 'L'; f77_char trans = 'N'; f77_int mm = bli_obj_length( c ); f77_int kk = bli_obj_width( a ); f77_int lda = bli_obj_col_stride( a ); f77_int ldb = bli_obj_col_stride( b ); f77_int ldc = bli_obj_col_stride( c ); double* alphap = bli_obj_buffer( alpha ); double* ap = bli_obj_buffer( a ); double* bp = bli_obj_buffer( b ); double* betap = bli_obj_buffer( beta ); double* cp = bli_obj_buffer( c ); dsyr2k_( &uplo, &trans, &mm, &kk, alphap, ap, &lda, bp, &ldb, betap, cp, &ldc ); } #endif #if NBLAS >= 5 { f77_char side = 'L'; f77_char uplo = 'L'; f77_char trans = 'N'; f77_char diag = 'N'; f77_int mm = bli_obj_length( c ); f77_int nn = bli_obj_width( c ); f77_int lda = bli_obj_col_stride( a ); f77_int ldc = bli_obj_col_stride( c ); double* alphap = bli_obj_buffer( alpha ); double* ap = bli_obj_buffer( a ); double* cp = bli_obj_buffer( c ); dtrmm_( &side, &uplo, &trans, &diag, &mm, &nn, alphap, ap, &lda, cp, &ldc ); } #endif #if NBLAS >= 6 { f77_char side = 'L'; f77_char uplo = 'L'; f77_char trans = 'N'; f77_char diag = 'N'; f77_int mm = bli_obj_length( c ); f77_int nn = bli_obj_width( c ); f77_int lda = bli_obj_col_stride( a ); f77_int ldc = bli_obj_col_stride( c ); double* alphap = bli_obj_buffer( alpha ); double* ap = bli_obj_buffer( a ); double* cp = bli_obj_buffer( c ); dtrsm_( &side, &uplo, &trans, &diag, &mm, &nn, alphap, ap, &lda, cp, &ldc ); } #endif #if NBLAS >= 7 { f77_char transa = 'N'; f77_char transb = 'N'; f77_int mm = bli_obj_length( c ); f77_int kk = bli_obj_width_after_trans( a ); f77_int nn = bli_obj_width( c ); f77_int lda = bli_obj_col_stride( a ); f77_int ldb = bli_obj_col_stride( b ); f77_int ldc = bli_obj_col_stride( c ); dcomplex* alphap = bli_obj_buffer( alpha ); dcomplex* ap = bli_obj_buffer( a ); dcomplex* bp = bli_obj_buffer( b ); dcomplex* betap = bli_obj_buffer( beta ); dcomplex* cp = bli_obj_buffer( c ); zgemm_( &transa, &transb, &mm, &nn, &kk, alphap, ap, &lda, bp, &ldb, betap, cp, &ldc ); } #endif #if NBLAS >= 8 { f77_char side = 'L'; f77_char uplo = 'L'; f77_int mm = bli_obj_length( c ); f77_int nn = bli_obj_width( c ); f77_int lda = bli_obj_col_stride( a ); f77_int ldb = bli_obj_col_stride( b ); f77_int ldc = bli_obj_col_stride( c ); dcomplex* alphap = bli_obj_buffer( alpha ); dcomplex* ap = bli_obj_buffer( a ); dcomplex* bp = bli_obj_buffer( b ); dcomplex* betap = bli_obj_buffer( beta ); dcomplex* cp = bli_obj_buffer( c ); zhemm_( &side, &uplo, &mm, &nn, alphap, ap, &lda, bp, &ldb, betap, cp, &ldc ); } #endif #if NBLAS >= 9 { f77_char uplo = 'L'; f77_char trans = 'N'; f77_int mm = bli_obj_length( c ); f77_int kk = bli_obj_width( a ); f77_int lda = bli_obj_col_stride( a ); f77_int ldc = bli_obj_col_stride( c ); double* alphap = bli_obj_buffer( alpha ); dcomplex* ap = bli_obj_buffer( a ); double* betap = bli_obj_buffer( beta ); dcomplex* cp = bli_obj_buffer( c ); zherk_( &uplo, &trans, &mm, &kk, alphap, ap, &lda, betap, cp, &ldc ); } #endif #if NBLAS >= 10 { f77_char uplo = 'L'; f77_char trans = 'N'; f77_int mm = bli_obj_length( c ); f77_int kk = bli_obj_width( a ); f77_int lda = bli_obj_col_stride( a ); f77_int ldb = bli_obj_col_stride( b ); f77_int ldc = bli_obj_col_stride( c ); dcomplex* alphap = bli_obj_buffer( alpha ); dcomplex* ap = bli_obj_buffer( a ); dcomplex* bp = bli_obj_buffer( b ); double* betap = bli_obj_buffer( beta ); dcomplex* cp = bli_obj_buffer( c ); zher2k_( &uplo, &trans, &mm, &kk, alphap, ap, &lda, bp, &ldb, betap, cp, &ldc ); } #endif #if NBLAS >= 11 { f77_char side = 'L'; f77_char uplo = 'L'; f77_char trans = 'N'; f77_char diag = 'N'; f77_int mm = bli_obj_length( c ); f77_int nn = bli_obj_width( c ); f77_int lda = bli_obj_col_stride( a ); f77_int ldc = bli_obj_col_stride( c ); dcomplex* alphap = bli_obj_buffer( alpha ); dcomplex* ap = bli_obj_buffer( a ); dcomplex* cp = bli_obj_buffer( c ); ztrmm_( &side, &uplo, &trans, &diag, &mm, &nn, alphap, ap, &lda, cp, &ldc ); } #endif #if NBLAS >= 12 { f77_char side = 'L'; f77_char uplo = 'L'; f77_char trans = 'N'; f77_char diag = 'N'; f77_int mm = bli_obj_length( c ); f77_int nn = bli_obj_width( c ); f77_int lda = bli_obj_col_stride( a ); f77_int ldc = bli_obj_col_stride( c ); dcomplex* alphap = bli_obj_buffer( alpha ); dcomplex* ap = bli_obj_buffer( a ); dcomplex* cp = bli_obj_buffer( c ); ztrsm_( &side, &uplo, &trans, &diag, &mm, &nn, alphap, ap, &lda, cp, &ldc ); } #endif #endif #ifdef NBLIS bli_obj_free( &x ); bli_obj_free( &y ); bli_obj_free( &alpha ); bli_obj_free( &beta ); bli_obj_free( &a ); bli_obj_free( &b ); bli_obj_free( &c ); #endif #ifdef NBLAS free( x.buffer ); free( y.buffer ); free( alpha.buffer ); free( beta.buffer ); free( a.buffer ); free( b.buffer ); free( c.buffer ); #endif } #ifdef NBLIS bli_finalize(); #endif return 0; }
/* Subroutine */ int zhegv_(integer *itype, char *jobz, char *uplo, integer * n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) { /* -- LAPACK driver routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University June 30, 1999 Purpose ======= ZHEGV computes all the eigenvalues, and optionally, the eigenvectors of a complex generalized Hermitian-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be Hermitian and B is also positive definite. Arguments ========= ITYPE (input) INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangles of A and B are stored; = 'L': Lower triangles of A and B are stored. N (input) INTEGER The order of the matrices A and B. N >= 0. A (input/output) COMPLEX*16 array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**H*B*Z = I; if ITYPE = 3, Z**H*inv(B)*Z = I. If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') or the lower triangle (if UPLO='L') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) COMPLEX*16 array, dimension (LDB, N) On entry, the Hermitian positive definite matrix B. If UPLO = 'U', the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = 'L', the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**H*U or B = L*L**H. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). W (output) DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. WORK (workspace/output) COMPLEX*16 array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,2*N-1). For optimal efficiency, LWORK >= (NB+1)*N, where NB is the blocksize for ZHETRD returned by ILAENV. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. RWORK (workspace) DOUBLE PRECISION array, dimension (max(1, 3*N-2)) INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: ZPOTRF or ZHEEV returned an error code: <= N: if INFO = i, ZHEEV failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. ===================================================================== Test the input parameters. Parameter adjustments */ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; static integer c__1 = 1; static integer c_n1 = -1; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; /* Local variables */ static integer neig; extern logical lsame_(char *, char *); extern /* Subroutine */ int zheev_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, integer *); static char trans[1]; static logical upper, wantz; extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); static integer nb; extern /* Subroutine */ int xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern /* Subroutine */ int zhegst_(integer *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); static integer lwkopt; static logical lquery; extern /* Subroutine */ int zpotrf_(char *, integer *, doublecomplex *, integer *, integer *); a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1 * 1; b -= b_offset; --w; --work; --rwork; /* Function Body */ wantz = lsame_(jobz, "V"); upper = lsame_(uplo, "U"); lquery = *lwork == -1; *info = 0; if (*itype < 1 || *itype > 3) { *info = -1; } else if (! (wantz || lsame_(jobz, "N"))) { *info = -2; } else if (! (upper || lsame_(uplo, "L"))) { *info = -3; } else if (*n < 0) { *info = -4; } else if (*lda < max(1,*n)) { *info = -6; } else if (*ldb < max(1,*n)) { *info = -8; } else /* if(complicated condition) */ { /* Computing MAX */ i__1 = 1, i__2 = (*n << 1) - 1; if (*lwork < max(i__1,i__2) && ! lquery) { *info = -11; } } if (*info == 0) { nb = ilaenv_(&c__1, "ZHETRD", uplo, n, &c_n1, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1); lwkopt = (nb + 1) * *n; work[1].r = (doublereal) lwkopt, work[1].i = 0.; } if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGV ", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Form a Cholesky factorization of B. */ zpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; return 0; } /* Transform problem to standard eigenvalue problem and solve. */ zhegst_(itype, uplo, n, &a[a_offset], lda, &b[b_offset], ldb, info); zheev_(jobz, uplo, n, &a[a_offset], lda, &w[1], &work[1], lwork, &rwork[1] , info); if (wantz) { /* Backtransform eigenvectors to the original problem. */ neig = *n; if (*info > 0) { neig = *info - 1; } if (*itype == 1 || *itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (upper) { *(unsigned char *)trans = 'N'; } else { *(unsigned char *)trans = 'C'; } ztrsm_("Left", uplo, trans, "Non-unit", n, &neig, &c_b1, &b[ b_offset], ldb, &a[a_offset], lda); } else if (*itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (upper) { *(unsigned char *)trans = 'C'; } else { *(unsigned char *)trans = 'N'; } ztrmm_("Left", uplo, trans, "Non-unit", n, &neig, &c_b1, &b[ b_offset], ldb, &a[a_offset], lda); } } work[1].r = (doublereal) lwkopt, work[1].i = 0.; return 0; /* End of ZHEGV */ } /* zhegv_ */
/* Subroutine */ int zpftrf_(char *transr, char *uplo, integer *n, doublecomplex *a, integer *info) { /* System generated locals */ integer i__1, i__2; /* Local variables */ integer k, n1, n2; logical normaltransr; logical lower; logical nisodd; /* -- LAPACK routine (version 3.2) -- */ /* -- Contributed by Fred Gustavson of the IBM Watson Research Center -- */ /* -- November 2008 -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ /* Purpose */ /* ======= */ /* ZPFTRF computes the Cholesky factorization of a complex Hermitian */ /* positive definite matrix A. */ /* The factorization has the form */ /* A = U**H * U, if UPLO = 'U', or */ /* A = L * L**H, if UPLO = 'L', */ /* where U is an upper triangular matrix and L is lower triangular. */ /* This is the block version of the algorithm, calling Level 3 BLAS. */ /* Arguments */ /* ========= */ /* TRANSR (input) CHARACTER */ /* = 'N': The Normal TRANSR of RFP A is stored; */ /* = 'C': The Conjugate-transpose TRANSR of RFP A is stored. */ /* UPLO (input) CHARACTER */ /* = 'U': Upper triangle of RFP A is stored; */ /* = 'L': Lower triangle of RFP A is stored. */ /* N (input) INTEGER */ /* The order of the matrix A. N >= 0. */ /* A (input/output) COMPLEX array, dimension ( N*(N+1)/2 ); */ /* On entry, the Hermitian matrix A in RFP format. RFP format is */ /* described by TRANSR, UPLO, and N as follows: If TRANSR = 'N' */ /* then RFP A is (0:N,0:k-1) when N is even; k=N/2. RFP A is */ /* (0:N-1,0:k) when N is odd; k=N/2. IF TRANSR = 'C' then RFP is */ /* the Conjugate-transpose of RFP A as defined when */ /* TRANSR = 'N'. The contents of RFP A are defined by UPLO as */ /* follows: If UPLO = 'U' the RFP A contains the nt elements of */ /* upper packed A. If UPLO = 'L' the RFP A contains the elements */ /* of lower packed A. The LDA of RFP A is (N+1)/2 when TRANSR = */ /* 'C'. When TRANSR is 'N' the LDA is N+1 when N is even and N */ /* is odd. See the Note below for more details. */ /* On exit, if INFO = 0, the factor U or L from the Cholesky */ /* factorization RFP A = U**H*U or RFP A = L*L**H. */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* > 0: if INFO = i, the leading minor of order i is not */ /* positive definite, and the factorization could not be */ /* completed. */ /* Further Notes on RFP Format: */ /* ============================ */ /* We first consider Standard Packed Format when N is even. */ /* We give an example where N = 6. */ /* AP is Upper AP is Lower */ /* 00 01 02 03 04 05 00 */ /* 11 12 13 14 15 10 11 */ /* 22 23 24 25 20 21 22 */ /* 33 34 35 30 31 32 33 */ /* 44 45 40 41 42 43 44 */ /* 55 50 51 52 53 54 55 */ /* Let TRANSR = 'N'. RFP holds AP as follows: */ /* For UPLO = 'U' the upper trapezoid A(0:5,0:2) consists of the last */ /* three columns of AP upper. The lower triangle A(4:6,0:2) consists of */ /* conjugate-transpose of the first three columns of AP upper. */ /* For UPLO = 'L' the lower trapezoid A(1:6,0:2) consists of the first */ /* three columns of AP lower. The upper triangle A(0:2,0:2) consists of */ /* conjugate-transpose of the last three columns of AP lower. */ /* To denote conjugate we place -- above the element. This covers the */ /* case N even and TRANSR = 'N'. */ /* RFP A RFP A */ /* -- -- -- */ /* 03 04 05 33 43 53 */ /* -- -- */ /* 13 14 15 00 44 54 */ /* -- */ /* 23 24 25 10 11 55 */ /* 33 34 35 20 21 22 */ /* -- */ /* 00 44 45 30 31 32 */ /* -- -- */ /* 01 11 55 40 41 42 */ /* -- -- -- */ /* 02 12 22 50 51 52 */ /* Now let TRANSR = 'C'. RFP A in both UPLO cases is just the conjugate- */ /* transpose of RFP A above. One therefore gets: */ /* RFP A RFP A */ /* -- -- -- -- -- -- -- -- -- -- */ /* 03 13 23 33 00 01 02 33 00 10 20 30 40 50 */ /* -- -- -- -- -- -- -- -- -- -- */ /* 04 14 24 34 44 11 12 43 44 11 21 31 41 51 */ /* -- -- -- -- -- -- -- -- -- -- */ /* 05 15 25 35 45 55 22 53 54 55 22 32 42 52 */ /* We next consider Standard Packed Format when N is odd. */ /* We give an example where N = 5. */ /* AP is Upper AP is Lower */ /* 00 01 02 03 04 00 */ /* 11 12 13 14 10 11 */ /* 22 23 24 20 21 22 */ /* 33 34 30 31 32 33 */ /* 44 40 41 42 43 44 */ /* Let TRANSR = 'N'. RFP holds AP as follows: */ /* For UPLO = 'U' the upper trapezoid A(0:4,0:2) consists of the last */ /* three columns of AP upper. The lower triangle A(3:4,0:1) consists of */ /* conjugate-transpose of the first two columns of AP upper. */ /* For UPLO = 'L' the lower trapezoid A(0:4,0:2) consists of the first */ /* three columns of AP lower. The upper triangle A(0:1,1:2) consists of */ /* conjugate-transpose of the last two columns of AP lower. */ /* To denote conjugate we place -- above the element. This covers the */ /* case N odd and TRANSR = 'N'. */ /* RFP A RFP A */ /* -- -- */ /* 02 03 04 00 33 43 */ /* -- */ /* 12 13 14 10 11 44 */ /* 22 23 24 20 21 22 */ /* -- */ /* 00 33 34 30 31 32 */ /* -- -- */ /* 01 11 44 40 41 42 */ /* Now let TRANSR = 'C'. RFP A in both UPLO cases is just the conjugate- */ /* transpose of RFP A above. One therefore gets: */ /* RFP A RFP A */ /* -- -- -- -- -- -- -- -- -- */ /* 02 12 22 00 01 00 10 20 30 40 50 */ /* -- -- -- -- -- -- -- -- -- */ /* 03 13 23 33 11 33 11 21 31 41 51 */ /* -- -- -- -- -- -- -- -- -- */ /* 04 14 24 34 44 43 44 22 32 42 52 */ /* ===================================================================== */ /* Test the input parameters. */ *info = 0; normaltransr = lsame_(transr, "N"); lower = lsame_(uplo, "L"); if (! normaltransr && ! lsame_(transr, "C")) { *info = -1; } else if (! lower && ! lsame_(uplo, "U")) { *info = -2; } else if (*n < 0) { *info = -3; } if (*info != 0) { i__1 = -(*info); xerbla_("ZPFTRF", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* If N is odd, set NISODD = .TRUE. */ /* If N is even, set K = N/2 and NISODD = .FALSE. */ if (*n % 2 == 0) { k = *n / 2; nisodd = FALSE_; } else { nisodd = TRUE_; } /* Set N1 and N2 depending on LOWER */ if (lower) { n2 = *n / 2; n1 = *n - n2; } else { n1 = *n / 2; n2 = *n - n1; } /* start execution: there are eight cases */ if (nisodd) { /* N is odd */ if (normaltransr) { /* N is odd and TRANSR = 'N' */ if (lower) { /* SRPA for LOWER, NORMAL and N is odd ( a(0:n-1,0:n1-1) ) */ /* T1 -> a(0,0), T2 -> a(0,1), S -> a(n1,0) */ /* T1 -> a(0), T2 -> a(n), S -> a(n1) */ zpotrf_("L", &n1, a, n, info); if (*info > 0) { return 0; } ztrsm_("R", "L", "C", "N", &n2, &n1, &c_b1, a, n, &a[n1], n); zherk_("U", "N", &n2, &n1, &c_b15, &a[n1], n, &c_b16, &a[*n], n); zpotrf_("U", &n2, &a[*n], n, info); if (*info > 0) { *info += n1; } } else { /* SRPA for UPPER, NORMAL and N is odd ( a(0:n-1,0:n2-1) */ /* T1 -> a(n1+1,0), T2 -> a(n1,0), S -> a(0,0) */ /* T1 -> a(n2), T2 -> a(n1), S -> a(0) */ zpotrf_("L", &n1, &a[n2], n, info); if (*info > 0) { return 0; } ztrsm_("L", "L", "N", "N", &n1, &n2, &c_b1, &a[n2], n, a, n); zherk_("U", "C", &n2, &n1, &c_b15, a, n, &c_b16, &a[n1], n); zpotrf_("U", &n2, &a[n1], n, info); if (*info > 0) { *info += n1; } } } else { /* N is odd and TRANSR = 'C' */ if (lower) { /* SRPA for LOWER, TRANSPOSE and N is odd */ /* T1 -> A(0,0) , T2 -> A(1,0) , S -> A(0,n1) */ /* T1 -> a(0+0) , T2 -> a(1+0) , S -> a(0+n1*n1); lda=n1 */ zpotrf_("U", &n1, a, &n1, info); if (*info > 0) { return 0; } ztrsm_("L", "U", "C", "N", &n1, &n2, &c_b1, a, &n1, &a[n1 * n1], &n1); zherk_("L", "C", &n2, &n1, &c_b15, &a[n1 * n1], &n1, &c_b16, & a[1], &n1); zpotrf_("L", &n2, &a[1], &n1, info); if (*info > 0) { *info += n1; } } else { /* SRPA for UPPER, TRANSPOSE and N is odd */ /* T1 -> A(0,n1+1), T2 -> A(0,n1), S -> A(0,0) */ /* T1 -> a(n2*n2), T2 -> a(n1*n2), S -> a(0); lda = n2 */ zpotrf_("U", &n1, &a[n2 * n2], &n2, info); if (*info > 0) { return 0; } ztrsm_("R", "U", "N", "N", &n2, &n1, &c_b1, &a[n2 * n2], &n2, a, &n2); zherk_("L", "N", &n2, &n1, &c_b15, a, &n2, &c_b16, &a[n1 * n2] , &n2); zpotrf_("L", &n2, &a[n1 * n2], &n2, info); if (*info > 0) { *info += n1; } } } } else { /* N is even */ if (normaltransr) { /* N is even and TRANSR = 'N' */ if (lower) { /* SRPA for LOWER, NORMAL, and N is even ( a(0:n,0:k-1) ) */ /* T1 -> a(1,0), T2 -> a(0,0), S -> a(k+1,0) */ /* T1 -> a(1), T2 -> a(0), S -> a(k+1) */ i__1 = *n + 1; zpotrf_("L", &k, &a[1], &i__1, info); if (*info > 0) { return 0; } i__1 = *n + 1; i__2 = *n + 1; ztrsm_("R", "L", "C", "N", &k, &k, &c_b1, &a[1], &i__1, &a[k + 1], &i__2); i__1 = *n + 1; i__2 = *n + 1; zherk_("U", "N", &k, &k, &c_b15, &a[k + 1], &i__1, &c_b16, a, &i__2); i__1 = *n + 1; zpotrf_("U", &k, a, &i__1, info); if (*info > 0) { *info += k; } } else { /* SRPA for UPPER, NORMAL, and N is even ( a(0:n,0:k-1) ) */ /* T1 -> a(k+1,0) , T2 -> a(k,0), S -> a(0,0) */ /* T1 -> a(k+1), T2 -> a(k), S -> a(0) */ i__1 = *n + 1; zpotrf_("L", &k, &a[k + 1], &i__1, info); if (*info > 0) { return 0; } i__1 = *n + 1; i__2 = *n + 1; ztrsm_("L", "L", "N", "N", &k, &k, &c_b1, &a[k + 1], &i__1, a, &i__2); i__1 = *n + 1; i__2 = *n + 1; zherk_("U", "C", &k, &k, &c_b15, a, &i__1, &c_b16, &a[k], & i__2); i__1 = *n + 1; zpotrf_("U", &k, &a[k], &i__1, info); if (*info > 0) { *info += k; } } } else { /* N is even and TRANSR = 'C' */ if (lower) { /* SRPA for LOWER, TRANSPOSE and N is even (see paper) */ /* T1 -> B(0,1), T2 -> B(0,0), S -> B(0,k+1) */ /* T1 -> a(0+k), T2 -> a(0+0), S -> a(0+k*(k+1)); lda=k */ zpotrf_("U", &k, &a[k], &k, info); if (*info > 0) { return 0; } ztrsm_("L", "U", "C", "N", &k, &k, &c_b1, &a[k], &n1, &a[k * ( k + 1)], &k); zherk_("L", "C", &k, &k, &c_b15, &a[k * (k + 1)], &k, &c_b16, a, &k); zpotrf_("L", &k, a, &k, info); if (*info > 0) { *info += k; } } else { /* SRPA for UPPER, TRANSPOSE and N is even (see paper) */ /* T1 -> B(0,k+1), T2 -> B(0,k), S -> B(0,0) */ /* T1 -> a(0+k*(k+1)), T2 -> a(0+k*k), S -> a(0+0)); lda=k */ zpotrf_("U", &k, &a[k * (k + 1)], &k, info); if (*info > 0) { return 0; } ztrsm_("R", "U", "N", "N", &k, &k, &c_b1, &a[k * (k + 1)], &k, a, &k); zherk_("L", "N", &k, &k, &c_b15, a, &k, &c_b16, &a[k * k], &k); zpotrf_("L", &k, &a[k * k], &k, info); if (*info > 0) { *info += k; } } } } return 0; /* End of ZPFTRF */ } /* zpftrf_ */
/* Subroutine */ int ztimb3_(char *line, integer *nm, integer *mval, integer * nn, integer *nval, integer *nk, integer *kval, integer *nlda, integer *ldaval, doublereal *timmin, doublecomplex *a, doublecomplex *b, doublecomplex *c__, doublereal *reslts, integer *ldr1, integer *ldr2, integer *nout, ftnlen line_len) { /* Initialized data */ static char names[6*9] = "ZGEMM " "ZHEMM " "ZSYMM " "ZHERK " "ZHER2K" "ZSYRK " "ZSYR2K" "ZTRMM " "ZTRSM "; static char trans[1*3] = "N" "T" "C"; static char sides[1*2] = "L" "R"; static char uplos[1*2] = "U" "L"; /* Format strings */ static char fmt_9999[] = "(1x,a6,\002 timing run not attempted\002,/)"; static char fmt_9998[] = "(/\002 *** Speed of \002,a6,\002 in megaflops " "***\002)"; static char fmt_9997[] = "(5x,\002with LDA = \002,i5)"; static char fmt_9996[] = "(5x,\002line \002,i2,\002 with LDA = \002,i5)"; static char fmt_9995[] = "(/1x,\002ZGEMM with TRANSA = '\002,a1,\002', " "TRANSB = '\002,a1,\002'\002)"; static char fmt_9994[] = "(/1x,\002K = \002,i4,/)"; static char fmt_9993[] = "(/1x,a6,\002 with SIDE = '\002,a1,\002', UPLO " "= '\002,a1,\002'\002,/)"; static char fmt_9992[] = "(/1x,a6,\002 with UPLO = '\002,a1,\002', TRANS" " = '\002,a1,\002'\002,/)"; static char fmt_9991[] = "(/1x,a6,\002 with SIDE = '\002,a1,\002', UPLO " "= '\002,a1,\002',\002,\002 TRANS = '\002,a1,\002'\002,/)"; static char fmt_9990[] = "(/////)"; /* System generated locals */ integer reslts_dim1, reslts_dim2, reslts_offset, i__1, i__2, i__3, i__4; /* Builtin functions Subroutine */ int s_copy(char *, char *, ftnlen, ftnlen); integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void), s_cmp(char *, char *, ftnlen, ftnlen); /* Local variables */ static integer ilda; static char side[1]; static integer imat, info; static char path[3]; static doublereal time; static integer isub; static char uplo[1]; static integer i__, k, m, n; static char cname[6]; static integer iside; extern logical lsame_(char *, char *); extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zhemm_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); static integer iuplo; static doublereal s1, s2; extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zsymm_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zsyrk_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer * , doublecomplex *, doublecomplex *, integer *); extern doublereal dopbl3_(char *, integer *, integer *, integer *) ; extern /* Subroutine */ int zher2k_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); static integer ic, ik, im, in; extern doublereal dsecnd_(void); extern /* Subroutine */ int zsyr2k_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), atimck_(integer *, char *, integer *, integer *, integer *, integer *, integer *, integer *, ftnlen); extern doublereal dmflop_(doublereal *, doublereal *, integer *); extern /* Subroutine */ int atimin_(char *, char *, integer *, char *, logical *, integer *, integer *, ftnlen, ftnlen, ftnlen), dprtbl_( char *, char *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, integer *, ftnlen, ftnlen); static char transa[1], transb[1]; static doublereal untime; static logical timsub[9]; extern /* Subroutine */ int ztimmg_(integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *); static integer lda, icl, ita, itb; static doublereal ops; /* Fortran I/O blocks */ static cilist io___9 = { 0, 0, 0, fmt_9999, 0 }; static cilist io___11 = { 0, 0, 0, fmt_9998, 0 }; static cilist io___12 = { 0, 0, 0, fmt_9997, 0 }; static cilist io___14 = { 0, 0, 0, fmt_9996, 0 }; static cilist io___34 = { 0, 0, 0, fmt_9995, 0 }; static cilist io___35 = { 0, 0, 0, fmt_9994, 0 }; static cilist io___41 = { 0, 0, 0, fmt_9993, 0 }; static cilist io___42 = { 0, 0, 0, fmt_9993, 0 }; static cilist io___43 = { 0, 0, 0, fmt_9992, 0 }; static cilist io___44 = { 0, 0, 0, fmt_9992, 0 }; static cilist io___45 = { 0, 0, 0, fmt_9992, 0 }; static cilist io___46 = { 0, 0, 0, fmt_9992, 0 }; static cilist io___47 = { 0, 0, 0, fmt_9991, 0 }; static cilist io___48 = { 0, 0, 0, fmt_9991, 0 }; static cilist io___49 = { 0, 0, 0, fmt_9990, 0 }; #define names_ref(a_0,a_1) &names[(a_1)*6 + a_0 - 6] #define reslts_ref(a_1,a_2,a_3) reslts[((a_3)*reslts_dim2 + (a_2))*\ reslts_dim1 + a_1] /* -- LAPACK timing routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University March 31, 1993 Purpose ======= ZTIMB3 times the Level 3 BLAS routines. Arguments ========= LINE (input) CHARACTER*80 The input line that requested this routine. The first six characters contain either the name of a subroutine or a generic path name. The remaining characters may be used to specify the individual routines to be timed. See ATIMIN for a full description of the format of the input line. NM (input) INTEGER The number of values of M contained in the vector MVAL. MVAL (input) INTEGER array, dimension (NM) The values of the matrix row dimension M. NN (input) INTEGER The number of values of N contained in the vector NVAL. NVAL (input) INTEGER array, dimension (NN) The values of the matrix column dimension N. NK (input) INTEGER The number of values of K contained in the vector KVAL. KVAL (input) INTEGER array, dimension (NK) The values of K. K is used as the intermediate matrix dimension for ZGEMM (the product of an M x K matrix and a K x N matrix) and as the dimension of the rank-K update in ZHERK and ZSYRK. NLDA (input) INTEGER The number of values of LDA contained in the vector LDAVAL. LDAVAL (input) INTEGER array, dimension (NLDA) The values of the leading dimension of the array A. TIMMIN (input) DOUBLE PRECISION The minimum time a subroutine will be timed. A (workspace) COMPLEX*16 array, dimension (LDAMAX*NMAX) where LDAMAX and NMAX are the maximum values permitted for LDA and N. B (workspace) COMPLEX*16 array, dimension (LDAMAX*NMAX) C (workspace) COMPLEX*16 array, dimension (LDAMAX*NMAX) RESLTS (output) DOUBLE PRECISION array, dimension (LDR1,LDR2,NLDA) The timing results for each subroutine over the relevant values of M, N, K, and LDA. LDR1 (input) INTEGER The first dimension of RESLTS. LDR1 >= max(1,NM,NK). LDR2 (input) INTEGER The second dimension of RESLTS. LDR2 >= max(1,NN). NOUT (input) INTEGER The unit number for output. ===================================================================== Parameter adjustments */ --mval; --nval; --kval; --ldaval; --a; --b; --c__; reslts_dim1 = *ldr1; reslts_dim2 = *ldr2; reslts_offset = 1 + reslts_dim1 * (1 + reslts_dim2 * 1); reslts -= reslts_offset; /* Function Body Extract the timing request from the input line. */ s_copy(path, "Zomplex precision", (ftnlen)1, (ftnlen)17); s_copy(path + 1, "B3", (ftnlen)2, (ftnlen)2); atimin_(path, line, &c__9, names, timsub, nout, &info, (ftnlen)3, line_len, (ftnlen)6); if (info != 0) { goto L690; } /* Check that M <= LDA. */ s_copy(cname, line, (ftnlen)6, (ftnlen)6); atimck_(&c__1, cname, nm, &mval[1], nlda, &ldaval[1], nout, &info, ( ftnlen)6); if (info > 0) { io___9.ciunit = *nout; s_wsfe(&io___9); do_fio(&c__1, cname, (ftnlen)6); e_wsfe(); goto L690; } /* Time each routine. */ for (isub = 1; isub <= 9; ++isub) { if (! timsub[isub - 1]) { goto L680; } /* Print header. */ s_copy(cname, names_ref(0, isub), (ftnlen)6, (ftnlen)6); io___11.ciunit = *nout; s_wsfe(&io___11); do_fio(&c__1, cname, (ftnlen)6); e_wsfe(); if (*nlda == 1) { io___12.ciunit = *nout; s_wsfe(&io___12); do_fio(&c__1, (char *)&ldaval[1], (ftnlen)sizeof(integer)); e_wsfe(); } else { i__1 = *nlda; for (i__ = 1; i__ <= i__1; ++i__) { io___14.ciunit = *nout; s_wsfe(&io___14); do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); do_fio(&c__1, (char *)&ldaval[i__], (ftnlen)sizeof(integer)); e_wsfe(); /* L10: */ } } /* Time ZGEMM */ if (s_cmp(cname, "ZGEMM ", (ftnlen)6, (ftnlen)6) == 0) { for (ita = 1; ita <= 3; ++ita) { *(unsigned char *)transa = *(unsigned char *)&trans[ita - 1]; for (itb = 1; itb <= 3; ++itb) { *(unsigned char *)transb = *(unsigned char *)&trans[itb - 1]; i__1 = *nk; for (ik = 1; ik <= i__1; ++ik) { k = kval[ik]; i__2 = *nlda; for (ilda = 1; ilda <= i__2; ++ilda) { lda = ldaval[ilda]; i__3 = *nm; for (im = 1; im <= i__3; ++im) { m = mval[im]; i__4 = *nn; for (in = 1; in <= i__4; ++in) { n = nval[in]; if (*(unsigned char *)transa == 'N') { ztimmg_(&c__1, &m, &k, &a[1], &lda, & c__0, &c__0); } else { ztimmg_(&c__1, &k, &m, &a[1], &lda, & c__0, &c__0); } if (*(unsigned char *)transb == 'N') { ztimmg_(&c__0, &k, &n, &b[1], &lda, & c__0, &c__0); } else { ztimmg_(&c__0, &n, &k, &b[1], &lda, & c__0, &c__0); } ztimmg_(&c__1, &m, &n, &c__[1], &lda, & c__0, &c__0); ic = 0; s1 = dsecnd_(); L20: zgemm_(transa, transb, &m, &n, &k, &c_b1, &a[1], &lda, &b[1], &lda, &c_b1, & c__[1], &lda); s2 = dsecnd_(); time = s2 - s1; ++ic; if (time < *timmin) { ztimmg_(&c__1, &m, &n, &c__[1], &lda, &c__0, &c__0); goto L20; } /* Subtract the time used in ZTIMMG. */ icl = 1; s1 = dsecnd_(); L30: s2 = dsecnd_(); untime = s2 - s1; ++icl; if (icl <= ic) { ztimmg_(&c__1, &m, &n, &c__[1], &lda, &c__0, &c__0); goto L30; } time = (time - untime) / (doublereal) ic; ops = dopbl3_(cname, &m, &n, &k); reslts_ref(im, in, ilda) = dmflop_(&ops, & time, &c__0); /* L40: */ } /* L50: */ } /* L60: */ } if (ik == 1) { io___34.ciunit = *nout; s_wsfe(&io___34); do_fio(&c__1, transa, (ftnlen)1); do_fio(&c__1, transb, (ftnlen)1); e_wsfe(); } io___35.ciunit = *nout; s_wsfe(&io___35); do_fio(&c__1, (char *)&kval[ik], (ftnlen)sizeof( integer)); e_wsfe(); dprtbl_("M", "N", nm, &mval[1], nn, &nval[1], nlda, & reslts[reslts_offset], ldr1, ldr2, nout, ( ftnlen)1, (ftnlen)1); /* L70: */ } /* L80: */ } /* L90: */ } /* Time ZHEMM */ } else if (s_cmp(cname, "ZHEMM ", (ftnlen)6, (ftnlen)6) == 0) { for (iside = 1; iside <= 2; ++iside) { *(unsigned char *)side = *(unsigned char *)&sides[iside - 1]; for (iuplo = 1; iuplo <= 2; ++iuplo) { *(unsigned char *)uplo = *(unsigned char *)&uplos[iuplo - 1]; if (lsame_(uplo, "U")) { imat = 6; } else { imat = -6; } i__1 = *nlda; for (ilda = 1; ilda <= i__1; ++ilda) { lda = ldaval[ilda]; i__2 = *nm; for (im = 1; im <= i__2; ++im) { m = mval[im]; i__3 = *nn; for (in = 1; in <= i__3; ++in) { n = nval[in]; if (iside == 1) { ztimmg_(&imat, &m, &m, &a[1], &lda, &c__0, &c__0); ztimmg_(&c__0, &m, &n, &b[1], &lda, &c__0, &c__0); } else { ztimmg_(&c__0, &m, &n, &b[1], &lda, &c__0, &c__0); ztimmg_(&imat, &n, &n, &a[1], &lda, &c__0, &c__0); } ztimmg_(&c__1, &m, &n, &c__[1], &lda, &c__0, & c__0); ic = 0; s1 = dsecnd_(); L100: zhemm_(side, uplo, &m, &n, &c_b1, &a[1], &lda, &b[1], &lda, &c_b1, &c__[1], &lda); s2 = dsecnd_(); time = s2 - s1; ++ic; if (time < *timmin) { ztimmg_(&c__1, &m, &n, &c__[1], &lda, & c__0, &c__0); goto L100; } /* Subtract the time used in ZTIMMG. */ icl = 1; s1 = dsecnd_(); L110: s2 = dsecnd_(); untime = s2 - s1; ++icl; if (icl <= ic) { ztimmg_(&c__1, &m, &n, &c__[1], &lda, & c__0, &c__0); goto L110; } time = (time - untime) / (doublereal) ic; i__4 = iside - 1; ops = dopbl3_(cname, &m, &n, &i__4) ; reslts_ref(im, in, ilda) = dmflop_(&ops, & time, &c__0); /* L120: */ } /* L130: */ } /* L140: */ } io___41.ciunit = *nout; s_wsfe(&io___41); do_fio(&c__1, "ZHEMM ", (ftnlen)6); do_fio(&c__1, side, (ftnlen)1); do_fio(&c__1, uplo, (ftnlen)1); e_wsfe(); dprtbl_("M", "N", nm, &mval[1], nn, &nval[1], nlda, & reslts[reslts_offset], ldr1, ldr2, nout, (ftnlen) 1, (ftnlen)1); /* L150: */ } /* L160: */ } /* Time ZSYMM */ } else if (s_cmp(cname, "ZSYMM ", (ftnlen)6, (ftnlen)6) == 0) { for (iside = 1; iside <= 2; ++iside) { *(unsigned char *)side = *(unsigned char *)&sides[iside - 1]; for (iuplo = 1; iuplo <= 2; ++iuplo) { *(unsigned char *)uplo = *(unsigned char *)&uplos[iuplo - 1]; if (lsame_(uplo, "U")) { imat = 8; } else { imat = -8; } i__1 = *nlda; for (ilda = 1; ilda <= i__1; ++ilda) { lda = ldaval[ilda]; i__2 = *nm; for (im = 1; im <= i__2; ++im) { m = mval[im]; i__3 = *nn; for (in = 1; in <= i__3; ++in) { n = nval[in]; if (iside == 1) { ztimmg_(&imat, &m, &m, &a[1], &lda, &c__0, &c__0); ztimmg_(&c__0, &m, &n, &b[1], &lda, &c__0, &c__0); } else { ztimmg_(&c__0, &m, &n, &b[1], &lda, &c__0, &c__0); ztimmg_(&imat, &n, &n, &a[1], &lda, &c__0, &c__0); } ztimmg_(&c__1, &m, &n, &c__[1], &lda, &c__0, & c__0); ic = 0; s1 = dsecnd_(); L170: zsymm_(side, uplo, &m, &n, &c_b1, &a[1], &lda, &b[1], &lda, &c_b1, &c__[1], &lda); s2 = dsecnd_(); time = s2 - s1; ++ic; if (time < *timmin) { ztimmg_(&c__1, &m, &n, &c__[1], &lda, & c__0, &c__0); goto L170; } /* Subtract the time used in ZTIMMG. */ icl = 1; s1 = dsecnd_(); L180: s2 = dsecnd_(); untime = s2 - s1; ++icl; if (icl <= ic) { ztimmg_(&c__1, &m, &n, &c__[1], &lda, & c__0, &c__0); goto L180; } time = (time - untime) / (doublereal) ic; i__4 = iside - 1; ops = dopbl3_(cname, &m, &n, &i__4) ; reslts_ref(im, in, ilda) = dmflop_(&ops, & time, &c__0); /* L190: */ } /* L200: */ } /* L210: */ } io___42.ciunit = *nout; s_wsfe(&io___42); do_fio(&c__1, "ZSYMM ", (ftnlen)6); do_fio(&c__1, side, (ftnlen)1); do_fio(&c__1, uplo, (ftnlen)1); e_wsfe(); dprtbl_("M", "N", nm, &mval[1], nn, &nval[1], nlda, & reslts[reslts_offset], ldr1, ldr2, nout, (ftnlen) 1, (ftnlen)1); /* L220: */ } /* L230: */ } /* Time ZHERK */ } else if (s_cmp(cname, "ZHERK ", (ftnlen)6, (ftnlen)6) == 0) { for (iuplo = 1; iuplo <= 2; ++iuplo) { *(unsigned char *)uplo = *(unsigned char *)&uplos[iuplo - 1]; if (lsame_(uplo, "U")) { imat = 6; } else { imat = -6; } for (ita = 1; ita <= 3; ++ita) { *(unsigned char *)transa = *(unsigned char *)&trans[ita - 1]; if (*(unsigned char *)transa != 'T') { i__1 = *nlda; for (ilda = 1; ilda <= i__1; ++ilda) { lda = ldaval[ilda]; i__2 = *nk; for (ik = 1; ik <= i__2; ++ik) { k = kval[ik]; if (*(unsigned char *)transa == 'N') { ztimmg_(&c__1, &n, &k, &a[1], &lda, &c__0, &c__0); } else { ztimmg_(&c__1, &k, &n, &a[1], &lda, &c__0, &c__0); } i__3 = *nn; for (in = 1; in <= i__3; ++in) { n = nval[in]; ztimmg_(&imat, &n, &n, &c__[1], &lda, & c__0, &c__0); ic = 0; s1 = dsecnd_(); L240: zherk_(uplo, transa, &n, &k, &c_b156, &a[ 1], &lda, &c_b156, &c__[1], &lda); s2 = dsecnd_(); time = s2 - s1; ++ic; if (time < *timmin) { ztimmg_(&imat, &n, &n, &c__[1], &lda, &c__0, &c__0); goto L240; } /* Subtract the time used in ZTIMMG. */ icl = 1; s1 = dsecnd_(); L250: s2 = dsecnd_(); untime = s2 - s1; ++icl; if (icl <= ic) { ztimmg_(&imat, &n, &n, &c__[1], &lda, &c__0, &c__0); goto L250; } time = (time - untime) / (doublereal) ic; ops = dopbl3_(cname, &n, &n, &k); reslts_ref(ik, in, ilda) = dmflop_(&ops, & time, &c__0); /* L260: */ } /* L270: */ } /* L280: */ } io___43.ciunit = *nout; s_wsfe(&io___43); do_fio(&c__1, cname, (ftnlen)6); do_fio(&c__1, uplo, (ftnlen)1); do_fio(&c__1, transa, (ftnlen)1); e_wsfe(); dprtbl_("K", "N", nk, &kval[1], nn, &nval[1], nlda, & reslts[reslts_offset], ldr1, ldr2, nout, ( ftnlen)1, (ftnlen)1); } /* L290: */ } /* L300: */ } /* Time ZHER2K */ } else if (s_cmp(cname, "ZHER2K", (ftnlen)6, (ftnlen)6) == 0) { for (iuplo = 1; iuplo <= 2; ++iuplo) { *(unsigned char *)uplo = *(unsigned char *)&uplos[iuplo - 1]; if (lsame_(uplo, "U")) { imat = 6; } else { imat = -6; } for (itb = 1; itb <= 3; ++itb) { *(unsigned char *)transb = *(unsigned char *)&trans[itb - 1]; if (*(unsigned char *)transb != 'T') { i__1 = *nlda; for (ilda = 1; ilda <= i__1; ++ilda) { lda = ldaval[ilda]; i__2 = *nk; for (ik = 1; ik <= i__2; ++ik) { k = kval[ik]; if (*(unsigned char *)transb == 'N') { ztimmg_(&c__1, &n, &k, &a[1], &lda, &c__0, &c__0); ztimmg_(&c__0, &n, &k, &b[1], &lda, &c__0, &c__0); } else { ztimmg_(&c__1, &k, &n, &a[1], &lda, &c__0, &c__0); ztimmg_(&c__0, &k, &n, &b[1], &lda, &c__0, &c__0); } i__3 = *nn; for (in = 1; in <= i__3; ++in) { n = nval[in]; ztimmg_(&imat, &n, &n, &c__[1], &lda, & c__0, &c__0); ic = 0; s1 = dsecnd_(); L310: zher2k_(uplo, transb, &n, &k, &c_b1, &a[1] , &lda, &b[1], &lda, &c_b156, & c__[1], &lda); s2 = dsecnd_(); time = s2 - s1; ++ic; if (time < *timmin) { ztimmg_(&imat, &n, &n, &c__[1], &lda, &c__0, &c__0); goto L310; } /* Subtract the time used in ZTIMMG. */ icl = 1; s1 = dsecnd_(); L320: s2 = dsecnd_(); untime = s2 - s1; ++icl; if (icl <= ic) { ztimmg_(&imat, &n, &n, &c__[1], &lda, &c__0, &c__0); goto L320; } time = (time - untime) / (doublereal) ic; ops = dopbl3_(cname, &n, &n, &k); reslts_ref(ik, in, ilda) = dmflop_(&ops, & time, &c__0); /* L330: */ } /* L340: */ } /* L350: */ } io___44.ciunit = *nout; s_wsfe(&io___44); do_fio(&c__1, cname, (ftnlen)6); do_fio(&c__1, uplo, (ftnlen)1); do_fio(&c__1, transb, (ftnlen)1); e_wsfe(); dprtbl_("K", "N", nk, &kval[1], nn, &nval[1], nlda, & reslts[reslts_offset], ldr1, ldr2, nout, ( ftnlen)1, (ftnlen)1); } /* L360: */ } /* L370: */ } /* Time ZSYRK */ } else if (s_cmp(cname, "ZSYRK ", (ftnlen)6, (ftnlen)6) == 0) { for (iuplo = 1; iuplo <= 2; ++iuplo) { *(unsigned char *)uplo = *(unsigned char *)&uplos[iuplo - 1]; if (lsame_(uplo, "U")) { imat = 8; } else { imat = -8; } for (ita = 1; ita <= 3; ++ita) { *(unsigned char *)transa = *(unsigned char *)&trans[ita - 1]; if (*(unsigned char *)transa != 'C') { i__1 = *nlda; for (ilda = 1; ilda <= i__1; ++ilda) { lda = ldaval[ilda]; i__2 = *nk; for (ik = 1; ik <= i__2; ++ik) { k = kval[ik]; if (*(unsigned char *)transa == 'N') { ztimmg_(&c__1, &n, &k, &a[1], &lda, &c__0, &c__0); } else { ztimmg_(&c__1, &k, &n, &a[1], &lda, &c__0, &c__0); } i__3 = *nn; for (in = 1; in <= i__3; ++in) { n = nval[in]; ztimmg_(&imat, &n, &n, &c__[1], &lda, & c__0, &c__0); ic = 0; s1 = dsecnd_(); L380: zsyrk_(uplo, transa, &n, &k, &c_b1, &a[1], &lda, &c_b1, &c__[1], &lda); s2 = dsecnd_(); time = s2 - s1; ++ic; if (time < *timmin) { ztimmg_(&imat, &n, &n, &c__[1], &lda, &c__0, &c__0); goto L380; } /* Subtract the time used in ZTIMMG. */ icl = 1; s1 = dsecnd_(); L390: s2 = dsecnd_(); untime = s2 - s1; ++icl; if (icl <= ic) { ztimmg_(&imat, &n, &n, &c__[1], &lda, &c__0, &c__0); goto L390; } time = (time - untime) / (doublereal) ic; ops = dopbl3_(cname, &n, &n, &k); reslts_ref(ik, in, ilda) = dmflop_(&ops, & time, &c__0); /* L400: */ } /* L410: */ } /* L420: */ } io___45.ciunit = *nout; s_wsfe(&io___45); do_fio(&c__1, cname, (ftnlen)6); do_fio(&c__1, uplo, (ftnlen)1); do_fio(&c__1, transa, (ftnlen)1); e_wsfe(); dprtbl_("K", "N", nk, &kval[1], nn, &nval[1], nlda, & reslts[reslts_offset], ldr1, ldr2, nout, ( ftnlen)1, (ftnlen)1); } /* L430: */ } /* L440: */ } /* Time ZSYR2K */ } else if (s_cmp(cname, "ZSYR2K", (ftnlen)6, (ftnlen)6) == 0) { for (iuplo = 1; iuplo <= 2; ++iuplo) { *(unsigned char *)uplo = *(unsigned char *)&uplos[iuplo - 1]; if (lsame_(uplo, "U")) { imat = 8; } else { imat = -8; } for (itb = 1; itb <= 3; ++itb) { *(unsigned char *)transb = *(unsigned char *)&trans[itb - 1]; if (*(unsigned char *)transb != 'C') { i__1 = *nlda; for (ilda = 1; ilda <= i__1; ++ilda) { lda = ldaval[ilda]; i__2 = *nk; for (ik = 1; ik <= i__2; ++ik) { k = kval[ik]; if (*(unsigned char *)transb == 'N') { ztimmg_(&c__1, &n, &k, &a[1], &lda, &c__0, &c__0); ztimmg_(&c__0, &n, &k, &b[1], &lda, &c__0, &c__0); } else { ztimmg_(&c__1, &k, &n, &a[1], &lda, &c__0, &c__0); ztimmg_(&c__0, &k, &n, &b[1], &lda, &c__0, &c__0); } i__3 = *nn; for (in = 1; in <= i__3; ++in) { n = nval[in]; ztimmg_(&imat, &n, &n, &c__[1], &lda, & c__0, &c__0); ic = 0; s1 = dsecnd_(); L450: zsyr2k_(uplo, transb, &n, &k, &c_b1, &a[1] , &lda, &b[1], &lda, &c_b1, &c__[ 1], &lda); s2 = dsecnd_(); time = s2 - s1; ++ic; if (time < *timmin) { ztimmg_(&imat, &n, &n, &c__[1], &lda, &c__0, &c__0); goto L450; } /* Subtract the time used in ZTIMMG. */ icl = 1; s1 = dsecnd_(); L460: s2 = dsecnd_(); untime = s2 - s1; ++icl; if (icl <= ic) { ztimmg_(&imat, &n, &n, &c__[1], &lda, &c__0, &c__0); goto L460; } time = (time - untime) / (doublereal) ic; ops = dopbl3_(cname, &n, &n, &k); reslts_ref(ik, in, ilda) = dmflop_(&ops, & time, &c__0); /* L470: */ } /* L480: */ } /* L490: */ } io___46.ciunit = *nout; s_wsfe(&io___46); do_fio(&c__1, cname, (ftnlen)6); do_fio(&c__1, uplo, (ftnlen)1); do_fio(&c__1, transb, (ftnlen)1); e_wsfe(); dprtbl_("K", "N", nk, &kval[1], nn, &nval[1], nlda, & reslts[reslts_offset], ldr1, ldr2, nout, ( ftnlen)1, (ftnlen)1); } /* L500: */ } /* L510: */ } /* Time ZTRMM */ } else if (s_cmp(cname, "ZTRMM ", (ftnlen)6, (ftnlen)6) == 0) { for (iside = 1; iside <= 2; ++iside) { *(unsigned char *)side = *(unsigned char *)&sides[iside - 1]; for (iuplo = 1; iuplo <= 2; ++iuplo) { *(unsigned char *)uplo = *(unsigned char *)&uplos[iuplo - 1]; if (lsame_(uplo, "U")) { imat = 11; } else { imat = -11; } for (ita = 1; ita <= 3; ++ita) { *(unsigned char *)transa = *(unsigned char *)&trans[ ita - 1]; i__1 = *nlda; for (ilda = 1; ilda <= i__1; ++ilda) { lda = ldaval[ilda]; i__2 = *nm; for (im = 1; im <= i__2; ++im) { m = mval[im]; i__3 = *nn; for (in = 1; in <= i__3; ++in) { n = nval[in]; if (iside == 1) { ztimmg_(&imat, &m, &m, &a[1], &lda, & c__0, &c__0); } else { ztimmg_(&imat, &n, &n, &a[1], &lda, & c__0, &c__0); } ztimmg_(&c__0, &m, &n, &b[1], &lda, &c__0, &c__0); ic = 0; s1 = dsecnd_(); L520: ztrmm_(side, uplo, transa, "Non-unit", &m, &n, &c_b1, &a[1], &lda, &b[1], & lda); s2 = dsecnd_(); time = s2 - s1; ++ic; if (time < *timmin) { ztimmg_(&c__0, &m, &n, &b[1], &lda, & c__0, &c__0); goto L520; } /* Subtract the time used in ZTIMMG. */ icl = 1; s1 = dsecnd_(); L530: s2 = dsecnd_(); untime = s2 - s1; ++icl; if (icl <= ic) { ztimmg_(&c__0, &m, &n, &b[1], &lda, & c__0, &c__0); goto L530; } time = (time - untime) / (doublereal) ic; i__4 = iside - 1; ops = dopbl3_(cname, &m, &n, &i__4); reslts_ref(im, in, ilda) = dmflop_(&ops, & time, &c__0); /* L540: */ } /* L550: */ } /* L560: */ } io___47.ciunit = *nout; s_wsfe(&io___47); do_fio(&c__1, cname, (ftnlen)6); do_fio(&c__1, side, (ftnlen)1); do_fio(&c__1, uplo, (ftnlen)1); do_fio(&c__1, transa, (ftnlen)1); e_wsfe(); dprtbl_("M", "N", nm, &mval[1], nn, &nval[1], nlda, & reslts[reslts_offset], ldr1, ldr2, nout, ( ftnlen)1, (ftnlen)1); /* L570: */ } /* L580: */ } /* L590: */ } /* Time ZTRSM */ } else if (s_cmp(cname, "ZTRSM ", (ftnlen)6, (ftnlen)6) == 0) { for (iside = 1; iside <= 2; ++iside) { *(unsigned char *)side = *(unsigned char *)&sides[iside - 1]; for (iuplo = 1; iuplo <= 2; ++iuplo) { *(unsigned char *)uplo = *(unsigned char *)&uplos[iuplo - 1]; if (lsame_(uplo, "U")) { imat = 11; } else { imat = -11; } for (ita = 1; ita <= 3; ++ita) { *(unsigned char *)transa = *(unsigned char *)&trans[ ita - 1]; i__1 = *nlda; for (ilda = 1; ilda <= i__1; ++ilda) { lda = ldaval[ilda]; i__2 = *nm; for (im = 1; im <= i__2; ++im) { m = mval[im]; i__3 = *nn; for (in = 1; in <= i__3; ++in) { n = nval[in]; if (iside == 1) { ztimmg_(&imat, &m, &m, &a[1], &lda, & c__0, &c__0); } else { ztimmg_(&imat, &n, &n, &a[1], &lda, & c__0, &c__0); } ztimmg_(&c__0, &m, &n, &b[1], &lda, &c__0, &c__0); ic = 0; s1 = dsecnd_(); L600: ztrsm_(side, uplo, transa, "Non-unit", &m, &n, &c_b1, &a[1], &lda, &b[1], & lda); s2 = dsecnd_(); time = s2 - s1; ++ic; if (time < *timmin) { ztimmg_(&c__0, &m, &n, &b[1], &lda, & c__0, &c__0); goto L600; } /* Subtract the time used in ZTIMMG. */ icl = 1; s1 = dsecnd_(); L610: s2 = dsecnd_(); untime = s2 - s1; ++icl; if (icl <= ic) { ztimmg_(&c__0, &m, &n, &b[1], &lda, & c__0, &c__0); goto L610; } time = (time - untime) / (doublereal) ic; i__4 = iside - 1; ops = dopbl3_(cname, &m, &n, &i__4); reslts_ref(im, in, ilda) = dmflop_(&ops, & time, &c__0); /* L620: */ } /* L630: */ } /* L640: */ } io___48.ciunit = *nout; s_wsfe(&io___48); do_fio(&c__1, cname, (ftnlen)6); do_fio(&c__1, side, (ftnlen)1); do_fio(&c__1, uplo, (ftnlen)1); do_fio(&c__1, transa, (ftnlen)1); e_wsfe(); dprtbl_("M", "N", nm, &mval[1], nn, &nval[1], nlda, & reslts[reslts_offset], ldr1, ldr2, nout, ( ftnlen)1, (ftnlen)1); /* L650: */ } /* L660: */ } /* L670: */ } } io___49.ciunit = *nout; s_wsfe(&io___49); e_wsfe(); L680: ; } L690: return 0; /* End of ZTIMB3 */ } /* ztimb3_ */
/* Subroutine */ int zgeqrs_(integer *m, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); /* -- LAPACK routine (version 3.1) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* Solve the least squares problem */ /* min || A*X - B || */ /* using the QR factorization */ /* A = Q*R */ /* computed by ZGEQRF. */ /* Arguments */ /* ========= */ /* M (input) INTEGER */ /* The number of rows of the matrix A. M >= 0. */ /* N (input) INTEGER */ /* The number of columns of the matrix A. M >= N >= 0. */ /* NRHS (input) INTEGER */ /* The number of columns of B. NRHS >= 0. */ /* A (input) COMPLEX*16 array, dimension (LDA,N) */ /* Details of the QR factorization of the original matrix A as */ /* returned by ZGEQRF. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= M. */ /* TAU (input) COMPLEX*16 array, dimension (N) */ /* Details of the orthogonal matrix Q. */ /* B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) */ /* On entry, the m-by-nrhs right hand side matrix B. */ /* On exit, the n-by-nrhs solution matrix X. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= M. */ /* WORK (workspace) COMPLEX*16 array, dimension (LWORK) */ /* LWORK (input) INTEGER */ /* The length of the array WORK. LWORK must be at least NRHS, */ /* and should be at least NRHS*NB, where NB is the block size */ /* for this environment. */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input arguments. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --tau; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; --work; /* Function Body */ *info = 0; if (*m < 0) { *info = -1; } else if (*n < 0 || *n > *m) { *info = -2; } else if (*nrhs < 0) { *info = -3; } else if (*lda < max(1,*m)) { *info = -5; } else if (*ldb < max(1,*m)) { *info = -8; } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { *info = -10; } if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQRS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0 || *m == 0) { return 0; } /* B := Q' * B */ zunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, & tau[1], &b[b_offset], ldb, &work[1], lwork, info); /* Solve R*X = B(1:n,:) */ ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, &a[ a_offset], lda, &b[b_offset], ldb); return 0; /* End of ZGEQRS */ } /* zgeqrs_ */
/* Subroutine */ int zgetrs_(char *trans, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ extern logical lsame_(char *, char *); extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *); logical notran; extern /* Subroutine */ int zlaswp_(integer *, doublecomplex *, integer *, integer *, integer *, integer *, integer *); /* -- LAPACK routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZGETRS solves a system of linear equations */ /* A * X = B, A**T * X = B, or A**H * X = B */ /* with a general N-by-N matrix A using the LU factorization computed */ /* by ZGETRF. */ /* Arguments */ /* ========= */ /* TRANS (input) CHARACTER*1 */ /* Specifies the form of the system of equations: */ /* = 'N': A * X = B (No transpose) */ /* = 'T': A**T * X = B (Transpose) */ /* = 'C': A**H * X = B (Conjugate transpose) */ /* N (input) INTEGER */ /* The order of the matrix A. N >= 0. */ /* NRHS (input) INTEGER */ /* The number of right hand sides, i.e., the number of columns */ /* of the matrix B. NRHS >= 0. */ /* A (input) COMPLEX*16 array, dimension (LDA,N) */ /* The factors L and U from the factorization A = P*L*U */ /* as computed by ZGETRF. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,N). */ /* IPIV (input) INTEGER array, dimension (N) */ /* The pivot indices from ZGETRF; for 1<=i<=N, row i of the */ /* matrix was interchanged with row IPIV(i). */ /* B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) */ /* On entry, the right hand side matrix B. */ /* On exit, the solution matrix X. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= max(1,N). */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --ipiv; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; /* Function Body */ *info = 0; notran = lsame_(trans, "N"); if (! notran && ! lsame_(trans, "T") && ! lsame_( trans, "C")) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*nrhs < 0) { *info = -3; } else if (*lda < max(1,*n)) { *info = -5; } else if (*ldb < max(1,*n)) { *info = -8; } if (*info != 0) { i__1 = -(*info); xerbla_("ZGETRS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { return 0; } if (notran) { /* Solve A * X = B. */ /* Apply row interchanges to the right hand sides. */ zlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c__1); /* Solve L*X = B, overwriting B with X. */ ztrsm_("Left", "Lower", "No transpose", "Unit", n, nrhs, &c_b1, &a[ a_offset], lda, &b[b_offset], ldb); /* Solve U*X = B, overwriting B with X. */ ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, & a[a_offset], lda, &b[b_offset], ldb); } else { /* Solve A**T * X = B or A**H * X = B. */ /* Solve U'*X = B, overwriting B with X. */ ztrsm_("Left", "Upper", trans, "Non-unit", n, nrhs, &c_b1, &a[ a_offset], lda, &b[b_offset], ldb); /* Solve L'*X = B, overwriting B with X. */ ztrsm_("Left", "Lower", trans, "Unit", n, nrhs, &c_b1, &a[a_offset], lda, &b[b_offset], ldb); /* Apply row interchanges to the solution vectors. */ zlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1); } return 0; /* End of ZGETRS */ } /* zgetrs_ */
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PZGSTRS solves a system of distributed linear equations * A*X = B with a general N-by-N matrix A using the LU factorization * computed by PZGSTRF. * If the equilibration, and row and column permutations were performed, * the LU factorization was performed for A1 where * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * and the linear system solved is * A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and * the permutation to B1 by Pc*Pr is applied internally in this routine. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures storing L and U factors. * The L and U factors are obtained from PZGSTRF for * the possibly scaled and permuted matrix A. * See superlu_zdefs.h for the definition of 'LUstruct_t'. * A may be scaled and permuted into A1, so that * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_defs.h for the definition of 'gridinfo_t'. * * B (input/output) doublecomplex* * On entry, the distributed right-hand side matrix of the possibly * equilibrated system. That is, B may be overwritten by diag(R)*B. * On exit, the distributed solution matrix Y of the possibly * equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X, * and X is the solution of the original system. * * m_loc (input) int (local) * The local row dimension of matrix B. * * fst_row (input) int (global) * The row number of B's first row in the global matrix. * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * Number of right-hand sides. * * SOLVEstruct (output) SOLVEstruct_t* (global) * Contains the information for the communication during the * solution phase. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
/* Subroutine */ int zgetrf_(integer *m, integer *n, doublecomplex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3; doublecomplex z__1; /* Local variables */ integer i__, j, ipivstart, jpivstart, jp; doublecomplex tmp; integer kcols; doublereal sfmin; extern /* Subroutine */ int zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer nstep; extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer kahead; extern doublereal dlamch_(char *); extern logical disnan_(doublereal *); doublereal pivmag; integer npived; extern integer izamax_(integer *, doublecomplex *, integer *); integer kstart, ntopiv; extern /* Subroutine */ int zlaswp_(integer *, doublecomplex *, integer *, integer *, integer *, integer *, integer *); /* -- LAPACK routine (version 3.X) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* May 2008 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZGETRF computes an LU factorization of a general M-by-N matrix A */ /* using partial pivoting with row interchanges. */ /* The factorization has the form */ /* A = P * L * U */ /* where P is a permutation matrix, L is lower triangular with unit */ /* diagonal elements (lower trapezoidal if m > n), and U is upper */ /* triangular (upper trapezoidal if m < n). */ /* This code implements an iterative version of Sivan Toledo's recursive */ /* LU algorithm[1]. For square matrices, this iterative versions should */ /* be within a factor of two of the optimum number of memory transfers. */ /* The pattern is as follows, with the large blocks of U being updated */ /* in one call to DTRSM, and the dotted lines denoting sections that */ /* have had all pending permutations applied: */ /* 1 2 3 4 5 6 7 8 */ /* +-+-+---+-------+------ */ /* | |1| | | */ /* |.+-+ 2 | | */ /* | | | | | */ /* |.|.+-+-+ 4 | */ /* | | | |1| | */ /* | | |.+-+ | */ /* | | | | | | */ /* |.|.|.|.+-+-+---+ 8 */ /* | | | | | |1| | */ /* | | | | |.+-+ 2 | */ /* | | | | | | | | */ /* | | | | |.|.+-+-+ */ /* | | | | | | | |1| */ /* | | | | | | |.+-+ */ /* | | | | | | | | | */ /* |.|.|.|.|.|.|.|.+----- */ /* | | | | | | | | | */ /* The 1-2-1-4-1-2-1-8-... pattern is the position of the last 1 bit in */ /* the binary expansion of the current column. Each Schur update is */ /* applied as soon as the necessary portion of U is available. */ /* [1] Toledo, S. 1997. Locality of Reference in LU Decomposition with */ /* Partial Pivoting. SIAM J. Matrix Anal. Appl. 18, 4 (Oct. 1997), */ /* 1065-1081. http://dx.doi.org/10.1137/S0895479896297744 */ /* Arguments */ /* ========= */ /* M (input) INTEGER */ /* The number of rows of the matrix A. M >= 0. */ /* N (input) INTEGER */ /* The number of columns of the matrix A. N >= 0. */ /* A (input/output) COMPLEX*16 array, dimension (LDA,N) */ /* On entry, the M-by-N matrix to be factored. */ /* On exit, the factors L and U from the factorization */ /* A = P*L*U; the unit diagonal elements of L are not stored. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,M). */ /* IPIV (output) INTEGER array, dimension (min(M,N)) */ /* The pivot indices; for 1 <= i <= min(M,N), row i of the */ /* matrix was interchanged with row IPIV(i). */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* > 0: if INFO = i, U(i,i) is exactly zero. The factorization */ /* has been completed, but the factor U is exactly */ /* singular, and division by zero will occur if it is used */ /* to solve a system of equations. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --ipiv; /* Function Body */ *info = 0; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < max(1,*m)) { *info = -4; } if (*info != 0) { i__1 = -(*info); xerbla_("ZGETRF", &i__1); return 0; } /* Quick return if possible */ if (*m == 0 || *n == 0) { return 0; } /* Compute machine safe minimum */ sfmin = dlamch_("S"); nstep = min(*m,*n); i__1 = nstep; for (j = 1; j <= i__1; ++j) { kahead = j & -j; kstart = j + 1 - kahead; /* Computing MIN */ i__2 = kahead, i__3 = *m - j; kcols = min(i__2,i__3); /* Find pivot. */ i__2 = *m - j + 1; jp = j - 1 + izamax_(&i__2, &a[j + j * a_dim1], &c__1); ipiv[j] = jp; /* Permute just this column. */ if (jp != j) { i__2 = j + j * a_dim1; tmp.r = a[i__2].r, tmp.i = a[i__2].i; i__2 = j + j * a_dim1; i__3 = jp + j * a_dim1; a[i__2].r = a[i__3].r, a[i__2].i = a[i__3].i; i__2 = jp + j * a_dim1; a[i__2].r = tmp.r, a[i__2].i = tmp.i; } /* Apply pending permutations to L */ ntopiv = 1; ipivstart = j; jpivstart = j - ntopiv; while(ntopiv < kahead) { zlaswp_(&ntopiv, &a[jpivstart * a_dim1 + 1], lda, &ipivstart, &j, &ipiv[1], &c__1); ipivstart -= ntopiv; ntopiv <<= 1; jpivstart -= ntopiv; } /* Permute U block to match L */ zlaswp_(&kcols, &a[(j + 1) * a_dim1 + 1], lda, &kstart, &j, &ipiv[1], &c__1); /* Factor the current column */ pivmag = z_abs(&a[j + j * a_dim1]); if (pivmag != 0. && ! disnan_(&pivmag)) { if (pivmag >= sfmin) { i__2 = *m - j; z_div(&z__1, &c_b1, &a[j + j * a_dim1]); zscal_(&i__2, &z__1, &a[j + 1 + j * a_dim1], &c__1); } else { i__2 = *m - j; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = j + i__ + j * a_dim1; z_div(&z__1, &a[j + i__ + j * a_dim1], &a[j + j * a_dim1]) ; a[i__3].r = z__1.r, a[i__3].i = z__1.i; } } } else if (pivmag == 0. && *info == 0) { *info = j; } /* Solve for U block. */ ztrsm_("Left", "Lower", "No transpose", "Unit", &kahead, &kcols, & c_b1, &a[kstart + kstart * a_dim1], lda, &a[kstart + (j + 1) * a_dim1], lda); /* Schur complement. */ i__2 = *m - j; zgemm_("No transpose", "No transpose", &i__2, &kcols, &kahead, &c_b2, &a[j + 1 + kstart * a_dim1], lda, &a[kstart + (j + 1) * a_dim1], lda, &c_b1, &a[j + 1 + (j + 1) * a_dim1], lda); } /* Handle pivot permutations on the way out of the recursion */ npived = nstep & -nstep; j = nstep - npived; while(j > 0) { ntopiv = j & -j; i__1 = j + 1; zlaswp_(&ntopiv, &a[(j - ntopiv + 1) * a_dim1 + 1], lda, &i__1, & nstep, &ipiv[1], &c__1); j -= ntopiv; } /* If short and wide, handle the rest of the columns. */ if (*m < *n) { i__1 = *n - *m; zlaswp_(&i__1, &a[(*m + kcols + 1) * a_dim1 + 1], lda, &c__1, m, & ipiv[1], &c__1); i__1 = *n - *m; ztrsm_("Left", "Lower", "No transpose", "Unit", m, &i__1, &c_b1, &a[ a_offset], lda, &a[(*m + kcols + 1) * a_dim1 + 1], lda); } return 0; /* End of ZGETRF */ } /* zgetrf_ */
/* Subroutine */ int zhegv_(integer *itype, char *jobz, char *uplo, integer * n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; /* Local variables */ integer nb, neig; extern logical lsame_(char *, char *); extern /* Subroutine */ int zheev_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, integer *); char trans[1]; logical upper, wantz; extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *); extern /* Subroutine */ int zhegst_(integer *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer lwkopt; logical lquery; extern /* Subroutine */ int zpotrf_(char *, integer *, doublecomplex *, integer *, integer *); /* -- LAPACK driver routine (version 3.4.0) -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ /* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ /* November 2011 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; --w; --work; --rwork; /* Function Body */ wantz = lsame_(jobz, "V"); upper = lsame_(uplo, "U"); lquery = *lwork == -1; *info = 0; if (*itype < 1 || *itype > 3) { *info = -1; } else if (! (wantz || lsame_(jobz, "N"))) { *info = -2; } else if (! (upper || lsame_(uplo, "L"))) { *info = -3; } else if (*n < 0) { *info = -4; } else if (*lda < max(1,*n)) { *info = -6; } else if (*ldb < max(1,*n)) { *info = -8; } if (*info == 0) { nb = ilaenv_(&c__1, "ZHETRD", uplo, n, &c_n1, &c_n1, &c_n1); /* Computing MAX */ i__1 = 1; i__2 = (nb + 1) * *n; // , expr subst lwkopt = max(i__1,i__2); work[1].r = (doublereal) lwkopt; work[1].i = 0.; // , expr subst /* Computing MAX */ i__1 = 1; i__2 = (*n << 1) - 1; // , expr subst if (*lwork < max(i__1,i__2) && ! lquery) { *info = -11; } } if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGV ", &i__1); return 0; } else if (lquery) { return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Form a Cholesky factorization of B. */ zpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; return 0; } /* Transform problem to standard eigenvalue problem and solve. */ zhegst_(itype, uplo, n, &a[a_offset], lda, &b[b_offset], ldb, info); zheev_(jobz, uplo, n, &a[a_offset], lda, &w[1], &work[1], lwork, &rwork[1] , info); if (wantz) { /* Backtransform eigenvectors to the original problem. */ neig = *n; if (*info > 0) { neig = *info - 1; } if (*itype == 1 || *itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; */ /* backtransform eigenvectors: x = inv(L)**H *y or inv(U)*y */ if (upper) { *(unsigned char *)trans = 'N'; } else { *(unsigned char *)trans = 'C'; } ztrsm_("Left", uplo, trans, "Non-unit", n, &neig, &c_b1, &b[ b_offset], ldb, &a[a_offset], lda); } else if (*itype == 3) { /* For B*A*x=(lambda)*x; */ /* backtransform eigenvectors: x = L*y or U**H *y */ if (upper) { *(unsigned char *)trans = 'C'; } else { *(unsigned char *)trans = 'N'; } ztrmm_("Left", uplo, trans, "Non-unit", n, &neig, &c_b1, &b[ b_offset], ldb, &a[a_offset], lda); } } work[1].r = (doublereal) lwkopt; work[1].i = 0.; // , expr subst return 0; /* End of ZHEGV */ }
/* Subroutine */ int zhegvd_(integer *itype, char *jobz, char *uplo, integer * n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *lrwork, integer *iwork, integer *liwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; doublereal d__1, d__2; /* Local variables */ integer lopt; extern logical lsame_(char *, char *); integer lwmin; char trans[1]; integer liopt; logical upper; integer lropt; logical wantz; extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *), zheevd_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *, integer *); integer liwmin; extern /* Subroutine */ int zhegst_(integer *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer lrwmin; logical lquery; extern /* Subroutine */ int zpotrf_(char *, integer *, doublecomplex *, integer *, integer *); /* -- LAPACK driver routine (version 3.1) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZHEGVD computes all the eigenvalues, and optionally, the eigenvectors */ /* of a complex generalized Hermitian-definite eigenproblem, of the form */ /* A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and */ /* B are assumed to be Hermitian and B is also positive definite. */ /* If eigenvectors are desired, it uses a divide and conquer algorithm. */ /* The divide and conquer algorithm makes very mild assumptions about */ /* floating point arithmetic. It will work on machines with a guard */ /* digit in add/subtract, or on those binary machines without guard */ /* digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or */ /* Cray-2. It could conceivably fail on hexadecimal or decimal machines */ /* without guard digits, but we know of none. */ /* Arguments */ /* ========= */ /* ITYPE (input) INTEGER */ /* Specifies the problem type to be solved: */ /* = 1: A*x = (lambda)*B*x */ /* = 2: A*B*x = (lambda)*x */ /* = 3: B*A*x = (lambda)*x */ /* JOBZ (input) CHARACTER*1 */ /* = 'N': Compute eigenvalues only; */ /* = 'V': Compute eigenvalues and eigenvectors. */ /* UPLO (input) CHARACTER*1 */ /* = 'U': Upper triangles of A and B are stored; */ /* = 'L': Lower triangles of A and B are stored. */ /* N (input) INTEGER */ /* The order of the matrices A and B. N >= 0. */ /* A (input/output) COMPLEX*16 array, dimension (LDA, N) */ /* On entry, the Hermitian matrix A. If UPLO = 'U', the */ /* leading N-by-N upper triangular part of A contains the */ /* upper triangular part of the matrix A. If UPLO = 'L', */ /* the leading N-by-N lower triangular part of A contains */ /* the lower triangular part of the matrix A. */ /* On exit, if JOBZ = 'V', then if INFO = 0, A contains the */ /* matrix Z of eigenvectors. The eigenvectors are normalized */ /* as follows: */ /* if ITYPE = 1 or 2, Z**H*B*Z = I; */ /* if ITYPE = 3, Z**H*inv(B)*Z = I. */ /* If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') */ /* or the lower triangle (if UPLO='L') of A, including the */ /* diagonal, is destroyed. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,N). */ /* B (input/output) COMPLEX*16 array, dimension (LDB, N) */ /* On entry, the Hermitian matrix B. If UPLO = 'U', the */ /* leading N-by-N upper triangular part of B contains the */ /* upper triangular part of the matrix B. If UPLO = 'L', */ /* the leading N-by-N lower triangular part of B contains */ /* the lower triangular part of the matrix B. */ /* On exit, if INFO <= N, the part of B containing the matrix is */ /* overwritten by the triangular factor U or L from the Cholesky */ /* factorization B = U**H*U or B = L*L**H. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= max(1,N). */ /* W (output) DOUBLE PRECISION array, dimension (N) */ /* If INFO = 0, the eigenvalues in ascending order. */ /* WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) */ /* On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ /* LWORK (input) INTEGER */ /* The length of the array WORK. */ /* If N <= 1, LWORK >= 1. */ /* If JOBZ = 'N' and N > 1, LWORK >= N + 1. */ /* If JOBZ = 'V' and N > 1, LWORK >= 2*N + N**2. */ /* If LWORK = -1, then a workspace query is assumed; the routine */ /* only calculates the optimal sizes of the WORK, RWORK and */ /* IWORK arrays, returns these values as the first entries of */ /* the WORK, RWORK and IWORK arrays, and no error message */ /* related to LWORK or LRWORK or LIWORK is issued by XERBLA. */ /* RWORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LRWORK)) */ /* On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. */ /* LRWORK (input) INTEGER */ /* The dimension of the array RWORK. */ /* If N <= 1, LRWORK >= 1. */ /* If JOBZ = 'N' and N > 1, LRWORK >= N. */ /* If JOBZ = 'V' and N > 1, LRWORK >= 1 + 5*N + 2*N**2. */ /* If LRWORK = -1, then a workspace query is assumed; the */ /* routine only calculates the optimal sizes of the WORK, RWORK */ /* and IWORK arrays, returns these values as the first entries */ /* of the WORK, RWORK and IWORK arrays, and no error message */ /* related to LWORK or LRWORK or LIWORK is issued by XERBLA. */ /* IWORK (workspace/output) INTEGER array, dimension (MAX(1,LIWORK)) */ /* On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */ /* LIWORK (input) INTEGER */ /* The dimension of the array IWORK. */ /* If N <= 1, LIWORK >= 1. */ /* If JOBZ = 'N' and N > 1, LIWORK >= 1. */ /* If JOBZ = 'V' and N > 1, LIWORK >= 3 + 5*N. */ /* If LIWORK = -1, then a workspace query is assumed; the */ /* routine only calculates the optimal sizes of the WORK, RWORK */ /* and IWORK arrays, returns these values as the first entries */ /* of the WORK, RWORK and IWORK arrays, and no error message */ /* related to LWORK or LRWORK or LIWORK is issued by XERBLA. */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* > 0: ZPOTRF or ZHEEVD returned an error code: */ /* <= N: if INFO = i and JOBZ = 'N', then the algorithm */ /* failed to converge; i off-diagonal elements of an */ /* intermediate tridiagonal form did not converge to */ /* zero; */ /* if INFO = i and JOBZ = 'V', then the algorithm */ /* failed to compute an eigenvalue while working on */ /* the submatrix lying in rows and columns INFO/(N+1) */ /* through mod(INFO,N+1); */ /* > N: if INFO = N + i, for 1 <= i <= N, then the leading */ /* minor of order i of B is not positive definite. */ /* The factorization of B could not be completed and */ /* no eigenvalues or eigenvectors were computed. */ /* Further Details */ /* =============== */ /* Based on contributions by */ /* Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* Modified so that no backsubstitution is performed if ZHEEVD fails to */ /* converge (NEIG in old code could be greater than N causing out of */ /* bounds reference to A - reported by Ralf Meyer). Also corrected the */ /* description of INFO and the test on ITYPE. Sven, 16 Feb 05. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; --w; --work; --rwork; --iwork; /* Function Body */ wantz = lsame_(jobz, "V"); upper = lsame_(uplo, "U"); lquery = *lwork == -1 || *lrwork == -1 || *liwork == -1; *info = 0; if (*n <= 1) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if (wantz) { lwmin = (*n << 1) + *n * *n; lrwmin = *n * 5 + 1 + (*n << 1) * *n; liwmin = *n * 5 + 3; } else { lwmin = *n + 1; lrwmin = *n; liwmin = 1; } lopt = lwmin; lropt = lrwmin; liopt = liwmin; if (*itype < 1 || *itype > 3) { *info = -1; } else if (! (wantz || lsame_(jobz, "N"))) { *info = -2; } else if (! (upper || lsame_(uplo, "L"))) { *info = -3; } else if (*n < 0) { *info = -4; } else if (*lda < max(1,*n)) { *info = -6; } else if (*ldb < max(1,*n)) { *info = -8; } if (*info == 0) { work[1].r = (doublereal) lopt, work[1].i = 0.; rwork[1] = (doublereal) lropt; iwork[1] = liopt; if (*lwork < lwmin && ! lquery) { *info = -11; } else if (*lrwork < lrwmin && ! lquery) { *info = -13; } else if (*liwork < liwmin && ! lquery) { *info = -15; } } if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGVD", &i__1); return 0; } else if (lquery) { return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Form a Cholesky factorization of B. */ zpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; return 0; } /* Transform problem to standard eigenvalue problem and solve. */ zhegst_(itype, uplo, n, &a[a_offset], lda, &b[b_offset], ldb, info); zheevd_(jobz, uplo, n, &a[a_offset], lda, &w[1], &work[1], lwork, &rwork[ 1], lrwork, &iwork[1], liwork, info); /* Computing MAX */ d__1 = (doublereal) lopt, d__2 = work[1].r; lopt = (integer) max(d__1,d__2); /* Computing MAX */ d__1 = (doublereal) lropt; lropt = (integer) max(d__1,rwork[1]); /* Computing MAX */ d__1 = (doublereal) liopt, d__2 = (doublereal) iwork[1]; liopt = (integer) max(d__1,d__2); if (wantz && *info == 0) { /* Backtransform eigenvectors to the original problem. */ if (*itype == 1 || *itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; */ /* backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (upper) { *(unsigned char *)trans = 'N'; } else { *(unsigned char *)trans = 'C'; } ztrsm_("Left", uplo, trans, "Non-unit", n, n, &c_b1, &b[b_offset], ldb, &a[a_offset], lda); } else if (*itype == 3) { /* For B*A*x=(lambda)*x; */ /* backtransform eigenvectors: x = L*y or U'*y */ if (upper) { *(unsigned char *)trans = 'C'; } else { *(unsigned char *)trans = 'N'; } ztrmm_("Left", uplo, trans, "Non-unit", n, n, &c_b1, &b[b_offset], ldb, &a[a_offset], lda); } } work[1].r = (doublereal) lopt, work[1].i = 0.; rwork[1] = (doublereal) lropt; iwork[1] = liopt; return 0; /* End of ZHEGVD */ } /* zhegvd_ */
/* Subroutine */ int zdrvrf3_(integer *nout, integer *nn, integer *nval, doublereal *thresh, doublecomplex *a, integer *lda, doublecomplex * arf, doublecomplex *b1, doublecomplex *b2, doublereal * d_work_zlange__, doublecomplex *z_work_zgeqrf__, doublecomplex *tau) { /* Initialized data */ static integer iseedy[4] = { 1988,1989,1990,1991 }; static char uplos[1*2] = "U" "L"; static char forms[1*2] = "N" "C"; static char sides[1*2] = "L" "R"; static char transs[1*2] = "N" "C"; static char diags[1*2] = "N" "U"; /* Format strings */ static char fmt_9999[] = "(1x,\002 *** Error(s) or Failure(s) while test" "ing ZTFSM ***\002)"; static char fmt_9997[] = "(1x,\002 Failure in \002,a5,\002, CFORM=" "'\002,a1,\002',\002,\002 SIDE='\002,a1,\002',\002,\002 UPLO='" "\002,a1,\002',\002,\002 TRANS='\002,a1,\002',\002,\002 DIAG='" "\002,a1,\002',\002,\002 M=\002,i3,\002, N =\002,i3,\002, test" "=\002,g12.5)"; static char fmt_9996[] = "(1x,\002All tests for \002,a5,\002 auxiliary r" "outine passed the \002,\002threshold (\002,i5,\002 tests run)" "\002)"; static char fmt_9995[] = "(1x,a6,\002 auxiliary routine:\002,i5,\002 out" " of \002,i5,\002 tests failed to pass the threshold\002)"; /* System generated locals */ integer a_dim1, a_offset, b1_dim1, b1_offset, b2_dim1, b2_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1, z__2; /* Local variables */ integer i__, j, m, n, na, iim, iin; doublereal eps; char diag[1], side[1]; integer info; char uplo[1]; integer nrun, idiag; doublecomplex alpha; integer nfail, iseed[4], iside; char cform[1]; integer iform; char trans[1]; integer iuplo; integer ialpha; integer itrans; doublereal result[1]; /* Fortran I/O blocks */ static cilist io___32 = { 0, 0, 0, 0, 0 }; static cilist io___33 = { 0, 0, 0, fmt_9999, 0 }; static cilist io___34 = { 0, 0, 0, fmt_9997, 0 }; static cilist io___35 = { 0, 0, 0, fmt_9996, 0 }; static cilist io___36 = { 0, 0, 0, fmt_9995, 0 }; /* -- LAPACK test routine (version 3.2.0) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2008 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZDRVRF3 tests the LAPACK RFP routines: */ /* ZTFSM */ /* Arguments */ /* ========= */ /* NOUT (input) INTEGER */ /* The unit number for output. */ /* NN (input) INTEGER */ /* The number of values of N contained in the vector NVAL. */ /* NVAL (input) INTEGER array, dimension (NN) */ /* The values of the matrix dimension N. */ /* THRESH (input) DOUBLE PRECISION */ /* The threshold value for the test ratios. A result is */ /* included in the output file if RESULT >= THRESH. To have */ /* every test ratio printed, use THRESH = 0. */ /* A (workspace) COMPLEX*16 array, dimension (LDA,NMAX) */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= max(1,NMAX). */ /* ARF (workspace) COMPLEX*16 array, dimension ((NMAX*(NMAX+1))/2). */ /* B1 (workspace) COMPLEX*16 array, dimension (LDA,NMAX) */ /* B2 (workspace) COMPLEX*16 array, dimension (LDA,NMAX) */ /* D_WORK_ZLANGE (workspace) DOUBLE PRECISION array, dimension (NMAX) */ /* Z_WORK_ZGEQRF (workspace) COMPLEX*16 array, dimension (NMAX) */ /* TAU (workspace) COMPLEX*16 array, dimension (NMAX) */ /* ===================================================================== */ /* .. */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. Local Arrays .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Scalars in Common .. */ /* .. */ /* .. Common blocks .. */ /* .. */ /* .. Data statements .. */ /* Parameter adjustments */ --nval; b2_dim1 = *lda; b2_offset = 1 + b2_dim1; b2 -= b2_offset; b1_dim1 = *lda; b1_offset = 1 + b1_dim1; b1 -= b1_offset; a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --arf; --d_work_zlange__; --z_work_zgeqrf__; --tau; /* Function Body */ /* .. */ /* .. Executable Statements .. */ /* Initialize constants and the random number seed. */ nrun = 0; nfail = 0; info = 0; for (i__ = 1; i__ <= 4; ++i__) { iseed[i__ - 1] = iseedy[i__ - 1]; /* L10: */ } eps = dlamch_("Precision"); i__1 = *nn; for (iim = 1; iim <= i__1; ++iim) { m = nval[iim]; i__2 = *nn; for (iin = 1; iin <= i__2; ++iin) { n = nval[iin]; for (iform = 1; iform <= 2; ++iform) { *(unsigned char *)cform = *(unsigned char *)&forms[iform - 1]; for (iuplo = 1; iuplo <= 2; ++iuplo) { *(unsigned char *)uplo = *(unsigned char *)&uplos[iuplo - 1]; for (iside = 1; iside <= 2; ++iside) { *(unsigned char *)side = *(unsigned char *)&sides[ iside - 1]; for (itrans = 1; itrans <= 2; ++itrans) { *(unsigned char *)trans = *(unsigned char *)& transs[itrans - 1]; for (idiag = 1; idiag <= 2; ++idiag) { *(unsigned char *)diag = *(unsigned char *)& diags[idiag - 1]; for (ialpha = 1; ialpha <= 3; ++ialpha) { if (ialpha == 1) { alpha.r = 0., alpha.i = 0.; } else if (ialpha == 1) { alpha.r = 1., alpha.i = 0.; } else { zlarnd_(&z__1, &c__4, iseed); alpha.r = z__1.r, alpha.i = z__1.i; } /* All the parameters are set: */ /* CFORM, SIDE, UPLO, TRANS, DIAG, M, N, */ /* and ALPHA */ /* READY TO TEST! */ ++nrun; if (iside == 1) { /* The case ISIDE.EQ.1 is when SIDE.EQ.'L' */ /* -> A is M-by-M ( B is M-by-N ) */ na = m; } else { /* The case ISIDE.EQ.2 is when SIDE.EQ.'R' */ /* -> A is N-by-N ( B is M-by-N ) */ na = n; } /* Generate A our NA--by--NA triangular */ /* matrix. */ /* Our test is based on forward error so we */ /* do want A to be well conditionned! To get */ /* a well-conditionned triangular matrix, we */ /* take the R factor of the QR/LQ factorization */ /* of a random matrix. */ i__3 = na; for (j = 1; j <= i__3; ++j) { i__4 = na; for (i__ = 1; i__ <= i__4; ++i__) { i__5 = i__ + j * a_dim1; zlarnd_(&z__1, &c__4, iseed); a[i__5].r = z__1.r, a[i__5].i = z__1.i; } } if (iuplo == 1) { /* The case IUPLO.EQ.1 is when SIDE.EQ.'U' */ /* -> QR factorization. */ s_copy(srnamc_1.srnamt, "ZGEQRF", ( ftnlen)32, (ftnlen)6); zgeqrf_(&na, &na, &a[a_offset], lda, & tau[1], &z_work_zgeqrf__[1], lda, &info); } else { /* The case IUPLO.EQ.2 is when SIDE.EQ.'L' */ /* -> QL factorization. */ s_copy(srnamc_1.srnamt, "ZGELQF", ( ftnlen)32, (ftnlen)6); zgelqf_(&na, &na, &a[a_offset], lda, & tau[1], &z_work_zgeqrf__[1], lda, &info); } /* After the QR factorization, the diagonal */ /* of A is made of real numbers, we multiply */ /* by a random complex number of absolute */ /* value 1.0E+00. */ i__3 = na; for (j = 1; j <= i__3; ++j) { i__4 = j + j * a_dim1; i__5 = j + j * a_dim1; zlarnd_(&z__2, &c__5, iseed); z__1.r = a[i__5].r * z__2.r - a[i__5] .i * z__2.i, z__1.i = a[i__5] .r * z__2.i + a[i__5].i * z__2.r; a[i__4].r = z__1.r, a[i__4].i = z__1.i; } /* Store a copy of A in RFP format (in ARF). */ s_copy(srnamc_1.srnamt, "ZTRTTF", (ftnlen) 32, (ftnlen)6); ztrttf_(cform, uplo, &na, &a[a_offset], lda, &arf[1], &info); /* Generate B1 our M--by--N right-hand side */ /* and store a copy in B2. */ i__3 = n; for (j = 1; j <= i__3; ++j) { i__4 = m; for (i__ = 1; i__ <= i__4; ++i__) { i__5 = i__ + j * b1_dim1; zlarnd_(&z__1, &c__4, iseed); b1[i__5].r = z__1.r, b1[i__5].i = z__1.i; i__5 = i__ + j * b2_dim1; i__6 = i__ + j * b1_dim1; b2[i__5].r = b1[i__6].r, b2[i__5] .i = b1[i__6].i; } } /* Solve op( A ) X = B or X op( A ) = B */ /* with ZTRSM */ s_copy(srnamc_1.srnamt, "ZTRSM", (ftnlen) 32, (ftnlen)5); ztrsm_(side, uplo, trans, diag, &m, &n, & alpha, &a[a_offset], lda, &b1[ b1_offset], lda); /* Solve op( A ) X = B or X op( A ) = B */ /* with ZTFSM */ s_copy(srnamc_1.srnamt, "ZTFSM", (ftnlen) 32, (ftnlen)5); ztfsm_(cform, side, uplo, trans, diag, &m, &n, &alpha, &arf[1], &b2[ b2_offset], lda); /* Check that the result agrees. */ i__3 = n; for (j = 1; j <= i__3; ++j) { i__4 = m; for (i__ = 1; i__ <= i__4; ++i__) { i__5 = i__ + j * b1_dim1; i__6 = i__ + j * b2_dim1; i__7 = i__ + j * b1_dim1; z__1.r = b2[i__6].r - b1[i__7].r, z__1.i = b2[i__6].i - b1[ i__7].i; b1[i__5].r = z__1.r, b1[i__5].i = z__1.i; } } result[0] = zlange_("I", &m, &n, &b1[ b1_offset], lda, &d_work_zlange__[ 1]); /* Computing MAX */ i__3 = max(m,n); result[0] = result[0] / sqrt(eps) / max( i__3,1); if (result[0] >= *thresh) { if (nfail == 0) { io___32.ciunit = *nout; s_wsle(&io___32); e_wsle(); io___33.ciunit = *nout; s_wsfe(&io___33); e_wsfe(); } io___34.ciunit = *nout; s_wsfe(&io___34); do_fio(&c__1, "ZTFSM", (ftnlen)5); do_fio(&c__1, cform, (ftnlen)1); do_fio(&c__1, side, (ftnlen)1); do_fio(&c__1, uplo, (ftnlen)1); do_fio(&c__1, trans, (ftnlen)1); do_fio(&c__1, diag, (ftnlen)1); do_fio(&c__1, (char *)&m, (ftnlen) sizeof(integer)); do_fio(&c__1, (char *)&n, (ftnlen) sizeof(integer)); do_fio(&c__1, (char *)&result[0], ( ftnlen)sizeof(doublereal)); e_wsfe(); ++nfail; } /* L100: */ } /* L110: */ } /* L120: */ } /* L130: */ } /* L140: */ } /* L150: */ } /* L160: */ } /* L170: */ } /* Print a summary of the results. */ if (nfail == 0) { io___35.ciunit = *nout; s_wsfe(&io___35); do_fio(&c__1, "ZTFSM", (ftnlen)5); do_fio(&c__1, (char *)&nrun, (ftnlen)sizeof(integer)); e_wsfe(); } else { io___36.ciunit = *nout; s_wsfe(&io___36); do_fio(&c__1, "ZTFSM", (ftnlen)5); do_fio(&c__1, (char *)&nfail, (ftnlen)sizeof(integer)); do_fio(&c__1, (char *)&nrun, (ftnlen)sizeof(integer)); e_wsfe(); } return 0; /* End of ZDRVRF3 */ } /* zdrvrf3_ */
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; MPI_Request *send_req, recv_req; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; #if 1 for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol != kcol && fmod[lk] ) mod_bit[lk] = 1; /* contribution from off-diagonal */ } } /*PrintInt10("mod_bit", nlb, mod_bit);*/ #if ( PROFlevel>=2 ) t_reduce_tmp = SuperLU_timer_(); #endif /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); #if ( PROFlevel>=2 ) t_reduce += SuperLU_timer_() - t_reduce_tmp; #endif for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); if ( mycol == kcol ) { /* diagonal process */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; } } } #else /* old */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } #endif } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #if 0 MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #if 0 MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/ for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status); Llu->SolveMsgSent = 0; MPI_Barrier( grid->comm ); /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; #if 1 for (k = 0; k < nlb; ++k) mod_bit[k] = 0; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); /* root process in this row scope */ if ( mycol != kcol && bmod[lk] ) mod_bit[lk] = 1; /* Contribution from off-diagonal */ } } /* Every process receives the count, but it is only useful on the diagonal processes. */ MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* local block number */ kcol = PCOL( k, grid ); /* root process in this row scope. */ if ( mycol == kcol ) { /* diagonal process */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } #else /* old */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } #endif } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
int zpotrf_(char *uplo, int *n, doublecomplex *a, int *lda, int *info) { /* System generated locals */ int a_dim1, a_offset, i__1, i__2, i__3, i__4; doublecomplex z__1; /* Local variables */ int j, jb, nb; extern int lsame_(char *, char *); extern int zgemm_(char *, char *, int *, int *, int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, doublecomplex *, int *), zherk_(char *, char *, int *, int *, double *, doublecomplex *, int *, double *, doublecomplex *, int *); int upper; extern int ztrsm_(char *, char *, char *, char *, int *, int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *), zpotf2_(char *, int *, doublecomplex *, int *, int *), xerbla_(char *, int *); extern int ilaenv_(int *, char *, char *, int *, int *, int *, int *); /* -- LAPACK routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZPOTRF computes the Cholesky factorization of a complex Hermitian */ /* positive definite matrix A. */ /* The factorization has the form */ /* A = U**H * U, if UPLO = 'U', or */ /* A = L * L**H, if UPLO = 'L', */ /* where U is an upper triangular matrix and L is lower triangular. */ /* This is the block version of the algorithm, calling Level 3 BLAS. */ /* Arguments */ /* ========= */ /* UPLO (input) CHARACTER*1 */ /* = 'U': Upper triangle of A is stored; */ /* = 'L': Lower triangle of A is stored. */ /* N (input) INTEGER */ /* The order of the matrix A. N >= 0. */ /* A (input/output) COMPLEX*16 array, dimension (LDA,N) */ /* On entry, the Hermitian matrix A. If UPLO = 'U', the leading */ /* N-by-N upper triangular part of A contains the upper */ /* triangular part of the matrix A, and the strictly lower */ /* triangular part of A is not referenced. If UPLO = 'L', the */ /* leading N-by-N lower triangular part of A contains the lower */ /* triangular part of the matrix A, and the strictly upper */ /* triangular part of A is not referenced. */ /* On exit, if INFO = 0, the factor U or L from the Cholesky */ /* factorization A = U**H*U or A = L*L**H. */ /* LDA (input) INTEGER */ /* The leading dimension of the array A. LDA >= MAX(1,N). */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* > 0: if INFO = i, the leading minor of order i is not */ /* positive definite, and the factorization could not be */ /* completed. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; /* Function Body */ *info = 0; upper = lsame_(uplo, "U"); if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < MAX(1,*n)) { *info = -4; } if (*info != 0) { i__1 = -(*info); xerbla_("ZPOTRF", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Determine the block size for this environment. */ nb = ilaenv_(&c__1, "ZPOTRF", uplo, n, &c_n1, &c_n1, &c_n1); if (nb <= 1 || nb >= *n) { /* Use unblocked code. */ zpotf2_(uplo, n, &a[a_offset], lda, info); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ i__1 = *n; i__2 = nb; for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Update and factorize the current diagonal block and test */ /* for non-positive-definiteness. */ /* Computing MIN */ i__3 = nb, i__4 = *n - j + 1; jb = MIN(i__3,i__4); i__3 = j - 1; zherk_("Upper", "Conjugate transpose", &jb, &i__3, &c_b14, &a[ j * a_dim1 + 1], lda, &c_b15, &a[j + j * a_dim1], lda); zpotf2_("Upper", &jb, &a[j + j * a_dim1], lda, info); if (*info != 0) { goto L30; } if (j + jb <= *n) { /* Compute the current block row. */ i__3 = *n - j - jb + 1; i__4 = j - 1; z__1.r = -1., z__1.i = -0.; zgemm_("Conjugate transpose", "No transpose", &jb, &i__3, &i__4, &z__1, &a[j * a_dim1 + 1], lda, &a[(j + jb) * a_dim1 + 1], lda, &c_b1, &a[j + (j + jb) * a_dim1], lda); i__3 = *n - j - jb + 1; ztrsm_("Left", "Upper", "Conjugate transpose", "Non-unit", &jb, &i__3, &c_b1, &a[j + j * a_dim1], lda, &a[j + (j + jb) * a_dim1], lda); } /* L10: */ } } else { /* Compute the Cholesky factorization A = L*L'. */ i__2 = *n; i__1 = nb; for (j = 1; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) { /* Update and factorize the current diagonal block and test */ /* for non-positive-definiteness. */ /* Computing MIN */ i__3 = nb, i__4 = *n - j + 1; jb = MIN(i__3,i__4); i__3 = j - 1; zherk_("Lower", "No transpose", &jb, &i__3, &c_b14, &a[j + a_dim1], lda, &c_b15, &a[j + j * a_dim1], lda); zpotf2_("Lower", &jb, &a[j + j * a_dim1], lda, info); if (*info != 0) { goto L30; } if (j + jb <= *n) { /* Compute the current block column. */ i__3 = *n - j - jb + 1; i__4 = j - 1; z__1.r = -1., z__1.i = -0.; zgemm_("No transpose", "Conjugate transpose", &i__3, &jb, &i__4, &z__1, &a[j + jb + a_dim1], lda, &a[j + a_dim1], lda, &c_b1, &a[j + jb + j * a_dim1], lda); i__3 = *n - j - jb + 1; ztrsm_("Right", "Lower", "Conjugate transpose", "Non-unit" , &i__3, &jb, &c_b1, &a[j + j * a_dim1], lda, &a[ j + jb + j * a_dim1], lda); } /* L20: */ } } } goto L40; L30: *info = *info + j - 1; L40: return 0; /* End of ZPOTRF */ } /* zpotrf_ */
/* Subroutine */ int zpbtrf_(char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, integer *info) { /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4, i__5, i__6; doublecomplex z__1; /* Local variables */ integer i__, j, i2, i3, ib, nb, ii, jj; doublecomplex work[1056] /* was [33][32] */; extern logical lsame_(char *, char *); extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zpbtf2_(char *, integer *, integer *, doublecomplex *, integer *, integer *), zpotf2_(char *, integer *, doublecomplex *, integer *, integer *), xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *); /* -- LAPACK routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZPBTRF computes the Cholesky factorization of a complex Hermitian */ /* positive definite band matrix A. */ /* The factorization has the form */ /* A = U**H * U, if UPLO = 'U', or */ /* A = L * L**H, if UPLO = 'L', */ /* where U is an upper triangular matrix and L is lower triangular. */ /* Arguments */ /* ========= */ /* UPLO (input) CHARACTER*1 */ /* = 'U': Upper triangle of A is stored; */ /* = 'L': Lower triangle of A is stored. */ /* N (input) INTEGER */ /* The order of the matrix A. N >= 0. */ /* KD (input) INTEGER */ /* The number of superdiagonals of the matrix A if UPLO = 'U', */ /* or the number of subdiagonals if UPLO = 'L'. KD >= 0. */ /* AB (input/output) COMPLEX*16 array, dimension (LDAB,N) */ /* On entry, the upper or lower triangle of the Hermitian band */ /* matrix A, stored in the first KD+1 rows of the array. The */ /* j-th column of A is stored in the j-th column of the array AB */ /* as follows: */ /* if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; */ /* if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). */ /* On exit, if INFO = 0, the triangular factor U or L from the */ /* Cholesky factorization A = U**H*U or A = L*L**H of the band */ /* matrix A, in the same storage format as A. */ /* LDAB (input) INTEGER */ /* The leading dimension of the array AB. LDAB >= KD+1. */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -i, the i-th argument had an illegal value */ /* > 0: if INFO = i, the leading minor of order i is not */ /* positive definite, and the factorization could not be */ /* completed. */ /* Further Details */ /* =============== */ /* The band storage scheme is illustrated by the following example, when */ /* N = 6, KD = 2, and UPLO = 'U': */ /* On entry: On exit: */ /* * * a13 a24 a35 a46 * * u13 u24 u35 u46 */ /* * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 */ /* a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 */ /* Similarly, if UPLO = 'L' the format of A is as follows: */ /* On entry: On exit: */ /* a11 a22 a33 a44 a55 a66 l11 l22 l33 l44 l55 l66 */ /* a21 a32 a43 a54 a65 * l21 l32 l43 l54 l65 * */ /* a31 a42 a53 a64 * * l31 l42 l53 l64 * * */ /* Array elements marked * are not used by the routine. */ /* Contributed by */ /* Peter Mayes and Giuseppe Radicati, IBM ECSEC, Rome, March 23, 1989 */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. Local Arrays .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ ab_dim1 = *ldab; ab_offset = 1 + ab_dim1; ab -= ab_offset; /* Function Body */ *info = 0; if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*kd < 0) { *info = -3; } else if (*ldab < *kd + 1) { *info = -5; } if (*info != 0) { i__1 = -(*info); xerbla_("ZPBTRF", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Determine the block size for this environment */ nb = ilaenv_(&c__1, "ZPBTRF", uplo, n, kd, &c_n1, &c_n1); /* The block size must not exceed the semi-bandwidth KD, and must not */ /* exceed the limit set by the size of the local array WORK. */ nb = min(nb,32); if (nb <= 1 || nb > *kd) { /* Use unblocked code */ zpbtf2_(uplo, n, kd, &ab[ab_offset], ldab, info); } else { /* Use blocked code */ if (lsame_(uplo, "U")) { /* Compute the Cholesky factorization of a Hermitian band */ /* matrix, given the upper triangle of the matrix in band */ /* storage. */ /* Zero the upper triangle of the work array. */ i__1 = nb; for (j = 1; j <= i__1; ++j) { i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = i__ + j * 33 - 34; work[i__3].r = 0., work[i__3].i = 0.; /* L10: */ } /* L20: */ } /* Process the band matrix one diagonal block at a time. */ i__1 = *n; i__2 = nb; for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { /* Computing MIN */ i__3 = nb, i__4 = *n - i__ + 1; ib = min(i__3,i__4); /* Factorize the diagonal block */ i__3 = *ldab - 1; zpotf2_(uplo, &ib, &ab[*kd + 1 + i__ * ab_dim1], &i__3, &ii); if (ii != 0) { *info = i__ + ii - 1; goto L150; } if (i__ + ib <= *n) { /* Update the relevant part of the trailing submatrix. */ /* If A11 denotes the diagonal block which has just been */ /* factorized, then we need to update the remaining */ /* blocks in the diagram: */ /* A11 A12 A13 */ /* A22 A23 */ /* A33 */ /* The numbers of rows and columns in the partitioning */ /* are IB, I2, I3 respectively. The blocks A12, A22 and */ /* A23 are empty if IB = KD. The upper triangle of A13 */ /* lies outside the band. */ /* Computing MIN */ i__3 = *kd - ib, i__4 = *n - i__ - ib + 1; i2 = min(i__3,i__4); /* Computing MIN */ i__3 = ib, i__4 = *n - i__ - *kd + 1; i3 = min(i__3,i__4); if (i2 > 0) { /* Update A12 */ i__3 = *ldab - 1; i__4 = *ldab - 1; ztrsm_("Left", "Upper", "Conjugate transpose", "Non-" "unit", &ib, &i2, &c_b1, &ab[*kd + 1 + i__ * ab_dim1], &i__3, &ab[*kd + 1 - ib + (i__ + ib) * ab_dim1], &i__4); /* Update A22 */ i__3 = *ldab - 1; i__4 = *ldab - 1; zherk_("Upper", "Conjugate transpose", &i2, &ib, & c_b21, &ab[*kd + 1 - ib + (i__ + ib) * ab_dim1], &i__3, &c_b22, &ab[*kd + 1 + (i__ + ib) * ab_dim1], &i__4); } if (i3 > 0) { /* Copy the lower triangle of A13 into the work array. */ i__3 = i3; for (jj = 1; jj <= i__3; ++jj) { i__4 = ib; for (ii = jj; ii <= i__4; ++ii) { i__5 = ii + jj * 33 - 34; i__6 = ii - jj + 1 + (jj + i__ + *kd - 1) * ab_dim1; work[i__5].r = ab[i__6].r, work[i__5].i = ab[ i__6].i; /* L30: */ } /* L40: */ } /* Update A13 (in the work array). */ i__3 = *ldab - 1; ztrsm_("Left", "Upper", "Conjugate transpose", "Non-" "unit", &ib, &i3, &c_b1, &ab[*kd + 1 + i__ * ab_dim1], &i__3, work, &c__33); /* Update A23 */ if (i2 > 0) { z__1.r = -1., z__1.i = -0.; i__3 = *ldab - 1; i__4 = *ldab - 1; zgemm_("Conjugate transpose", "No transpose", &i2, &i3, &ib, &z__1, &ab[*kd + 1 - ib + (i__ + ib) * ab_dim1], &i__3, work, &c__33, & c_b1, &ab[ib + 1 + (i__ + *kd) * ab_dim1], &i__4); } /* Update A33 */ i__3 = *ldab - 1; zherk_("Upper", "Conjugate transpose", &i3, &ib, & c_b21, work, &c__33, &c_b22, &ab[*kd + 1 + ( i__ + *kd) * ab_dim1], &i__3); /* Copy the lower triangle of A13 back into place. */ i__3 = i3; for (jj = 1; jj <= i__3; ++jj) { i__4 = ib; for (ii = jj; ii <= i__4; ++ii) { i__5 = ii - jj + 1 + (jj + i__ + *kd - 1) * ab_dim1; i__6 = ii + jj * 33 - 34; ab[i__5].r = work[i__6].r, ab[i__5].i = work[ i__6].i; /* L50: */ } /* L60: */ } } } /* L70: */ } } else { /* Compute the Cholesky factorization of a Hermitian band */ /* matrix, given the lower triangle of the matrix in band */ /* storage. */ /* Zero the lower triangle of the work array. */ i__2 = nb; for (j = 1; j <= i__2; ++j) { i__1 = nb; for (i__ = j + 1; i__ <= i__1; ++i__) { i__3 = i__ + j * 33 - 34; work[i__3].r = 0., work[i__3].i = 0.; /* L80: */ } /* L90: */ } /* Process the band matrix one diagonal block at a time. */ i__2 = *n; i__1 = nb; for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { /* Computing MIN */ i__3 = nb, i__4 = *n - i__ + 1; ib = min(i__3,i__4); /* Factorize the diagonal block */ i__3 = *ldab - 1; zpotf2_(uplo, &ib, &ab[i__ * ab_dim1 + 1], &i__3, &ii); if (ii != 0) { *info = i__ + ii - 1; goto L150; } if (i__ + ib <= *n) { /* Update the relevant part of the trailing submatrix. */ /* If A11 denotes the diagonal block which has just been */ /* factorized, then we need to update the remaining */ /* blocks in the diagram: */ /* A11 */ /* A21 A22 */ /* A31 A32 A33 */ /* The numbers of rows and columns in the partitioning */ /* are IB, I2, I3 respectively. The blocks A21, A22 and */ /* A32 are empty if IB = KD. The lower triangle of A31 */ /* lies outside the band. */ /* Computing MIN */ i__3 = *kd - ib, i__4 = *n - i__ - ib + 1; i2 = min(i__3,i__4); /* Computing MIN */ i__3 = ib, i__4 = *n - i__ - *kd + 1; i3 = min(i__3,i__4); if (i2 > 0) { /* Update A21 */ i__3 = *ldab - 1; i__4 = *ldab - 1; ztrsm_("Right", "Lower", "Conjugate transpose", "Non" "-unit", &i2, &ib, &c_b1, &ab[i__ * ab_dim1 + 1], &i__3, &ab[ib + 1 + i__ * ab_dim1], &i__4); /* Update A22 */ i__3 = *ldab - 1; i__4 = *ldab - 1; zherk_("Lower", "No transpose", &i2, &ib, &c_b21, &ab[ ib + 1 + i__ * ab_dim1], &i__3, &c_b22, &ab[( i__ + ib) * ab_dim1 + 1], &i__4); } if (i3 > 0) { /* Copy the upper triangle of A31 into the work array. */ i__3 = ib; for (jj = 1; jj <= i__3; ++jj) { i__4 = min(jj,i3); for (ii = 1; ii <= i__4; ++ii) { i__5 = ii + jj * 33 - 34; i__6 = *kd + 1 - jj + ii + (jj + i__ - 1) * ab_dim1; work[i__5].r = ab[i__6].r, work[i__5].i = ab[ i__6].i; /* L100: */ } /* L110: */ } /* Update A31 (in the work array). */ i__3 = *ldab - 1; ztrsm_("Right", "Lower", "Conjugate transpose", "Non" "-unit", &i3, &ib, &c_b1, &ab[i__ * ab_dim1 + 1], &i__3, work, &c__33); /* Update A32 */ if (i2 > 0) { z__1.r = -1., z__1.i = -0.; i__3 = *ldab - 1; i__4 = *ldab - 1; zgemm_("No transpose", "Conjugate transpose", &i3, &i2, &ib, &z__1, work, &c__33, &ab[ib + 1 + i__ * ab_dim1], &i__3, &c_b1, &ab[*kd + 1 - ib + (i__ + ib) * ab_dim1], &i__4); } /* Update A33 */ i__3 = *ldab - 1; zherk_("Lower", "No transpose", &i3, &ib, &c_b21, work, &c__33, &c_b22, &ab[(i__ + *kd) * ab_dim1 + 1], &i__3); /* Copy the upper triangle of A31 back into place. */ i__3 = ib; for (jj = 1; jj <= i__3; ++jj) { i__4 = min(jj,i3); for (ii = 1; ii <= i__4; ++ii) { i__5 = *kd + 1 - jj + ii + (jj + i__ - 1) * ab_dim1; i__6 = ii + jj * 33 - 34; ab[i__5].r = work[i__6].r, ab[i__5].i = work[ i__6].i; /* L120: */ } /* L130: */ } } } /* L140: */ } } } return 0; L150: return 0; /* End of ZPBTRF */ } /* zpbtrf_ */