int main () { double a[N][N], b[N][N], c[N][N], c_ref[N][N]; init(a, b, N); work(a, b, c, N); work_ref(a, b, c_ref, N); check(c, c_ref); return 0; }
int main () { double a[N], a_ref[N], b[N], res, ref, diff; init(a, a_ref, b, N); res = work(a, b, N); ref = work_ref(a_ref, b, N); diff = res - ref; if (diff > EPS || -diff > EPS) abort (); return 0; }
extern "C" magma_int_t magma_zgeqrf2_2q_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaDoubleComplex *tau, magma_queue_t* queues, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaDoubleComplex_ptr dwork; magmaDoubleComplex *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return MAGMA_SUCCESS; nb = magma_get_zgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_zmalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, lwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } */ cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaDoubleComplex)*lwork, NULL, NULL); work = (magmaDoubleComplex*)clEnqueueMapBuffer(queues[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(magmaDoubleComplex), 0, NULL, NULL, NULL); nbmin = 2; nx = 2*nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_zgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), ldwork, queues[0], NULL); clFlush(queues[0]); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queues[1]); magma_zsetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, queues[1], NULL); clFlush(queues[1]); } magma_queue_sync(queues[0]); lapackf77_zgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); zpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_zsetmatrix( rows, ib, work_ref(i), ldwork, dA(i,i), ldda, queues[0]); zq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_zsetmatrix( ib, ib, hwork, ib, dwork, 0, lddwork, queues[1]); if (i+nb < k-nx){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_queue_sync(queues[1]); }else { magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_zsetmatrix( ib, ib, work_ref(i), ldwork, dA(i,i), ldda, queues[1]); clFlush(queues[1]); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows, queues[0]); lhwork = lwork - rows*ib; lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda, queues[0]); } clEnqueueUnmapMemObject(queues[0], buffer, work, 0, NULL, NULL); clReleaseMemObject(buffer); // magma_free_cpu(work); return *info; } /* magma_zgeqrf2_gpu */
extern "C" magma_int_t magma_cgeqrf_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *tau, magmaFloatComplex *dT, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them. The corresponding parts of the upper triangular R are inverted and stored separately in dT. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). dT (workspace/output) COMPLEX array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_cgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal inverses for the R matrix. The rest of the array is used as workspace. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1)) #define t_ref(a_1) (dT+(a_1)*nb) #define d_ref(a_1) (dT+(minmn+(a_1))*nb) #define dd_ref(a_1) (dT+(2*minmn+(a_1))*nb) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; magmaFloatComplex *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_cgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(magmaFloatComplex)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_cgetmatrix_async( rows, ib, a_ref(i,i), ldda, work_ref(i), ldwork, stream[1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, a_ref(old_i, old_i ), ldda, t_ref(old_i), nb, a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_csetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_cgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut and invert it. */ magma_queue_sync( stream[0] ); csplit_diag_block(ib, work_ref(i), ldwork, ut); magma_csetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_csetmatrix( ib, ib, hwork, ib, t_ref(i), nb ); if (i+nb < k-nb){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_csetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_cgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_cgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_csetmatrix( rows, ib, work, rows, a_ref(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; /* End of MAGMA_CGEQRF */ } /* magma_cgeqrf */
extern "C" magma_int_t magma_zgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This version has LAPACK-complaint arguments. This version assumes the computation runs through the NULL stream and therefore is not overlapping some computation with communication. Other versions (magma_zgeqrf_gpu and magma_zgeqrf3_gpu) store the intermediate T matrices. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) ( dA+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaDoubleComplex *dwork; magmaDoubleComplex *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; /* Function Body */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_zgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if (MAGMA_SUCCESS != magma_zmalloc( &dwork, (n)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lwork )) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_zgetmatrix_async( rows, ib, dA(i,i), ldda, work_ref(i), ldwork, stream[1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); magma_zsetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_zgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); zpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_zsetmatrix( rows, ib, work_ref(i), ldwork, dA(i,i), ldda ); zq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_zsetmatrix( ib, ib, hwork, ib, dwork, lddwork ); if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); } else { magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); magma_zsetmatrix( ib, ib, work_ref(i), ldwork, dA(i,i), ldda ); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free( dwork ); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); return *info; } /* magma_zgeqrf2_gpu */
magma_err_t magma_sgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, size_t dA_offset, magma_int_t ldda, float *tau, magma_err_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver April 2012 Purpose ======= SGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -9, internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaFloat_ptr dwork; float *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return MAGMA_SUCCESS; nb = magma_get_sgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_malloc( &dwork, n*nb*sizeof(float))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ( MAGMA_SUCCESS != magma_malloc_host((void**)&work, lwork*sizeof(float)) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } magma_event_t event[2] = {NULL, NULL}; nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_queue_sync( queue ); magma_sgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queue, &event[0]); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queue); magma_ssetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork, dA(old_i, old_i), ldda, queue, &event[1]); } magma_event_sync(event[0]); lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); if (i > 0) { magma_event_sync(event[1]); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); spanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_ssetmatrix(rows, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queue); sq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_ssetmatrix(ib, ib, hwork, 0, ib, dwork, 0, lddwork, queue); if (i+nb < k-nx) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue); else { magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue); magma_ssetmatrix(ib, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queue); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_sgetmatrix(rows, ib, dA(i, i), ldda, work, 0, rows, queue); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_ssetmatrix(rows, ib, work, 0, rows, dA(i, i), ldda, queue); } magma_free_host(work); return *info; } /* magma_sgeqrf2_gpu */
/* Subroutine */ int sgtt01_(integer *n, real *dl, real *d__, real *du, real * dlf, real *df, real *duf, real *du2, integer *ipiv, real *work, integer *ldwork, real *rwork, real *resid) { /* System generated locals */ integer work_dim1, work_offset, i__1, i__2; /* Local variables */ static integer i__, j; static real anorm; static integer lastj; extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *); static real li; static integer ip; extern doublereal slamch_(char *), slangt_(char *, integer *, real *, real *, real *), slanhs_(char *, integer *, real * , integer *, real *); static real eps; #define work_ref(a_1,a_2) work[(a_2)*work_dim1 + a_1] /* -- LAPACK test routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University February 29, 1992 Purpose ======= SGTT01 reconstructs a tridiagonal matrix A from its LU factorization and computes the residual norm(L*U - A) / ( norm(A) * EPS ), where EPS is the machine epsilon. Arguments ========= N (input) INTEGTER The order of the matrix A. N >= 0. DL (input) REAL array, dimension (N-1) The (n-1) sub-diagonal elements of A. D (input) REAL array, dimension (N) The diagonal elements of A. DU (input) REAL array, dimension (N-1) The (n-1) super-diagonal elements of A. DLF (input) REAL array, dimension (N-1) The (n-1) multipliers that define the matrix L from the LU factorization of A. DF (input) REAL array, dimension (N) The n diagonal elements of the upper triangular matrix U from the LU factorization of A. DUF (input) REAL array, dimension (N-1) The (n-1) elements of the first super-diagonal of U. DU2F (input) REAL array, dimension (N-2) The (n-2) elements of the second super-diagonal of U. IPIV (input) INTEGER array, dimension (N) The pivot indices; for 1 <= i <= n, row i of the matrix was interchanged with row IPIV(i). IPIV(i) will always be either i or i+1; IPIV(i) = i indicates a row interchange was not required. WORK (workspace) REAL array, dimension (LDWORK,N) LDWORK (input) INTEGER The leading dimension of the array WORK. LDWORK >= max(1,N). RWORK (workspace) REAL array, dimension (N) RESID (output) REAL The scaled residual: norm(L*U - A) / (norm(A) * EPS) ===================================================================== Quick return if possible Parameter adjustments */ --dl; --d__; --du; --dlf; --df; --duf; --du2; --ipiv; work_dim1 = *ldwork; work_offset = 1 + work_dim1 * 1; work -= work_offset; --rwork; /* Function Body */ if (*n <= 0) { *resid = 0.f; return 0; } eps = slamch_("Epsilon"); /* Copy the matrix U to WORK. */ i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { work_ref(i__, j) = 0.f; /* L10: */ } /* L20: */ } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (i__ == 1) { work_ref(i__, i__) = df[i__]; if (*n >= 2) { work_ref(i__, i__ + 1) = duf[i__]; } if (*n >= 3) { work_ref(i__, i__ + 2) = du2[i__]; } } else if (i__ == *n) { work_ref(i__, i__) = df[i__]; } else { work_ref(i__, i__) = df[i__]; work_ref(i__, i__ + 1) = duf[i__]; if (i__ < *n - 1) { work_ref(i__, i__ + 2) = du2[i__]; } } /* L30: */ } /* Multiply on the left by L. */ lastj = *n; for (i__ = *n - 1; i__ >= 1; --i__) { li = dlf[i__]; i__1 = lastj - i__ + 1; saxpy_(&i__1, &li, &work_ref(i__, i__), ldwork, &work_ref(i__ + 1, i__), ldwork); ip = ipiv[i__]; if (ip == i__) { /* Computing MIN */ i__1 = i__ + 2; lastj = min(i__1,*n); } else { i__1 = lastj - i__ + 1; sswap_(&i__1, &work_ref(i__, i__), ldwork, &work_ref(i__ + 1, i__) , ldwork); } /* L40: */ } /* Subtract the matrix A. */ work_ref(1, 1) = work_ref(1, 1) - d__[1]; if (*n > 1) { work_ref(1, 2) = work_ref(1, 2) - du[1]; work_ref(*n, *n - 1) = work_ref(*n, *n - 1) - dl[*n - 1]; work_ref(*n, *n) = work_ref(*n, *n) - d__[*n]; i__1 = *n - 1; for (i__ = 2; i__ <= i__1; ++i__) { work_ref(i__, i__ - 1) = work_ref(i__, i__ - 1) - dl[i__ - 1]; work_ref(i__, i__) = work_ref(i__, i__) - d__[i__]; work_ref(i__, i__ + 1) = work_ref(i__, i__ + 1) - du[i__]; /* L50: */ } } /* Compute the 1-norm of the tridiagonal matrix A. */ anorm = slangt_("1", n, &dl[1], &d__[1], &du[1]); /* Compute the 1-norm of WORK, which is only guaranteed to be upper Hessenberg. */ *resid = slanhs_("1", n, &work[work_offset], ldwork, &rwork[1]) ; /* Compute norm(L*U - A) / (norm(A) * EPS) */ if (anorm <= 0.f) { if (*resid != 0.f) { *resid = 1.f / eps; } } else { *resid = *resid / anorm / eps; } return 0; /* End of SGTT01 */ } /* sgtt01_ */
/** Purpose ------- DGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version has LAPACK-complaint arguments. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Other versions (magma_dgeqrf_gpu and magma_dgeqrf3_gpu) store the intermediate T matrices. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf2_gpu( magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, double *tau, magma_int_t *info ) { #define dA(a_1,a_2) ( dA+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) double *dwork; double *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; /* Function Body */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_dgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lwork )) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; /* download i-th panel */ magma_queue_sync( stream[1] ); magma_dgetmatrix_async( rows, ib, dA(i,i), ldda, work_ref(i), ldwork, stream[0] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); magma_dsetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, stream[1] ); } magma_queue_sync( stream[0] ); lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); dpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); /* download the i-th V matrix */ magma_dsetmatrix_async( rows, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_queue_sync( stream[1] ); magma_dsetmatrix_async( ib, ib, hwork, ib, dwork, lddwork, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); } else { magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_dsetmatrix_async( ib, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[1] ); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free( dwork ); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_dgetmatrix_async( rows, ib, dA(i, i), ldda, work, rows, stream[1] ); magma_queue_sync( stream[1] ); lhwork = lwork - rows*ib; lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_dsetmatrix_async( rows, ib, work, rows, dA(i, i), ldda, stream[1] ); } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dgeqrf2_gpu */
/* Subroutine */ int sget03_(integer *n, real *a, integer *lda, real *ainv, integer *ldainv, real *work, integer *ldwork, real *rwork, real * rcond, real *resid) { /* System generated locals */ integer a_dim1, a_offset, ainv_dim1, ainv_offset, work_dim1, work_offset, i__1; /* Local variables */ static integer i__; extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); static real anorm; extern doublereal slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); static real ainvnm, eps; #define work_ref(a_1,a_2) work[(a_2)*work_dim1 + a_1] /* -- LAPACK test routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University June 30, 1999 Purpose ======= SGET03 computes the residual for a general matrix times its inverse: norm( I - AINV*A ) / ( N * norm(A) * norm(AINV) * EPS ), where EPS is the machine epsilon. Arguments ========== N (input) INTEGER The number of rows and columns of the matrix A. N >= 0. A (input) REAL array, dimension (LDA,N) The original N x N matrix A. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). AINV (input) REAL array, dimension (LDAINV,N) The inverse of the matrix A. LDAINV (input) INTEGER The leading dimension of the array AINV. LDAINV >= max(1,N). WORK (workspace) REAL array, dimension (LDWORK,N) LDWORK (input) INTEGER The leading dimension of the array WORK. LDWORK >= max(1,N). RWORK (workspace) REAL array, dimension (N) RCOND (output) REAL The reciprocal of the condition number of A, computed as ( 1/norm(A) ) / norm(AINV). RESID (output) REAL norm(I - AINV*A) / ( N * norm(A) * norm(AINV) * EPS ) ===================================================================== Quick exit if N = 0. Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; ainv_dim1 = *ldainv; ainv_offset = 1 + ainv_dim1 * 1; ainv -= ainv_offset; work_dim1 = *ldwork; work_offset = 1 + work_dim1 * 1; work -= work_offset; --rwork; /* Function Body */ if (*n <= 0) { *rcond = 1.f; *resid = 0.f; return 0; } /* Exit with RESID = 1/EPS if ANORM = 0 or AINVNM = 0. */ eps = slamch_("Epsilon"); anorm = slange_("1", n, n, &a[a_offset], lda, &rwork[1]); ainvnm = slange_("1", n, n, &ainv[ainv_offset], ldainv, &rwork[1]); if (anorm <= 0.f || ainvnm <= 0.f) { *rcond = 0.f; *resid = 1.f / eps; return 0; } *rcond = 1.f / anorm / ainvnm; /* Compute I - A * AINV */ sgemm_("No transpose", "No transpose", n, n, n, &c_b7, &ainv[ainv_offset], ldainv, &a[a_offset], lda, &c_b8, &work[work_offset], ldwork); i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { work_ref(i__, i__) = work_ref(i__, i__) + 1.f; /* L10: */ } /* Compute norm(I - AINV*A) / (N * norm(A) * norm(AINV) * EPS) */ *resid = slange_("1", n, n, &work[work_offset], ldwork, &rwork[1]); *resid = *resid * *rcond / eps / (real) (*n); return 0; /* End of SGET03 */ } /* sget03_ */
/* Subroutine */ int ztrsna_(char *job, char *howmny, logical *select, integer *n, doublecomplex *t, integer *ldt, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, doublereal *s, doublereal *sep, integer *mm, integer *m, doublecomplex *work, integer *ldwork, doublereal *rwork, integer *info) { /* -- LAPACK routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University September 30, 1994 Purpose ======= ZTRSNA estimates reciprocal condition numbers for specified eigenvalues and/or right eigenvectors of a complex upper triangular matrix T (or of any matrix Q*T*Q**H with Q unitary). Arguments ========= JOB (input) CHARACTER*1 Specifies whether condition numbers are required for eigenvalues (S) or eigenvectors (SEP): = 'E': for eigenvalues only (S); = 'V': for eigenvectors only (SEP); = 'B': for both eigenvalues and eigenvectors (S and SEP). HOWMNY (input) CHARACTER*1 = 'A': compute condition numbers for all eigenpairs; = 'S': compute condition numbers for selected eigenpairs specified by the array SELECT. SELECT (input) LOGICAL array, dimension (N) If HOWMNY = 'S', SELECT specifies the eigenpairs for which condition numbers are required. To select condition numbers for the j-th eigenpair, SELECT(j) must be set to .TRUE.. If HOWMNY = 'A', SELECT is not referenced. N (input) INTEGER The order of the matrix T. N >= 0. T (input) COMPLEX*16 array, dimension (LDT,N) The upper triangular matrix T. LDT (input) INTEGER The leading dimension of the array T. LDT >= max(1,N). VL (input) COMPLEX*16 array, dimension (LDVL,M) If JOB = 'E' or 'B', VL must contain left eigenvectors of T (or of any Q*T*Q**H with Q unitary), corresponding to the eigenpairs specified by HOWMNY and SELECT. The eigenvectors must be stored in consecutive columns of VL, as returned by ZHSEIN or ZTREVC. If JOB = 'V', VL is not referenced. LDVL (input) INTEGER The leading dimension of the array VL. LDVL >= 1; and if JOB = 'E' or 'B', LDVL >= N. VR (input) COMPLEX*16 array, dimension (LDVR,M) If JOB = 'E' or 'B', VR must contain right eigenvectors of T (or of any Q*T*Q**H with Q unitary), corresponding to the eigenpairs specified by HOWMNY and SELECT. The eigenvectors must be stored in consecutive columns of VR, as returned by ZHSEIN or ZTREVC. If JOB = 'V', VR is not referenced. LDVR (input) INTEGER The leading dimension of the array VR. LDVR >= 1; and if JOB = 'E' or 'B', LDVR >= N. S (output) DOUBLE PRECISION array, dimension (MM) If JOB = 'E' or 'B', the reciprocal condition numbers of the selected eigenvalues, stored in consecutive elements of the array. Thus S(j), SEP(j), and the j-th columns of VL and VR all correspond to the same eigenpair (but not in general the j-th eigenpair, unless all eigenpairs are selected). If JOB = 'V', S is not referenced. SEP (output) DOUBLE PRECISION array, dimension (MM) If JOB = 'V' or 'B', the estimated reciprocal condition numbers of the selected eigenvectors, stored in consecutive elements of the array. If JOB = 'E', SEP is not referenced. MM (input) INTEGER The number of elements in the arrays S (if JOB = 'E' or 'B') and/or SEP (if JOB = 'V' or 'B'). MM >= M. M (output) INTEGER The number of elements of the arrays S and/or SEP actually used to store the estimated condition numbers. If HOWMNY = 'A', M is set to N. WORK (workspace) COMPLEX*16 array, dimension (LDWORK,N+1) If JOB = 'E', WORK is not referenced. LDWORK (input) INTEGER The leading dimension of the array WORK. LDWORK >= 1; and if JOB = 'V' or 'B', LDWORK >= N. RWORK (workspace) DOUBLE PRECISION array, dimension (N) If JOB = 'E', RWORK is not referenced. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== The reciprocal of the condition number of an eigenvalue lambda is defined as S(lambda) = |v'*u| / (norm(u)*norm(v)) where u and v are the right and left eigenvectors of T corresponding to lambda; v' denotes the conjugate transpose of v, and norm(u) denotes the Euclidean norm. These reciprocal condition numbers always lie between zero (very badly conditioned) and one (very well conditioned). If n = 1, S(lambda) is defined to be 1. An approximate error bound for a computed eigenvalue W(i) is given by EPS * norm(T) / S(i) where EPS is the machine precision. The reciprocal of the condition number of the right eigenvector u corresponding to lambda is defined as follows. Suppose T = ( lambda c ) ( 0 T22 ) Then the reciprocal condition number is SEP( lambda, T22 ) = sigma-min( T22 - lambda*I ) where sigma-min denotes the smallest singular value. We approximate the smallest singular value by the reciprocal of an estimate of the one-norm of the inverse of T22 - lambda*I. If n = 1, SEP(1) is defined to be abs(T(1,1)). An approximate error bound for a computed right eigenvector VR(i) is given by EPS * norm(T) / SEP(i) ===================================================================== Decode and test the input parameters Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; /* System generated locals */ integer t_dim1, t_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, work_dim1, work_offset, i__1, i__2, i__3, i__4, i__5; doublereal d__1, d__2; doublecomplex z__1; /* Builtin functions */ double z_abs(doublecomplex *), d_imag(doublecomplex *); /* Local variables */ static integer kase, ierr; static doublecomplex prod; static doublereal lnrm, rnrm; static integer i__, j, k; static doublereal scale; extern logical lsame_(char *, char *); extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); static doublecomplex dummy[1]; static logical wants; static doublereal xnorm; extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *), dlamch_( char *); static integer ks, ix; extern /* Subroutine */ int xerbla_(char *, integer *); static doublereal bignum; static logical wantbh; extern /* Subroutine */ int zlacon_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *); extern integer izamax_(integer *, doublecomplex *, integer *); static logical somcon; extern /* Subroutine */ int zdrscl_(integer *, doublereal *, doublecomplex *, integer *); static char normin[1]; extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); static doublereal smlnum; static logical wantsp; extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *), ztrexc_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, integer *); static doublereal eps, est; #define work_subscr(a_1,a_2) (a_2)*work_dim1 + a_1 #define work_ref(a_1,a_2) work[work_subscr(a_1,a_2)] #define t_subscr(a_1,a_2) (a_2)*t_dim1 + a_1 #define t_ref(a_1,a_2) t[t_subscr(a_1,a_2)] #define vl_subscr(a_1,a_2) (a_2)*vl_dim1 + a_1 #define vl_ref(a_1,a_2) vl[vl_subscr(a_1,a_2)] #define vr_subscr(a_1,a_2) (a_2)*vr_dim1 + a_1 #define vr_ref(a_1,a_2) vr[vr_subscr(a_1,a_2)] --select; t_dim1 = *ldt; t_offset = 1 + t_dim1 * 1; t -= t_offset; vl_dim1 = *ldvl; vl_offset = 1 + vl_dim1 * 1; vl -= vl_offset; vr_dim1 = *ldvr; vr_offset = 1 + vr_dim1 * 1; vr -= vr_offset; --s; --sep; work_dim1 = *ldwork; work_offset = 1 + work_dim1 * 1; work -= work_offset; --rwork; /* Function Body */ wantbh = lsame_(job, "B"); wants = lsame_(job, "E") || wantbh; wantsp = lsame_(job, "V") || wantbh; somcon = lsame_(howmny, "S"); /* Set M to the number of eigenpairs for which condition numbers are to be computed. */ if (somcon) { *m = 0; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (select[j]) { ++(*m); } /* L10: */ } } else { *m = *n; } *info = 0; if (! wants && ! wantsp) { *info = -1; } else if (! lsame_(howmny, "A") && ! somcon) { *info = -2; } else if (*n < 0) { *info = -4; } else if (*ldt < max(1,*n)) { *info = -6; } else if (*ldvl < 1 || wants && *ldvl < *n) { *info = -8; } else if (*ldvr < 1 || wants && *ldvr < *n) { *info = -10; } else if (*mm < *m) { *info = -13; } else if (*ldwork < 1 || wantsp && *ldwork < *n) { *info = -16; } if (*info != 0) { i__1 = -(*info); xerbla_("ZTRSNA", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } if (*n == 1) { if (somcon) { if (! select[1]) { return 0; } } if (wants) { s[1] = 1.; } if (wantsp) { sep[1] = z_abs(&t_ref(1, 1)); } return 0; } /* Get machine constants */ eps = dlamch_("P"); smlnum = dlamch_("S") / eps; bignum = 1. / smlnum; dlabad_(&smlnum, &bignum); ks = 1; i__1 = *n; for (k = 1; k <= i__1; ++k) { if (somcon) { if (! select[k]) { goto L50; } } if (wants) { /* Compute the reciprocal condition number of the k-th eigenvalue. */ zdotc_(&z__1, n, &vr_ref(1, ks), &c__1, &vl_ref(1, ks), &c__1); prod.r = z__1.r, prod.i = z__1.i; rnrm = dznrm2_(n, &vr_ref(1, ks), &c__1); lnrm = dznrm2_(n, &vl_ref(1, ks), &c__1); s[ks] = z_abs(&prod) / (rnrm * lnrm); } if (wantsp) { /* Estimate the reciprocal condition number of the k-th eigenvector. Copy the matrix T to the array WORK and swap the k-th diagonal element to the (1,1) position. */ zlacpy_("Full", n, n, &t[t_offset], ldt, &work[work_offset], ldwork); ztrexc_("No Q", n, &work[work_offset], ldwork, dummy, &c__1, &k, & c__1, &ierr); /* Form C = T22 - lambda*I in WORK(2:N,2:N). */ i__2 = *n; for (i__ = 2; i__ <= i__2; ++i__) { i__3 = work_subscr(i__, i__); i__4 = work_subscr(i__, i__); i__5 = work_subscr(1, 1); z__1.r = work[i__4].r - work[i__5].r, z__1.i = work[i__4].i - work[i__5].i; work[i__3].r = z__1.r, work[i__3].i = z__1.i; /* L20: */ } /* Estimate a lower bound for the 1-norm of inv(C'). The 1st and (N+1)th columns of WORK are used to store work vectors. */ sep[ks] = 0.; est = 0.; kase = 0; *(unsigned char *)normin = 'N'; L30: i__2 = *n - 1; zlacon_(&i__2, &work_ref(1, *n + 1), &work[work_offset], &est, & kase); if (kase != 0) { if (kase == 1) { /* Solve C'*x = scale*b */ i__2 = *n - 1; zlatrs_("Upper", "Conjugate transpose", "Nonunit", normin, &i__2, &work_ref(2, 2), ldwork, &work[ work_offset], &scale, &rwork[1], &ierr); } else { /* Solve C*x = scale*b */ i__2 = *n - 1; zlatrs_("Upper", "No transpose", "Nonunit", normin, &i__2, &work_ref(2, 2), ldwork, &work[work_offset], & scale, &rwork[1], &ierr); } *(unsigned char *)normin = 'Y'; if (scale != 1.) { /* Multiply by 1/SCALE if doing so will not cause overflow. */ i__2 = *n - 1; ix = izamax_(&i__2, &work[work_offset], &c__1); i__2 = work_subscr(ix, 1); xnorm = (d__1 = work[i__2].r, abs(d__1)) + (d__2 = d_imag( &work_ref(ix, 1)), abs(d__2)); if (scale < xnorm * smlnum || scale == 0.) { goto L40; } zdrscl_(n, &scale, &work[work_offset], &c__1); } goto L30; } sep[ks] = 1. / max(est,smlnum); } L40: ++ks; L50: ; } return 0; /* End of ZTRSNA */ } /* ztrsna_ */
/* Subroutine */ int dpbtrf_(char *uplo, integer *n, integer *kd, doublereal * ab, integer *ldab, integer *info) { /* -- LAPACK routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University March 31, 1993 Purpose ======= DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. The factorization has the form A = U**T * U, if UPLO = 'U', or A = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. KD (input) INTEGER The number of superdiagonals of the matrix A if UPLO = 'U', or the number of subdiagonals if UPLO = 'L'. KD >= 0. AB (input/output) DOUBLE PRECISION array, dimension (LDAB,N) On entry, the upper or lower triangle of the symmetric band matrix A, stored in the first KD+1 rows of the array. The j-th column of A is stored in the j-th column of the array AB as follows: if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). On exit, if INFO = 0, the triangular factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T of the band matrix A, in the same storage format as A. LDAB (input) INTEGER The leading dimension of the array AB. LDAB >= KD+1. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. Further Details =============== The band storage scheme is illustrated by the following example, when N = 6, KD = 2, and UPLO = 'U': On entry: On exit: * * a13 a24 a35 a46 * * u13 u24 u35 u46 * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 Similarly, if UPLO = 'L' the format of A is as follows: On entry: On exit: a11 a22 a33 a44 a55 a66 l11 l22 l33 l44 l55 l66 a21 a32 a43 a54 a65 * l21 l32 l43 l54 l65 * a31 a42 a53 a64 * * l31 l42 l53 l64 * * Array elements marked * are not used by the routine. Contributed by Peter Mayes and Giuseppe Radicati, IBM ECSEC, Rome, March 23, 1989 ===================================================================== Test the input parameters. Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; static integer c_n1 = -1; static doublereal c_b18 = 1.; static doublereal c_b21 = -1.; static integer c__33 = 33; /* System generated locals */ integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4; /* Local variables */ static doublereal work[1056] /* was [33][32] */; static integer i__, j; extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static integer i2, i3; extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *), dpbtf2_(char *, integer *, integer *, doublereal *, integer *, integer *), dpotf2_(char *, integer *, doublereal *, integer *, integer *); static integer ib, nb, ii, jj; extern /* Subroutine */ int xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); #define work_ref(a_1,a_2) work[(a_2)*33 + a_1 - 34] #define ab_ref(a_1,a_2) ab[(a_2)*ab_dim1 + a_1] ab_dim1 = *ldab; ab_offset = 1 + ab_dim1 * 1; ab -= ab_offset; /* Function Body */ *info = 0; if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*kd < 0) { *info = -3; } else if (*ldab < *kd + 1) { *info = -5; } if (*info != 0) { i__1 = -(*info); xerbla_("DPBTRF", &i__1); return 0; } /* Quick return if possible */ if (*n == 0) { return 0; } /* Determine the block size for this environment */ nb = ilaenv_(&c__1, "DPBTRF", uplo, n, kd, &c_n1, &c_n1, (ftnlen)6, ( ftnlen)1); /* The block size must not exceed the semi-bandwidth KD, and must not exceed the limit set by the size of the local array WORK. */ nb = min(nb,32); if (nb <= 1 || nb > *kd) { /* Use unblocked code */ dpbtf2_(uplo, n, kd, &ab[ab_offset], ldab, info); } else { /* Use blocked code */ if (lsame_(uplo, "U")) { /* Compute the Cholesky factorization of a symmetric band matrix, given the upper triangle of the matrix in band storage. Zero the upper triangle of the work array. */ i__1 = nb; for (j = 1; j <= i__1; ++j) { i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { work_ref(i__, j) = 0.; /* L10: */ } /* L20: */ } /* Process the band matrix one diagonal block at a time. */ i__1 = *n; i__2 = nb; for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { /* Computing MIN */ i__3 = nb, i__4 = *n - i__ + 1; ib = min(i__3,i__4); /* Factorize the diagonal block */ i__3 = *ldab - 1; dpotf2_(uplo, &ib, &ab_ref(*kd + 1, i__), &i__3, &ii); if (ii != 0) { *info = i__ + ii - 1; goto L150; } if (i__ + ib <= *n) { /* Update the relevant part of the trailing submatrix. If A11 denotes the diagonal block which has just been factorized, then we need to update the remaining blocks in the diagram: A11 A12 A13 A22 A23 A33 The numbers of rows and columns in the partitioning are IB, I2, I3 respectively. The blocks A12, A22 and A23 are empty if IB = KD. The upper triangle of A13 lies outside the band. Computing MIN */ i__3 = *kd - ib, i__4 = *n - i__ - ib + 1; i2 = min(i__3,i__4); /* Computing MIN */ i__3 = ib, i__4 = *n - i__ - *kd + 1; i3 = min(i__3,i__4); if (i2 > 0) { /* Update A12 */ i__3 = *ldab - 1; i__4 = *ldab - 1; dtrsm_("Left", "Upper", "Transpose", "Non-unit", &ib, &i2, &c_b18, &ab_ref(*kd + 1, i__), &i__3, & ab_ref(*kd + 1 - ib, i__ + ib), &i__4); /* Update A22 */ i__3 = *ldab - 1; i__4 = *ldab - 1; dsyrk_("Upper", "Transpose", &i2, &ib, &c_b21, & ab_ref(*kd + 1 - ib, i__ + ib), &i__3, &c_b18, &ab_ref(*kd + 1, i__ + ib), &i__4); } if (i3 > 0) { /* Copy the lower triangle of A13 into the work array. */ i__3 = i3; for (jj = 1; jj <= i__3; ++jj) { i__4 = ib; for (ii = jj; ii <= i__4; ++ii) { work_ref(ii, jj) = ab_ref(ii - jj + 1, jj + i__ + *kd - 1); /* L30: */ } /* L40: */ } /* Update A13 (in the work array). */ i__3 = *ldab - 1; dtrsm_("Left", "Upper", "Transpose", "Non-unit", &ib, &i3, &c_b18, &ab_ref(*kd + 1, i__), &i__3, work, &c__33); /* Update A23 */ if (i2 > 0) { i__3 = *ldab - 1; i__4 = *ldab - 1; dgemm_("Transpose", "No Transpose", &i2, &i3, &ib, &c_b21, &ab_ref(*kd + 1 - ib, i__ + ib), &i__3, work, &c__33, &c_b18, &ab_ref(ib + 1, i__ + *kd), &i__4); } /* Update A33 */ i__3 = *ldab - 1; dsyrk_("Upper", "Transpose", &i3, &ib, &c_b21, work, & c__33, &c_b18, &ab_ref(*kd + 1, i__ + *kd), & i__3); /* Copy the lower triangle of A13 back into place. */ i__3 = i3; for (jj = 1; jj <= i__3; ++jj) { i__4 = ib; for (ii = jj; ii <= i__4; ++ii) { ab_ref(ii - jj + 1, jj + i__ + *kd - 1) = work_ref(ii, jj); /* L50: */ } /* L60: */ } } } /* L70: */ } } else { /* Compute the Cholesky factorization of a symmetric band matrix, given the lower triangle of the matrix in band storage. Zero the lower triangle of the work array. */ i__2 = nb; for (j = 1; j <= i__2; ++j) { i__1 = nb; for (i__ = j + 1; i__ <= i__1; ++i__) { work_ref(i__, j) = 0.; /* L80: */ } /* L90: */ } /* Process the band matrix one diagonal block at a time. */ i__2 = *n; i__1 = nb; for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { /* Computing MIN */ i__3 = nb, i__4 = *n - i__ + 1; ib = min(i__3,i__4); /* Factorize the diagonal block */ i__3 = *ldab - 1; dpotf2_(uplo, &ib, &ab_ref(1, i__), &i__3, &ii); if (ii != 0) { *info = i__ + ii - 1; goto L150; } if (i__ + ib <= *n) { /* Update the relevant part of the trailing submatrix. If A11 denotes the diagonal block which has just been factorized, then we need to update the remaining blocks in the diagram: A11 A21 A22 A31 A32 A33 The numbers of rows and columns in the partitioning are IB, I2, I3 respectively. The blocks A21, A22 and A32 are empty if IB = KD. The lower triangle of A31 lies outside the band. Computing MIN */ i__3 = *kd - ib, i__4 = *n - i__ - ib + 1; i2 = min(i__3,i__4); /* Computing MIN */ i__3 = ib, i__4 = *n - i__ - *kd + 1; i3 = min(i__3,i__4); if (i2 > 0) { /* Update A21 */ i__3 = *ldab - 1; i__4 = *ldab - 1; dtrsm_("Right", "Lower", "Transpose", "Non-unit", &i2, &ib, &c_b18, &ab_ref(1, i__), &i__3, &ab_ref( ib + 1, i__), &i__4); /* Update A22 */ i__3 = *ldab - 1; i__4 = *ldab - 1; dsyrk_("Lower", "No Transpose", &i2, &ib, &c_b21, & ab_ref(ib + 1, i__), &i__3, &c_b18, &ab_ref(1, i__ + ib), &i__4); } if (i3 > 0) { /* Copy the upper triangle of A31 into the work array. */ i__3 = ib; for (jj = 1; jj <= i__3; ++jj) { i__4 = min(jj,i3); for (ii = 1; ii <= i__4; ++ii) { work_ref(ii, jj) = ab_ref(*kd + 1 - jj + ii, jj + i__ - 1); /* L100: */ } /* L110: */ } /* Update A31 (in the work array). */ i__3 = *ldab - 1; dtrsm_("Right", "Lower", "Transpose", "Non-unit", &i3, &ib, &c_b18, &ab_ref(1, i__), &i__3, work, & c__33); /* Update A32 */ if (i2 > 0) { i__3 = *ldab - 1; i__4 = *ldab - 1; dgemm_("No transpose", "Transpose", &i3, &i2, &ib, &c_b21, work, &c__33, &ab_ref(ib + 1, i__), &i__3, &c_b18, &ab_ref(*kd + 1 - ib, i__ + ib), &i__4); } /* Update A33 */ i__3 = *ldab - 1; dsyrk_("Lower", "No Transpose", &i3, &ib, &c_b21, work, &c__33, &c_b18, &ab_ref(1, i__ + *kd), & i__3); /* Copy the upper triangle of A31 back into place. */ i__3 = ib; for (jj = 1; jj <= i__3; ++jj) { i__4 = min(jj,i3); for (ii = 1; ii <= i__4; ++ii) { ab_ref(*kd + 1 - jj + ii, jj + i__ - 1) = work_ref(ii, jj); /* L120: */ } /* L130: */ } } } /* L140: */ } } } return 0; L150: return 0; /* End of DPBTRF */ } /* dpbtrf_ */
/* Subroutine */ int dlarfb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, doublereal *v, integer * ldv, doublereal *t, integer *ldt, doublereal *c__, integer *ldc, doublereal *work, integer *ldwork) { /* -- LAPACK auxiliary routine (version 3.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University February 29, 1992 Purpose ======= DLARFB applies a real block reflector H or its transpose H' to a real m by n matrix C, from either the left or the right. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply H or H' from the Left = 'R': apply H or H' from the Right TRANS (input) CHARACTER*1 = 'N': apply H (No transpose) = 'T': apply H' (Transpose) DIRECT (input) CHARACTER*1 Indicates how H is formed from a product of elementary reflectors = 'F': H = H(1) H(2) . . . H(k) (Forward) = 'B': H = H(k) . . . H(2) H(1) (Backward) STOREV (input) CHARACTER*1 Indicates how the vectors which define the elementary reflectors are stored: = 'C': Columnwise = 'R': Rowwise M (input) INTEGER The number of rows of the matrix C. N (input) INTEGER The number of columns of the matrix C. K (input) INTEGER The order of the matrix T (= the number of elementary reflectors whose product defines the block reflector). V (input) DOUBLE PRECISION array, dimension (LDV,K) if STOREV = 'C' (LDV,M) if STOREV = 'R' and SIDE = 'L' (LDV,N) if STOREV = 'R' and SIDE = 'R' The matrix V. See further details. LDV (input) INTEGER The leading dimension of the array V. If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); if STOREV = 'R', LDV >= K. T (input) DOUBLE PRECISION array, dimension (LDT,K) The triangular k by k matrix T in the representation of the block reflector. LDT (input) INTEGER The leading dimension of the array T. LDT >= K. C (input/output) DOUBLE PRECISION array, dimension (LDC,N) On entry, the m by n matrix C. On exit, C is overwritten by H*C or H'*C or C*H or C*H'. LDC (input) INTEGER The leading dimension of the array C. LDA >= max(1,M). WORK (workspace) DOUBLE PRECISION array, dimension (LDWORK,K) LDWORK (input) INTEGER The leading dimension of the array WORK. If SIDE = 'L', LDWORK >= max(1,N); if SIDE = 'R', LDWORK >= max(1,M). ===================================================================== Quick return if possible Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; static doublereal c_b14 = 1.; static doublereal c_b25 = -1.; /* System generated locals */ integer c_dim1, c_offset, t_dim1, t_offset, v_dim1, v_offset, work_dim1, work_offset, i__1, i__2; /* Local variables */ static integer i__, j; extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); static char transt[1]; #define work_ref(a_1,a_2) work[(a_2)*work_dim1 + a_1] #define c___ref(a_1,a_2) c__[(a_2)*c_dim1 + a_1] #define v_ref(a_1,a_2) v[(a_2)*v_dim1 + a_1] v_dim1 = *ldv; v_offset = 1 + v_dim1 * 1; v -= v_offset; t_dim1 = *ldt; t_offset = 1 + t_dim1 * 1; t -= t_offset; c_dim1 = *ldc; c_offset = 1 + c_dim1 * 1; c__ -= c_offset; work_dim1 = *ldwork; work_offset = 1 + work_dim1 * 1; work -= work_offset; /* Function Body */ if (*m <= 0 || *n <= 0) { return 0; } if (lsame_(trans, "N")) { *(unsigned char *)transt = 'T'; } else { *(unsigned char *)transt = 'N'; } if (lsame_(storev, "C")) { if (lsame_(direct, "F")) { /* Let V = ( V1 ) (first K rows) ( V2 ) where V1 is unit lower triangular. */ if (lsame_(side, "L")) { /* Form H * C or H' * C where C = ( C1 ) ( C2 ) W := C' * V = (C1'*V1 + C2'*V2) (stored in WORK) W := C1' */ i__1 = *k; for (j = 1; j <= i__1; ++j) { dcopy_(n, &c___ref(j, 1), ldc, &work_ref(1, j), &c__1); /* L10: */ } /* W := W * V1 */ dtrmm_("Right", "Lower", "No transpose", "Unit", n, k, &c_b14, &v[v_offset], ldv, &work[work_offset], ldwork); if (*m > *k) { /* W := W + C2'*V2 */ i__1 = *m - *k; dgemm_("Transpose", "No transpose", n, k, &i__1, &c_b14, & c___ref(*k + 1, 1), ldc, &v_ref(*k + 1, 1), ldv, & c_b14, &work[work_offset], ldwork); } /* W := W * T' or W * T */ dtrmm_("Right", "Upper", transt, "Non-unit", n, k, &c_b14, &t[ t_offset], ldt, &work[work_offset], ldwork); /* C := C - V * W' */ if (*m > *k) { /* C2 := C2 - V2 * W' */ i__1 = *m - *k; dgemm_("No transpose", "Transpose", &i__1, n, k, &c_b25, & v_ref(*k + 1, 1), ldv, &work[work_offset], ldwork, &c_b14, &c___ref(*k + 1, 1), ldc); } /* W := W * V1' */ dtrmm_("Right", "Lower", "Transpose", "Unit", n, k, &c_b14, & v[v_offset], ldv, &work[work_offset], ldwork); /* C1 := C1 - W' */ i__1 = *k; for (j = 1; j <= i__1; ++j) { i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { c___ref(j, i__) = c___ref(j, i__) - work_ref(i__, j); /* L20: */ } /* L30: */ } } else if (lsame_(side, "R")) { /* Form C * H or C * H' where C = ( C1 C2 ) W := C * V = (C1*V1 + C2*V2) (stored in WORK) W := C1 */ i__1 = *k; for (j = 1; j <= i__1; ++j) { dcopy_(m, &c___ref(1, j), &c__1, &work_ref(1, j), &c__1); /* L40: */ } /* W := W * V1 */ dtrmm_("Right", "Lower", "No transpose", "Unit", m, k, &c_b14, &v[v_offset], ldv, &work[work_offset], ldwork); if (*n > *k) { /* W := W + C2 * V2 */ i__1 = *n - *k; dgemm_("No transpose", "No transpose", m, k, &i__1, & c_b14, &c___ref(1, *k + 1), ldc, &v_ref(*k + 1, 1) , ldv, &c_b14, &work[work_offset], ldwork); } /* W := W * T or W * T' */ dtrmm_("Right", "Upper", trans, "Non-unit", m, k, &c_b14, &t[ t_offset], ldt, &work[work_offset], ldwork); /* C := C - W * V' */ if (*n > *k) { /* C2 := C2 - W * V2' */ i__1 = *n - *k; dgemm_("No transpose", "Transpose", m, &i__1, k, &c_b25, & work[work_offset], ldwork, &v_ref(*k + 1, 1), ldv, &c_b14, &c___ref(1, *k + 1), ldc); } /* W := W * V1' */ dtrmm_("Right", "Lower", "Transpose", "Unit", m, k, &c_b14, & v[v_offset], ldv, &work[work_offset], ldwork); /* C1 := C1 - W */ i__1 = *k; for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { c___ref(i__, j) = c___ref(i__, j) - work_ref(i__, j); /* L50: */ } /* L60: */ } } } else { /* Let V = ( V1 ) ( V2 ) (last K rows) where V2 is unit upper triangular. */ if (lsame_(side, "L")) { /* Form H * C or H' * C where C = ( C1 ) ( C2 ) W := C' * V = (C1'*V1 + C2'*V2) (stored in WORK) W := C2' */ i__1 = *k; for (j = 1; j <= i__1; ++j) { dcopy_(n, &c___ref(*m - *k + j, 1), ldc, &work_ref(1, j), &c__1); /* L70: */ } /* W := W * V2 */ dtrmm_("Right", "Upper", "No transpose", "Unit", n, k, &c_b14, &v_ref(*m - *k + 1, 1), ldv, &work[work_offset], ldwork); if (*m > *k) { /* W := W + C1'*V1 */ i__1 = *m - *k; dgemm_("Transpose", "No transpose", n, k, &i__1, &c_b14, & c__[c_offset], ldc, &v[v_offset], ldv, &c_b14, & work[work_offset], ldwork); } /* W := W * T' or W * T */ dtrmm_("Right", "Lower", transt, "Non-unit", n, k, &c_b14, &t[ t_offset], ldt, &work[work_offset], ldwork); /* C := C - V * W' */ if (*m > *k) { /* C1 := C1 - V1 * W' */ i__1 = *m - *k; dgemm_("No transpose", "Transpose", &i__1, n, k, &c_b25, & v[v_offset], ldv, &work[work_offset], ldwork, & c_b14, &c__[c_offset], ldc) ; } /* W := W * V2' */ dtrmm_("Right", "Upper", "Transpose", "Unit", n, k, &c_b14, & v_ref(*m - *k + 1, 1), ldv, &work[work_offset], ldwork); /* C2 := C2 - W' */ i__1 = *k; for (j = 1; j <= i__1; ++j) { i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { c___ref(*m - *k + j, i__) = c___ref(*m - *k + j, i__) - work_ref(i__, j); /* L80: */ } /* L90: */ } } else if (lsame_(side, "R")) { /* Form C * H or C * H' where C = ( C1 C2 ) W := C * V = (C1*V1 + C2*V2) (stored in WORK) W := C2 */ i__1 = *k; for (j = 1; j <= i__1; ++j) { dcopy_(m, &c___ref(1, *n - *k + j), &c__1, &work_ref(1, j) , &c__1); /* L100: */ } /* W := W * V2 */ dtrmm_("Right", "Upper", "No transpose", "Unit", m, k, &c_b14, &v_ref(*n - *k + 1, 1), ldv, &work[work_offset], ldwork); if (*n > *k) { /* W := W + C1 * V1 */ i__1 = *n - *k; dgemm_("No transpose", "No transpose", m, k, &i__1, & c_b14, &c__[c_offset], ldc, &v[v_offset], ldv, & c_b14, &work[work_offset], ldwork); } /* W := W * T or W * T' */ dtrmm_("Right", "Lower", trans, "Non-unit", m, k, &c_b14, &t[ t_offset], ldt, &work[work_offset], ldwork); /* C := C - W * V' */ if (*n > *k) { /* C1 := C1 - W * V1' */ i__1 = *n - *k; dgemm_("No transpose", "Transpose", m, &i__1, k, &c_b25, & work[work_offset], ldwork, &v[v_offset], ldv, & c_b14, &c__[c_offset], ldc) ; } /* W := W * V2' */ dtrmm_("Right", "Upper", "Transpose", "Unit", m, k, &c_b14, & v_ref(*n - *k + 1, 1), ldv, &work[work_offset], ldwork); /* C2 := C2 - W */ i__1 = *k; for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { c___ref(i__, *n - *k + j) = c___ref(i__, *n - *k + j) - work_ref(i__, j); /* L110: */ } /* L120: */ } } } } else if (lsame_(storev, "R")) { if (lsame_(direct, "F")) { /* Let V = ( V1 V2 ) (V1: first K columns) where V1 is unit upper triangular. */ if (lsame_(side, "L")) { /* Form H * C or H' * C where C = ( C1 ) ( C2 ) W := C' * V' = (C1'*V1' + C2'*V2') (stored in WORK) W := C1' */ i__1 = *k; for (j = 1; j <= i__1; ++j) { dcopy_(n, &c___ref(j, 1), ldc, &work_ref(1, j), &c__1); /* L130: */ } /* W := W * V1' */ dtrmm_("Right", "Upper", "Transpose", "Unit", n, k, &c_b14, & v[v_offset], ldv, &work[work_offset], ldwork); if (*m > *k) { /* W := W + C2'*V2' */ i__1 = *m - *k; dgemm_("Transpose", "Transpose", n, k, &i__1, &c_b14, & c___ref(*k + 1, 1), ldc, &v_ref(1, *k + 1), ldv, & c_b14, &work[work_offset], ldwork); } /* W := W * T' or W * T */ dtrmm_("Right", "Upper", transt, "Non-unit", n, k, &c_b14, &t[ t_offset], ldt, &work[work_offset], ldwork); /* C := C - V' * W' */ if (*m > *k) { /* C2 := C2 - V2' * W' */ i__1 = *m - *k; dgemm_("Transpose", "Transpose", &i__1, n, k, &c_b25, & v_ref(1, *k + 1), ldv, &work[work_offset], ldwork, &c_b14, &c___ref(*k + 1, 1), ldc); } /* W := W * V1 */ dtrmm_("Right", "Upper", "No transpose", "Unit", n, k, &c_b14, &v[v_offset], ldv, &work[work_offset], ldwork); /* C1 := C1 - W' */ i__1 = *k; for (j = 1; j <= i__1; ++j) { i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { c___ref(j, i__) = c___ref(j, i__) - work_ref(i__, j); /* L140: */ } /* L150: */ } } else if (lsame_(side, "R")) { /* Form C * H or C * H' where C = ( C1 C2 ) W := C * V' = (C1*V1' + C2*V2') (stored in WORK) W := C1 */ i__1 = *k; for (j = 1; j <= i__1; ++j) { dcopy_(m, &c___ref(1, j), &c__1, &work_ref(1, j), &c__1); /* L160: */ } /* W := W * V1' */ dtrmm_("Right", "Upper", "Transpose", "Unit", m, k, &c_b14, & v[v_offset], ldv, &work[work_offset], ldwork); if (*n > *k) { /* W := W + C2 * V2' */ i__1 = *n - *k; dgemm_("No transpose", "Transpose", m, k, &i__1, &c_b14, & c___ref(1, *k + 1), ldc, &v_ref(1, *k + 1), ldv, & c_b14, &work[work_offset], ldwork); } /* W := W * T or W * T' */ dtrmm_("Right", "Upper", trans, "Non-unit", m, k, &c_b14, &t[ t_offset], ldt, &work[work_offset], ldwork); /* C := C - W * V */ if (*n > *k) { /* C2 := C2 - W * V2 */ i__1 = *n - *k; dgemm_("No transpose", "No transpose", m, &i__1, k, & c_b25, &work[work_offset], ldwork, &v_ref(1, *k + 1), ldv, &c_b14, &c___ref(1, *k + 1), ldc); } /* W := W * V1 */ dtrmm_("Right", "Upper", "No transpose", "Unit", m, k, &c_b14, &v[v_offset], ldv, &work[work_offset], ldwork); /* C1 := C1 - W */ i__1 = *k; for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { c___ref(i__, j) = c___ref(i__, j) - work_ref(i__, j); /* L170: */ } /* L180: */ } } } else { /* Let V = ( V1 V2 ) (V2: last K columns) where V2 is unit lower triangular. */ if (lsame_(side, "L")) { /* Form H * C or H' * C where C = ( C1 ) ( C2 ) W := C' * V' = (C1'*V1' + C2'*V2') (stored in WORK) W := C2' */ i__1 = *k; for (j = 1; j <= i__1; ++j) { dcopy_(n, &c___ref(*m - *k + j, 1), ldc, &work_ref(1, j), &c__1); /* L190: */ } /* W := W * V2' */ dtrmm_("Right", "Lower", "Transpose", "Unit", n, k, &c_b14, & v_ref(1, *m - *k + 1), ldv, &work[work_offset], ldwork); if (*m > *k) { /* W := W + C1'*V1' */ i__1 = *m - *k; dgemm_("Transpose", "Transpose", n, k, &i__1, &c_b14, & c__[c_offset], ldc, &v[v_offset], ldv, &c_b14, & work[work_offset], ldwork); } /* W := W * T' or W * T */ dtrmm_("Right", "Lower", transt, "Non-unit", n, k, &c_b14, &t[ t_offset], ldt, &work[work_offset], ldwork); /* C := C - V' * W' */ if (*m > *k) { /* C1 := C1 - V1' * W' */ i__1 = *m - *k; dgemm_("Transpose", "Transpose", &i__1, n, k, &c_b25, &v[ v_offset], ldv, &work[work_offset], ldwork, & c_b14, &c__[c_offset], ldc); } /* W := W * V2 */ dtrmm_("Right", "Lower", "No transpose", "Unit", n, k, &c_b14, &v_ref(1, *m - *k + 1), ldv, &work[work_offset], ldwork); /* C2 := C2 - W' */ i__1 = *k; for (j = 1; j <= i__1; ++j) { i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { c___ref(*m - *k + j, i__) = c___ref(*m - *k + j, i__) - work_ref(i__, j); /* L200: */ } /* L210: */ } } else if (lsame_(side, "R")) { /* Form C * H or C * H' where C = ( C1 C2 ) W := C * V' = (C1*V1' + C2*V2') (stored in WORK) W := C2 */ i__1 = *k; for (j = 1; j <= i__1; ++j) { dcopy_(m, &c___ref(1, *n - *k + j), &c__1, &work_ref(1, j) , &c__1); /* L220: */ } /* W := W * V2' */ dtrmm_("Right", "Lower", "Transpose", "Unit", m, k, &c_b14, & v_ref(1, *n - *k + 1), ldv, &work[work_offset], ldwork); if (*n > *k) { /* W := W + C1 * V1' */ i__1 = *n - *k; dgemm_("No transpose", "Transpose", m, k, &i__1, &c_b14, & c__[c_offset], ldc, &v[v_offset], ldv, &c_b14, & work[work_offset], ldwork); } /* W := W * T or W * T' */ dtrmm_("Right", "Lower", trans, "Non-unit", m, k, &c_b14, &t[ t_offset], ldt, &work[work_offset], ldwork); /* C := C - W * V */ if (*n > *k) { /* C1 := C1 - W * V1 */ i__1 = *n - *k; dgemm_("No transpose", "No transpose", m, &i__1, k, & c_b25, &work[work_offset], ldwork, &v[v_offset], ldv, &c_b14, &c__[c_offset], ldc); } /* W := W * V2 */ dtrmm_("Right", "Lower", "No transpose", "Unit", m, k, &c_b14, &v_ref(1, *n - *k + 1), ldv, &work[work_offset], ldwork); /* C1 := C1 - W */ i__1 = *k; for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { c___ref(i__, *n - *k + j) = c___ref(i__, *n - *k + j) - work_ref(i__, j); /* L230: */ } /* L240: */ } } } } return 0; /* End of DLARFB */ } /* dlarfb_ */