/* //////////////////////////////////////////////////////////////////////////// -- Testing dormbr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Cnorm, error, dwork[1]; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, mi, ni, mm, nn, nq, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; double *C, *R, *A, *work, *tau, *tauq, *taup; double *d, *e; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_vect_t vect [] = { MagmaQ, MagmaP }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf("%% M N K vect side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int ivect = 0; ivect < 2; ++ivect ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_dgebrd_nb( m, n ); ldc = m; // A is nq x k (vect=Q) or k x nq (vect=P) // where nq=m (left) or nq=n (right) nq = (side[iside] == MagmaLeft ? m : n ); mm = (vect[ivect] == MagmaQ ? nq : k ); nn = (vect[ivect] == MagmaQ ? k : nq); lda = mm; // MBR calls either MQR or MLQ in various ways if ( vect[ivect] == MagmaQ ) { if ( nq >= k ) { gflops = FLOPS_DORMQR( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_DORMQR( mi, ni, nq-1, side[iside] ) / 1e9; } } else { if ( nq > k ) { gflops = FLOPS_DORMLQ( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_DORMLQ( mi, ni, nq-1, side[iside] ) / 1e9; } } // workspace for gebrd is (mm + nn)*nb // workspace for unmbr is m*nb or n*nb, depending on side lwork_max = max( (mm + nn)*nb, max( m*nb, n*nb )); // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_dmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, double, ldc*n ); TESTING_MALLOC_CPU( R, double, ldc*n ); TESTING_MALLOC_CPU( A, double, lda*nn ); TESTING_MALLOC_CPU( work, double, lwork_max ); TESTING_MALLOC_CPU( d, double, min(mm,nn) ); TESTING_MALLOC_CPU( e, double, min(mm,nn) ); TESTING_MALLOC_CPU( tauq, double, min(mm,nn) ); TESTING_MALLOC_CPU( taup, double, min(mm,nn) ); // C is full, m x n size = ldc*n; lapackf77_dlarnv( &ione, ISEED, &size, C ); lapackf77_dlacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*nn; lapackf77_dlarnv( &ione, ISEED, &size, A ); // compute BRD factorization to get Householder vectors in A, tauq, taup //lapackf77_dgebrd( &mm, &nn, A, &lda, d, e, tauq, taup, work, &lwork_max, &info ); magma_dgebrd( mm, nn, A, lda, d, e, tauq, taup, work, lwork_max, &info ); if (info != 0) { printf("magma_dgebrd returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( vect[ivect] == MagmaQ ) { tau = tauq; } else { tau = taup; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dormbr( lapack_vect_const( vect[ivect] ), lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dormbr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_dormbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); if (info != 0) { printf("magma_dormbr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_D_REAL( work[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_dormbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dormbr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_daxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_dlange( "Fro", &m, &n, C, &ldc, dwork ); error = lapackf77_dlange( "Fro", &m, &n, R, &ldc, dwork ) / (magma_dsqrt(m*n) * Cnorm); printf( "%5d %5d %5d %c %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_vect_const( vect[ivect] ), lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( d ); TESTING_FREE_CPU( e ); TESTING_FREE_CPU( taup ); TESTING_FREE_CPU( tauq ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}} // end ivect, iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- DGEBRD reduces a general real M-by-N matrix A to upper or lower bidiagonal form B by an orthogonal transformation: Q**H * A * P = B. If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. M >= 0. @param[in] n INTEGER The number of columns in the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N general matrix to be reduced. On exit, if m >= n, the diagonal and the first superdiagonal are overwritten with the upper bidiagonal matrix B; the elements below the diagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the first superdiagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors; \n if m < n, the diagonal and the first subdiagonal are overwritten with the lower bidiagonal matrix B; the elements below the first subdiagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the diagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] d double precision array, dimension (min(M,N)) The diagonal elements of the bidiagonal matrix B: D(i) = A(i,i). @param[out] e double precision array, dimension (min(M,N)-1) The off-diagonal elements of the bidiagonal matrix B: if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. @param[out] tauq DOUBLE_PRECISION array dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= (M+N)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: If m >= n, Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors; v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The contents of A on exit are illustrated by the following examples: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) ( v1 v2 v3 v4 v5 ) @endverbatim where d and e denote diagonal and off-diagonal elements of B, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_dgesvd_comp ********************************************************************/ extern "C" magma_int_t magma_dgebrd(magma_int_t m, magma_int_t n, double *A, magma_int_t lda, double *d, double *e, double *tauq, double *taup, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double *dA, *dwork; magma_int_t ncol, nrow, jmax, nb, ldda; magma_int_t i, j, nx; magma_int_t iinfo; magma_int_t minmn; magma_int_t ldwrkx, ldwrky, lwkopt; magma_int_t lquery; nb = magma_get_dgebrd_nb(n); ldda = m; lwkopt = (m + n) * nb; work[0] = MAGMA_D_MAKE( lwkopt, 0. ); lquery = (lwork == -1); /* Check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < lwkopt && (! lquery) ) { *info = -10; } if (*info < 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ minmn = min(m,n); if (minmn == 0) { work[0] = c_one; return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda + (m + n)*nb )) { fprintf (stderr, "!!!! device memory allocation error in dgebrd\n" ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = dA + n*ldda; ldwrkx = m; ldwrky = n; /* Set the block/unblock crossover point NX. */ nx = 128; /* Copy the matrix to the GPU */ if (minmn - nx >= 1) { magma_dsetmatrix( m, n, A, lda, dA, ldda ); } for (i=0; i < (minmn - nx); i += nb) { /* Reduce rows and columns i:i+nb-1 to bidiagonal form and return the matrices X and Y which are needed to update the unreduced part of the matrix */ nrow = m - i; ncol = n - i; /* Get the current panel (no need for the 1st iteration) */ if ( i > 0 ) { magma_dgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda ); magma_dgetmatrix( nb, ncol - nb, dA(i, i+nb), ldda, A( i, i+nb), lda ); } magma_dlabrd_gpu(nrow, ncol, nb, A(i, i), lda, dA(i, i), ldda, d+i, e+i, tauq+i, taup+i, work, ldwrkx, dwork, ldwrkx, // x, dx work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky); // y, dy /* Update the trailing submatrix A(i+nb:m,i+nb:n), using an update of the form A := A - V*Y' - X*U' */ nrow = m - i - nb; ncol = n - i - nb; // Send Y back to the GPU magma_dsetmatrix( nrow, nb, work + nb, ldwrkx, dwork + nb, ldwrkx ); magma_dsetmatrix( ncol, nb, work + (ldwrkx+1)*nb, ldwrky, dwork + (ldwrkx+1)*nb, ldwrky ); magma_dgemm( MagmaNoTrans, MagmaConjTrans, nrow, ncol, nb, c_neg_one, dA(i+nb, i ), ldda, dwork+(ldwrkx+1)*nb, ldwrky, c_one, dA(i+nb, i+nb), ldda); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nrow, ncol, nb, c_neg_one, dwork+nb, ldwrkx, dA( i, i+nb ), ldda, c_one, dA( i+nb, i+nb ), ldda); /* Copy diagonal and off-diagonal elements of B back into A */ if (m >= n) { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_D_MAKE( d[j], 0. ); *A(j, j+1) = MAGMA_D_MAKE( e[j], 0. ); } } else { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_D_MAKE( d[j], 0. ); *A(j+1, j ) = MAGMA_D_MAKE( e[j], 0. ); } } } /* Use unblocked code to reduce the remainder of the matrix */ nrow = m - i; ncol = n - i; if ( 0 < minmn - nx ) { magma_dgetmatrix( nrow, ncol, dA(i, i), ldda, A(i, i), lda ); } lapackf77_dgebrd( &nrow, &ncol, A(i, i), &lda, d+i, e+i, tauq+i, taup+i, work, &lwork, &iinfo); work[0] = MAGMA_D_MAKE( lwkopt, 0. ); magma_free( dA ); return *info; } /* magma_dgebrd */
/** Purpose ------- If VECT = MagmaQ, DORMBR overwrites the general real M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q*C C*Q TRANS = MagmaTrans: Q**H*C C*Q**H If VECT = MagmaP, DORMBR overwrites the general real M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: P*C C*P TRANS = MagmaTrans: P**H*C C*P**H Here Q and P**H are the unitary matrices determined by DGEBRD when reducing A real matrix A to bidiagonal form: A = Q*B * P**H. Q and P**H are defined as products of elementary reflectors H(i) and G(i) respectively. Let nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Thus nq is the order of the unitary matrix Q or P**H that is applied. If VECT = MagmaQ, A is assumed to have been an NQ-by-K matrix: if nq >= k, Q = H(1) H(2) . . . H(k); if nq < k, Q = H(1) H(2) . . . H(nq-1). If VECT = MagmaP, A is assumed to have been A K-by-NQ matrix: if k < nq, P = G(1) G(2) . . . G(k); if k >= nq, P = G(1) G(2) . . . G(nq-1). Arguments --------- @param[in] vect magma_vect_t - = MagmaQ: apply Q or Q**H; - = MagmaP: apply P or P**H. @param[in] side magma_side_t - = MagmaLeft: apply Q, Q**H, P or P**H from the Left; - = MagmaRight: apply Q, Q**H, P or P**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q or P; - = MagmaTrans: Conjugate transpose, apply Q**H or P**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER If VECT = MagmaQ, the number of columns in the original matrix reduced by DGEBRD. If VECT = MagmaP, the number of rows in the original matrix reduced by DGEBRD. K >= 0. @param[in] A DOUBLE_PRECISION array, dimension (LDA,min(nq,K)) if VECT = MagmaQ (LDA,nq) if VECT = MagmaP The vectors which define the elementary reflectors H(i) and G(i), whose products determine the matrices Q and P, as returned by DGEBRD. @param[in] lda INTEGER The leading dimension of the array A. If VECT = MagmaQ, LDA >= max(1,nq); if VECT = MagmaP, LDA >= max(1,min(nq,K)). @param[in] tau DOUBLE_PRECISION array, dimension (min(nq,K)) TAU(i) must contain the scalar factor of the elementary reflector H(i) or G(i) which determines Q or P, as returned by DGEBRD in the array argument TAUQ or TAUP. @param[in,out] C DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q or P*C or P**H*C or C*P or C*P**H. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M); if N = 0 or M = 0, LWORK >= 1. For optimum performance if SIDE = MagmaLeft, LWORK >= max(1,N*NB); if SIDE = MagmaRight, LWORK >= max(1,M*NB), where NB is the optimal blocksize. (NB = 0 if M = 0 or N = 0.) \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgesvd_comp ********************************************************************/ extern "C" magma_int_t magma_dormbr( magma_vect_t vect, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i,j) (A + (i) + (j)*lda) #define C(i,j) (C + (i) + (j)*ldc) magma_int_t i1, i2, nb, mi, ni, nq, nq_1, nw, iinfo, lwkopt; magma_int_t left, notran, applyq, lquery; magma_trans_t transt; MAGMA_UNUSED( nq_1 ); // used only in version 1 *info = 0; applyq = (vect == MagmaQ); left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q or P and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (m == 0 || n == 0) { nw = 0; } /* check arguments */ if (! applyq && vect != MagmaP) { *info = -1; } else if (! left && side != MagmaRight) { *info = -2; } else if (! notran && trans != MagmaTrans) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (k < 0) { *info = -6; } else if ( ( applyq && lda < max(1,nq) ) || ( ! applyq && lda < max(1,min(nq,k)) ) ) { *info = -8; } else if (ldc < max(1,m)) { *info = -11; } else if (lwork < max(1,nw) && ! lquery) { *info = -13; } if (*info == 0) { if (nw > 0) { // TODO have get_dormqr_nb and get_dormlq_nb routines? see original LAPACK dormbr. // TODO make them dependent on m, n, and k? nb = magma_get_dgebrd_nb( min( m, n )); lwkopt = max(1, nw*nb); } else { lwkopt = 1; } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } if (applyq) { /* Apply Q */ if (nq >= k) { /* Q was determined by a call to DGEBRD with nq >= k */ #if VERSION == 1 lapackf77_dormqr( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); #else magma_dormqr( side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo); #endif } else if (nq > 1) { /* Q was determined by a call to DGEBRD with nq < k */ if (left) { mi = m - 1; ni = n; i1 = 1; i2 = 0; } else { mi = m; ni = n - 1; i1 = 0; i2 = 1; } #if VERSION == 1 nq_1 = nq - 1; lapackf77_dormqr( lapack_side_const(side), lapack_trans_const(trans), &mi, &ni, &nq_1, A(1,0), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo); #else magma_dormqr( side, trans, mi, ni, nq-1, A(1,0), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo); #endif } } else { /* Apply P */ if (notran) { transt = MagmaTrans; } else { transt = MagmaNoTrans; } if (nq > k) { /* P was determined by a call to DGEBRD with nq > k */ #if VERSION == 1 lapackf77_dormlq( lapack_side_const(side), lapack_trans_const(transt), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); #else magma_dormlq( side, transt, m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo); #endif } else if (nq > 1) { /* P was determined by a call to DGEBRD with nq <= k */ if (left) { mi = m - 1; ni = n; i1 = 1; i2 = 0; } else { mi = m; ni = n - 1; i1 = 0; i2 = 1; } #if VERSION == 1 nq_1 = nq - 1; lapackf77_dormlq( lapack_side_const(side), lapack_trans_const(transt), &mi, &ni, &nq_1, A(0,1), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo); #else magma_dormlq( side, transt, mi, ni, nq-1, A(0,1), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo); #endif } } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); return *info; } /* magma_dormbr */
magma_int_t magma_get_dgesvd_nb( magma_int_t m ) { return magma_get_dgebrd_nb( m ); }
magma_int_t magmaf_get_dgebrd_nb( magma_int_t *m ) { return magma_get_dgebrd_nb( *m ); }
extern "C" magma_int_t magma_dgebrd(magma_int_t m, magma_int_t n, double *a, magma_int_t lda, double *d, double *e, double *tauq, double *taup, double *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DGEBRD reduces a general real M-by-N matrix A to upper or lower bidiagonal form B by an orthogonal transformation: Q**T * A * P = B. If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal. Arguments ========= M (input) INTEGER The number of rows in the matrix A. M >= 0. N (input) INTEGER The number of columns in the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N general matrix to be reduced. On exit, if m >= n, the diagonal and the first superdiagonal are overwritten with the upper bidiagonal matrix B; the elements below the diagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the first superdiagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors; if m < n, the diagonal and the first subdiagonal are overwritten with the lower bidiagonal matrix B; the elements below the first subdiagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the diagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). D (output) double precision array, dimension (min(M,N)) The diagonal elements of the bidiagonal matrix B: D(i) = A(i,i). E (output) double precision array, dimension (min(M,N)-1) The off-diagonal elements of the bidiagonal matrix B: if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. TAUQ (output) DOUBLE_PRECISION array dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. TAUP (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= (M+N)*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrices Q and P are represented as products of elementary reflectors: If m >= n, Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors; v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The contents of A on exit are illustrated by the following examples: m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) ( v1 v2 v3 v4 v5 ) where d and e denote diagonal and off-diagonal elements of B, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). ===================================================================== */ double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double *da, *dwork; magma_int_t ncol, nrow, jmax, nb, ldda; magma_int_t i, j, nx; magma_int_t iinfo; magma_int_t minmn; magma_int_t ldwrkx, ldwrky, lwkopt; magma_int_t lquery; nb = magma_get_dgebrd_nb(n); ldda = m; lwkopt = (m + n) * nb; work[0] = MAGMA_D_MAKE( lwkopt, 0. ); lquery = (lwork == -1); /* Check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < lwkopt && (! lquery) ) { *info = -10; } if (*info < 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ minmn = min(m,n); if (minmn == 0) { work[0] = c_one; return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &da, n*ldda + (m + n)*nb )) { fprintf (stderr, "!!!! device memory allocation error in dgebrd\n" ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = da + (n)*ldda; ldwrkx = m; ldwrky = n; /* Set the block/unblock crossover point NX. */ nx = 128; /* Copy the matrix to the GPU */ if (minmn - nx >= 1) { magma_dsetmatrix( m, n, a, lda, da, ldda ); } for (i=0; i< (minmn - nx); i += nb) { /* Reduce rows and columns i:i+nb-1 to bidiagonal form and return the matrices X and Y which are needed to update the unreduced part of the matrix */ nrow = m - i; ncol = n - i; /* Get the current panel (no need for the 1st iteration) */ if ( i > 0 ) { magma_dgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda ); magma_dgetmatrix( nb, ncol - nb, dA(i, i+nb), ldda, A( i, i+nb), lda ); } magma_dlabrd_gpu(nrow, ncol, nb, A(i, i), lda, dA(i, i), ldda, d+i, e+i, tauq+i, taup+i, work, ldwrkx, dwork, ldwrkx, // x, dx work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky); // y, dy /* Update the trailing submatrix A(i+nb:m,i+nb:n), using an update of the form A := A - V*Y' - X*U' */ nrow = m - i - nb; ncol = n - i - nb; // Send Y back to the GPU magma_dsetmatrix( nrow, nb, work + nb, ldwrkx, dwork + nb, ldwrkx ); magma_dsetmatrix( ncol, nb, work + (ldwrkx+1)*nb, ldwrky, dwork + (ldwrkx+1)*nb, ldwrky ); magma_dgemm( MagmaNoTrans, MagmaTrans, nrow, ncol, nb, c_neg_one, dA(i+nb, i ), ldda, dwork+(ldwrkx+1)*nb, ldwrky, c_one, dA(i+nb, i+nb), ldda); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nrow, ncol, nb, c_neg_one, dwork+nb, ldwrkx, dA( i, i+nb ), ldda, c_one, dA( i+nb, i+nb ), ldda); /* Copy diagonal and off-diagonal elements of B back into A */ if (m >= n) { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_D_MAKE( d[j], 0. ); *A(j, j+1) = MAGMA_D_MAKE( e[j], 0. ); } } else { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_D_MAKE( d[j], 0. ); *A(j+1, j ) = MAGMA_D_MAKE( e[j], 0. ); } } } /* Use unblocked code to reduce the remainder of the matrix */ nrow = m - i; ncol = n - i; if ( 0 < minmn - nx ) { magma_dgetmatrix( nrow, ncol, dA(i, i), ldda, A(i, i), lda ); } lapackf77_dgebrd( &nrow, &ncol, A(i, i), &lda, d+i, e+i, tauq+i, taup+i, work, &lwork, &iinfo); work[0] = MAGMA_D_MAKE( lwkopt, 0. ); magma_free( da ); return *info; } /* dgebrd */