extern "C" magma_int_t magma_cunmql(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *c, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CUNMQL overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'C': Q**H * C C * Q**H where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by CGEQLF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'C': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQLF in the last k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQLF. C (input/output) COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; magma_int_t i__4, i__; magmaFloatComplex *T; magma_int_t i1, i2, i3, ib, nb, mi, ni, nq, nw; magma_int_t iinfo, ldwork, lwkopt=0; int lquery, left, notran; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = max(1,n); } else { nq = n; nw = max(1,m); } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, "C")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } if (*info == 0) { if (m == 0 || n == 0) { lwkopt = 1; } else { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = nw * nb; } work[0] = MAGMA_C_MAKE( lwkopt, 0 ); if (lwork < nw && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } /* Allocate work space on the GPU */ magmaFloatComplex *dwork, *dc; magma_cmalloc( &dc, (m)*(n) ); magma_cmalloc( &dwork, 2*(m + 64)*64 ); /* Copy matrix C from the CPU to the GPU */ magma_csetmatrix( m, n, c, ldc, dc, m ); /* work space on CPU */ if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &T, 2*nb*nb ) ) { magma_free( dc ); magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } ldwork = nw; if ( nb >= k ) { /* Use CPU code */ lapackf77_cunmql(side_, trans_, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ((left && notran) || (! left && ! notran)) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } for (i__ = i1; (i3 < 0 ? i__ >= i2 : i__ <= i2); i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ i__4 = nq - k + i__ + ib - 1; lapackf77_clarft("Backward", "Columnwise", &i__4, &ib, &a[(i__-1) * lda], &lda, &tau[i__-1], T, &ib); /* 1) Put 0s in the lower triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ cpanel_to_q('L', ib, &a[i__-1 + (i__-1) * lda], lda, T+ib*ib); magma_csetmatrix( i__4, ib, &a[(i__-1) * lda], lda, dwork, i__4 ); cq_to_panel('L', ib, &a[i__-1 + (i__-1) * lda], lda, T+ib*ib); if (left) { /* H or H' is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i__ + ib - 1; } else { /* H or H' is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i__ + ib - 1; } /* Apply H or H'; First copy T to the GPU */ magma_csetmatrix( ib, ib, T, ib, dwork+i__4*ib, ib ); magma_clarfb_gpu(side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dwork, i__4, dwork+i__4*ib, ib, dc, m, dwork+i__4*ib + ib*ib, ldwork); } magma_cgetmatrix( m, n, dc, m, c, ldc ); } work[0] = MAGMA_C_MAKE( lwkopt, 0 ); magma_free( dc ); magma_free( dwork ); magma_free_pinned( T); return *info; } /* magma_cunmql */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmql */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgeqlf_nb( m ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_CUNMQL( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqlf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*k ); TESTING_MALLOC_CPU( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, k ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute QL factorization to get Householder vectors in A, tau magma_cgeqlf( mm, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_cgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmql( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmql returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmql( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_cunmql (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_cunmql( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmql returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/** Purpose ------- CUNMQL overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by CGEQLF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQLF in the last k columns of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQLF. @param[in,out] C COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_cunmql( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) magmaFloatComplex *T, *T2; magma_int_t i, i1, i2, ib, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; *info = 0; bool left = (side == MagmaLeft); bool notran = (trans == MagmaNoTrans); bool lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_cgelqf_nb( m, n ); lwkopt = max(1,nw)*nb; work[0] = magma_cmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_C_ONE; return *info; } ldwork = nw; if ( nb >= k ) { /* Use CPU code */ lapackf77_cunmql( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo ); } else { /* Use hybrid CPU-GPU code */ /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = magma_roundup( m, 32 ); magmaFloatComplex *dwork, *dV, *dT, *dC; magma_cmalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_cmalloc_pinned( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Copy matrix C from the CPU to the GPU */ magma_csetmatrix( m, n, C, ldc, dC, lddc, queue ); if ( (left && notran) || (! left && ! notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ nq_i = nq - k + i + ib; lapackf77_clarft("Backward", "Columnwise", &nq_i, &ib, A(0,i), &lda, &tau[i], T, &ib); /* 1) set lower triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ magma_cpanel_to_q( MagmaLower, ib, A(nq_i-ib,i), lda, T2 ); magma_csetmatrix( nq_i, ib, A(0,i), lda, dV, nq_i, queue ); magma_cq_to_panel( MagmaLower, ib, A(nq_i-ib,i), lda, T2 ); if (left) { /* H or H**H is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i + ib; } else { /* H or H**H is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i + ib; } /* Apply H or H**H; First copy T to the GPU */ magma_csetmatrix( ib, ib, T, ib, dT, ib, queue ); magma_clarfb_gpu( side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dV, nq_i, dT, ib, dC, lddc, dwork, ldwork, queue ); } magma_cgetmatrix( m, n, dC, lddc, C, ldc, queue ); magma_queue_destroy( queue ); magma_free( dwork ); magma_free_pinned( T ); } work[0] = magma_cmake_lwork( lwkopt ); return *info; } /* magma_cunmql */