extern "C" magma_int_t magma_sgeqlf(magma_int_t m, magma_int_t n, float *a, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SGEQLF computes a QL factorization of a REAL M-by-N matrix A: A = Q * L. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, if m >= n, the lower triangle of the subarray A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L; if m <= n, the elements on and below the (n-m)-th superdiagonal contain the M-by-N lower trapezoidal matrix L; the remaining elements, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB can be obtained through magma_get_sgeqlf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in A(1:m-k+i-1,n-k+i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) float *da, *dwork; float c_one = MAGMA_S_ONE; magma_int_t i, k, lddwork, old_i, old_ib, nb; magma_int_t rows, cols; magma_int_t ib, ki, kk, mu, nu, iinfo, ldda; int lquery; nb = magma_get_sgeqlf_nb(m); *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } if (*info == 0) { k = min(m,n); if (k == 0) work[0] = c_one; else { work[0] = MAGMA_S_MAKE( n*nb, 0 ); } if (lwork < max(1,n) && ! lquery) *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (k == 0) return *info; lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; if (MAGMA_SUCCESS != magma_smalloc( &da, (n)*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = da + ldda*(n); cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. The last kk columns are handled by the block method. First, copy the matrix on the GPU except the last kk columns */ magma_ssetmatrix_async( (m), (n-nb), a_ref(0, 0), lda, da_ref(0, 0), ldda, stream[0] ); ki = ((k - nb - 1) / nb) * nb; kk = min(k, ki + nb); for (i = k - kk + ki; i >= k -kk; i -= nb) { ib = min(k-i,nb); if (i< k - kk + ki){ /* 1. Copy asynchronously the current panel to the CPU. 2. Copy asynchronously the submatrix below the panel to the CPU) */ rows = m - k + i + ib; magma_sgetmatrix_async( rows, ib, da_ref(0, n-k+i), ldda, a_ref(0, n-k+i), lda, stream[1] ); magma_sgetmatrix_async( (m-rows), ib, da_ref(rows, n-k+i), ldda, a_ref(rows, n-k+i), lda, stream[0] ); /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in two steps - implementing the lookahead techniques. This is the main update from the lookahead techniques. */ rows = m - k + old_i + old_ib; cols = n - k + old_i - old_ib; magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaBackward, MagmaColumnwise, rows, cols, old_ib, da_ref(0, cols+old_ib), ldda, dwork, lddwork, da_ref(0, 0 ), ldda, dwork+old_ib, lddwork); } magma_queue_sync( stream[1] ); /* Compute the QL factorization of the current block A(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1) */ rows = m - k + i + ib; cols = n - k + i; lapackf77_sgeqlf(&rows,&ib, a_ref(0,cols), &lda, tau+i, work, &lwork, &iinfo); if (cols > 0) { /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ lapackf77_slarft( MagmaBackwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(0, cols), &lda, tau + i, work, &ib); spanel_to_q( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib); magma_ssetmatrix( rows, ib, a_ref(0,cols), lda, da_ref(0,cols), ldda ); sq_to_panel( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib); // Send the triangular part on the GPU magma_ssetmatrix( ib, ib, work, ib, dwork, lddwork ); /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in two steps - implementing the lookahead techniques. This is the update of first ib columns. */ if (i-ib >= k -kk) magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaBackward, MagmaColumnwise, rows, ib, ib, da_ref(0, cols), ldda, dwork, lddwork, da_ref(0,cols-ib), ldda, dwork+ib, lddwork); else{ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaBackward, MagmaColumnwise, rows, cols, ib, da_ref(0, cols), ldda, dwork, lddwork, da_ref(0, 0 ), ldda, dwork+ib, lddwork); } old_i = i; old_ib = ib; } } mu = m - k + i + nb; nu = n - k + i + nb; magma_sgetmatrix( m, nu, da_ref(0,0), ldda, a_ref(0,0), lda ); } else { mu = m; nu = n; } /* Use unblocked code to factor the last or only block */ if (mu > 0 && nu > 0) lapackf77_sgeqlf(&mu, &nu, a_ref(0,0), &lda, tau, work, &lwork, &iinfo); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); return *info; } /* magma_sgeqlf */
magma_int_t magmaf_get_sgeqlf_nb( magma_int_t *m ) { return magma_get_sgeqlf_nb( *m ); }
/** Purpose ------- SGEQLF computes a QL factorization of a REAL M-by-N matrix A: A = Q * L. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, if m >= n, the lower triangle of the subarray A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L; if m <= n, the elements on and below the (n-m)-th superdiagonal contain the M-by-N lower trapezoidal matrix L; the remaining elements, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max(1,N,2*NB^2). For optimum performance LWORK >= max(N*NB, 2*NB^2) where NB can be obtained through magma_get_sgeqlf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in A(1:m-k+i-1,n-k+i), and tau in TAU(i). @ingroup magma_sgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqlf( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dwork(i_) (dwork + (i_)) /* Constants */ const float c_one = MAGMA_S_ONE; /* Local variables */ magmaFloat_ptr dA, dwork; magma_int_t i, minmn, lddwork, old_i, old_ib, nb; magma_int_t rows, cols; magma_int_t ib, ki, kk, mu, nu, iinfo, ldda; nb = magma_get_sgeqlf_nb( m, n ); *info = 0; bool lquery = (lwork == -1); // silence "uninitialized" warnings old_ib = nb; old_i = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } minmn = min(m,n); if (*info == 0) { if (minmn == 0) { work[0] = c_one; } else { work[0] = magma_smake_lwork( max(n*nb, 2*nb*nb) ); } if (lwork < max(max(1,n), 2*nb*nb) && ! lquery) *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (minmn == 0) return *info; lddwork = magma_roundup( n, 32 ); ldda = magma_roundup( m, 32 ); if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = dA + ldda*n; magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if ( (nb > 1) && (nb < minmn) ) { /* Use blocked code initially. The last kk columns are handled by the block method. First, copy the matrix on the GPU except the last kk columns */ magma_ssetmatrix_async( m, n-nb, A(0, 0), lda, dA(0, 0), ldda, queues[0] ); ki = ((minmn - nb - 1) / nb) * nb; kk = min( minmn, ki + nb ); for (i = minmn - kk + ki; i >= minmn - kk; i -= nb) { ib = min( minmn-i, nb ); if (i < minmn - kk + ki) { // 1. Copy asynchronously the current panel to the CPU. // 2. Copy asynchronously the submatrix below the panel to the CPU rows = m - minmn + i + ib; magma_sgetmatrix_async( rows, ib, dA(0, n-minmn+i), ldda, A(0, n-minmn+i), lda, queues[1] ); magma_sgetmatrix_async( m-rows, ib, dA(rows, n-minmn+i), ldda, A(rows, n-minmn+i), lda, queues[0] ); /* Apply H^H to A(1:m-minmn+i+ib-1,1:n-minmn+i-1) from the left in two steps - implementing the lookahead techniques. This is the main update from the lookahead techniques. */ rows = m - minmn + old_i + old_ib; cols = n - minmn + old_i - old_ib; magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, cols, old_ib, dA(0, cols+old_ib), ldda, dwork(0), lddwork, dA(0, 0 ), ldda, dwork(old_ib), lddwork, queues[0] ); } magma_queue_sync( queues[1] ); // wait for panel /* Compute the QL factorization of the current block A(1:m-minmn+i+ib-1,n-minmn+i:n-minmn+i+ib-1) */ rows = m - minmn + i + ib; cols = n - minmn + i; lapackf77_sgeqlf( &rows, &ib, A(0,cols), &lda, tau+i, work, &lwork, &iinfo ); if (cols > 0) { /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ lapackf77_slarft( MagmaBackwardStr, MagmaColumnwiseStr, &rows, &ib, A(0, cols), &lda, tau + i, work, &ib ); magma_spanel_to_q( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib ); magma_ssetmatrix( rows, ib, A(0,cols), lda, dA(0,cols), ldda, queues[1] ); magma_sq_to_panel( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib ); // wait for main update (above) to finish with dwork magma_queue_sync( queues[0] ); // Send the triangular part to the GPU magma_ssetmatrix( ib, ib, work, ib, dwork(0), lddwork, queues[1] ); /* Apply H^H to A(1:m-minmn+i+ib-1,1:n-minmn+i-1) from the left in two steps - implementing the lookahead techniques. This is the update of first ib columns. */ if (i-ib >= minmn - kk) { magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, ib, ib, dA(0, cols), ldda, dwork(0), lddwork, dA(0,cols-ib), ldda, dwork(ib), lddwork, queues[1] ); // wait for larfb to finish with dwork before larfb in next iteration starts magma_queue_sync( queues[1] ); } else { magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, cols, ib, dA(0, cols), ldda, dwork(0), lddwork, dA(0, 0 ), ldda, dwork(ib), lddwork, queues[1] ); } old_i = i; old_ib = ib; } } mu = m - minmn + i + nb; nu = n - minmn + i + nb; magma_sgetmatrix( m, nu, dA(0,0), ldda, A(0,0), lda, queues[1] ); } else { mu = m; nu = n; } /* Use unblocked code to factor the last or only block */ if (mu > 0 && nu > 0) { lapackf77_sgeqlf( &mu, &nu, A(0,0), &lda, tau, work, &lwork, &iinfo ); } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); return *info; } /* magma_sgeqlf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sormql */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Cnorm, error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; float *C, *R, *A, *hwork, *tau; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_sgeqlf_nb( m, n ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_SORMQL( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqlf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_smake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, float, ldc*n ); TESTING_MALLOC_CPU( R, float, ldc*n ); TESTING_MALLOC_CPU( A, float, lda*k ); TESTING_MALLOC_CPU( hwork, float, lwork_max ); TESTING_MALLOC_CPU( tau, float, k ); // C is full, m x n size = ldc*n; lapackf77_slarnv( &ione, ISEED, &size, C ); lapackf77_slacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_slarnv( &ione, ISEED, &size, A ); // compute QL factorization to get Householder vectors in A, tau magma_sgeqlf( mm, k, A, lda, tau, hwork, lwork_max, &info ); if (info != 0) { printf("magma_sgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sormql( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, hwork, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sormql returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_sormql( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, hwork, lwork, &info ); if (info != 0) { printf("magma_sormql (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_S_REAL( hwork[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_sormql( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, hwork, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sormql returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_slange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_slange( "Fro", &m, &n, R, &ldc, work ) / (magma_ssqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }