/* //////////////////////////////////////////////////////////////////////////// -- Testing sormqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; float *C, *R, *A, *W, *tau; float *dC, *dA, *dT; magma_opts opts; parse_opts( argc, argv, &opts ); // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaTransStr, MagmaNoTransStr }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; nb = magma_get_sgeqrf_nb( m ); ldc = ((m + 31)/32)*32; lda = ((max(m,n) + 31)/32)*32; gflops = FLOPS_SORMQR( m, n, k, *side[iside] ) / 1e9; if ( *side[iside] == 'L' && m < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=left and m < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'R' && n < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=right and n < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'L' ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + ((k + 31)/32)*32 )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + ((k + 31)/32)*32 )*nb; } TESTING_MALLOC_CPU( C, float, ldc*n ); TESTING_MALLOC_CPU( R, float, ldc*n ); TESTING_MALLOC_CPU( A, float, lda*k ); TESTING_MALLOC_CPU( W, float, lwork_max ); TESTING_MALLOC_CPU( tau, float, k ); TESTING_MALLOC_DEV( dC, float, ldc*n ); TESTING_MALLOC_DEV( dA, float, lda*k ); TESTING_MALLOC_DEV( dT, float, dt_size ); // C is full, m x n size = ldc*n; lapackf77_slarnv( &ione, ISEED, &size, C ); magma_ssetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) lda = (*side[iside] == 'L' ? m : n); size = lda*k; lapackf77_slarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_ssetmatrix( lda, k, A, lda, dA, lda ); magma_sgeqrf_gpu( lda, k, dA, lda, tau, dT, &info ); magma_sgetmatrix( lda, k, dA, lda, A, lda ); if (info != 0) printf("magma_sgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sormqr( side[iside], trans[itran], &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sormqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_sormqr_gpu( *side[iside], *trans[itran], m, n, k, dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info ); if (info != 0) printf("magma_sormqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_S_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) printf("invalid lwork %d, lwork_max %d\n", (int) lwork, (int) lwork_max ); gpu_time = magma_sync_wtime( 0 ); // sync needed for L,N and R,T cases magma_sormqr_gpu( *side[iside], *trans[itran], m, n, k, dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sormqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_slange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_slange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-5s %-9s %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, side[iside], trans[itran], cpu_perf, cpu_time, gpu_perf, gpu_time, error ); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return 0; }
/** Purpose ------- Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by SGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by SGEQRF_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau REAL array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_SGEQRF_GPU. @param[in,out] dB REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] dT REAL array that is the output (the 6th argument) of magma_sgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_sgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgels_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrs_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *tau, float *dT, float *dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (lddwork+(a_1))*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *dwork; magma_int_t i, k, lddwork, rows, ib; magma_int_t ione = 1; magma_int_t nb = magma_get_sgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -9; else if (lwork < lwkopt && ! lquery) *info = -11; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } /* B := Q' * B */ magma_sormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) */ lddwork= k; if (nb < k) dwork = dT+2*lddwork*nb; else dwork = dT; // To do: Why did we have this line originally; seems to be a bug (Stan)? // dwork = dT; i = (k-1)/nb * nb; ib = n-i; rows = m-i; // TODO: this assumes that, on exit from magma_sormqr_gpu, hwork contains // the last block of A and B (i.e., C in sormqr). This should be fixed. // Seems this data should already be on the GPU, so could switch to // magma_strsm and drop the ssetmatrix. if ( nrhs == 1 ) { blasf77_strsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_strsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_ssetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork ); // update c if (nrhs == 1) magma_sgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1); else magma_sgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); int start = i-nb; if (nb < k) { for (i = start; i >= 0; i -= nb) { ib = min(k-i, nb); rows = m -i; if (i + ib < n) { if (nrhs == 1) { magma_sgemv( MagmaNoTrans, ib, ib, c_one, dT(i), ib, dB+i, 1, c_zero, dwork+i, 1); magma_sgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1); } else { magma_sgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, dT(i), ib, dB+i, lddb, c_zero, dwork+i, lddwork); magma_sgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); } } } } magma_scopymatrix( (n), nrhs, dwork, lddwork, dB, lddb ); return *info; }
extern "C" magma_int_t magma_sgeqrs3_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, float *dA, magma_int_t ldda, float *tau, float *dT, float *dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by SGEQRF3_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input) REAL array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by SGEQRF3_GPU in the first n columns of its array argument A. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. TAU (input) REAL array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_SGEQRF_GPU. DB (input/output) REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. DT (input) REAL array that is the output (the 6th argument) of magma_sgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK, LWORK >= max(1,NRHS). For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where NB is the blocksize given by magma_get_sgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1)) #define d_ref(a_1) (dT+(lddwork+(a_1))*nb) float c_one = MAGMA_S_ONE; magma_int_t k, lddwork; magma_int_t nb = magma_get_sgeqrf_nb(m); magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb); int lquery = (lwork == -1); hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } lddwork= k; /* B := Q' * B */ magma_sormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, a_ref(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) 1. Move the block diagonal submatrices from d_ref to R 2. Solve 3. Restore the data format moving data from R back to d_ref */ magmablas_sswapdblk(k, nb, a_ref(0,0), ldda, 1, d_ref(0), nb, 0); if ( nrhs == 1 ) { magma_strsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, a_ref(0,0), ldda, dB, 1); } else { magma_strsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, a_ref(0,0), ldda, dB, lddb); } magmablas_sswapdblk(k, nb, d_ref(0), nb, 0, a_ref(0,0), ldda, 1); return *info; }
/** Purpose ------- Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by SGEQRF3_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by SGEQRF3_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau REAL array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_SGEQRF_GPU. @param[in,out] dB REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] dT REAL array that is the output (the 6th argument) of magma_sgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_sgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgels_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrs3_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloat_ptr dA, magma_int_t ldda, float *tau, magmaFloat_ptr dT, magmaFloat_ptr dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (lddwork+(a_1))*nb) float c_one = MAGMA_S_ONE; magma_int_t k, lddwork; magma_int_t nb = magma_get_sgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } lddwork = k; /* B := Q' * B */ magma_sormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) 1. Move the (k-1)/nb block diagonal submatrices from dT to R 2. Solve 3. Restore the data format moving data from R back to dT */ magmablas_sswapdblk(k-1, nb, dA(0,0), ldda, 1, dT(0), nb, 0); if ( nrhs == 1 ) { magma_strsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA(0,0), ldda, dB, 1); } else { magma_strsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA(0,0), ldda, dB, lddb); } magmablas_sswapdblk(k-1, nb, dT(0), nb, 0, dA(0,0), ldda, 1); return *info; }
extern "C" magma_err_t magma_sgeqrs_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloat_ptr dA, size_t dA_offset, magma_int_t ldda, float *tau, magmaFloat_ptr dT, size_t dT_offset, magmaFloat_ptr dB, size_t dB_offset, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- clMagma (version 0.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by SGEQRF_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input) REAL array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by SGEQRF_GPU in the first n columns of its array argument A. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. TAU (input) REAL array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_SGEQRF_GPU. DB (input/output) REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. DT (input) REAL array that is the output (the 6th argument) of magma_sgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK, LWORK >= max(1,NRHS). For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where NB is the blocksize given by magma_get_sgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define d_ref(a_1) dT, (dT_offset + (lddwork+(a_1))*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magmaFloat_ptr dwork; magma_int_t i, k, lddwork, rows, ib; magma_int_t ione = 1; magma_int_t nb = magma_get_sgeqrf_nb(m); magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb); long int lquery = (lwork == -1); hwork[0] = MAGMA_S_MAKE( (float)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } /* B := Q' * B */ magma_sormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, a_ref(0,0), ldda, tau, dB, dB_offset, lddb, hwork, lwork, dT, dT_offset, nb, info, queue ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) */ lddwork= k; int ldtwork; size_t dwork_offset = 0; if (nb < k) { dwork = dT; dwork_offset = dT_offset+2*lddwork*nb; } else { ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; magma_smalloc( &dwork, ldtwork ); } // To do: Why did we have this line originally; seems to be a bug (Stan)? //dwork = dT; i = (k-1)/nb * nb; ib = n-i; rows = m-i; if ( nrhs == 1 ) { blasf77_strsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_strsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_ssetmatrix( ib, nrhs, hwork+rows*ib, 0, rows, dwork, dwork_offset+i, lddwork, queue ); // update c if (nrhs == 1) magma_sgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset+i, 1, c_one, dB, dB_offset, 1, queue ); else magma_sgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset + i, lddwork, c_one, dB, dB_offset, lddb, queue ); int start = i-nb; if (nb < k) { for (i = start; i >=0; i -= nb) { ib = min(k-i, nb); rows = m -i; if (i + ib < n) { if (nrhs == 1) { magma_sgemv( MagmaNoTrans, ib, ib, c_one, d_ref(i), ib, dB, dB_offset+i, 1, c_zero, dwork, dwork_offset+i, 1, queue ); magma_sgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset+i, 1, c_one, dB, dB_offset, 1, queue ); } else { magma_sgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, d_ref(i), ib, dB, dB_offset+i, lddb, c_zero, dwork, dwork_offset+i, lddwork, queue ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset+i, lddwork, c_one, dB, dB_offset, lddb, queue ); } } } } magma_scopymatrix( (n), nrhs, dwork, dwork_offset, lddwork, dB, dB_offset, lddb, queue ); if (nb >= k) magma_free(dwork); magma_queue_sync( queue ); return *info; }
/** Purpose ------- SGEQRS solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by SGEQRF3_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by SGEQRF3_GPU in the first n columns of its array argument A. dA is modified by the routine but restored on exit. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau REAL array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_SGEQRF_GPU. @param[in,out] dB REAL array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in,out] dT REAL array that is the output (the 6th argument) of magma_sgeqrf_gpu of size 2*MIN(M, N)*NB + ceil(N/32)*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size (ceil(N/32)*32)* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_sgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgels_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrs3_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloat_ptr dA, magma_int_t ldda, float const *tau, magmaFloat_ptr dT, magmaFloat_ptr dB, magma_int_t lddb, float *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_) (dT + (lddwork + (i_))*nb) float c_one = MAGMA_S_ONE; magma_int_t min_mn, lddwork; magma_int_t nb = magma_get_sgeqrf_nb( m, n ); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; bool lquery = (lwork == -1); hwork[0] = magma_smake_lwork( lwkopt ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { hwork[0] = c_one; return *info; } lddwork = min_mn; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* B := Q^H * B */ magma_sormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { magma_queue_destroy( queue ); return *info; } /* Solve R*X = B(1:n,:) 1. Move the (min_mn - 1)/nb block diagonal submatrices from dT to R 2. Solve 3. Restore the data format moving data from R back to dT */ magmablas_sswapdblk( min_mn-1, nb, dA(0,0), ldda, 1, dT(0), nb, 0, queue ); if ( nrhs == 1 ) { magma_strsv( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA(0,0), ldda, dB, 1, queue ); } else { magma_strsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA(0,0), ldda, dB, lddb, queue ); } magmablas_sswapdblk( min_mn-1, nb, dT(0), nb, 0, dA(0,0), ldda, 1, queue ); magma_queue_destroy( queue ); return *info; }