/** Purpose ------- SORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A REAL array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF. @param[in,out] C REAL array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sormqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *C, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); // TODO fix memory leak (alloc after argument checks) magma_int_t nb = 128; float *T; magma_smalloc_pinned(&T, nb*nb); //printf("calling sormqr_m with nb=%d\n", (int) nb); float* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_device_t igpu; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_S_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_sormqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+ngpu-1)/ngpu*nb_l; ngpu = min(ngpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_smalloc( &dw[igpu], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(*info) ); return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_ssetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/ngpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_ssetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_slaset_band_q( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_slarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_ssetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_sgetmatrix( m, kb, dC(igpu, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_sgetmatrix_async( m, kb, // dC(igpu, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { // TODO fix memory leak T, dw, event, stream fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_slarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_ssetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_slaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_ssetmatrix( ib, ib, T, ib, dT, ib ); magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_S_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_sormqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sormqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; float *C, *R, *A, *W, *tau; float *dC, *dA, *dT; magma_opts opts; parse_opts( argc, argv, &opts ); // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaTransStr, MagmaNoTransStr }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; nb = magma_get_sgeqrf_nb( m ); ldc = ((m + 31)/32)*32; lda = ((max(m,n) + 31)/32)*32; gflops = FLOPS_SORMQR( m, n, k, *side[iside] ) / 1e9; if ( *side[iside] == 'L' && m < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=left and m < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'R' && n < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=right and n < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'L' ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + ((k + 31)/32)*32 )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + ((k + 31)/32)*32 )*nb; } TESTING_MALLOC_CPU( C, float, ldc*n ); TESTING_MALLOC_CPU( R, float, ldc*n ); TESTING_MALLOC_CPU( A, float, lda*k ); TESTING_MALLOC_CPU( W, float, lwork_max ); TESTING_MALLOC_CPU( tau, float, k ); TESTING_MALLOC_DEV( dC, float, ldc*n ); TESTING_MALLOC_DEV( dA, float, lda*k ); TESTING_MALLOC_DEV( dT, float, dt_size ); // C is full, m x n size = ldc*n; lapackf77_slarnv( &ione, ISEED, &size, C ); magma_ssetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) lda = (*side[iside] == 'L' ? m : n); size = lda*k; lapackf77_slarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_ssetmatrix( lda, k, A, lda, dA, lda ); magma_sgeqrf_gpu( lda, k, dA, lda, tau, dT, &info ); magma_sgetmatrix( lda, k, dA, lda, A, lda ); if (info != 0) printf("magma_sgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sormqr( side[iside], trans[itran], &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sormqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_sormqr_gpu( *side[iside], *trans[itran], m, n, k, dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info ); if (info != 0) printf("magma_sormqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_S_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) printf("invalid lwork %d, lwork_max %d\n", (int) lwork, (int) lwork_max ); gpu_time = magma_sync_wtime( 0 ); // sync needed for L,N and R,T cases magma_sormqr_gpu( *side[iside], *trans[itran], m, n, k, dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sormqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_slange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_slange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-5s %-9s %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, side[iside], trans[itran], cpu_perf, cpu_time, gpu_perf, gpu_time, error ); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_sormqr_gpu(char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, float *dA, magma_int_t ldda, float *tau, float *dC, magma_int_t lddc, float *hwork, magma_int_t lwork, float *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SORMQR_GPU overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. DA (input) REAL array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF in the first k columns of its array argument DA. DA is modified by the routine but restored on exit. LDDA (input) INTEGER The leading dimension of the array DA. If SIDE = 'L', LDDA >= max(1,M); if SIDE = 'R', LDDA >= max(1,N). TAU (input) REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF. DC (input/output) REAL array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q. LDDC (input) INTEGER The leading dimension of the array DC. LDDC >= max(1,M). HWORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) Currently, sgetrs_gpu assumes that on exit, hwork contains the last block of A and C. This will change and *should not be relied on*! LWORK (input) INTEGER The dimension of the array HWORK. LWORK >= (M-K+NB)*(N+NB) + N*NB if SIDE = 'L', and LWORK >= (N-K+NB)*(M+NB) + M*NB if SIDE = 'R', where NB is the given blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array, and no error message related to LWORK is issued by XERBLA. DT (input) REAL array on the GPU that is the output (the 9th argument) of magma_sgeqrf_gpu. NB (input) INTEGER This is the blocking size that was used in pre-computing DT, e.g., the blocking size used in magma_sgeqrf_gpu. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define dA(a_1,a_2) (dA + (a_1) + (a_2)*ldda) #define dC(a_1,a_2) (dC + (a_1) + (a_2)*lddc) #define dT(a_1) (dT + (a_1)*nb) float c_one = MAGMA_S_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; float *dwork; magma_int_t i, lddwork; magma_int_t i1, i2, step, ib, ic, jc, ma, mi, ni, nq, nw; int left, notran, lquery; magma_int_t lwkopt; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } lwkopt = (nq - k + nb)*(nw + nb) + nw*nb; hwork[0] = MAGMA_S_MAKE( lwkopt, 0 ); if ( (!left) && (!lapackf77_lsame(side_, "R")) ) { *info = -1; } else if ( (!notran) && (!lapackf77_lsame(trans_, MagmaTransStr)) ) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (lwork < lwkopt && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { hwork[0] = c_one; return *info; } lddwork = k; dwork = dT(2*lddwork); if ( (left && (! notran)) || ((! left) && notran) ) { // left trans: Q^T C // right notrans: C Q // multiply from first block, i = 0, to next-to-last block, i < k-nb i1 = 0; i2 = k-nb; step = nb; } else { // left notrans: Q C // right trans: C Q^T // multiply from next-to-last block, i = floor((k-1-nb)/nb)*nb, to first block, i = 0 i1 = ((k - 1 - nb) / nb) * nb; i2 = 0; step = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } /* Use unblocked code to multiply last or only block (cases Q*C or C*Q^T). */ // workspace left: A(mi*nb) + C(mi*ni) + work(ni*nb_la) = (m-k-nb)*nb + (m-k-nb)*n + n*nb // workspace right: A(ni*nb) + C(mi*ni) + work(mi*nb_la) = (n-k-nb)*nb + m*(n-k-nb) + m*nb if ( step < 0 ) { // i is beginning of last block i = i1 - step; if ( i >= k ) { i = i1; } ib = k - i; if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ma = mi; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; ma = ni; jc = i; } float* hA = hwork; float* hC = hwork + ma*ib; float* hW = hwork + ma*ib + mi*ni; magma_int_t lhwork = lwork - (ma*ib + mi*ni); magma_sgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma ); magma_sgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi ); lapackf77_sormqr( side_, trans_, &mi, &ni, &ib, hA, &ma, tau+i, hC, &mi, hW, &lhwork, info ); // send the updated part of C back to the GPU magma_ssetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc ); } /* Use blocked code to multiply blocks */ if (nb < k) { for( i=i1; (step<0 ? i>=i2 : i<i2); i+=step ) { ib = min(nb, k - i); if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; jc = i; } magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, i ), ldda, dT(i), nb, dC(ic, jc), lddc, dwork, nw ); } } else { i = i1; } /* Use unblocked code to multiply the last or only block (cases Q^T*C or C*Q). */ if ( step > 0 ) { ib = k-i; if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ma = mi; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; ma = ni; jc = i; } float* hA = hwork; float* hC = hwork + ma*ib; float* hW = hwork + ma*ib + mi*ni; magma_int_t lhwork = lwork - (ma*ib + mi*ni); magma_sgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma ); magma_sgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi ); lapackf77_sormqr( side_, trans_, &mi, &ni, &ib, hA, &ma, tau+i, hC, &mi, hW, &lhwork, info ); // send the updated part of C back to the GPU magma_ssetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc ); } // TODO sync. For cases Q*C and C*Q^T, last call is magma_slarfb_gpu, // which is async magma_gemm calls, so sormqr can be unfinished. // TODO: sgeqrs_gpu ASSUMES that hwork contains the last block of A and C. // That needs to be fixed, but until then, don't modify hwork[0] here. // In LAPACK: On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. //hwork[0] = MAGMA_S_MAKE( lwkopt, 0 ); return *info; } /* end of magma_sormqr_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sormqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; float *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_sgeqrf_nb( m ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_SORMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqrf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); TESTING_MALLOC_CPU( C, float, ldc*n ); TESTING_MALLOC_CPU( R, float, ldc*n ); TESTING_MALLOC_CPU( A, float, lda*k ); TESTING_MALLOC_CPU( W, float, lwork_max ); TESTING_MALLOC_CPU( tau, float, k ); // C is full, m x n size = ldc*n; lapackf77_slarnv( &ione, ISEED, &size, C ); lapackf77_slacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_slarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_sgeqrf( mm, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sormqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sormqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_sormqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_sormqr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_S_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_sormqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sormqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_slange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_slange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/** Purpose ------- SORMQR_GPU overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**T * C C * Q**T @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**T from the Left; - = MagmaRight: apply Q or Q**T from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Transpose, apply Q**T. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] dA REAL array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF in the first k columns of its array argument DA. DA is modified by the routine but restored on exit. @param[in] ldda INTEGER The leading dimension of the array DA. If SIDE = MagmaLeft, LDDA >= max(1,M); if SIDE = MagmaRight, LDDA >= max(1,N). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF. @param[in,out] dC REAL array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q. @param[in] lddc INTEGER The leading dimension of the array DC. LDDC >= max(1,M). @param[out] hwork (workspace) REAL array, dimension (MAX(1,LWORK)) \n Currently, sgetrs_gpu assumes that on exit, hwork contains the last block of A and C. This will change and *should not be relied on*! @param[in] lwork INTEGER The dimension of the array HWORK. LWORK >= (M-K+NB)*(N+NB) + N*NB if SIDE = MagmaLeft, and LWORK >= (N-K+NB)*(M+NB) + M*NB if SIDE = MagmaRight, where NB is the given blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array, and no error message related to LWORK is issued by XERBLA. @param[in] dT REAL array on the GPU that is the output (the 9th argument) of magma_sgeqrf_gpu. @param[in] nb INTEGER This is the blocking size that was used in pre-computing DT, e.g., the blocking size used in magma_sgeqrf_gpu. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sormqr_gpu(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, float *dA, magma_int_t ldda, float *tau, float *dC, magma_int_t lddc, float *hwork, magma_int_t lwork, float *dT, magma_int_t nb, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_1) + (a_2)*ldda) #define dC(a_1,a_2) (dC + (a_1) + (a_2)*lddc) #define dT(a_1) (dT + (a_1)*nb) float c_one = MAGMA_S_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); float *dwork; magma_int_t i, lddwork; magma_int_t i1, i2, step, ib, ic, jc, ma, mi, ni, nq, nw; int left, notran, lquery; magma_int_t lwkopt; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } lwkopt = (nq - k + nb)*(nw + nb) + nw*nb; hwork[0] = MAGMA_S_MAKE( lwkopt, 0 ); if ( ! left && side != MagmaRight ) { *info = -1; } else if ( ! notran && trans != MagmaTrans ) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (lwork < lwkopt && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { hwork[0] = c_one; return *info; } lddwork = k; dwork = dT(2*lddwork); if ( (left && (! notran)) || ((! left) && notran) ) { // left trans: Q^T C // right notrans: C Q // multiply from first block, i = 0, to next-to-last block, i < k-nb i1 = 0; i2 = k-nb; step = nb; } else { // left notrans: Q C // right trans: C Q^T // multiply from next-to-last block, i = floor((k-1-nb)/nb)*nb, to first block, i = 0 i1 = ((k - 1 - nb) / nb) * nb; i2 = 0; step = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } /* Use unblocked code to multiply last or only block (cases Q*C or C*Q^T). */ // workspace left: A(mi*nb) + C(mi*ni) + work(ni*nb_la) = (m-k-nb)*nb + (m-k-nb)*n + n*nb // workspace right: A(ni*nb) + C(mi*ni) + work(mi*nb_la) = (n-k-nb)*nb + m*(n-k-nb) + m*nb if ( step < 0 ) { // i is beginning of last block i = i1 - step; if ( i >= k ) { i = i1; } ib = k - i; if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ma = mi; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; ma = ni; jc = i; } float* hA = hwork; float* hC = hwork + ma*ib; float* hW = hwork + ma*ib + mi*ni; magma_int_t lhwork = lwork - (ma*ib + mi*ni); magma_sgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma ); magma_sgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi ); lapackf77_sormqr( side_, trans_, &mi, &ni, &ib, hA, &ma, tau+i, hC, &mi, hW, &lhwork, info ); // send the updated part of C back to the GPU magma_ssetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc ); } /* Use blocked code to multiply blocks */ if (nb < k) { for( i=i1; (step < 0 ? i >= i2 : i < i2); i += step ) { ib = min(nb, k - i); if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; jc = i; } magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, i ), ldda, dT(i), nb, dC(ic, jc), lddc, dwork, nw ); } } else { i = i1; } /* Use unblocked code to multiply the last or only block (cases Q^T*C or C*Q). */ if ( step > 0 ) { ib = k-i; if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ma = mi; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; ma = ni; jc = i; } float* hA = hwork; float* hC = hwork + ma*ib; float* hW = hwork + ma*ib + mi*ni; magma_int_t lhwork = lwork - (ma*ib + mi*ni); magma_sgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma ); magma_sgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi ); lapackf77_sormqr( side_, trans_, &mi, &ni, &ib, hA, &ma, tau+i, hC, &mi, hW, &lhwork, info ); // send the updated part of C back to the GPU magma_ssetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc ); } // TODO sync. For cases Q*C and C*Q^T, last call is magma_slarfb_gpu, // which is async magma_gemm calls, so sormqr can be unfinished. // TODO: sgeqrs_gpu ASSUMES that hwork contains the last block of A and C. // That needs to be fixed, but until then, don't modify hwork[0] here. // In LAPACK: On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. //hwork[0] = MAGMA_S_MAKE( lwkopt, 0 ); return *info; } /* magma_sormqr_gpu */
/** Purpose ------- SGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors. @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace, for [cz]geqp3 only) REAL array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqp3_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqp3( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *jpvt, float *tau, float *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dwork + (i) + (j)*(ldda)) float *dwork, *df; magma_int_t ione = 1; magma_int_t n_j, ldda, ldwork; magma_int_t j, jb, na, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, sminmn, lwkopt=0, lquery; *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } nb = magma_get_sgeqp3_nb(min(m, n)); minmn = min(m,n); if (*info == 0) { if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #ifdef REAL lwkopt += 2*n; #endif } work[0] = MAGMA_S_MAKE( lwkopt, 0. ); if (lwork < lwkopt && ! lquery) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } if (minmn == 0) return *info; #ifdef REAL float *rwork = work + (n + 1)*nb; #endif ldda = ((m+31)/32)*32; ldwork = n*ldda + (n+1)*nb; if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } df = dwork + n*ldda; // dwork used for dA magma_queue_t stream; magma_queue_create( &stream ); /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ nfxd = 0; for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_sswap(&m, A(0, j), &ione, A(0, nfxd), &ione); jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. */ if (nfxd > 0) { na = min(m,nfxd); lapackf77_sgeqrf(&m, &na, A, &lda, tau, work, &lwork, info); if (na < n) { n_j = n - na; lapackf77_sormqr( MagmaLeftStr, MagmaConjTransStr, &m, &n_j, &na, A, &lda, tau, A(0, na), &lda, work, &lwork, info ); } } /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; sminmn = minmn - nfxd; if (nb < sminmn) { j = nfxd; // Set the original matrix to the GPU magma_ssetmatrix_async( m, sn, A (0,j), lda, dA(0,j), ldda, stream ); } /* Initialize partial column norms. */ for (j = nfxd; j < n; ++j) { rwork[j] = magma_cblas_snrm2( sm, A(nfxd,j), ione ); rwork[n + j] = rwork[j]; } j = nfxd; if (nb < sminmn) { /* Use blocked code initially. */ magma_queue_sync( stream ); /* Compute factorization: while loop. */ topbmn = minmn - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; if (j > nfxd) { // Get panel to the CPU magma_sgetmatrix( m-j, jb, dA(j,j), ldda, A (j,j), lda ); // Get the rows magma_sgetmatrix( jb, n_j - jb, dA(j,j + jb), ldda, A (j,j + jb), lda ); } magma_slaqps( m, n_j, j, jb, &fjb, A (0, j), lda, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], work, &work[jb], n_j, &df[jb], n_j ); j += fjb; /* fjb is actual number of columns factored */ } } /* Use unblocked code to factor the last or only block. */ if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_sgetmatrix( m-j, n_j, dA(j,j), ldda, A (j,j), lda ); } lapackf77_slaqp2(&m, &n_j, &j, A(0, j), &lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], work ); } } work[0] = MAGMA_S_MAKE( lwkopt, 0. ); magma_free( dwork ); magma_queue_destroy( stream ); return *info; } /* magma_sgeqp3 */
extern "C" magma_int_t magma_sormqr(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, float *a, magma_int_t lda, float *tau, float *c, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- MAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver September 2012 Purpose ======= SORMQR overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) REAL array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF. C (input/output) REAL array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ float c_one = MAGMA_S_ONE; magma_side_t side_ = side; magma_trans_t trans_ = trans; /* Allocate work space on the GPU */ magmaFloat_ptr dwork, dc; magma_malloc( &dc, (m)*(n)*sizeof(float) ); magma_malloc( &dwork, (m + n + 64)*64*sizeof(float) ); /* Copy matrix C from the CPU to the GPU */ magma_ssetmatrix( m, n, c, 0, ldc, dc, 0, m, queue ); //dc -= (1 + m); size_t dc_offset = -(1+m); magma_int_t a_offset, c_offset, i__4, lddwork; magma_int_t i__; float t[2*4160] /* was [65][64] */; magma_int_t i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw; int left, notran, lquery; magma_int_t iinfo, lwkopt; a_offset = 1 + lda; a -= a_offset; --tau; c_offset = 1 + ldc; c -= c_offset; *info = 0; left = lapackf77_lsame(lapack_const(side_), "L"); notran = lapackf77_lsame(lapack_const(trans_), "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(lapack_const(side_), "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(lapack_const(trans_), "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = max(1,nw) * nb; // ACD // MAGMA_S_SET2REAL( work[0], lwkopt ); MAGMA_S_SET2REAL( work[0], (float) lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_sormqr(lapack_const(side_), lapack_const(trans_), &m, &n, &k, &a[a_offset], &lda, &tau[1], &c[c_offset], &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } for (i__ = i1; i3 < 0 ? i__ >= i2 : i__ <= i2; i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i__ + 1; lapackf77_slarft("F", "C", &i__4, &ib, &a[i__ + i__ * lda], &lda, &tau[i__], t, &ib); /* 1) Put 0s in the upper triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ spanel_to_q(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); magma_ssetmatrix( i__4, ib, &a[i__ + i__ * lda], 0, lda, dwork, 0, i__4, queue ); sq_to_panel(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i__ + 1; ic = i__; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i__ + 1; jc = i__; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_ssetmatrix( ib, ib, t, 0, ib, dwork, i__4*ib, ib, queue ); magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dwork, 0, i__4, dwork, i__4*ib, ib, dc, dc_offset+(ic + jc * m), m, dwork, (i__4*ib + ib*ib), lddwork, queue); } magma_sgetmatrix( m, n, dc, dc_offset+(1+m), m, &c[c_offset], 0, ldc, queue ); } // ACD // MAGMA_S_SET2REAL( work[0], lwkopt ); MAGMA_S_SET2REAL( work[0], (float) lwkopt ); //dc += (1 + m); magma_free( dc ); magma_free( dwork ); return *info; } /* magma_sormqr */
/** Purpose ------- SORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A REAL array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF. @param[in,out] C REAL array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sormqr( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *C, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) float *T, *T2; magma_int_t i, i1, i2, ib, ic, jc, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; magma_int_t left, notran, lquery; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_sgelqf_nb( min( m, n )); lwkopt = max(1,nw)*nb; work[0] = MAGMA_S_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_S_ONE; return *info; } ldwork = nw; if (nb >= k) { /* Use CPU code */ lapackf77_sormqr( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = ((m+31)/32)*32; float *dwork, *dV, *dT, *dC; magma_smalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_smalloc_cpu( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; /* Copy matrix C from the CPU to the GPU */ magma_ssetmatrix( m, n, C, ldc, dC, lddc ); if ( (left && ! notran) || (! left && notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ nq_i = nq - i; lapackf77_slarft("Forward", "Columnwise", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib); /* 1) set upper triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ spanel_to_q( MagmaUpper, ib, A(i,i), lda, T2 ); magma_ssetmatrix( nq_i, ib, A(i,i), lda, dV, nq_i ); sq_to_panel( MagmaUpper, ib, A(i,i), lda, T2 ); if (left) { /* H or H**H is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H**H is applied to C(1:m,i:n) */ ni = n - i; jc = i; } /* Apply H or H**H; First copy T to the GPU */ magma_ssetmatrix( ib, ib, T, ib, dT, ib ); magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dV, nq_i, dT, ib, dC(ic,jc), lddc, dwork, ldwork ); } magma_sgetmatrix( m, n, dC, lddc, C, ldc ); magma_free( dwork ); magma_free_cpu( T ); } work[0] = MAGMA_S_MAKE( lwkopt, 0 ); return *info; } /* magma_sormqr */
/***************************************************************************//** Purpose ------- SORMBR multiplies by Q or P as part of the SVD decomposition. If VECT = MagmaQ, SORMBR overwrites the general real M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q*C C*Q TRANS = MagmaTrans: Q**H*C C*Q**H If VECT = MagmaP, SORMBR overwrites the general real M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: P*C C*P TRANS = MagmaTrans: P**H*C C*P**H Here Q and P**H are the orthogonal matrices determined by SGEBRD when reducing A real matrix A to bidiagonal form: A = Q*B * P**H. Q and P**H are defined as products of elementary reflectors H(i) and G(i) respectively. Let nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Thus nq is the order of the orthogonal matrix Q or P**H that is applied. If VECT = MagmaQ, A is assumed to have been an NQ-by-K matrix: if nq >= k, Q = H(1) H(2) . . . H(k); if nq < k, Q = H(1) H(2) . . . H(nq-1). If VECT = MagmaP, A is assumed to have been A K-by-NQ matrix: if k < nq, P = G(1) G(2) . . . G(k); if k >= nq, P = G(1) G(2) . . . G(nq-1). Arguments --------- @param[in] vect magma_vect_t - = MagmaQ: apply Q or Q**H; - = MagmaP: apply P or P**H. @param[in] side magma_side_t - = MagmaLeft: apply Q, Q**H, P or P**H from the Left; - = MagmaRight: apply Q, Q**H, P or P**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q or P; - = MagmaTrans: Conjugate transpose, apply Q**H or P**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER If VECT = MagmaQ, the number of columns in the original matrix reduced by SGEBRD. If VECT = MagmaP, the number of rows in the original matrix reduced by SGEBRD. K >= 0. @param[in] A REAL array, dimension (LDA,min(nq,K)) if VECT = MagmaQ (LDA,nq) if VECT = MagmaP The vectors which define the elementary reflectors H(i) and G(i), whose products determine the matrices Q and P, as returned by SGEBRD. @param[in] lda INTEGER The leading dimension of the array A. If VECT = MagmaQ, LDA >= max(1,nq); if VECT = MagmaP, LDA >= max(1,min(nq,K)). @param[in] tau REAL array, dimension (min(nq,K)) TAU(i) must contain the scalar factor of the elementary reflector H(i) or G(i) which determines Q or P, as returned by SGEBRD in the array argument TAUQ or TAUP. @param[in,out] C REAL array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q or P*C or P**H*C or C*P or C*P**H. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M); if N = 0 or M = 0, LWORK >= 1. For optimum performance if SIDE = MagmaLeft, LWORK >= max(1,N*NB); if SIDE = MagmaRight, LWORK >= max(1,M*NB), where NB is the optimal blocksize. (NB = 0 if M = 0 or N = 0.) \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_unmbr *******************************************************************************/ extern "C" magma_int_t magma_sormbr( magma_vect_t vect, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *C, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i,j) (A + (i) + (j)*lda) #define C(i,j) (C + (i) + (j)*ldc) magma_int_t i1, i2, nb, mi, ni, nq, nq_1, minwrk, iinfo, lwkopt; magma_int_t left, notran, applyq, lquery; magma_trans_t transt; MAGMA_UNUSED( nq_1 ); // used only in version 1 *info = 0; applyq = (vect == MagmaQ); left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q or P and MINWRK (previously "nw") is the minimum dimension of WORK */ if (left) { nq = m; minwrk = n; } else { nq = n; minwrk = m; } if (m == 0 || n == 0) { minwrk = 0; } /* check arguments */ if (! applyq && vect != MagmaP) { *info = -1; } else if (! left && side != MagmaRight) { *info = -2; } else if (! notran && trans != MagmaTrans) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (k < 0) { *info = -6; } else if ( ( applyq && lda < max(1,nq) ) || ( ! applyq && lda < max(1,min(nq,k)) ) ) { *info = -8; } else if (ldc < max(1,m)) { *info = -11; } else if (lwork < max(1,minwrk) && ! lquery) { *info = -13; } if (*info == 0) { if (minwrk > 0) { // TODO have get_sormqr_nb and get_sormlq_nb routines? see original LAPACK sormbr. // TODO make them dependent on m, n, and k? nb = magma_get_sgebrd_nb( m, n ); lwkopt = max(1, minwrk*nb); } else { lwkopt = 1; } work[0] = magma_smake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } if (applyq) { /* Apply Q */ if (nq >= k) { /* Q was determined by a call to SGEBRD with nq >= k */ #if VERSION == 1 lapackf77_sormqr( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); #else magma_sormqr( side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo); #endif } else if (nq > 1) { /* Q was determined by a call to SGEBRD with nq < k */ if (left) { mi = m - 1; ni = n; i1 = 1; i2 = 0; } else { mi = m; ni = n - 1; i1 = 0; i2 = 1; } #if VERSION == 1 nq_1 = nq - 1; lapackf77_sormqr( lapack_side_const(side), lapack_trans_const(trans), &mi, &ni, &nq_1, A(1,0), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo); #else magma_sormqr( side, trans, mi, ni, nq-1, A(1,0), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo); #endif } } else { /* Apply P */ if (notran) { transt = MagmaTrans; } else { transt = MagmaNoTrans; } if (nq > k) { /* P was determined by a call to SGEBRD with nq > k */ #if VERSION == 1 lapackf77_sormlq( lapack_side_const(side), lapack_trans_const(transt), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); #else magma_sormlq( side, transt, m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo); #endif } else if (nq > 1) { /* P was determined by a call to SGEBRD with nq <= k */ if (left) { mi = m - 1; ni = n; i1 = 1; i2 = 0; } else { mi = m; ni = n - 1; i1 = 0; i2 = 1; } #if VERSION == 1 nq_1 = nq - 1; lapackf77_sormlq( lapack_side_const(side), lapack_trans_const(transt), &mi, &ni, &nq_1, A(0,1), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo); #else magma_sormlq( side, transt, mi, ni, nq-1, A(0,1), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo); #endif } } work[0] = magma_smake_lwork( lwkopt ); return *info; } /* magma_sormbr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrs_gpu */ int main( int argc, char** argv) { //#if defined(PRECISION_s) /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float matnorm, work[1]; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A, d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, 1024, 1024); M = N = size[6]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_sgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( tau, float, min_mn ); TESTING_MALLOC_PIN( h_A, float, lda*N ); TESTING_MALLOC_PIN( h_A2, float, lda*N ); TESTING_MALLOC_PIN( h_B, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_X, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_R, float, ldb*nrhs ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = -1; lapackf77_sormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC_PIN( hwork, float, lhwork ); printf("\n"); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<7; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_GEQRF( (float)M, (float)N ) + FLOPS_GEQRS( (float)M, (float)N, (float)nrhs )) / 1e9; /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_slarnv( &ione, ISEED, &n2, h_B ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Warm up to measure the performance */ magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); gpu_time = magma_wtime(); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgels had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; // Get the solution in h_X magma_sgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue ); // compute the residual blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_slange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_sgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, hwork, &lhwork, &info); cpu_time = magma_wtime()-cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapackf77_sgels had an illegal value.\n", -info); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", M, N, cpu_perf, gpu_perf, lapackf77_slange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_slange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( hwork ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
/***************************************************************************//** Purpose ------- SORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A REAL array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF. @param[in,out] C REAL array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_unmqr *******************************************************************************/ extern "C" magma_int_t magma_sormqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *C, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) /* Constants */ float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; /* Local variables */ const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; float *T = NULL; magmaFloat_ptr dw[MagmaMaxGPUs] = { NULL }; magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }}; magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }}; magma_int_t ind_c; magma_device_t dev; magma_device_t orig_dev; magma_getdevice( &orig_dev ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = magma_smake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_sormqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = magma_roundup( m, 64 ); // TODO why 64 instead of 32 ? magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l; ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) if (MAGMA_SUCCESS != magma_smalloc_pinned( &T, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_smalloc( &dw[dev], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &events[dev][0] ); magma_event_create( &events[dev][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_ssetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] ); nlocal[dev] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied magma_ssetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); // set upper triangular part of dA to identity magmablas_slaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_slarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_ssetmatrix_async(kb, kb, T, kb, dT(dev, ind_c), kb, queues[dev][0] ); } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); // check if the data was copied magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[dev], kb, dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb, dC(dev, i, 0), lddc, dwork(dev, ind_c), lddwork, queues[dev][1] ); magma_event_record(events[dev][ind_c], queues[dev][1] ); } ind_c = (ind_c+1)%2; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_sgetmatrix( m, kb, dC(dev, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc, queues[dev][1] ); // magma_sgetmatrix_async( m, kb, // dC(dev, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, queues[dev][0] ); } } else { *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); goto cleanup; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_slarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_ssetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] ); magmablas_slaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_ssetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] ); magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork, queues[dev][1] ); } */ } cleanup: work[0] = magma_smake_lwork( lwkopt ); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_destroy( events[dev][0] ); magma_event_destroy( events[dev][1] ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_free( dw[dev] ); } magma_setdevice( orig_dev ); magma_free_pinned( T ); return *info; } /* magma_sormqr */
extern "C" magma_int_t magma_sormqr_gpu(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloat_ptr dA, size_t dA_offset, magma_int_t ldda, float *tau, magmaFloat_ptr dC, size_t dC_offset, magma_int_t lddc, float *hwork, magma_int_t lwork, magmaFloat_ptr dT, size_t dT_offset, magma_int_t nb, magma_int_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver April 2012 Purpose ======= SORMQR_GPU overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. DA (input) REAL array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF in the first k columns of its array argument DA. DA is modified by the routine but restored on exit. LDDA (input) INTEGER The leading dimension of the array DA. If SIDE = 'L', LDDA >= max(1,M); if SIDE = 'R', LDDA >= max(1,N). TAU (input) REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF. DC (input/output) REAL array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q. LDDC (input) INTEGER The leading dimension of the array DC. LDDC >= max(1,M). HWORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array HWORK. LWORK >= (M-K+NB)*(N+2*NB) if SIDE = 'L', and LWORK >= (N-K+NB)*(M+2*NB) if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array, and no error message related to LWORK is issued by XERBLA. DT (input) REAL array on the GPU that is the output (the 9th argument) of magma_sgeqrf_gpu. NB (input) INTEGER This is the blocking size that was used in pre-computing DT, e.g., the blocking size used in magma_sgeqrf_gpu. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) dA, (dA_offset+(a_1)+(a_2)*(ldda)) #define c_ref(a_1,a_2) dC, (dC_offset+(a_1)+(a_2)*(lddc)) #define t_ref(a_1) dT, (dT_offset+(a_1)*nb) float c_one = MAGMA_S_ONE; magma_side_t side_ = side; magma_trans_t trans_ = trans; magmaFloat_ptr dwork; magma_int_t i, lddwork; magma_int_t i1, i2, i3, ib, ic, jc, mi, ni, nq, nw, ret; long int left, notran, lquery; static magma_int_t lwkopt; *info = 0; left = lapackf77_lsame(lapack_const(side_), lapack_const(MagmaLeft)); notran = lapackf77_lsame(lapack_const(trans_), lapack_const(MagmaNoTrans)); lquery = (lwork == -1); if (!left || notran) printf("sormqr_gpu called with arguments not yet supported\n"); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if ( (!left) && (!lapackf77_lsame(lapack_const(side_), lapack_const(MagmaRight))) ) { *info = -1; } else if ( (!notran) && (!lapackf77_lsame(lapack_const(trans_), lapack_const(MagmaTrans))) ) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } lwkopt = (m-k+nb)*(n+2*nb); hwork[0] = MAGMA_S_MAKE( lwkopt, 0 ); if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { hwork[0] = c_one; return *info; } lddwork= k; dwork = dT; size_t dwork_offset = 2*lddwork*nb; if ( (left && (! notran)) || ( (!left) && notran ) ) { i1 = 0; i2 = k-nb; i3 = nb; } else { i1 = (k - 1 - nb) / nb * nb; i2 = 0; i3 = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } if (nb < k) { for (i=i1; i3<0 ? i>i2 : i<i2; i+=i3) { ib = min(nb, k - i); if (left){ mi = m - i; ic = i; } else { ni = n - i; jc = i; } ret = magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, mi, ni, ib, a_ref(i, i ), ldda, t_ref(i), nb, c_ref(ic, jc), lddc, dwork, dwork_offset, nw, queue); if ( ret != MAGMA_SUCCESS ) return ret; } } else { i = i1; } /* Use unblocked code to multiply the last or only block. */ if (i < k) { ib = k-i; if (left){ mi = m - i; ic = i; } else { ni = n - i; jc = i; } magma_sgetmatrix(mi, ib, a_ref(i, i), ldda, hwork, 0, mi, queue); magma_sgetmatrix(mi, ni, c_ref(ic, jc), lddc, hwork+mi*ib, 0, mi, queue); magma_int_t lhwork = lwork - mi*(ib + ni); lapackf77_sormqr( MagmaLeftStr, MagmaTransStr, &mi, &ni, &ib, hwork, &mi, tau+i, hwork+mi*ib, &mi, hwork+mi*(ib+ni), &lhwork, info); // send the updated part of c back to the GPU magma_ssetmatrix(mi, ni, hwork+mi*ib, 0, mi, c_ref(ic, jc), lddc, queue); } return *info; /* End of MAGMA_SORMQR_GPU */ }