/* Update thread */ extern "C" void *cpu_thread(void *a) { magma_int_t i; t_params *tp = (t_params*)a; magma_qr_params *mp = (magma_qr_params*)tp->params; //long magma_int_t t = (long int) a; long int t = (long int) tp->tid; magma_int_t M; magma_int_t N; magma_int_t K; cuDoubleComplex *WORK; loop: while (mp->sync0 == 0) { sched_yield(); } for (i = 0; i < mp->np_gpu; i++) { while (mp->p[i] == NULL) { sched_yield(); } M=mp->m-i*mp->nb; N=mp->ob; K=mp->nb; if (mp->m >= (mp->n-(mp->nthreads*mp->ob))) { if (i == (mp->np_gpu - 1)) { K = mp->n-mp->nthreads*mp->ob-(mp->np_gpu-1)*mp->nb; } } WORK = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex)*M*N); lapackf77_zlarfb(MagmaLeftStr, MagmaTransStr, MagmaForwardStr, MagmaColumnwiseStr, &M,&N,&K,mp->a+i*mp->nb*mp->lda+i*mp->nb,&(mp->lda),mp->t+i*mp->nb*mp->nb,&K, mp->a+mp->m*mp->n-(mp->nthreads-t)*mp->ob*mp->lda+i*mp->nb,&(mp->lda),WORK,&N); free(WORK); } mp->sync1[t] = 1; while (mp->sync2 == 0) { sched_yield(); } goto loop; return (void*)NULL; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlarfb_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t M, N, K, size, ldc, ldv, ldt, ldw, nv; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double error, work[1]; // test all combinations of input parameters const char side[] = { MagmaLeft, MagmaRight }; const char trans[] = { MagmaConjTrans, MagmaNoTrans }; const char direct[] = { MagmaForward, MagmaBackward }; const char storev[] = { MagmaColumnwise, MagmaRowwise }; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { M = opts.msize[i]; N = opts.nsize[i]; K = opts.ksize[i]; if ( M < K || N < K || K <= 0 ) { printf( "skipping M %d, N %d, K %d; requires M >= K, N >= K, K >= 0.\n", (int) M, (int) N, (int) K ); continue; } for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { ldc = ((M+31)/32)*32; ldt = ((K+31)/32)*32; ldw = (side[iside] == MagmaLeft ? N : M); // (ldv, nv) get swapped later if rowwise ldv = (side[iside] == MagmaLeft ? M : N); nv = K; // Allocate memory for matrices magmaDoubleComplex *C, *R, *V, *T, *W; TESTING_MALLOC( C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC( R, magmaDoubleComplex, ldc*N ); TESTING_MALLOC( V, magmaDoubleComplex, ldv*K ); TESTING_MALLOC( T, magmaDoubleComplex, ldt*K ); TESTING_MALLOC( W, magmaDoubleComplex, ldw*K ); magmaDoubleComplex *dC, *dV, *dT, *dW; TESTING_DEVALLOC( dC, magmaDoubleComplex, ldc*N ); TESTING_DEVALLOC( dV, magmaDoubleComplex, ldv*K ); TESTING_DEVALLOC( dT, magmaDoubleComplex, ldt*K ); TESTING_DEVALLOC( dW, magmaDoubleComplex, ldw*K ); // C is M x N. size = ldc*N; lapackf77_zlarnv( &ione, ISEED, &size, C ); //printf( "C=" ); magma_zprint( M, N, C, ldc ); // V is ldv x nv. See larfb docs for description. // if column-wise and left, M x K // if column-wise and right, N x K // if row-wise and left, K x M // if row-wise and right, K x N size = ldv*nv; lapackf77_zlarnv( &ione, ISEED, &size, V ); if ( storev[istor] == MagmaColumnwise ) { if ( direct[idir] == MagmaForward ) { lapackf77_zlaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_zlaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, &V[(ldv-K)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( direct[idir] == MagmaForward ) { lapackf77_zlaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_zlaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, &V[(nv-K)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_zprint( ldv, nv, V, ldv ); // T is K x K, upper triangular for forward, and lower triangular for backward magma_int_t k1 = K-1; size = ldt*K; lapackf77_zlarnv( &ione, ISEED, &size, T ); if ( direct[idir] == MagmaForward ) { lapackf77_zlaset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_zlaset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_zprint( K, K, T, ldt ); magma_zsetmatrix( M, N, C, ldc, dC, ldc ); magma_zsetmatrix( ldv, nv, V, ldv, dV, ldv ); magma_zsetmatrix( K, K, T, ldt, dT, ldt ); lapackf77_zlarfb( &side[iside], &trans[itran], &direct[idir], &storev[istor], &M, &N, &K, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_zprint( M, N, C, ldc ); magma_zlarfb_gpu( side[iside], trans[itran], direct[idir], storev[istor], M, N, K, dV, ldv, dT, ldt, dC, ldc, dW, ldw ); magma_zgetmatrix( M, N, dC, ldc, R, ldc ); //printf( "dHC=" ); magma_zprint( M, N, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| error = lapackf77_zlange( "Fro", &M, &N, C, &ldc, work ); size = ldc*N; blasf77_zaxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_zlange( "Fro", &M, &N, R, &ldc, work ) / error; printf( "%5d %5d %5d %c %c %c %c %8.2e\n", (int) M, (int) N, (int) K, storev[istor], side[iside], direct[idir], trans[itran], error ); TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( V ); TESTING_FREE( T ); TESTING_FREE( W ); TESTING_DEVFREE( dC ); TESTING_DEVFREE( dV ); TESTING_DEVFREE( dT ); TESTING_DEVFREE( dW ); }}}} printf( "\n" ); } TESTING_FINALIZE(); return 0; }
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX_16 array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] dT COMPLEX_16 array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_zgeqrf_gpu. @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dT, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; magma_int_t lddwork; magmaDoubleComplex *dA=NULL, *dV=NULL, *dW=NULL; magmaDoubleComplex *work=NULL; magma_queue_t queue=NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = magma_roundup( m, 32 ); lddwork = magma_roundup( n, 32 ); if (MAGMA_SUCCESS != magma_zmalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space // n*nb for larfb work // m*nb for V // nb*nb for T lwork = (n + m + nb) * nb; magma_zmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } magmaDoubleComplex *work_T, *work_V; work_T = work + n*nb; work_V = work + n*nb + nb*nb; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; // zungqr requires less workspace (n*nb), but is slow if k < zungqr's block size. // replacing it with the 4 routines below is much faster (e.g., 60x). //magma_int_t iinfo; //lapackf77_zungqr( &m_kk, &n_kk, &k_kk, // A(kk, kk), &lda, // &tau[kk], work, &lwork, &iinfo ); lapackf77_zlacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk); lapackf77_zlaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, work_V, &m_kk, &tau[kk], work_T, &k_kk); lapackf77_zlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk ); if (kk > 0) { magma_zsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda, queue ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda, queue ); } } if (kk > 0) { // Use blocked code // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to dV on the GPU mi = m - i; lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_zsetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, queue ); // set panel to identity magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda, queue ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda, queue ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork, queue ); } } // copy result back to CPU magma_zgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda, queue ); } cleanup: magma_queue_destroy( queue ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_zungqr */
static void magma_ztile_bulge_applyQ( magma_int_t core_id, magma_side_t side, magma_int_t n_loc, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t lde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt) //, magma_int_t* info) { //%=========================== //% local variables //%=========================== magma_int_t firstcolj; magma_int_t bg, rownbm; magma_int_t st,ed,fst,vlen,vnb,colj; magma_int_t vpos,tpos; magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd; magma_int_t nbgr, colst, coled; if (n <= 0) return; if (n_loc <= 0) return; //info = 0; magma_int_t INFO=0; magma_int_t nbGblk = magma_ceildiv(n-1, Vblksiz); /* * version v1: for each chunck it apply all the V's then move to * the other chunck. the locality here inside each * chunck meaning that thread t apply V_k then move * to V_k+1 which overlap with V_k meaning that the * E_k+1 overlap with E_k. so here is the * locality however thread t had to read V_k+1 and * T_k+1 at each apply. note that all thread if they * run at same speed they might reading the same V_k * and T_k at the same time. * */ magma_int_t nb_loc = 128; //$$$$$$$$ magma_int_t lwork = 2*nb_loc*max(Vblksiz,64); magmaDoubleComplex *work, *work2; magma_zmalloc_cpu(&work, lwork); magma_zmalloc_cpu(&work2, lwork); magma_int_t nbchunk = magma_ceildiv(n_loc, nb_loc); /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * each q_i consist of applying V to a block of row E(row_i,:) and applies are overlapped meaning * that q_i+1 overlap a portion of the E(row_i, :). * IN parallel E is splitten in vertical block over the threads */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * each q_i consist of applying V to a block of col E(:, col_i,:) and the applies are overlapped meaning * that q_i+1 overlap a portion of the E(:, col_i). * IN parallel E is splitten in horizontal block over the threads */ #ifdef ENABLE_DEBUG if ((core_id == 0) || (core_id == 1)) printf(" APPLY Q2_cpu zbulge_back_m N %d N_loc %d nbchunk %d NB %d Vblksiz %d SIDE %c \n", n, n_loc, nbchunk, nb, Vblksiz, side); #endif for (magma_int_t i = 0; i < nbchunk; i++) { magma_int_t ib_loc = min(nb_loc, (n_loc - i*nb_loc)); if (side == MagmaLeft) { for (bg = nbGblk; bg > 0; bg--) { firstcolj = (bg-1)*Vblksiz + 1; rownbm = magma_ceildiv((n-(firstcolj+1)),nb); if (bg == nbGblk) rownbm = magma_ceildiv((n-(firstcolj)),nb); // last blk has size=1 used for complex to handle A(N,N-1) for (magma_int_t j = rownbm; j > 0; j--) { vlen = 0; vnb = 0; colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop fst = (rownbm -j)*nb+colj +1; for (magma_int_t k=0; k < Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -j)*nb+colj +1; ed = min(st+nb-1,n-1); if (st > ed) break; if ((st == ed) && (colj != n-2)) break; vlen=ed-fst+1; vnb=k+1; } colst = (bg-1)*Vblksiz; magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos); if ((vlen > 0) && (vnb > 0)) { lapackf77_zlarfb( "L", "N", "F", "C", &vlen, &ib_loc, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(fst,i*nb_loc), &lde, work, &ib_loc); } if (INFO != 0) printf("ERROR ZUNMQR INFO %d \n", (int) INFO); } } } else if (side == MagmaRight) { rownbm = magma_ceildiv((n-1),nb); for (magma_int_t k = 1; k <= rownbm; k++) { ncolinvolvd = min(n-1, k*nb); avai_blksiz=min(Vblksiz,ncolinvolvd); nbgr = magma_ceildiv(ncolinvolvd,avai_blksiz); for (magma_int_t j = 1; j <= nbgr; j++) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(j-1)*avai_blksiz, avai_blksiz); colst = (j-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -k)*nb+colst +1; for (colj=colst; colj <= coled; colj++) { st = (rownbm -k)*nb+colj +1; ed = min(st+nb-1,n-1); if (st > ed) break; if ((st == ed) && (colj != n-2)) break; vlen=ed-fst+1; vnb=vnb+1; } magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos); if ((vlen > 0) && (vnb > 0)) { lapackf77_zlarfb( "R", "N", "F", "C", &ib_loc, &vlen, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(i*nb_loc,fst), &lde, work, &ib_loc); } } } } else { printf("ERROR SIDE %d \n",side); } } // END loop over the chunks magma_free_cpu(work); magma_free_cpu(work2); }
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX_16 array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] dT COMPLEX_16 array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_zgeqrf_gpu. @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr(magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; magmaDoubleComplex *dA, *dV, *dW; magmaDoubleComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_zmalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_zmalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magmaDoubleComplex *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but zungqr is slow for // k smaller than the zungqr's blocking size (new version can be up to 60x faster) lapackf77_zungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_zlacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_zlaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_zlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_zsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_zsetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_zgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zungqr */