extern "C" magma_int_t magma_zunmqr_gpu( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dC, size_t dC_offset, magma_int_t lddc, magmaDoubleComplex *hwork, magma_int_t lwork, magmaDoubleComplex_ptr dT, size_t dT_offset, magma_int_t nb, magma_queue_t queue, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZUNMQR_GPU overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. DA (input) COMPLEX_16 array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument DA. DA is modified by the routine but restored on exit. LDDA (input) INTEGER The leading dimension of the array DA. If SIDE = 'L', LDDA >= max(1,M); if SIDE = 'R', LDDA >= max(1,N). TAU (input) COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. DC (input/output) COMPLEX_16 array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDDC (input) INTEGER The leading dimension of the array DC. LDDC >= max(1,M). HWORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array HWORK. LWORK >= (M-K+NB)*(N+2*NB) if SIDE = 'L', and LWORK >= (N-K+NB)*(M+2*NB) if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array, and no error message related to LWORK is issued by XERBLA. DT (input) COMPLEX_16 array on the GPU that is the output (the 9th argument) of magma_zgeqrf_gpu. NB (input) INTEGER This is the blocking size that was used in pre-computing DT, e.g., the blocking size used in magma_zgeqrf_gpu. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) dA, (dA_offset+(a_1)+(a_2)*(ldda)) #define c_ref(a_1,a_2) dC, (dC_offset+(a_1)+(a_2)*(lddc)) #define t_ref(a_1) dT, (dT_offset+(a_1)*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex_ptr dwork; magma_int_t i, lddwork; magma_int_t i1, i2, i3, ib, ic, jc, mi, ni, nq, nw, ret; int left, notran, lquery; magma_int_t lwkopt; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); if (!left || notran) printf("zunmqr_gpu called with arguments not yet supported\n"); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if ( (!left) && (side != MagmaRight) ) { *info = -1; } else if ( (!notran) && (trans != MagmaConjTrans) ) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } lwkopt = (m-k+nb)*(n+2*nb); hwork[0] = MAGMA_Z_MAKE( lwkopt, 0 ); if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { hwork[0] = c_one; return *info; } lddwork= k; dwork = dT; size_t dwork_offset = 2*lddwork*nb; if ( (left && (! notran)) || ( (!left) && notran ) ) { i1 = 0; i2 = k-nb; i3 = nb; } else { i1 = (k - 1 - nb) / nb * nb; i2 = 0; i3 = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } if (nb < k) { for (i=i1; i3<0 ? i>i2 : i<i2; i+=i3) { ib = min(nb, k - i); if (left){ mi = m - i; ic = i; } else { ni = n - i; jc = i; } ret = magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, mi, ni, ib, a_ref(i, i ), ldda, t_ref(i), nb, c_ref(ic, jc), lddc, dwork, dwork_offset, nw, queue); if ( ret != MAGMA_SUCCESS ) return ret; } } else { i = i1; } /* Use unblocked code to multiply the last or only block. */ if (i < k) { ib = k-i; if (left){ mi = m - i; ic = i; } else { ni = n - i; jc = i; } magma_zgetmatrix(mi, ib, a_ref(i, i), ldda, hwork, mi, queue); magma_zgetmatrix(mi, ni, c_ref(ic, jc), lddc, hwork+mi*ib, mi, queue); magma_int_t lhwork = lwork - mi*(ib + ni); lapackf77_zunmqr( MagmaLeftStr, MagmaConjTransStr, &mi, &ni, &ib, hwork, &mi, tau+i, hwork+mi*ib, &mi, hwork+mi*(ib+ni), &lhwork, info); // send the updated part of c back to the GPU magma_zsetmatrix(mi, ni, hwork+mi*ib, mi, c_ref(ic, jc), lddc, queue); } return *info; /* End of MAGMA_ZUNMQR_GPU */ }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrs */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, matnorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDoubleComplex *d_A, *d_B; magma_int_t M, N, n2, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===============================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; if ( M < N ) { printf( "skipping M=%d, N=%d because M < N is not yet supported.\n", (int) M, (int) N ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; n2 = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; // query for workspace size lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; lhwork = -1; lapackf77_zgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = -1; lapackf77_zunmqr( MagmaLeftStr, MagmaConjTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_work, magmaDoubleComplex, lhwork ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*N ); TESTING_DEVALLOC( d_B, magmaDoubleComplex, lddb*nrhs ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS n2 = M*nrhs; lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //n2 = N*nrhs; //lapackf77_zlarnv( &ione, ISEED, &n2, h_X ); //blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_zgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*matnorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*matnorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error ); printf("%s\n", (gpu_error < tol ? "" : " failed")); status |= ! (gpu_error < tol); TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_A2 ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( h_R ); TESTING_FREE( h_work ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_zunmqr(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *C, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZUNMQR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. C (input/output) COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define A(a_1,a_2) ( A + (a_1) + (a_2)*lda) #define dC(a_1,a_2) (dC + (a_1) + (a_2)*lddc) magma_int_t nb = magma_get_zgeqrf_nb( min( m, n )); cuDoubleComplex c_one = MAGMA_Z_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; magma_int_t nq_i, lddwork; magma_int_t i; cuDoubleComplex T[ 2*nb*nb ]; magma_int_t i1, i2, step, ib, ic, jc, mi, ni, nq, nw; int left, notran, lquery; magma_int_t iinfo, lwkopt; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } lwkopt = max(1,nw) * nb; work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, MagmaConjTransStr)) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } /* Allocate work space on the GPU */ magma_int_t lddc = m; cuDoubleComplex *dwork, *dC; magma_zmalloc( &dC, lddc*n ); magma_zmalloc( &dwork, (m + n + nb)*nb ); /* Copy matrix C from the CPU to the GPU */ magma_zsetmatrix( m, n, C, ldc, dC, lddc ); if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, &tau[1], C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ( (left && (! notran)) || ((! left) && notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } for( i=i1; (step<0 ? i>=i2 : i<i2); i += step ) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ nq_i = nq - i; lapackf77_zlarft("F", "C", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib); /* 1) Put 0s in the upper triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ zpanel_to_q('U', ib, A(i,i), lda, T+ib*ib); magma_zsetmatrix( nq_i, ib, A(i,i), lda, dwork, nq_i ); zq_to_panel('U', ib, A(i,i), lda, T+ib*ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dwork+nq_i*ib, ib ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dwork, nq_i, dwork+nq_i*ib, ib, dC(ic,jc), lddc, dwork+nq_i*ib + ib*ib, lddwork); } magma_zgetmatrix( m, n, dC, lddc, C, ldc ); } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); magma_free( dC ); magma_free( dwork ); return *info; } /* magma_zunmqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zunmqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaDoubleComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaConjTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_zgeqrf_nb( m ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_ZUNMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqrf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); TESTING_MALLOC_CPU( C, magmaDoubleComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaDoubleComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaDoubleComplex, lda*k ); TESTING_MALLOC_CPU( W, magmaDoubleComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, k ); // C is full, m x n size = ldc*n; lapackf77_zlarnv( &ione, ISEED, &size, C ); lapackf77_zlacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_zlarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_zgeqrf( mm, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zunmqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_zunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_zunmqr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_Z_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_zunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_zlange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_zaxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_zlange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) /* Constants */ magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; /* Local variables */ const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; magmaDoubleComplex *T = NULL; magmaDoubleComplex_ptr dw[MagmaMaxGPUs] = { NULL }; magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }}; magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }}; magma_int_t ind_c; magma_device_t dev; magma_device_t orig_dev; magma_getdevice( &orig_dev ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = magma_zmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = magma_roundup( m, 64 ); // TODO why 64 instead of 32 ? magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l; ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) if (MAGMA_SUCCESS != magma_zmalloc_pinned( &T, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_zmalloc( &dw[dev], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &events[dev][0] ); magma_event_create( &events[dev][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] ); nlocal[dev] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied magma_zsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); // set upper triangular part of dA to identity magmablas_zlaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_zsetmatrix_async(kb, kb, T, kb, dT(dev, ind_c), kb, queues[dev][0] ); } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); // check if the data was copied magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[dev], kb, dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb, dC(dev, i, 0), lddc, dwork(dev, ind_c), lddwork, queues[dev][1] ); magma_event_record(events[dev][ind_c], queues[dev][1] ); } ind_c = (ind_c+1)%2; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zgetmatrix( m, kb, dC(dev, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc, queues[dev][1] ); // magma_zgetmatrix_async( m, kb, // dC(dev, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, queues[dev][0] ); } } else { *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); goto cleanup; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] ); magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork, queues[dev][1] ); } */ } cleanup: work[0] = magma_zmake_lwork( lwkopt ); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_destroy( events[dev][0] ); magma_event_destroy( events[dev][1] ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_free( dw[dev] ); } magma_setdevice( orig_dev ); magma_free_pinned( T ); return *info; } /* magma_zunmqr */
extern "C" magma_int_t magma_zunmqr(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *c, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- MAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= ZUNMQR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. C (input/output) COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_side_t side_ = side; magma_trans_t trans_ = trans; /* Allocate work space on the GPU */ magmaDoubleComplex_ptr dwork, dc; magma_zmalloc( &dc, (m)*(n) ); magma_zmalloc( &dwork, (m + n + 64)*64 ); /* Copy matrix C from the CPU to the GPU */ magma_zsetmatrix( m, n, c, 0, ldc, dc, 0, m, queue ); //dc -= (1 + m); size_t dc_offset = -(1+m); magma_int_t a_offset, c_offset, i__4, lddwork; magma_int_t i__; magmaDoubleComplex t[2*4160] /* was [65][64] */; magma_int_t i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw; int left, notran, lquery; magma_int_t iinfo, lwkopt; a_offset = 1 + lda; a -= a_offset; --tau; c_offset = 1 + ldc; c -= c_offset; *info = 0; left = lapackf77_lsame(lapack_const(side_), "L"); notran = lapackf77_lsame(lapack_const(trans_), "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(lapack_const(side_), "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(lapack_const(trans_), "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = max(1,nw) * nb; MAGMA_Z_SET2REAL( work[0], lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(lapack_const(side_), lapack_const(trans_), &m, &n, &k, &a[a_offset], &lda, &tau[1], &c[c_offset], &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } for (i__ = i1; i3 < 0 ? i__ >= i2 : i__ <= i2; i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i__ + 1; lapackf77_zlarft("F", "C", &i__4, &ib, &a[i__ + i__ * lda], &lda, &tau[i__], t, &ib); /* 1) Put 0s in the upper triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ zpanel_to_q(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); magma_zsetmatrix( i__4, ib, &a[i__ + i__ * lda], 0, lda, dwork, 0, i__4, queue ); zq_to_panel(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i__ + 1; ic = i__; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i__ + 1; jc = i__; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_zsetmatrix( ib, ib, t, 0, ib, dwork, i__4*ib, ib, queue ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dwork, 0, i__4, dwork, i__4*ib, ib, dc, dc_offset+(ic + jc * m), m, dwork, (i__4*ib + ib*ib), lddwork, queue); } magma_zgetmatrix( m, n, dc, dc_offset+(1+m), m, &c[c_offset], 0, ldc, queue ); } MAGMA_Z_SET2REAL( work[0], lwkopt ); //dc += (1 + m); magma_free( dc ); magma_free( dwork ); return *info; } /* magma_zunmqr */
extern "C" magma_int_t magma_zunmqr_m(magma_int_t nrgpu, char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *c, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZUNMQR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. C (input/output) COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ cuDoubleComplex c_one = MAGMA_Z_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; cuDoubleComplex* dw[MagmaMaxGPUs]; cudaStream_t stream [MagmaMaxGPUs][2]; magma_int_t ind_c, kb; magma_int_t i__4; magma_int_t i; cuDoubleComplex t[4160]; /* was [65][64] */ magma_int_t i1, i2, i3, ib, nb, nq, nw; magma_int_t left, notran, lquery; magma_int_t iinfo, lwkopt; magma_int_t igpu = 0; int gpu_b; magma_getdevice(&gpu_b); *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = max(1,nw) * nb; MAGMA_Z_SET2REAL( work[0], lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar =nb; magma_int_t lddwork = nw; magma_int_t n_l = (n+nrgpu-1)/nrgpu; // local n n_l = ((n_l+63)/64)*64; if (n_l<256) n_l=256; nrgpu = min(nrgpu, (n+n_l-1)/n_l); // Don't use GPU that will not have data. for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(NULL); if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], (n_l*lddc + 2*lddac*lddar + 2*(nb + 1 + lddwork)*nb))) { magma_xerbla( __func__, -(*info) ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); kb = min(n_l, n-igpu*n_l); magma_zsetmatrix_async( m, kb, C(0, igpu*n_l), ldc, dC(igpu, 0, 0), lddc, stream[igpu][0] ); } if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } kb = min(nb, k-i1); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_zsetmatrix_async( nq-i1, kb, A(i1, i1), lda, dA_c(igpu, 0, i1, 0), lddac, stream[igpu][0] ); } ind_c = 0; for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], t, &ib); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_zsetmatrix_async( ib, ib, t, ib, dt(igpu, ind_c), ib, stream[igpu][ind_c] ); magma_queue_sync( stream[igpu][ind_c] ); // Makes sure that we can change t next iteration. } // start the copy of next A panel kb = min(nb, k - i - i3); if (kb > 0 && i+i3 >= 0){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_zsetmatrix_async( nq-(i+i3), kb, A(i+i3, i+i3), lda, dA_c(igpu, (ind_c+1)%2, i+i3, 0), lddac, stream[igpu][(ind_c+1)%2] ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); // Put 0s in the upper triangular part of dA; magmablas_zsetdiag1subdiag0_stream('L', ib, ib, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][ind_c]); kb = min(n_l, n-igpu*n_l); magmablasSetKernelStream(stream[igpu][ind_c]); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, kb, ib, dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), ib, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); } ind_c = (ind_c+1)%2; } //copy C from mgpus for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); magma_queue_sync( stream[igpu][1] ); kb = min(n_l, n-igpu*n_l); //asynchronous copy gives problems sometimes... // magma_zgetmatrix_async( m, kb, // dC(igpu, 0, 0), lddc, // C(0, igpu*n_l), ldc, stream[igpu][0] ); magma_zgetmatrix( m, kb, dC(igpu, 0, 0), lddc, C(0, igpu*n_l), ldc ); } } else { fprintf(stderr, "The case (side == right) is not implemented\n"); magma_xerbla( __func__, 1 ); return *info; /*if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], t, &ib); // 1) copy the panel from A to the GPU, and // 2) Put 0s in the upper triangular part of dA; magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_zsetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, t, ib, dt, ib ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dt, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } } MAGMA_Z_SET2REAL( work[0], lwkopt ); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); magmablasSetKernelStream(NULL); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice(gpu_b); return *info; } /* magma_zunmqr */
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] nrgpu INTEGER Number of GPUs to use. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr_m(magma_int_t nrgpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; magmaDoubleComplex *T; magma_zmalloc_pinned(&T, nb*nb); //printf("calling zunmqr_m with nb=%d\n", (int) nb); magmaDoubleComplex* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_int_t igpu = 0; int gpu_b; magma_getdevice(&gpu_b); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l; nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], ldw )) { magma_xerbla( __func__, -(*info) ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_zsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_zlaset_band_stream( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_zsetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zgetmatrix( m, kb, dC(igpu, 0, i/nrgpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_zgetmatrix_async( m, kb, // dC(igpu, 0, i/nrgpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, T, ib, dT, ib ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magmablasSetKernelStream(NULL); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice(gpu_b); return *info; } /* magma_zunmqr */
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define dV(i_,j_) (dV + (i_) + (j_)*nq_i) #define dT(i_,j_) (dT + (i_) + (j_)*ib) #define dwork(i_) (dwork + (i_)) magmaDoubleComplex *T, *T2; magma_int_t i, i1, i2, ib, ic, jc, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; magma_int_t left, notran, lquery; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_zgelqf_nb( m, n ); lwkopt = max(1,nw)*nb; work[0] = magma_zmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_Z_ONE; return *info; } ldwork = nw; if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = magma_roundup( m, 32 ); magmaDoubleComplex_ptr dwork, dV, dT, dC; magma_zmalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_zmalloc_cpu( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; /* Copy matrix C from the CPU to the GPU */ magma_zsetmatrix( m, n, C, ldc, dC(0,0), lddc, queue ); if ( (left && ! notran) || (! left && notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ nq_i = nq - i; lapackf77_zlarft( "Forward", "Columnwise", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib ); /* 1) set upper triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ magma_zpanel_to_q( MagmaUpper, ib, A(i,i), lda, T2 ); magma_zsetmatrix( nq_i, ib, A(i,i), lda, dV(0,0), nq_i, queue ); magma_zq_to_panel( MagmaUpper, ib, A(i,i), lda, T2 ); if (left) { /* H or H**H is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H**H is applied to C(1:m,i:n) */ ni = n - i; jc = i; } /* Apply H or H**H; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dT(0,0), ib, queue ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dV(0,0), nq_i, dT(0,0), ib, dC(ic,jc), lddc, dwork(0), ldwork, queue ); } magma_zgetmatrix( m, n, dC(0,0), lddc, C, ldc, queue ); magma_queue_destroy( queue ); magma_free( dwork ); magma_free_cpu( T ); } work[0] = magma_zmake_lwork( lwkopt ); return *info; } /* magma_zunmqr */