void magmaf_dormqr2_gpu( magma_side_t *side, magma_trans_t *trans, magma_int_t *m, magma_int_t *n, magma_int_t *k, devptr_t *da, magma_int_t *ldda, double *tau, devptr_t *dc, magma_int_t *lddc, double *wa, magma_int_t *ldwa, magma_int_t *info ) { magma_dormqr2_gpu( *side, *trans, *m, *n, *k, magma_ddevptr(da), *ldda, tau, magma_ddevptr(dc), *lddc, wa, *ldwa, info ); }
extern "C" magma_int_t magma_dormtr_gpu(char side, char uplo, char trans, magma_int_t m, magma_int_t n, double *da, magma_int_t ldda, double *tau, double *dc, magma_int_t lddc, double *wa, magma_int_t ldwa, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= DORMTR overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix of order nq, with nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of nq-1 elementary reflectors, as returned by SSYTRD: if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1); if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1). Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A contains elementary reflectors from SSYTRD; = 'L': Lower triangle of A contains elementary reflectors from SSYTRD. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. DA (device input) DOUBLE_PRECISION array, dimension (LDDA,M) if SIDE = 'L' (LDDA,N) if SIDE = 'R' The vectors which define the elementary reflectors, as returned by DSYTRD_GPU. On output the diagonal, the subdiagonal and the upper part(UPLO='L') or lower part(UPLO='U') are destroyed. LDDA (input) INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = 'L'; LDDA >= max(1,N) if SIDE = 'R'. TAU (input) DOUBLE_PRECISION array, dimension (M-1) if SIDE = 'L' (N-1) if SIDE = 'R' TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SSYTRD. DC (device input/output) DOUBLE_PRECISION array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q. LDDC (input) INTEGER The leading dimension of the array C. LDDC >= max(1,M). WA (input/workspace) DOUBLE_PRECISION array, dimension (LDWA,M) if SIDE = 'L' (LDWA,N) if SIDE = 'R' The vectors which define the elementary reflectors, as returned by DSYTRD_GPU. LDWA (input) INTEGER The leading dimension of the array A. LDWA >= max(1,M) if SIDE = 'L'; LDWA >= max(1,N) if SIDE = 'R'. WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ double c_one = MAGMA_D_ONE; char side_[2] = {side, 0}; char uplo_[2] = {uplo, 0}; char trans_[2] = {trans, 0}; magma_int_t i__2; magma_int_t i1, i2, mi, ni, nq, nw; int left, upper; magma_int_t iinfo; *info = 0; left = lapackf77_lsame(side_, "L"); upper = lapackf77_lsame(uplo_, "U"); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -2; } else if (! lapackf77_lsame(trans_, "N") && ! lapackf77_lsame(trans_, "C")) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || nq == 1) { return *info; } if (left) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } if (upper) { i__2 = nq - 1; magma_dormql2_gpu(side, trans, mi, ni, i__2, &da[ldda], ldda, tau, dc, lddc, &wa[ldwa], ldwa, &iinfo); } else { /* Q was determined by a call to SSYTRD with UPLO = 'L' */ if (left) { i1 = 1; i2 = 0; } else { i1 = 0; i2 = 1; } i__2 = nq - 1; magma_dormqr2_gpu(side, trans, mi, ni, i__2, &da[1], ldda, tau, &dc[i1 + i2 * lddc], lddc, &wa[1], ldwa, &iinfo); } return *info; } /* dormtr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dormqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Cnorm, error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; double *C, *R, *A, *hwork, *tau; magmaDouble_ptr dC, dA, dT; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; ldc = magma_roundup( m, opts.align ); // multiple of 32 by default // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); nb = magma_get_dgeqrf_nb( mm, k ); lda = magma_roundup( mm, opts.align ); // multiple of 32 by default gflops = FLOPS_DORMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaLeft ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + magma_roundup( max(m,n), 32) )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + magma_roundup( max(m,n), 32 ) )*nb; } // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_dmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, double, ldc*n ); TESTING_MALLOC_CPU( R, double, ldc*n ); TESTING_MALLOC_CPU( A, double, lda*k ); TESTING_MALLOC_CPU( hwork, double, lwork_max ); TESTING_MALLOC_CPU( tau, double, k ); TESTING_MALLOC_DEV( dC, double, ldc*n ); TESTING_MALLOC_DEV( dA, double, lda*k ); TESTING_MALLOC_DEV( dT, double, dt_size ); // C is full, m x n size = ldc*n; lapackf77_dlarnv( &ione, ISEED, &size, C ); magma_dsetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) size = lda*k; lapackf77_dlarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_dsetmatrix( mm, k, A, lda, dA, lda ); magma_dgeqrf_gpu( mm, k, dA, lda, tau, dT, &info ); magma_dgetmatrix( mm, k, dA, lda, A, lda ); if (info != 0) { printf("magma_dgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dormqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, hwork, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dormqr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_dormqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); if (info != 0) { printf("magma_dormqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_D_REAL( hwork[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } // dormqr2 takes a copy of dA in CPU memory if ( opts.version == 2 ) { magma_dgetmatrix( mm, k, dA, lda, A, lda ); } magmablasSetKernelStream( opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); // sync needed for L,N and R,T cases if ( opts.version == 1 ) { magma_dormqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); } else if ( opts.version == 2 ) { magma_dormqr2_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, A, lda, &info ); } gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dormqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_dgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_daxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_dlange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_dlange( "Fro", &m, &n, R, &ldc, work ) / (magma_dsqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- DORMTR overwrites the general real M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H where Q is a real unitary matrix of order nq, with nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Q is defined as the product of nq-1 elementary reflectors, as returned by DSYTRD: if UPLO = MagmaUpper, Q = H(nq-1) . . . H(2) H(1); if UPLO = MagmaLower, Q = H(1) H(2) . . . H(nq-1). Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A contains elementary reflectors from DSYTRD; - = MagmaLower: Lower triangle of A contains elementary reflectors from DSYTRD. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] dA DOUBLE_PRECISION array, dimension (LDDA,M) if SIDE = MagmaLeft (LDDA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by DSYTRD_GPU. On output the diagonal, the subdiagonal and the upper part (UPLO=MagmaLower) or lower part (UPLO=MagmaUpper) are destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = MagmaLeft; LDDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau DOUBLE_PRECISION array, dimension (M-1) if SIDE = MagmaLeft (N-1) if SIDE = MagmaRight TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DSYTRD. @param[in,out] dC DOUBLE_PRECISION array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param[in] wA (workspace) DOUBLE_PRECISION array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by DSYTRD_GPU. @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,M) if SIDE = MagmaLeft; LDWA >= max(1,N) if SIDE = MagmaRight. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dsyev_comp ********************************************************************/ extern "C" magma_int_t magma_dormtr_gpu( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, double *tau, magmaDouble_ptr dC, magma_int_t lddc, double *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) magma_int_t i1, i2, mi, ni, nq; int left, upper; magma_int_t iinfo; *info = 0; left = (side == MagmaLeft); upper = (uplo == MagmaUpper); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; //nw = n; } else { nq = n; //nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (trans != MagmaNoTrans && trans != MagmaTrans) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || nq == 1) { return *info; } if (left) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } if (upper) { magma_dormql2_gpu(side, trans, mi, ni, nq-1, dA(0,1), ldda, tau, dC, lddc, wA(0,1), ldwa, &iinfo); } else { /* Q was determined by a call to DSYTRD with UPLO = 'L' */ if (left) { i1 = 1; i2 = 0; } else { i1 = 0; i2 = 1; } magma_dormqr2_gpu(side, trans, mi, ni, nq-1, dA(1,0), ldda, tau, dC(i1,i2), lddc, wA(1,0), ldwa, &iinfo); } return *info; } /* magma_dormtr */