/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Cnorm, error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; magmaFloatComplex *C, *R, *A, *hwork, *tau; magmaFloatComplex_ptr dC, dA, dT; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; ldc = magma_roundup( m, opts.align ); // multiple of 32 by default // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); nb = magma_get_cgeqrf_nb( mm, k ); lda = magma_roundup( mm, opts.align ); // multiple of 32 by default gflops = FLOPS_CUNMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaLeft ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + magma_roundup( max(m,n), 32) )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + magma_roundup( max(m,n), 32 ) )*nb; } // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_cmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*k ); TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, k ); TESTING_MALLOC_DEV( dC, magmaFloatComplex, ldc*n ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, lda*k ); TESTING_MALLOC_DEV( dT, magmaFloatComplex, dt_size ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); magma_csetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) size = lda*k; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_csetmatrix( mm, k, A, lda, dA, lda ); magma_cgeqrf_gpu( mm, k, dA, lda, tau, dT, &info ); magma_cgetmatrix( mm, k, dA, lda, A, lda ); if (info != 0) { printf("magma_cgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, hwork, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); if (info != 0) { printf("magma_cunmqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_C_REAL( hwork[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } // cunmqr2 takes a copy of dA in CPU memory if ( opts.version == 2 ) { magma_cgetmatrix( mm, k, dA, lda, A, lda ); } magmablasSetKernelStream( opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); // sync needed for L,N and R,T cases if ( opts.version == 1 ) { magma_cunmqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); } else if ( opts.version == 2 ) { magma_cunmqr2_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, A, lda, &info ); } gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cunmqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_cgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / (magma_ssqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- CUNMTR overwrites the general complex M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H where Q is a complex unitary matrix of order nq, with nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Q is defined as the product of nq-1 elementary reflectors, as returned by CHETRD: if UPLO = MagmaUpper, Q = H(nq-1) . . . H(2) H(1); if UPLO = MagmaLower, Q = H(1) H(2) . . . H(nq-1). Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A contains elementary reflectors from CHETRD; - = MagmaLower: Lower triangle of A contains elementary reflectors from CHETRD. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] dA COMPLEX array, dimension (LDDA,M) if SIDE = MagmaLeft (LDDA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by CHETRD_GPU. On output the diagonal, the subdiagonal and the upper part (UPLO=MagmaLower) or lower part (UPLO=MagmaUpper) are destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = MagmaLeft; LDDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau COMPLEX array, dimension (M-1) if SIDE = MagmaLeft (N-1) if SIDE = MagmaRight TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CHETRD. @param[in,out] dC COMPLEX array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param[in] wA (workspace) COMPLEX array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by CHETRD_GPU. @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,M) if SIDE = MagmaLeft; LDWA >= max(1,N) if SIDE = MagmaRight. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cheev_comp ********************************************************************/ extern "C" magma_int_t magma_cunmtr_gpu(magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_int_t m, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *tau, magmaFloatComplex *dC, magma_int_t lddc, magmaFloatComplex *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) magma_int_t i1, i2, mi, ni, nq, nw; int left, upper; magma_int_t iinfo; *info = 0; left = (side == MagmaLeft); upper = (uplo == MagmaUpper); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (trans != MagmaNoTrans && trans != Magma_ConjTrans) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || nq == 1) { return *info; } if (left) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } if (upper) { magma_cunmql2_gpu(side, trans, mi, ni, nq-1, dA(0,1), ldda, tau, dC, lddc, wA(0,1), ldwa, &iinfo); } else { /* Q was determined by a call to CHETRD with UPLO = 'L' */ if (left) { i1 = 1; i2 = 0; } else { i1 = 0; i2 = 1; } magma_cunmqr2_gpu(side, trans, mi, ni, nq-1, dA(1,0), ldda, tau, dC(i1,i2), lddc, wA(1,0), ldwa, &iinfo); } return *info; } /* magma_cunmtr */