inline void trmv( const Order, const UpLo, const Trans, const Diag, const int n, const std::complex<double>* a, const int lda, std::complex<double>* x, const int incx ) { BOOST_STATIC_ASSERT( (is_same<Order, tag::column_major>::value) ); cublasZtrmv( blas_option< UpLo >::value, blas_option< Trans >::value, blas_option< Diag >::value, n, a, lda, x, incx ); }
/** Perform triangular matrix-vector product. \f$ x = A x \f$ (trans == MagmaNoTrans), or \n \f$ x = A^T x \f$ (trans == MagmaTrans), or \n \f$ x = A^H x \f$ (trans == MagmaConjTrans). @param[in] uplo Whether the upper or lower triangle of A is referenced. @param[in] trans Operation to perform on A. @param[in] diag Whether the diagonal of A is assumed to be unit or non-unit. @param[in] n Number of rows and columns of A. n >= 0. @param[in] dA COMPLEX_16 array of dimension (ldda,n), ldda >= max(1,n). The n-by-n matrix A, on GPU device. @param[in] ldda Leading dimension of dA. @param[in] dx COMPLEX_16 array on GPU device. The n element vector x of dimension (1 + (n-1)*incx). @param[in] incx Stride between consecutive elements of dx. incx != 0. @ingroup magma_zblas2 */ extern "C" void magma_ztrmv( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t n, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dx, magma_int_t incx ) { cublasZtrmv( cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), n, dA, ldda, dx, incx ); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ztrmv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N; magma_int_t Ak; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_x, *h_xcublas; magmaDoubleComplex *d_A, *d_x; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "uplo = %c, transA = %c, diag = %c \n", opts.uplo, opts.transA, opts.diag ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; gflops = FLOPS_ZTRMM(opts.side, N, 1) / 1e9; lda = N; Ak = N; ldda = ((lda+31)/32)*32; sizeA = lda*Ak; TESTING_MALLOC( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC( h_x, magmaDoubleComplex, N ); TESTING_MALLOC( h_xcublas, magmaDoubleComplex, N ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_DEVALLOC( d_x, magmaDoubleComplex, N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &N, h_x ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_zsetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasZtrmv( opts.uplo, opts.transA, opts.diag, N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ztrmv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &ione, h_x, &N, work ); blasf77_zaxpy( &N, &c_neg_one, h_x, &ione, h_xcublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &ione, h_xcublas, &N, work ) / Cnorm; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE( h_A ); TESTING_FREE( h_x ); TESTING_FREE( h_xcublas ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_x ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ztrmv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N; magma_int_t Ak; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_x, *h_xcublas; magmaDoubleComplex_ptr d_A, d_x; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s, diag = %s \n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; gflops = FLOPS_ZTRMM(opts.side, N, 1) / 1e9; lda = N; Ak = N; ldda = ((lda+31)/32)*32; sizeA = lda*Ak; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC_CPU( h_x, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( h_xcublas, magmaDoubleComplex, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_x, magmaDoubleComplex, N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &N, h_x ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_zsetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasZtrmv( opts.handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ztrmv( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &ione, h_x, &N, work ); blasf77_zaxpy( &N, &c_neg_one, h_x, &ione, h_xcublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &ione, h_xcublas, &N, work ) / Cnorm; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_x ); TESTING_FREE_CPU( h_xcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_x ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }