void magma_zsyrk( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, cuDoubleComplex alpha, cuDoubleComplex const* dA, magma_int_t lda, cuDoubleComplex beta, cuDoubleComplex* dC, magma_int_t ldc ) { cublasZsyrk( cublas_uplo_const( uplo ), cublas_trans_const( trans ), n, k, alpha, dA, lda, beta, dC, ldc ); }
void magma_ssyr( magma_uplo_t uplo, magma_int_t n, float alpha, const float *dx, magma_int_t incx, float *dA, magma_int_t ldda ) { cublasSsyr( cublas_uplo_const( uplo ), n, alpha, dx, incx, dA, ldda ); }
void magma_dsyrk( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, double const* dA, magma_int_t lda, double beta, double* dC, magma_int_t ldc ) { cublasDsyrk( cublas_uplo_const( uplo ), cublas_trans_const( trans ), n, k, alpha, dA, lda, beta, dC, ldc ); }
void magma_cher( magma_uplo_t uplo, magma_int_t n, float alpha, const magmaFloatComplex *dx, magma_int_t incx, magmaFloatComplex *dA, magma_int_t ldda ) { cublasCher( cublas_uplo_const( uplo ), n, alpha, dx, incx, dA, ldda ); }
void magma_ssyrk( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, float alpha, float const* dA, magma_int_t lda, float beta, float* dC, magma_int_t ldc ) { cublasSsyrk( cublas_uplo_const( uplo ), cublas_trans_const( trans ), n, k, alpha, dA, lda, beta, dC, ldc ); }
void magma_dsyr( magma_uplo_t uplo, magma_int_t n, double alpha, const double *dx, magma_int_t incx, double *dA, magma_int_t ldda ) { cublasDsyr( cublas_uplo_const( uplo ), n, alpha, dx, incx, dA, ldda ); }
void magma_ssymv( magma_uplo_t uplo, magma_int_t n, float alpha, float const* dA, magma_int_t lda, float const* dx, magma_int_t incx, float beta, float* dy, magma_int_t incy ) { cublasSsymv( cublas_uplo_const( uplo ), n, alpha, dA, lda, dx, incx, beta, dy, incy ); }
void magma_ctrsv( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t n, const magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dx, magma_int_t incx ) { cublasCtrsv( cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), n, dA, ldda, dx, incx ); }
void magma_dsymv( magma_uplo_t uplo, magma_int_t n, double alpha, double const* dA, magma_int_t lda, double const* dx, magma_int_t incx, double beta, double* dy, magma_int_t incy ) { cublasDsymv( cublas_uplo_const( uplo ), n, alpha, dA, lda, dx, incx, beta, dy, incy ); }
/** Perform Hermitian rank-1 update, \f$ A = \alpha x x^H + A \f$. @param[in] uplo Whether the upper or lower triangle of A is referenced. @param[in] n Number of rows and columns of A. n >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dx COMPLEX_16 array on GPU device. The n element vector x of dimension (1 + (n-1)*incx). @param[in] incx Stride between consecutive elements of dx. incx != 0. @param[in,out] dA COMPLEX_16 array of dimension (ldda,n), ldda >= max(1,n). The n-by-n matrix A, on GPU device. @param[in] ldda Leading dimension of dA. @ingroup magma_zblas2 */ extern "C" void magma_zher( magma_uplo_t uplo, magma_int_t n, double alpha, magmaDoubleComplex_const_ptr dx, magma_int_t incx, magmaDoubleComplex_ptr dA, magma_int_t ldda ) { cublasZher( cublas_uplo_const( uplo ), n, alpha, dx, incx, dA, ldda ); }
void magma_strsv( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t n, float const *dA, magma_int_t lda, float *dx, magma_int_t incx ) { cublasStrsv( cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), n, dA, lda, dx, incx ); }
void magma_dtrsv( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t n, const double *dA, magma_int_t ldda, double *dx, magma_int_t incx ) { cublasDtrsv( cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), n, dA, ldda, dx, incx ); }
void magma_dtrmm( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, double const *dA, magma_int_t lda, double *dB, magma_int_t ldb ) { cublasDtrmm( cublas_side_const( side ), cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), m, n, alpha, dA, lda, dB, ldb ); }
void magma_strsm( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, float alpha, float const* dA, magma_int_t lda, float* dB, magma_int_t ldb ) { cublasStrsm( cublas_side_const( side ), cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), m, n, alpha, dA, lda, dB, ldb ); }
void magma_cherk( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, float alpha, const magmaFloatComplex *dA, magma_int_t ldda, float beta, magmaFloatComplex *dC, magma_int_t lddc ) { cublasCherk( cublas_uplo_const( uplo ), cublas_trans_const( trans ), n, k, alpha, dA, ldda, beta, dC, lddc ); }
void magma_ssymm( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, float alpha, float const* dA, magma_int_t lda, float const* dB, magma_int_t ldb, float beta, float* dC, magma_int_t ldc ) { cublasSsymm( cublas_side_const( side ), cublas_uplo_const( uplo ), m, n, alpha, dA, lda, dB, ldb, beta, dC, ldc ); }
void magma_dsymm( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, double alpha, double const* dA, magma_int_t lda, double const* dB, magma_int_t ldb, double beta, double* dC, magma_int_t ldc ) { cublasDsymm( cublas_side_const( side ), cublas_uplo_const( uplo ), m, n, alpha, dA, lda, dB, ldb, beta, dC, ldc ); }
/** Solve triangular matrix-vector system (one right-hand side). \f$ A x = b \f$ (trans == MagmaNoTrans), or \n \f$ A^T x = b \f$ (trans == MagmaTrans), or \n \f$ A^H x = b \f$ (trans == MagmaConjTrans). @param[in] uplo Whether the upper or lower triangle of A is referenced. @param[in] trans Operation to perform on A. @param[in] diag Whether the diagonal of A is assumed to be unit or non-unit. @param[in] n Number of rows and columns of A. n >= 0. @param[in] dA COMPLEX_16 array of dimension (ldda,n), ldda >= max(1,n). The n-by-n matrix A, on GPU device. @param[in] ldda Leading dimension of dA. @param[in,out] dx COMPLEX_16 array on GPU device. On entry, the n element RHS vector b of dimension (1 + (n-1)*incx). On exit, overwritten with the solution vector x. @param[in] incx Stride between consecutive elements of dx. incx != 0. @ingroup magma_zblas2 */ extern "C" void magma_ztrsv( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t n, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dx, magma_int_t incx ) { cublasZtrsv( cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), n, dA, ldda, dx, incx ); }
void magma_ctrsm( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, const magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dB, magma_int_t lddb ) { cublasCtrsm( cublas_side_const( side ), cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), m, n, alpha, dA, ldda, dB, lddb ); }
/** Perform Hermitian rank-k update. \f$ C = \alpha A A^T + \beta C \f$ (trans == MagmaNoTrans), or \n \f$ C = \alpha A^T A + \beta C \f$ (trans == MagmaTrans), \n where \f$ C \f$ is Hermitian. @param[in] uplo Whether the upper or lower triangle of C is referenced. @param[in] trans Operation to perform on A. @param[in] n Number of rows and columns of C. n >= 0. @param[in] k Number of columns of A (for MagmaNoTrans) or rows of A (for MagmaTrans). k >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX_16 array on GPU device. If trans == MagmaNoTrans, the n-by-k matrix A of dimension (ldda,k), ldda >= max(1,n); \n otherwise, the k-by-n matrix A of dimension (ldda,n), ldda >= max(1,k). @param[in] ldda Leading dimension of dA. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dC COMPLEX_16 array on GPU device. The n-by-n Hermitian matrix C of dimension (lddc,n), lddc >= max(1,n). @param[in] lddc Leading dimension of dC. @ingroup magma_zblas3 */ extern "C" void magma_zherk( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, double beta, magmaDoubleComplex_ptr dC, magma_int_t lddc ) { cublasZherk( cublas_uplo_const( uplo ), cublas_trans_const( trans ), n, k, alpha, dA, ldda, beta, dC, lddc ); }
void magma_chemv( magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, const magmaFloatComplex *dA, magma_int_t ldda, const magmaFloatComplex *dx, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex *dy, magma_int_t incy ) { cublasChemv( cublas_uplo_const( uplo ), n, alpha, dA, ldda, dx, incx, beta, dy, incy ); }
void magma_chemm( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, const magmaFloatComplex *dA, magma_int_t ldda, const magmaFloatComplex *dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex *dC, magma_int_t lddc ) { cublasChemm( cublas_side_const( side ), cublas_uplo_const( uplo ), m, n, alpha, dA, ldda, dB, lddb, beta, dC, lddc ); }
void magma_csyr2k( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, const magmaFloatComplex *dA, magma_int_t ldda, const magmaFloatComplex *dB, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex *dC, magma_int_t lddc ) { cublasCsyr2k( cublas_uplo_const( uplo ), cublas_trans_const( trans ), n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc ); }
/** Solve triangular matrix-matrix system (multiple right-hand sides). \f$ op(A) X = \alpha B \f$ (side == MagmaLeft), or \n \f$ X op(A) = \alpha B \f$ (side == MagmaRight), \n where \f$ A \f$ is triangular. @param[in] side Whether A is on the left or right. @param[in] uplo Whether A is upper or lower triangular. @param[in] trans Operation to perform on A. @param[in] diag Whether the diagonal of A is assumed to be unit or non-unit. @param[in] m Number of rows of B. m >= 0. @param[in] n Number of columns of B. n >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX_16 array on GPU device. If side == MagmaLeft, the m-by-m triangular matrix A of dimension (ldda,m), ldda >= max(1,m); \n otherwise, the n-by-n triangular matrix A of dimension (ldda,n), ldda >= max(1,n). @param[in] ldda Leading dimension of dA. @param[in,out] dB COMPLEX_16 array on GPU device. On entry, m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m). On exit, overwritten with the solution matrix X. @param[in] lddb Leading dimension of dB. @ingroup magma_zblas3 */ extern "C" void magma_ztrsm( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb ) { cublasZtrsm( cublas_side_const( side ), cublas_uplo_const( uplo ), cublas_trans_const( trans ), cublas_diag_const( diag ), m, n, alpha, dA, ldda, dB, lddb ); }
/** Perform Hermitian matrix-vector product, \f$ y = \alpha A x + \beta y \f$. @param[in] uplo Whether the upper or lower triangle of A is referenced. @param[in] n Number of rows and columns of A. n >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX_16 array of dimension (ldda,n), ldda >= max(1,n). The n-by-n matrix A, on GPU device. @param[in] ldda Leading dimension of dA. @param[in] dx COMPLEX_16 array on GPU device. The m element vector x of dimension (1 + (m-1)*incx). @param[in] incx Stride between consecutive elements of dx. incx != 0. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dy COMPLEX_16 array on GPU device. The n element vector y of dimension (1 + (n-1)*incy). @param[in] incy Stride between consecutive elements of dy. incy != 0. @ingroup magma_zblas2 */ extern "C" void magma_zhemv( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dx, magma_int_t incx, magmaDoubleComplex beta, magmaDoubleComplex_ptr dy, magma_int_t incy ) { cublasZhemv( cublas_uplo_const( uplo ), n, alpha, dA, ldda, dx, incx, beta, dy, incy ); }
/** Perform symmetric rank-2k update. \f$ C = \alpha A B^T + \alpha B A^T \beta C \f$ (trans == MagmaNoTrans), or \n \f$ C = \alpha A^T B + \alpha B^T A \beta C \f$ (trans == MagmaTrans), \n where \f$ C \f$ is symmetric. @param[in] uplo Whether the upper or lower triangle of C is referenced. @param[in] trans Operation to perform on A and B. @param[in] n Number of rows and columns of C. n >= 0. @param[in] k Number of columns of A and B (for MagmaNoTrans) or rows of A and B (for MagmaTrans). k >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX_16 array on GPU device. If trans == MagmaNoTrans, the n-by-k matrix A of dimension (ldda,k), ldda >= max(1,n); \n otherwise, the k-by-n matrix A of dimension (ldda,n), ldda >= max(1,k). @param[in] ldda Leading dimension of dA. @param[in] dB COMPLEX_16 array on GPU device. If trans == MagmaNoTrans, the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n); \n otherwise, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k). @param[in] lddb Leading dimension of dB. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dC COMPLEX_16 array on GPU device. The n-by-n symmetric matrix C of dimension (lddc,n), lddc >= max(1,n). @param[in] lddc Leading dimension of dC. @ingroup magma_zblas3 */ extern "C" void magma_zsyr2k( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dB, magma_int_t lddb, magmaDoubleComplex beta, magmaDoubleComplex_ptr dC, magma_int_t lddc ) { cublasZsyr2k( cublas_uplo_const( uplo ), cublas_trans_const( trans ), n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc ); }
/** Perform Hermitian matrix-matrix product. \f$ C = \alpha A B + \beta C \f$ (side == MagmaLeft), or \n \f$ C = \alpha B A + \beta C \f$ (side == MagmaRight), \n where \f$ A \f$ is Hermitian. @param[in] side Whether A is on the left or right. @param[in] uplo Whether the upper or lower triangle of A is referenced. @param[in] m Number of rows of C. m >= 0. @param[in] n Number of columns of C. n >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX_16 array on GPU device. If side == MagmaLeft, the m-by-m Hermitian matrix A of dimension (ldda,m), ldda >= max(1,m); \n otherwise, the n-by-n Hermitian matrix A of dimension (ldda,n), ldda >= max(1,n). @param[in] ldda Leading dimension of dA. @param[in] dB COMPLEX_16 array on GPU device. The m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m). @param[in] lddb Leading dimension of dB. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dC COMPLEX_16 array on GPU device. The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m). @param[in] lddc Leading dimension of dC. @ingroup magma_zblas3 */ extern "C" void magma_zhemm( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dB, magma_int_t lddb, magmaDoubleComplex beta, magmaDoubleComplex_ptr dC, magma_int_t lddc ) { cublasZhemm( cublas_side_const( side ), cublas_uplo_const( uplo ), m, n, alpha, dA, ldda, dB, lddb, beta, dC, lddc ); }
int main(int argc, char **argv) { TESTING_INIT(); const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; real_Double_t atomics_perf, atomics_time; real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error, atomics_error, cublas_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; float alpha = MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma; magmaFloat_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N MAGMA Gflop/s (ms) Atomics Gflop/s CUBLAS Gflop/s CPU Gflop/s MAGMA error Atomics CUBLAS\n"); printf("======================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_SSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, float, sizeA ); TESTING_MALLOC_CPU( X, float, sizeX ); TESTING_MALLOC_CPU( Y, float, sizeY ); TESTING_MALLOC_CPU( Yatomics, float, sizeY ); TESTING_MALLOC_CPU( Ycublas, float, sizeY ); TESTING_MALLOC_CPU( Ymagma, float, sizeY ); TESTING_MALLOC_DEV( dA, float, ldda*N ); TESTING_MALLOC_DEV( dX, float, sizeX ); TESTING_MALLOC_DEV( dY, float, sizeY ); blocks = (N + nb - 1) / nb; ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, float, ldwork ); magmablas_slaset( MagmaFull, ldwork, 1, MAGMA_S_NAN, MAGMA_S_NAN, dwork, ldwork ); magmablas_slaset( MagmaFull, ldda, N, MAGMA_S_NAN, MAGMA_S_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &sizeA, A ); magma_smake_symmetric( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_slaset( "Lower", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[1], &lda ); } else { lapackf77_slaset( "Upper", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[lda], &lda ); } lapackf77_slarnv( &ione, ISEED, &sizeX, X ); lapackf77_slarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( N, N, A, lda, dA, ldda ); magma_ssetvector( N, X, incx, dX, incx ); magma_ssetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using CUBLAS - using atomics =================================================================== */ cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED ); magma_ssetvector( N, Y, incy, dY, incy ); atomics_time = magma_sync_wtime( 0 ); cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); atomics_time = magma_sync_wtime( 0 ) - atomics_time; atomics_perf = gflops / atomics_time; magma_sgetvector( N, dY, incy, Yatomics, incy ); cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_ssetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); if ( opts.version == 1 ) { magmablas_ssymv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_ssymv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); } magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_ssymv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_slange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_slange( "M", &N, &ione, Ycublas, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy ); atomics_error = lapackf77_slange( "M", &N, &ione, Yatomics, &N, work ) / N; bool ok = (magma_error < tol && cublas_error < tol && atomics_error < tol); status += ! ok; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, atomics_perf, 1000.*atomics_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, atomics_error, (ok ? "ok" : "failed")); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Yatomics ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zher2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_B, *h_C, *h_Ccublas; magmaDoubleComplex *d_A, *d_B, *d_C; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[itest]; K = opts.ksize[itest]; gflops = FLOPS_ZHER2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*Bk ); TESTING_MALLOC_CPU( h_C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*Bk ); TESTING_MALLOC_DEV( d_C, magmaDoubleComplex, lddc*N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_zsetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_zsetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasZher2k( handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_zher2k( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &N, h_C, &ldc, work ); blasf77_zaxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsyr2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double *h_A, *h_B, *h_C, *h_Ccublas; magmaDouble_ptr d_A, d_B, d_C; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); #ifdef COMPLEX if (opts.transA == MagmaTrans) { opts.transA = MagmaConjTrans; printf("%% WARNING: transA = MagmaTrans changed to MagmaConjTrans\n"); } #endif printf("%% If running lapack (option --lapack), CUBLAS error is computed\n" "%% relative to CPU BLAS result.\n\n"); printf("%% uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf("%% N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("%%=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[itest]; K = opts.ksize[itest]; gflops = FLOPS_DSYR2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default lddc = magma_roundup( ldc, opts.align ); // multiple of 32 by default sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, double, lda*Ak ); TESTING_MALLOC_CPU( h_B, double, ldb*Bk ); TESTING_MALLOC_CPU( h_C, double, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, double, ldc*N ); TESTING_MALLOC_DEV( d_A, double, ldda*Ak ); TESTING_MALLOC_DEV( d_B, double, lddb*Bk ); TESTING_MALLOC_DEV( d_C, double, lddc*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_dlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_dsetmatrix( N, N, h_C, ldc, d_C, lddc ); magmablasSetKernelStream( opts.queue ); // opts.handle also uses opts.queue cublas_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasDsyr2k( opts.handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); #else magma_dsyr2k( opts.uplo, opts.transA, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, opts.queue ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dsyr2k( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_dlange( "M", &N, &N, h_C, &ldc, work ); blasf77_daxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_dlange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }