/** Perform Hermitian matrix-vector product, \f$ y = \alpha A x + \beta y \f$. @param[in] uplo Whether the upper or lower triangle of A is referenced. @param[in] n Number of rows and columns of A. n >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX_16 array of dimension (ldda,n), ldda >= max(1,n). The n-by-n matrix A, on GPU device. @param[in] ldda Leading dimension of dA. @param[in] dx COMPLEX_16 array on GPU device. The m element vector x of dimension (1 + (m-1)*incx). @param[in] incx Stride between consecutive elements of dx. incx != 0. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dy COMPLEX_16 array on GPU device. The n element vector y of dimension (1 + (n-1)*incy). @param[in] incy Stride between consecutive elements of dy. incy != 0. @ingroup magma_zblas2 */ extern "C" void magma_zhemv( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dx, magma_int_t incx, magmaDoubleComplex beta, magmaDoubleComplex_ptr dy, magma_int_t incy ) { cublasZhemv( cublas_uplo_const( uplo ), n, alpha, dA, ldda, dx, incx, beta, dy, incy ); }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaDoubleComplex *A, *B, *C, *C2, *LU; magmaDoubleComplex *dA, *dB, *dC1, *dC2; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_zmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_zmalloc( &dA, size ); assert( err == 0 ); err = magma_zmalloc( &dB, size ); assert( err == 0 ); err = magma_zmalloc( &dC1, size ); assert( err == 0 ); err = magma_zmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); lapackf77_zlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test ZSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetmatrix( m, n, A, ld, dB, ld ); magma_zswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_zswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_zgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "zswap diff %.2g\n", error ); } else { printf( "zswap skipped for n < 3\n" ); } // ----- test IZAMAX // get argmax of column of A magma_zsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_izamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIzamax( handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "izamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test ZGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetvector( maxn, B, 1, dB, 1 ); magma_zsetvector( maxn, C, 1, dC1, 1 ); magma_zsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemv( handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasZaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMV( m, n ) / 1e9; printf( "zgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetvector( m, B, 1, dB, 1 ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemv( handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMV( m ) / 1e9; printf( "zhemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_zsetmatrix( m, m, LU, ld, dA, ld ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ztrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test ZGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMM( m, n, k ) / 1e9; printf( "zgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetmatrix( m, n, B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9; printf( "zhemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_zsetmatrix( n, k, A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHERK( k, n ) / 1e9; printf( "zherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHER2K( k, n ) / 1e9; printf( "zher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasZtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9; printf( "ztrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test ZTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9; printf( "ztrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); fflush( stdout ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
// -------------------- int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_time=0, cpu_perf=0, gpu_time, gpu_perf, mgpu_time, mgpu_perf, cuda_time, cuda_perf; double Ynorm, error=0, error2=0, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t n_local[MagmaMaxGPUs]; magma_int_t N, Noffset, lda, ldda, blocks, lhwork, ldwork, matsize, vecsize; magma_int_t incx = 1; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ylapack, *Ycublas, *Ymagma, *Ymagma1, *hwork; magmaDoubleComplex_ptr dA, dX, dY; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs], dwork[MagmaMaxGPUs]; magma_device_t dev; magma_queue_t queues[MagmaMaxGPUs]; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code double tol = opts.tolerance * lapackf77_dlamch("E"); magma_int_t nb = 64; // required by magmablas_zhemv_mgpu implementation for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_create( dev, &queues[dev] ); } // currently, tests all offsets in the offsets array; // comment out loop below to test a specific offset. magma_int_t offset = opts.offset; magma_int_t offsets[] = { 0, 1, 31, 32, 33, 63, 64, 65, 100, 200 }; magma_int_t noffsets = sizeof(offsets) / sizeof(*offsets); printf("%% uplo = %s, ngpu %d, block size = %d, offset %d\n", lapack_uplo_const(opts.uplo), (int) opts.ngpu, (int) nb, (int) offset ); printf( "%% BLAS CUBLAS MAGMA 1 GPU MAGMA MGPU Error rel Error rel\n" "%% N offset Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) to CUBLAS to LAPACK\n" "%%==================================================================================================================\n" ); for( int itest = 0; itest < opts.ntest; ++itest ) { // comment out these two lines & end of loop to test a specific offset for( int ioffset=0; ioffset < noffsets; ioffset += 1 ) { offset = offsets[ioffset]; for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; Noffset = N + offset; lda = Noffset; ldda = magma_roundup( Noffset, opts.align ); // multiple of 32 by default matsize = Noffset*ldda; vecsize = (Noffset-1)*incx + 1; gflops = FLOPS_ZHEMV( N ) / 1e9; blocks = magma_ceildiv( N + (offset % nb), nb ); lhwork = N*opts.ngpu; ldwork = ldda*(blocks + 1); TESTING_MALLOC_CPU( A, magmaDoubleComplex, matsize ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma1, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ylapack, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( X, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( hwork, magmaDoubleComplex, lhwork ); magma_setdevice( opts.device ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, matsize ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, vecsize ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, vecsize ); // TODO make magma_zmalloc_bcyclic helper function? for( dev=0; dev < opts.ngpu; dev++ ) { n_local[dev] = ((Noffset/nb)/opts.ngpu)*nb; if (dev < (Noffset/nb) % opts.ngpu) n_local[dev] += nb; else if (dev == (Noffset/nb) % opts.ngpu) n_local[dev] += Noffset % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaDoubleComplex, ldda*n_local[dev] ); TESTING_MALLOC_DEV( dwork[dev], magmaDoubleComplex, ldwork ); } ////////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &matsize, A ); magma_zmake_hermitian( Noffset, A, lda ); lapackf77_zlarnv( &ione, ISEED, &vecsize, X ); lapackf77_zlarnv( &ione, ISEED, &vecsize, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_setdevice( opts.device ); magma_zsetmatrix( Noffset, Noffset, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Noffset, X, incx, dX, incx, opts.queue ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); cuda_time = magma_sync_wtime(0); cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, &beta, dY + offset, incx ); cuda_time = magma_sync_wtime(0) - cuda_time; cuda_perf = gflops / cuda_time; magma_zgetvector( Noffset, dY, incx, Ycublas, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (1 GPU) =================================================================== */ magma_setdevice( opts.device ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); magmablas_zhemv_work( opts.uplo, N, alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, beta, dY + offset, incx, dwork[ opts.device ], ldwork, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; magma_zgetvector( Noffset, dY, incx, Ymagma1, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (multi-GPU) =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( Noffset, Noffset, A, lda, d_lA, ldda, opts.ngpu, nb, queues ); blasf77_zcopy( &Noffset, Y, &incx, Ymagma, &incx ); // workspaces do NOT need to be zero -- set to NAN to prove for( dev=0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork[dev], ldwork, opts.queue ); } lapackf77_zlaset( "Full", &lhwork, &ione, &MAGMA_Z_NAN, &MAGMA_Z_NAN, hwork, &lhwork ); mgpu_time = magma_sync_wtime(0); magma_int_t info; info = magmablas_zhemv_mgpu( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } info = magmablas_zhemv_mgpu_sync( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_sync returned error %d: %s.\n", (int) info, magma_strerror( info )); } mgpu_time = magma_sync_wtime(0) - mgpu_time; mgpu_perf = gflops / mgpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { blasf77_zcopy( &Noffset, Y, &incx, Ylapack, &incx ); cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A + offset + offset*lda, &lda, X + offset, &incx, &beta, Ylapack + offset, &incx ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Compute the Difference LAPACK vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ylapack, &incx ); error2 = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ) / Ynorm; } /* ===================================================================== Compute the Difference Cublas vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ycublas, &incx ); error = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ) / Ynorm; bool okay = (error < tol && error2 < tol); status += ! okay; if ( opts.lapack ) { printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) N, (int) offset, cpu_perf, cpu_time*1000., cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, error2, (okay ? "ok" : "failed") ); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e --- %s\n", (int) N, (int) offset, cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, (okay ? "ok" : "failed") ); } /* Free Memory */ TESTING_FREE_CPU( A ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_CPU( Ymagma1 ); TESTING_FREE_CPU( Ylapack ); TESTING_FREE_PIN( X ); TESTING_FREE_PIN( hwork ); magma_setdevice( opts.device ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); for( dev=0; dev < opts.ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); TESTING_FREE_DEV( dwork[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } // comment out these two lines line & top of loop test a specific offset } // end for ioffset printf( "\n" ); } for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_destroy( queues[dev] ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaDoubleComplex *dA, *dX, *dY, *dC_work; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC( Ymagma, magmaDoubleComplex, sizeY ); TESTING_DEVALLOC( dA, magmaDoubleComplex, sizeA ); TESTING_DEVALLOC( dX, magmaDoubleComplex, sizeX ); TESTING_DEVALLOC( dY, magmaDoubleComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = lda * (blocks + 1); TESTING_DEVALLOC( dC_work, magmaDoubleComplex, ldwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, lda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasZhemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMA BLAS =================================================================== */ magma_zsetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); #if (GPUSHMEM >= 200) magmablas_zhemv2( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy, dC_work, ldwork ); #else magmablas_zhemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); #endif magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( &opts.uplo, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); TESTING_DEVFREE( dC_work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
int main(int argc, char **argv) { TESTING_INIT(); const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magma_int_t ione = 1; real_Double_t atomics_perf=0, atomics_time=0; real_Double_t gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error=0, atomics_error=0, cublas_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma; magmaDoubleComplex_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% N MAGMA Gflop/s (ms) Atomics Gflop/s CUBLAS Gflop/s CPU Gflop/s MAGMA error Atomics CUBLAS\n"); printf("%%=====================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Yatomics, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); blocks = magma_ceildiv( N, nb ); ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork ); magmablas_zlaset( MagmaFull, ldda, N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_zlaset( "Lower", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[1], &lda ); } else { lapackf77_zlaset( "Upper", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[lda], &lda ); } lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, ldda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); magmablasSetKernelStream( opts.queue ); // opts.handle also uses opts.queue cublas_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_zhemv( opts.uplo, N, alpha, dA, 0, ldda, dX, 0, incx, beta, dY, 0, incy, opts.queue ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using CUBLAS - using atomics =================================================================== */ #ifdef HAVE_CUBLAS cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED ); magma_zsetvector( N, Y, incy, dY, incy ); // sync on queue doesn't work -- need device sync or use NULL stream -- bug in CUBLAS? atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ); cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ) - atomics_time; atomics_perf = gflops / atomics_time; magma_zgetvector( N, dY, incy, Yatomics, incy ); cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED ); #endif /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( opts.queue ); if ( opts.version == 1 ) { magmablas_zhemv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_zhemv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); } magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; #ifdef HAVE_CUBLAS blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy ); atomics_error = lapackf77_zlange( "M", &N, &ione, Yatomics, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; #endif bool okay = (magma_error < tol && cublas_error < tol && atomics_error < tol); status += ! okay; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, atomics_perf, 1000.*atomics_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, atomics_error, (okay ? "ok" : "failed")); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Yatomics ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaDoubleComplex *dA, *dX, *dY, *dwork; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); if ( opts.uplo == MagmaUpper ) { printf("** for uplo=MagmaUpper, magmablas_zhemv simply calls cublas_zhemv.\n"); } printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork ); magmablas_zlaset( MagmaFull, ldda, N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, ldda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasZhemv( handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetvector( N, Y, incy, dY, incy ); //magma_zprint_gpu( ldda, blocks, dwork, ldda ); magma_time = magma_sync_wtime( 0 ); magmablas_zhemv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork ); // TODO provide option to test non-work interface //magmablas_zhemv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); //magma_zprint_gpu( ldda, blocks, dwork, ldda ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }