/** Purpose ------- SLATRD2 reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, SLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, SLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by SSYTRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e REAL array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W REAL array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). @param dA TODO: dimension (ldda, n) ?? @param ldda TODO: ldda >= n ?? @param dW TODO: dimension (lddw, 2*nb) ?? @param lddw TODO: lddw >= n ?? @param dwork TODO: dimension (ldwork) ?? @param ldwork TODO: ldwork >= ceil(n/64)*ldda ?? Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slatrd2( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, float *A, magma_int_t lda, float *e, float *tau, float *W, magma_int_t ldw, magmaFloat_ptr dA, magma_int_t ldda, magmaFloat_ptr dW, magma_int_t lddw, magmaFloat_ptr dwork, magma_int_t ldwork) { #define A(i_, j_) (A + (i_) + (j_)*lda) #define W(i_, j_) (W + (i_) + (j_)*ldw) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dW(i_, j_) (dW + (i_) + (j_)*lddw) const float c_neg_one = MAGMA_S_NEG_ONE; const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; const magma_int_t ione = 1; float alpha, value; magma_int_t i, i_n, i_1, iw; /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper ) { info = -1; } else if ( n < 0 ) { info = -2; } else if ( nb < 1 ) { info = -3; } else if ( lda < max(1,n) ) { info = -5; } else if ( ldw < max(1,n) ) { info = -9; } else if ( ldda < max(1,n) ) { info = -11; } else if ( lddw < max(1,n) ) { info = -13; } else if ( ldwork < ldda*ceildiv(n,64) ) { info = -15; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0) { return info; } magma_queue_t stream; magma_queue_create( &stream ); float *f; magma_smalloc_cpu( &f, n ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, W(i, iw+1), &ldw ); #endif blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, W(i, iw+1), &ldw ); lapackf77_slacgv( &i_n, A(i, i+1), &lda ); #endif blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, A(i, i+1), &lda ); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_slarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] ); e[i-1] = MAGMA_S_REAL( alpha ); *A(i-1,i) = MAGMA_S_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_ssetvector_async( i, A(0, i), 1, dA(0, i), 1, stream ); magmablas_ssymv_work( MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork, stream ); // 2. Start getting the result back (asynchronously) magma_sgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw), ldw, stream ); if (i < n-1) { blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); } // 3. Here we need ssymv result W(0, iw) magma_queue_sync( stream ); if (i < n-1) { blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); } blasf77_sscal( &i, &tau[i - 1], W(0, iw), &ione ); value = magma_cblas_sdot( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_saxpy( &i, &alpha, A(0, i), &ione, W(0, iw), &ione ); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, W(i, 0), &ldw ); #endif blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, W(i, 0), &ldw ); lapackf77_slacgv( &i, A(i, 0), &lda ); #endif blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, A(i, 0), &lda ); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_slarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] ); e[i] = MAGMA_S_REAL( alpha ); *A(i+1,i) = MAGMA_S_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_ssetvector_async( i_n, A(i+1, i), 1, dA(i+1, i), 1, stream ); magmablas_ssymv_work( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork, stream ); // 2. Start getting the result back (asynchronously) magma_sgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione ); blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); // 3. Here we need ssymv result W(i+1, i) magma_queue_sync( stream ); if (i != 0) blasf77_saxpy( &i_n, &c_one, f, &ione, W(i+1, i), &ione ); blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione ); blasf77_sscal( &i_n, &tau[i], W(i+1,i), &ione ); value = magma_cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_saxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione ); } } } magma_free_cpu( f ); magma_queue_destroy( stream ); return info; } /* magma_slatrd */
int main(int argc, char **argv) { TESTING_INIT(); const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; real_Double_t atomics_perf, atomics_time; real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error, atomics_error, cublas_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; float alpha = MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma; magmaFloat_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N MAGMA Gflop/s (ms) Atomics Gflop/s CUBLAS Gflop/s CPU Gflop/s MAGMA error Atomics CUBLAS\n"); printf("======================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_SSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, float, sizeA ); TESTING_MALLOC_CPU( X, float, sizeX ); TESTING_MALLOC_CPU( Y, float, sizeY ); TESTING_MALLOC_CPU( Yatomics, float, sizeY ); TESTING_MALLOC_CPU( Ycublas, float, sizeY ); TESTING_MALLOC_CPU( Ymagma, float, sizeY ); TESTING_MALLOC_DEV( dA, float, ldda*N ); TESTING_MALLOC_DEV( dX, float, sizeX ); TESTING_MALLOC_DEV( dY, float, sizeY ); blocks = (N + nb - 1) / nb; ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, float, ldwork ); magmablas_slaset( MagmaFull, ldwork, 1, MAGMA_S_NAN, MAGMA_S_NAN, dwork, ldwork ); magmablas_slaset( MagmaFull, ldda, N, MAGMA_S_NAN, MAGMA_S_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &sizeA, A ); magma_smake_symmetric( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_slaset( "Lower", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[1], &lda ); } else { lapackf77_slaset( "Upper", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[lda], &lda ); } lapackf77_slarnv( &ione, ISEED, &sizeX, X ); lapackf77_slarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( N, N, A, lda, dA, ldda ); magma_ssetvector( N, X, incx, dX, incx ); magma_ssetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using CUBLAS - using atomics =================================================================== */ cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED ); magma_ssetvector( N, Y, incy, dY, incy ); atomics_time = magma_sync_wtime( 0 ); cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); atomics_time = magma_sync_wtime( 0 ) - atomics_time; atomics_perf = gflops / atomics_time; magma_sgetvector( N, dY, incy, Yatomics, incy ); cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_ssetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); if ( opts.version == 1 ) { magmablas_ssymv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_ssymv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); } magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_ssymv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_slange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_slange( "M", &N, &ione, Ycublas, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy ); atomics_error = lapackf77_slange( "M", &N, &ione, Yatomics, &N, work ) / N; bool ok = (magma_error < tol && cublas_error < tol && atomics_error < tol); status += ! ok; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, atomics_perf, 1000.*atomics_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, atomics_error, (ok ? "ok" : "failed")); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Yatomics ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y, *Ycublas, *Ymagma; float *dA, *dX, *dY, *dwork; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_SSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, float, sizeA ); TESTING_MALLOC_CPU( X, float, sizeX ); TESTING_MALLOC_CPU( Y, float, sizeY ); TESTING_MALLOC_CPU( Ycublas, float, sizeY ); TESTING_MALLOC_CPU( Ymagma, float, sizeY ); TESTING_MALLOC_DEV( dA, float, sizeA ); TESTING_MALLOC_DEV( dX, float, sizeX ); TESTING_MALLOC_DEV( dY, float, sizeY ); blocks = (N + nb - 1) / nb; ldwork = lda * (blocks + 1); TESTING_MALLOC_DEV( dwork, float, ldwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &sizeA, A ); magma_smake_symmetric( N, A, lda ); lapackf77_slarnv( &ione, ISEED, &sizeX, X ); lapackf77_slarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( N, N, A, lda, dA, lda ); magma_ssetvector( N, X, incx, dX, incx ); magma_ssetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasSsymv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMA BLAS =================================================================== */ magma_ssetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_ssymv_work( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy, dwork, ldwork ); // TODO provide option to test non-work interface //magmablas_ssymv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_ssymv( &opts.uplo, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_slange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_slange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }