void magmaf_dprint( magma_int_t *m, magma_int_t *n, const double *A, magma_int_t *lda ) { magma_dprint( *m, *n, A, *lda ); }
void magma_dprint_gpu( int m, int n, magmaDouble_ptr dA, size_t dA_offset, int ldda, magma_queue_t queue ) { int lda = m; double* A = (double*) malloc( lda*n*sizeof(double) ); magma_dgetmatrix( m, n, dA, dA_offset, ldda, A, 0, lda, queue ); magma_dprint( m, n, A, lda ); free( A ); }
void magma_dprint_gpu( magma_int_t m, magma_int_t n, const double *dA, magma_int_t ldda ) { if ( magma_is_devptr( dA ) == 0 ) { fprintf( stderr, "ERROR: dprint_gpu called with host pointer.\n" ); exit(1); } magma_int_t lda = m; double* A; magma_dmalloc_cpu( &A, lda*n ); magma_dgetmatrix( m, n, dA, ldda, A, lda ); magma_dprint( m, n, A, lda ); magma_free_cpu( A ); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dprint */ int main( int argc, char** argv) { TESTING_INIT(); double *hA; magmaDouble_ptr dA; //magma_int_t ione = 1; //magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, lda, ldda; //size magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default //size = lda*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( hA, double, lda *N ); TESTING_MALLOC_DEV( dA, double, ldda*N ); //lapackf77_dlarnv( &ione, ISEED, &size, hA ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { hA[i + j*lda] = MAGMA_D_MAKE( i + j*0.01, 0. ); } } magma_dsetmatrix( M, N, hA, lda, dA, ldda ); printf( "A=" ); magma_dprint( M, N, hA, lda ); printf( "dA=" ); magma_dprint_gpu( M, N, dA, ldda ); TESTING_FREE_CPU( hA ); TESTING_FREE_DEV( dA ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf */ int main( int argc, char** argv) { TESTING_INIT(); magma_setdevice( 0 ); double *hA, *dA; /* Matrix size */ magma_int_t m = 5; magma_int_t n = 10; //magma_int_t ione = 1; //magma_int_t ISEED[4] = {0,0,0,1}; //magma_int_t size; magma_int_t lda, ldda; lda = ((m + 31)/32)*32; ldda = ((m + 31)/32)*32; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( hA, double, lda *n ); TESTING_MALLOC_DEV( dA, double, ldda*n ); //size = lda*n; //lapackf77_dlarnv( &ione, ISEED, &size, hA ); for( int j = 0; j < n; ++j ) { for( int i = 0; i < m; ++i ) { hA[i + j*lda] = MAGMA_D_MAKE( i + j*0.01, 0. ); } } magma_dsetmatrix( m, n, hA, lda, dA, ldda ); printf( "A=" ); magma_dprint( m, n, hA, lda ); printf( "dA=" ); magma_dprint_gpu( m, n, dA, ldda ); //printf( "dA=" ); //magma_dprint( m, n, dA, ldda ); //printf( "A=" ); //magma_dprint_gpu( m, n, hA, lda ); /* Memory clean up */ TESTING_FREE_CPU( hA ); TESTING_FREE_DEV( dA ); /* Shutdown */ TESTING_FINALIZE(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dtrtri */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time=0; //, cpu_perf=0, cpu_time=0; double magma_error, norm_invA, work[1]; magma_int_t N, lda, ldda, info; magma_int_t jb, nb, nblock, sizeA, size_inv; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; double *h_A, *h_dinvA; double *d_A, *d_dinvA; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); const char *uplo_ = lapack_uplo_const(opts.uplo); // this is the NB hard coded into dtrtri_diag. nb = 128; printf("uplo = %s, diag = %s\n", lapack_uplo_const(opts.uplo), lapack_diag_const(opts.diag) ); printf(" N MAGMA Gflop/s (ms) MAGMA error\n"); printf("=======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((lda+31)/32)*32; nblock = (N+nb-1)/nb; gflops = nblock * FLOPS_DTRTRI( nb ) / 1e9; TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); size_inv = nblock*nb*nb; TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_dinvA, double, size_inv ); TESTING_MALLOC_CPU( h_dinvA, double, size_inv ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ sizeA = lda*N; lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); magma_time = magma_sync_wtime( NULL ); magmablas_dtrtri_diag( opts.uplo, opts.diag, N, d_A, ldda, d_dinvA ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_dgetvector( size_inv, d_dinvA, 1, h_dinvA, 1 ); if ( opts.verbose ) { printf( "A%d=", (int) N ); magma_dprint( N, N, h_A, lda ); printf( "d_dinvA%d=", (int) N ); magma_dprint( min(N+4, nb), min(N+4, nblock*nb), h_dinvA, nb ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { //cpu_time = magma_wtime(); lapackf77_dtrtri( lapack_uplo_const(opts.uplo), lapack_diag_const(opts.diag), &N, h_A, &lda, &info ); //cpu_time = magma_wtime() - cpu_time; //cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { // |invA - invA_magma| / |invA|, accumulated over all diagonal blocks magma_error = 0; norm_invA = 0; for( int i=0; i < N; i += nb ) { jb = min( nb, N-i ); dgeadd( jb, jb, c_neg_one, h_A(i, i), lda, h_dinvA(0, i), nb ); magma_error = max( magma_error, lapackf77_dlantr( "M", uplo_, MagmaNonUnitStr, &jb, &jb, h_dinvA(0, i), &nb, work )); norm_invA = max( norm_invA, lapackf77_dlantr( "M", uplo_, MagmaNonUnitStr, &jb, &jb, h_A(i, i), &lda, work )); } magma_error /= norm_invA; // CPU is doing N-by-N inverse, while GPU is doing (N/NB) NB-by-NB inverses. // So don't compare performance. printf("%5d %7.2f (%7.2f) %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, //cpu_perf, 1000.*cpu_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); } else { printf("%5d %7.2f (%7.2f) ---\n", (int) N, magma_perf, 1000.*magma_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_dinvA ); TESTING_FREE_CPU( h_dinvA ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlacpy */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_B, *h_R; magmaDouble_ptr d_A, d_B; magma_int_t M, N, size, lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("%% uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("%%================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldb = lda; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default lddb = ldda; size = lda*N; if ( uplo[iuplo] == MagmaLower ) { // load & save lower trapezoid (with diagonal) if ( M > N ) { gbytes = 2. * sizeof(double) * (1.*M*N - 0.5*N*(N-1)) / 1e9; } else { gbytes = 2. * sizeof(double) * 0.5*M*(M+1) / 1e9; } } else if ( uplo[iuplo] == MagmaUpper ) { // load & save upper trapezoid (with diagonal) if ( N > M ) { gbytes = 2. * sizeof(double) * (1.*M*N - 0.5*M*(M-1)) / 1e9; } else { gbytes = 2. * sizeof(double) * 0.5*N*(N+1) / 1e9; } } else { // load & save entire matrix gbytes = 2. * sizeof(double) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, double, size ); TESTING_MALLOC_CPU( h_B, double, size ); TESTING_MALLOC_CPU( h_R, double, size ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_D_MAKE( i + j/10000., j ); h_B[i + j*ldb] = MAGMA_D_MAKE( i - j/10000. + 10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda, opts.queue ); magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); //magmablas_dlacpy( uplo[iuplo], M-2, N-2, d_A+1+ldda, ldda, d_B+1+lddb, lddb, opts.queue ); // inset by 1 row & col magmablas_dlacpy( uplo[iuplo], M, N, d_A, ldda, d_B, lddb, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_dlacpy( lapack_uplo_const(uplo[iuplo]), &M2, &N2, h_A+1+lda, &lda, h_B+1+ldb, &ldb ); lapackf77_dlacpy( lapack_uplo_const(uplo[iuplo]), &M, &N, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if ( opts.verbose ) { printf( "A= " ); magma_dprint( M, N, h_A, lda ); printf( "B= " ); magma_dprint( M, N, h_B, ldb ); printf( "dA=" ); magma_dprint_gpu( M, N, d_A, ldda ); printf( "dB=" ); magma_dprint_gpu( M, N, d_B, lddb ); } /* ===================================================================== Check the result =================================================================== */ magma_dgetmatrix( M, N, d_B, lddb, h_R, lda, opts.queue ); blasf77_daxpy(&size, &c_neg_one, h_B, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work); printf("%5s %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapack_uplo_const(uplo[iuplo]), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlaset Code is very similar to testing_dlacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R; magmaDouble_ptr d_A; double offdiag = MAGMA_D_MAKE( 1.2000, 6.7000 ); double diag = MAGMA_D_MAKE( 3.1415, 2.7183 ); magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("=================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldda = ((M+31)/32)*32; size = lda*N; if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { // save triangle (with diagonal) // TODO wrong for trapezoid gbytes = sizeof(double) * 0.5*N*(N+1) / 1e9; } else { // save entire matrix gbytes = sizeof(double) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, double, size ); TESTING_MALLOC_CPU( h_R, double, size ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_D_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, 0, ldda, opts.queue ); gpu_time = magma_sync_wtime( 0 ); //magmablas_dlaset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, 0, ldda, opts.queue ); // inset by 1 row & col magmablas_dlaset( uplo[iuplo], M, N, offdiag, diag, d_A, 0, ldda, opts.queue ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_dlaset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_dlaset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if ( opts.verbose ) { printf( "A= " ); magma_dprint( M, N, h_A, lda ); printf( "dA=" ); magma_dprint_gpu( M, N, d_A, 0, ldda, opts.queue ); } /* ===================================================================== Check the result =================================================================== */ magma_dgetmatrix( M, N, d_A, 0, ldda, h_R, lda, opts.queue ); blasf77_daxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work); printf("%5s %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapack_uplo_const( uplo[iuplo] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlat2s and slat2d */ int main( int argc, char** argv ) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define SA(i_,j_) (SA + (i_) + (j_)*lda) TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; float serror, swork[1]; double c_neg_one = MAGMA_D_NEG_ONE; float s_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t n, lda, ldda, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; float *SA, *SR; double *A, *R; float *dSA; double *dA; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; printf("func uplo N CPU GB/s (ms) GPU GB/s (ms) ||R||_F\n"); printf("=====================================================================\n"); for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { n = opts.nsize[itest]; lda = n; ldda = ((n+31)/32)*32; // 0.5*(n+1)*n double-real loads and 0.5*(n+1)*n single-real stores (and vice-versa for slat2d) gbytes = (real_Double_t) 0.5*(n+1)*n * (sizeof(double) + sizeof(float)) / 1e9; size = ldda*n; // ldda >= lda TESTING_MALLOC_CPU( SA, float, size ); TESTING_MALLOC_CPU( A, double, size ); TESTING_MALLOC_CPU( SR, float, size ); TESTING_MALLOC_CPU( R, double, size ); TESTING_MALLOC_DEV( dSA, float, size ); TESTING_MALLOC_DEV( dA, double, size ); lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_ssetmatrix( n, n, SA, lda, dSA, ldda ); /* ===================================================================== Performs operation using LAPACK dlat2s =================================================================== */ info = 0; cpu_time = magma_wtime(); lapackf77_dlat2s( lapack_uplo_const(uplo[iuplo]), &n, A, &lda, SA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_dlat2s returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA dlat2s =================================================================== */ gpu_time = magma_sync_wtime(0); magmablas_dlat2s( uplo[iuplo], n, dA, ldda, dSA, ldda, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_dlat2s returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix( n, n, dSA, ldda, SR, lda ); if ( opts.verbose ) { printf( "A= " ); magma_dprint( n, n, A, lda ); printf( "SA= " ); magma_sprint( n, n, SA, lda ); printf( "dA= " ); magma_dprint_gpu( n, n, dA, ldda ); printf( "dSA=" ); magma_sprint_gpu( n, n, dSA, ldda ); } /* ===================================================================== compute error |SA_magma - SA_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_saxpy( &size, &s_neg_one, SA, &ione, SR, &ione ); serror = lapackf77_slange( "Fro", &n, &n, SR, &lda, swork ); printf( "dlat2s %5s %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", lapack_uplo_const(uplo[iuplo]), (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., serror, (serror == 0 ? "ok" : "failed") ); status += ! (serror == 0); /* ===================================================================== Reset matrices =================================================================== */ lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_ssetmatrix( n, n, SA, lda, dSA, ldda ); /* ===================================================================== Performs operation using LAPACK slat2d LAPACK doesn't implement slat2d; use our own simple implementation. =================================================================== */ cpu_time = magma_wtime(); if ( uplo[iuplo] == MagmaLower ) { for( int j=0; j < n; ++j ) { for( int i=j; i < n; ++i ) { *A(i,j) = MAGMA_D_MAKE( real(*SA(i,j)), imag(*SA(i,j)) ); } } } else { // upper for( int j=0; j < n; ++j ) { for( int i=0; i <= j; ++i ) { *A(i,j) = MAGMA_D_MAKE( real(*SA(i,j)), imag(*SA(i,j)) ); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_slat2d returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA slat2d =================================================================== */ magma_ssetmatrix( n, n, SA, lda, dSA, ldda ); gpu_time = magma_sync_wtime(0); magmablas_slat2d( uplo[iuplo], n, dSA, ldda, dA, ldda, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_slat2d returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix( n, n, dA, ldda, R, lda ); if ( opts.verbose ) { printf( "A= " ); magma_dprint( n, n, A, lda ); printf( "SA= " ); magma_sprint( n, n, SA, lda ); printf( "dA= " ); magma_dprint_gpu( n, n, dA, ldda ); printf( "dSA=" ); magma_sprint_gpu( n, n, dSA, ldda ); } /* ===================================================================== compute error |A_magma - A_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_daxpy( &size, &c_neg_one, A, &ione, R, &ione ); error = lapackf77_dlange( "Fro", &n, &n, R, &lda, work ); printf( "slat2d %5s %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", lapack_uplo_const(uplo[iuplo]), (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error == 0 ? "ok" : "failed") ); status += ! (error == 0); TESTING_FREE_CPU( SA ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( SR ); TESTING_FREE_CPU( R ); TESTING_FREE_DEV( dSA ); TESTING_FREE_DEV( dA ); printf( "\n" ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }