/* //////////////////////////////////////////////////////////////////////////// -- Testing sormql */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Cnorm, error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; float *C, *R, *A, *hwork, *tau; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_sgeqlf_nb( m, n ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_SORMQL( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqlf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_smake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, float, ldc*n ); TESTING_MALLOC_CPU( R, float, ldc*n ); TESTING_MALLOC_CPU( A, float, lda*k ); TESTING_MALLOC_CPU( hwork, float, lwork_max ); TESTING_MALLOC_CPU( tau, float, k ); // C is full, m x n size = ldc*n; lapackf77_slarnv( &ione, ISEED, &size, C ); lapackf77_slacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_slarnv( &ione, ISEED, &size, A ); // compute QL factorization to get Householder vectors in A, tau magma_sgeqlf( mm, k, A, lda, tau, hwork, lwork_max, &info ); if (info != 0) { printf("magma_sgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sormql( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, hwork, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sormql returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_sormql( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, hwork, lwork, &info ); if (info != 0) { printf("magma_sormql (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_S_REAL( hwork[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_sormql( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, hwork, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sormql returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_slange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_slange( "Fro", &m, &n, R, &ldc, work ) / (magma_ssqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgels */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float gpu_error, cpu_error, error, Anorm, work[1]; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; float *d_A, *d_B; magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol = opts.tolerance * lapackf77_slamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_sgeqrf_nb(M); gflops = (FLOPS_SGEQRF( M, N ) + FLOPS_SGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_sgels( MagmaNoTransStr, &M, &N, &nrhs, NULL, &lda, NULL, &ldb, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_S_REAL( tmp[0] ); lhwork = max( lhwork, lworkgpu ); TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, lda*N ); TESTING_MALLOC_CPU( h_A2, float, lda*N ); TESTING_MALLOC_CPU( h_B, float, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, float, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, float, ldb*nrhs ); TESTING_MALLOC_CPU( h_work, float, lhwork ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, lddb*nrhs ); /* Initialize the matrices */ size = lda*N; lapackf77_slarnv( &ione, ISEED, &size, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = ldb*nrhs; lapackf77_slarnv( &ione, ISEED, &size, h_B ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //size = N*nrhs; //lapackf77_slarnv( &ione, ISEED, &size, h_X ); //blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); magma_ssetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgels_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute the residual magma_sgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb ); Anorm = lapackf77_slange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_sgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb ); cpu_error = lapackf77_slange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_slange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_saxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_slange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error ); if ( M == N ) { printf( " %s\n", (gpu_error < tol && error < tol ? "ok" : "failed")); status += ! (gpu_error < tol && error < tol); } else { printf( " %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing spotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float *h_A, *h_R; magma_int_t N, n2, lda, info; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d, uplo %c\n", (int) opts.ngpu, opts.uplo ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; gflops = FLOPS_SPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); magma_smake_hpd( N, h_A, lda ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_spotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_spotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_spotrf( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_spotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_slange("f", &N, &N, h_A, &lda, work); blasf77_saxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed") ); status |= ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssygvd */ int main( int argc, char** argv) { TESTING_INIT(); float *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #ifdef COMPLEX float *rwork; #endif float *w1, *w2, result[2]={0, 0}; magma_int_t *iwork; real_Double_t mgpu_time, gpu_time, cpu_time; /* Matrix size */ magma_int_t N, n2, nb; magma_int_t info; magma_int_t ione = 1; float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); // checking NoVec requires LAPACK opts.lapack |= (opts.check && opts.jobz == MagmaNoVec); printf("using: ngpu = %d, itype = %d, jobz = %s, uplo = %s, check = %d\n", (int) opts.ngpu, (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), (int) opts.check); printf(" N CPU Time (sec) GPU Time (sec) MGPU Time (sec)\n"); printf("=========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // TODO define lda N = opts.nsize[itest]; n2 = N*N; nb = magma_get_ssytrd_nb(N); #ifdef COMPLEX magma_int_t lwork = max( N + N*nb, 2*N + N*N ); magma_int_t lrwork = 1 + 5*N +2*N*N; #else magma_int_t lwork = max( 2*N + N*nb, 1 + 6*N + 2*N*N ); #endif magma_int_t liwork = 3 + 5*N; TESTING_MALLOC_PIN( h_A, float, n2 ); TESTING_MALLOC_PIN( h_B, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); #ifdef COMPLEX TESTING_MALLOC_PIN( rwork, float, lrwork ); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slarnv( &ione, ISEED, &n2, h_B ); magma_smake_hpd( N, h_B, N ); magma_smake_symmetric( N, h_A, N ); if ( opts.warmup || opts.check ) { TESTING_MALLOC_CPU( h_Ainit, float, n2 ); TESTING_MALLOC_CPU( h_Binit, float, n2 ); lapackf77_slacpy( MagmaFullStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_slacpy( MagmaFullStr, &N, &N, h_B, &N, h_Binit, &N ); } if (opts.warmup) { // ================================================================== // Warmup using MAGMA. // ================================================================== magma_ssygvd_m( opts.ngpu, opts.itype, opts.jobz, opts.uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info); lapackf77_slacpy( MagmaFullStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_slacpy( MagmaFullStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== mgpu_time = magma_wtime(); magma_ssygvd_m( opts.ngpu, opts.itype, opts.jobz, opts.uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info); mgpu_time = magma_wtime() - mgpu_time; if (info != 0) printf("magma_ssygvd_m returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check && opts.jobz != MagmaNoVec ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) =================================================================== */ #ifdef REAL float *rwork = h_work + N*N; #endif result[0] = 1.; result[0] /= lapackf77_slansy("1", lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, rwork); result[0] /= lapackf77_slange("1", &N, &N, h_A, &N, rwork); if (opts.itype == 1) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i < N; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result[0] *= lapackf77_slange("1", &N, &N, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i < N; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result[0] *= lapackf77_slange("1", &N, &N, h_A, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i < N; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result[0] *= lapackf77_slange("1", &N, &N, h_A, &N, rwork)/N; } } if ( opts.lapack ) { lapackf77_slacpy( MagmaFullStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_slacpy( MagmaFullStr, &N, &N, h_Binit, &N, h_B, &N ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_ssygvd(opts.itype, opts.jobz, opts.uplo, N, h_A, N, h_B, N, w2, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_ssygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_ssygvd(&opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, h_Binit, &N, w2, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_ssygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); float maxw=0, diff=0; for(int j=0; j < N; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (N*maxw); /* ===================================================================== Print execution time =================================================================== */ printf("%5d %7.2f %7.2f %7.2f\n", (int) N, cpu_time, gpu_time, mgpu_time); } else { printf("%5d --- --- %7.2f\n", (int) N, mgpu_time); } if ( opts.check && opts.jobz != MagmaNoVec ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype == 1) { printf(" | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if (opts.itype == 2) { printf(" | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if (opts.itype == 3) { printf(" | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } status += ! (result[0] < tol); } if ( opts.lapack ) { printf( " | D_mgpu - D_lapack | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed") ); status += ! (result[1] < tolulp); } /* Memory clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #ifdef COMPLEX TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); if ( opts.warmup || opts.check ) { TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing spotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float *h_A, *h_R; magmaFloat_ptr d_A; magma_int_t N, n2, lda, ldda, info; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float Anorm, error, work[1]; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("%%=======================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default gflops = FLOPS_SPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); magma_smake_hpd( N, h_A, lda ); lapackf77_slacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_spotrf_gpu( opts.uplo, N, d_A, ldda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_spotrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_spotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_spotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_sgetmatrix( N, N, d_A, ldda, h_R, lda ); blasf77_saxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); Anorm = lapackf77_slange("f", &N, &N, h_A, &lda, work); error = lapackf77_slange("f", &N, &N, h_R, &lda, work) / Anorm; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrs_gpu */ int main( int argc, char** argv) { //#if defined(PRECISION_s) /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float matnorm, work[1]; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A, d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, 1024, 1024); M = N = size[6]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_sgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( tau, float, min_mn ); TESTING_MALLOC_PIN( h_A, float, lda*N ); TESTING_MALLOC_PIN( h_A2, float, lda*N ); TESTING_MALLOC_PIN( h_B, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_X, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_R, float, ldb*nrhs ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = -1; lapackf77_sormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC_PIN( hwork, float, lhwork ); printf("\n"); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<7; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_GEQRF( (float)M, (float)N ) + FLOPS_GEQRS( (float)M, (float)N, (float)nrhs )) / 1e9; /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_slarnv( &ione, ISEED, &n2, h_B ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Warm up to measure the performance */ magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); gpu_time = magma_wtime(); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgels had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; // Get the solution in h_X magma_sgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue ); // compute the residual blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_slange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_sgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, hwork, &lhwork, &info); cpu_time = magma_wtime()-cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapackf77_sgels had an illegal value.\n", -info); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", M, N, cpu_perf, gpu_perf, lapackf77_slange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_slange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( hwork ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing slarfb_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t M, N, K, size, ldc, ldv, ldt, ldw, nv; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float error, work[1]; // test all combinations of input parameters const char side[] = { MagmaLeft, MagmaRight }; const char trans[] = { MagmaTrans, MagmaNoTrans }; const char direct[] = { MagmaForward, MagmaBackward }; const char storev[] = { MagmaColumnwise, MagmaRowwise }; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { M = opts.msize[i]; N = opts.nsize[i]; K = opts.ksize[i]; if ( M < K || N < K || K <= 0 ) { printf( "skipping M %d, N %d, K %d; requires M >= K, N >= K, K >= 0.\n", (int) M, (int) N, (int) K ); continue; } for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { ldc = ((M+31)/32)*32; ldt = ((K+31)/32)*32; ldw = (side[iside] == MagmaLeft ? N : M); // (ldv, nv) get swapped later if rowwise ldv = (side[iside] == MagmaLeft ? M : N); nv = K; // Allocate memory for matrices float *C, *R, *V, *T, *W; TESTING_MALLOC( C, float, ldc*N ); TESTING_MALLOC( R, float, ldc*N ); TESTING_MALLOC( V, float, ldv*K ); TESTING_MALLOC( T, float, ldt*K ); TESTING_MALLOC( W, float, ldw*K ); float *dC, *dV, *dT, *dW; TESTING_DEVALLOC( dC, float, ldc*N ); TESTING_DEVALLOC( dV, float, ldv*K ); TESTING_DEVALLOC( dT, float, ldt*K ); TESTING_DEVALLOC( dW, float, ldw*K ); // C is M x N. size = ldc*N; lapackf77_slarnv( &ione, ISEED, &size, C ); //printf( "C=" ); magma_sprint( M, N, C, ldc ); // V is ldv x nv. See larfb docs for description. // if column-wise and left, M x K // if column-wise and right, N x K // if row-wise and left, K x M // if row-wise and right, K x N size = ldv*nv; lapackf77_slarnv( &ione, ISEED, &size, V ); if ( storev[istor] == MagmaColumnwise ) { if ( direct[idir] == MagmaForward ) { lapackf77_slaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_slaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, &V[(ldv-K)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( direct[idir] == MagmaForward ) { lapackf77_slaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_slaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, &V[(nv-K)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_sprint( ldv, nv, V, ldv ); // T is K x K, upper triangular for forward, and lower triangular for backward magma_int_t k1 = K-1; size = ldt*K; lapackf77_slarnv( &ione, ISEED, &size, T ); if ( direct[idir] == MagmaForward ) { lapackf77_slaset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_slaset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_sprint( K, K, T, ldt ); magma_ssetmatrix( M, N, C, ldc, dC, ldc ); magma_ssetmatrix( ldv, nv, V, ldv, dV, ldv ); magma_ssetmatrix( K, K, T, ldt, dT, ldt ); lapackf77_slarfb( &side[iside], &trans[itran], &direct[idir], &storev[istor], &M, &N, &K, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_sprint( M, N, C, ldc ); magma_slarfb_gpu( side[iside], trans[itran], direct[idir], storev[istor], M, N, K, dV, ldv, dT, ldt, dC, ldc, dW, ldw ); magma_sgetmatrix( M, N, dC, ldc, R, ldc ); //printf( "dHC=" ); magma_sprint( M, N, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| error = lapackf77_slange( "Fro", &M, &N, C, &ldc, work ); size = ldc*N; blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_slange( "Fro", &M, &N, R, &ldc, work ) / error; printf( "%5d %5d %5d %c %c %c %c %8.2e\n", (int) M, (int) N, (int) K, storev[istor], side[iside], direct[idir], trans[itran], error ); TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( V ); TESTING_FREE( T ); TESTING_FREE( W ); TESTING_DEVFREE( dC ); TESTING_DEVFREE( dV ); TESTING_DEVFREE( dT ); TESTING_DEVFREE( dW ); }}}} printf( "\n" ); } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlat2s and slat2d */ int main( int argc, char** argv ) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define SA(i_,j_) (SA + (i_) + (j_)*lda) TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; float serror, swork[1]; double c_neg_one = MAGMA_D_NEG_ONE; float s_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t n, lda, ldda, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; float *SA, *SR; double *A, *R; float *dSA; double *dA; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; printf("func uplo N CPU GB/s (ms) GPU GB/s (ms) ||R||_F\n"); printf("=====================================================================\n"); for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { n = opts.nsize[itest]; lda = n; ldda = ((n+31)/32)*32; // 0.5*(n+1)*n double-real loads and 0.5*(n+1)*n single-real stores (and vice-versa for slat2d) gbytes = (real_Double_t) 0.5*(n+1)*n * (sizeof(double) + sizeof(float)) / 1e9; size = ldda*n; // ldda >= lda TESTING_MALLOC_CPU( SA, float, size ); TESTING_MALLOC_CPU( A, double, size ); TESTING_MALLOC_CPU( SR, float, size ); TESTING_MALLOC_CPU( R, double, size ); TESTING_MALLOC_DEV( dSA, float, size ); TESTING_MALLOC_DEV( dA, double, size ); lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_ssetmatrix( n, n, SA, lda, dSA, ldda ); /* ===================================================================== Performs operation using LAPACK dlat2s =================================================================== */ info = 0; cpu_time = magma_wtime(); lapackf77_dlat2s( lapack_uplo_const(uplo[iuplo]), &n, A, &lda, SA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_dlat2s returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA dlat2s =================================================================== */ gpu_time = magma_sync_wtime(0); magmablas_dlat2s( uplo[iuplo], n, dA, ldda, dSA, ldda, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_dlat2s returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix( n, n, dSA, ldda, SR, lda ); if ( opts.verbose ) { printf( "A= " ); magma_dprint( n, n, A, lda ); printf( "SA= " ); magma_sprint( n, n, SA, lda ); printf( "dA= " ); magma_dprint_gpu( n, n, dA, ldda ); printf( "dSA=" ); magma_sprint_gpu( n, n, dSA, ldda ); } /* ===================================================================== compute error |SA_magma - SA_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_saxpy( &size, &s_neg_one, SA, &ione, SR, &ione ); serror = lapackf77_slange( "Fro", &n, &n, SR, &lda, swork ); printf( "dlat2s %5s %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", lapack_uplo_const(uplo[iuplo]), (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., serror, (serror == 0 ? "ok" : "failed") ); status += ! (serror == 0); /* ===================================================================== Reset matrices =================================================================== */ lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_ssetmatrix( n, n, SA, lda, dSA, ldda ); /* ===================================================================== Performs operation using LAPACK slat2d LAPACK doesn't implement slat2d; use our own simple implementation. =================================================================== */ cpu_time = magma_wtime(); if ( uplo[iuplo] == MagmaLower ) { for( int j=0; j < n; ++j ) { for( int i=j; i < n; ++i ) { *A(i,j) = MAGMA_D_MAKE( real(*SA(i,j)), imag(*SA(i,j)) ); } } } else { // upper for( int j=0; j < n; ++j ) { for( int i=0; i <= j; ++i ) { *A(i,j) = MAGMA_D_MAKE( real(*SA(i,j)), imag(*SA(i,j)) ); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_slat2d returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA slat2d =================================================================== */ magma_ssetmatrix( n, n, SA, lda, dSA, ldda ); gpu_time = magma_sync_wtime(0); magmablas_slat2d( uplo[iuplo], n, dSA, ldda, dA, ldda, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_slat2d returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix( n, n, dA, ldda, R, lda ); if ( opts.verbose ) { printf( "A= " ); magma_dprint( n, n, A, lda ); printf( "SA= " ); magma_sprint( n, n, SA, lda ); printf( "dA= " ); magma_dprint_gpu( n, n, dA, ldda ); printf( "dSA=" ); magma_sprint_gpu( n, n, dSA, ldda ); } /* ===================================================================== compute error |A_magma - A_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_daxpy( &size, &c_neg_one, A, &ione, R, &ione ); error = lapackf77_dlange( "Fro", &n, &n, R, &lda, work ); printf( "slat2d %5s %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", lapack_uplo_const(uplo[iuplo]), (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error == 0 ? "ok" : "failed") ); status += ! (error == 0); TESTING_FREE_CPU( SA ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( SR ); TESTING_FREE_CPU( R ); TESTING_FREE_DEV( dSA ); TESTING_FREE_DEV( dA ); printf( "\n" ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; float magma_error, dev_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y, *Ydev, *Ymagma; magmaFloat_ptr dA, dX, dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); #ifdef HAVE_CUBLAS printf(" M N MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else printf(" M N %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_SGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, float, sizeA ); TESTING_MALLOC_CPU( X, float, sizeX ); TESTING_MALLOC_CPU( Y, float, sizeY ); TESTING_MALLOC_CPU( Ydev, float, sizeY ); TESTING_MALLOC_CPU( Ymagma, float, sizeY ); TESTING_MALLOC_DEV( dA, float, sizeA ); TESTING_MALLOC_DEV( dX, float, sizeX ); TESTING_MALLOC_DEV( dY, float, sizeY ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &sizeA, A ); lapackf77_slarnv( &ione, ISEED, &sizeX, X ); lapackf77_slarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( M, N, A, lda, dA, 0, lda, opts.queue ); magma_ssetvector( Xm, X, incx, dX, 0, incx, opts.queue ); magma_ssetvector( Ym, Y, incy, dY, 0, incy, opts.queue ); #ifdef HAVE_CUBLAS dev_time = magma_sync_wtime( 0 ); cublasSgemv( opts.handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); dev_time = magma_sync_wtime( 0 ) - dev_time; #else dev_time = magma_sync_wtime( opts.queue ); magma_sgemv( opts.transA, M, N, alpha, dA, 0, lda, dX, 0, incx, beta, dY, 0, incy, opts.queue ); dev_time = magma_sync_wtime( opts.queue ) - dev_time; #endif dev_perf = gflops / dev_time; magma_sgetvector( Ym, dY, 0, incy, Ydev, incy, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_ssetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_sgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( Ym, dY, incy, Ymagma, incy ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_sgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ float Anorm = lapackf77_slange( "F", &M, &N, A, &lda, work ); float Xnorm = lapackf77_slange( "F", &Xm, &ione, X, &Xm, work ); blasf77_saxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy ); dev_error = lapackf77_slange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm); #ifdef HAVE_CUBLAS blasf77_saxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_slange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (magma_error < tol && dev_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && dev_error < tol); #else printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (dev_error < tol ? "ok" : "failed")); status += ! (dev_error < tol); #endif TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ydev ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssygst */ int main( int argc, char** argv) { TESTING_INIT(); // Constants const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; // Local variables real_Double_t gpu_time, cpu_time; float *h_A, *h_B, *h_R; magmaFloat_ptr d_A, d_B; float Anorm, error, work[1]; magma_int_t N, n2, lda, ldda, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% itype N CPU time (sec) GPU time (sec) |R| \n"); printf("%%=======================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( lda, opts.align ); n2 = N*lda; TESTING_MALLOC_CPU( h_A, float, lda*N ); TESTING_MALLOC_CPU( h_B, float, lda*N ); TESTING_MALLOC_PIN( h_R, float, lda*N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, ldda*N ); /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slarnv( &ione, ISEED, &n2, h_B ); magma_smake_symmetric( N, h_A, lda ); magma_smake_hpd( N, h_B, lda ); magma_spotrf( opts.uplo, N, h_B, lda, &info ); if (info != 0) { printf("magma_spotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_ssetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_ssetmatrix( N, N, h_B, lda, d_B, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_ssygst_gpu( opts.itype, opts.uplo, N, d_A, ldda, d_B, ldda, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_ssygst_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_ssygst( &opts.itype, lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_B, &lda, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) { printf("lapackf77_ssygst returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_sgetmatrix( N, N, d_A, ldda, h_R, lda, opts.queue ); blasf77_saxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); Anorm = safe_lapackf77_slansy("f", lapack_uplo_const(opts.uplo), &N, h_A, &lda, work ); error = safe_lapackf77_slansy("f", lapack_uplo_const(opts.uplo), &N, h_R, &lda, work ) / Anorm; bool okay = (error < tol); status += ! okay; printf("%3d %5d %7.2f %7.2f %8.2e %s\n", (int) opts.itype, (int) N, cpu_time, gpu_time, error, (okay ? "ok" : "failed")); } else { printf("%3d %5d --- %7.2f\n", (int) opts.itype, (int) N, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { #if (GPUSHMEM >= 200) TESTING_INIT(); magma_setdevice(0); magma_timestr_t start, end; float flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t n_local[4]; FILE *fp ; magma_int_t N, m, i, j, lda, LDA, M; magma_int_t matsize; magma_int_t vecsize; magma_int_t istart = 64; magma_int_t incx = 1; char uplo = MagmaLower; float alpha = MAGMA_S_MAKE(1., 0.); // MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE(0., 0.); // MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y[4], *Ycublas, *Ymagma; float *dA, *dX[4], *dY[4], *d_lA[4], *dYcublas ; magma_queue_t stream[4][10]; float *C_work; float *dC_work[4]; int max_num_gpus; magma_int_t num_gpus = 1, nb; magma_int_t blocks, workspace; magma_int_t offset = 0; M = 0; N = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); istart = N; } else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); else if (strcmp("-offset", argv[i])==0) offset = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) { printf(" testing_ssymv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); printf(" in %c side \n", uplo); } else { printf("\nUsage: \n"); printf(" testing_ssymv_mgpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { #if defined(PRECISION_z) M = N = 8000; #else M = N = 12480; #endif num_gpus = 2; offset = 0; printf("\nUsage: \n"); printf(" testing_ssymv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); } /////////////////////////////////////////////////////////////////////////////////////// cudaGetDeviceCount(&max_num_gpus); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); for(int i=0; i< num_gpus; i++) { magma_queue_create(&stream[i][0]); } LDA = ((N+31)/32)*32; matsize = N*LDA; vecsize = N*incx; nb = 32; //nb = 64; printf("block size = %d\n", (int) nb); TESTING_MALLOC( A, float, matsize ); TESTING_MALLOC( X, float, vecsize ); TESTING_MALLOC( Ycublas, float, vecsize ); TESTING_MALLOC( Ymagma, float, vecsize ); for(i=0; i<num_gpus; i++) { TESTING_MALLOC( Y[i], float, vecsize ); } magma_setdevice(0); TESTING_DEVALLOC( dA, float, matsize ); TESTING_DEVALLOC( dYcublas, float, vecsize ); for(i=0; i<num_gpus; i++) { n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; magma_setdevice(i); TESTING_DEVALLOC( d_lA[i], float, LDA*n_local[i] );// potentially bugged TESTING_DEVALLOC( dX[i], float, vecsize ); TESTING_DEVALLOC( dY[i], float, vecsize ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } magma_setdevice(0); /////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &matsize, A ); /* Make A symmetric */ { magma_int_t i, j; for(i=0; i<N; i++) { A[i*LDA+i] = MAGMA_S_MAKE( MAGMA_S_REAL(A[i*LDA+i]), 0. ); for(j=0; j<i; j++) A[i*LDA+j] = (A[j*LDA+i]); } } blocks = N / nb + (N % nb != 0); workspace = LDA * (blocks + 1); TESTING_MALLOC( C_work, float, workspace ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); TESTING_DEVALLOC( dC_work[i], float, workspace ); //fillZero(dC_work[i], workspace); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////////////////////////// fp = fopen ("results_ssymv_mgpu.csv", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("SSYMV float precision\n\n"); printf( " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); // for( offset = 0; offset< N; offset ++ ) for(int size = istart ; size <= N ; size += 128) { // printf("offset = %d ", offset); m = size ; // m = N; // lda = ((m+31)/32)*32;// lda = LDA; flops = FLOPS( (float)m ) / 1e6; printf( "N %5d ", (int) m ); fprintf( fp, "%5d, ", (int) m ); vecsize = m * incx; lapackf77_slarnv( &ione, ISEED, &vecsize, X ); lapackf77_slarnv( &ione, ISEED, &vecsize, Y[0] ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_setdevice(0); magma_ssetmatrix_1D_col_bcyclic(m, m, A, LDA, d_lA, lda, num_gpus, nb); magma_setdevice(0); magma_ssetmatrix( m, m, A, LDA, dA, lda ); magma_ssetvector( m, Y[0], incx, dYcublas, incx ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); magma_ssetvector( m, X, incx, dX[i], incx ); magma_ssetvector( m, Y[0], incx, dY[i], incx ); blocks = m / nb + (m % nb != 0); magma_ssetmatrix( lda, blocks, C_work, LDA, dC_work[i], lda ); } magma_setdevice(0); start = get_current_time(); cublasSsymv( uplo, m-offset, alpha, dA + offset + offset * lda, lda, dX[0] + offset, incx, beta, dYcublas + offset, incx ); end = get_current_time(); magma_sgetvector( m, dYcublas, incx, Ycublas, incx ); cuda_perf = flops / GetTimerValue(start,end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f,", cuda_perf ); magma_setdevice(0); start = get_current_time(); if(nb == 32) { magmablas_ssymv2_mgpu_32_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, workspace, num_gpus, nb, offset); } else // nb = 64 { magmablas_ssymv2_mgpu_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, workspace, num_gpus, nb, offset); } for(i=1; i<num_gpus; i++) { magma_setdevice(i); cudaDeviceSynchronize(); } end = get_current_time(); magma_perf = flops / GetTimerValue(start,end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f,", magma_perf ); for(i=0; i<num_gpus; i++) { magma_setdevice(i); magma_sgetvector( m, dY[i], incx, Y[i], incx ); } magma_setdevice(0); #ifdef validate for( j= offset;j<m;j++) { for(i=1; i<num_gpus; i++) { // printf("Y[%d][%d] = %15.14f\n", i, j, Y[i][j].x); #if defined(PRECISION_z) || defined(PRECISION_c) Y[0][j].x = Y[0][j].x + Y[i][j].x; Y[0][j].y = Y[0][j].y + Y[i][j].y; #else Y[0][j] = Y[0][j] + Y[i][j]; #endif } } /* #if defined(PRECISION_z) || defined(PRECISION_c) for( j=offset;j<m;j++) { if(Y[0][j].x != Ycublas[j].x) { printf("Y-multi[%d] = %f, %f\n", j, Y[0][j].x, Y[0][j].y ); printf("Ycublas[%d] = %f, %f\n", j, Ycublas[j].x, Ycublas[j].y); } } #else for( j=offset;j<m;j++) { if(Y[0][j] != Ycublas[j]) { printf("Y-multi[%d] = %f\n", j, Y[0][j] ); printf("Ycublas[%d] = %f\n", j, Ycublas[j]); } } #endif */ /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ magma_int_t nw = m - offset ; blasf77_saxpy( &nw, &c_neg_one, Y[0] + offset, &incx, Ycublas + offset, &incx); error = lapackf77_slange( "M", &nw, &ione, Ycublas + offset, &nw, work ); #if 0 printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); /* * Extra check with cblas vs magma */ cblas_scopy( m, Y, incx, Ycublas, incx ); cblas_ssymv( CblasColMajor, CblasLower, m, (alpha), A, LDA, X, incx, (beta), Ycublas, incx ); blasf77_saxpy( &m, &c_neg_one, Ymagma, &incx, Ycublas, &incx); error = lapackf77_slange( "M", &m, &ione, Ycublas, &m, work ); #endif printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); #endif printf("\n"); fprintf(fp, "\n"); } fclose( fp ) ; /* Free Memory */ TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_FREE( C_work ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dYcublas ); for(i=0; i<num_gpus; i++) { TESTING_FREE( Y[i] ); magma_setdevice(i); TESTING_DEVFREE( d_lA[i] ); TESTING_DEVFREE( dX[i] ); TESTING_DEVFREE( dY[i] ); TESTING_DEVFREE( dC_work[i] ); } magma_setdevice(0); /////////////////////////////////////////////////////////// /* Free device */ TESTING_FINALIZE(); #endif return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeadd */ int main( int argc, char** argv) { #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define h_B(i_, j_) (h_B + (i_) + (j_)*lda) // B uses lda TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Bnorm, error, work[1]; float *h_A, *h_B, *d_A, *d_B; float alpha = MAGMA_S_MAKE( 3.1415, 2.71828 ); float beta = MAGMA_S_MAKE( 6.0221, 6.67408 ); float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); /* Uncomment these lines to check parameters. * magma_xerbla calls lapack's xerbla to print out error. */ //magmablas_sgeadd( -1, N, alpha, d_A, ldda, d_B, ldda, opts.queue ); //magmablas_sgeadd( M, -1, alpha, d_A, ldda, d_B, ldda, opts.queue ); //magmablas_sgeadd( M, N, alpha, d_A, M-1, d_B, ldda, opts.queue ); //magmablas_sgeadd( M, N, alpha, d_A, ldda, d_B, N-1, opts.queue ); printf("%% M N CPU Gflop/s (ms) GPU Gflop/s (ms) |Bl-Bm|/|Bl|\n"); printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default size = lda*N; gflops = 2.*M*N / 1e9; TESTING_MALLOC_CPU( h_A, float, lda *N ); TESTING_MALLOC_CPU( h_B, float, lda *N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, ldda*N ); lapackf77_slarnv( &ione, ISEED, &size, h_A ); lapackf77_slarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda, opts.queue ); magma_ssetmatrix( M, N, h_B, lda, d_B, ldda, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); if ( opts.version == 1 ) { magmablas_sgeadd( M, N, alpha, d_A, ldda, d_B, ldda, opts.queue ); } else { magmablas_sgeadd2( M, N, alpha, d_A, ldda, beta, d_B, ldda, opts.queue ); } gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if ( opts.version == 1 ) { for( int j = 0; j < N; ++j ) { blasf77_saxpy( &M, &alpha, &h_A[j*lda], &ione, &h_B[j*lda], &ione ); } } else { for( int j = 0; j < N; ++j ) { // daxpby for( int i=0; i < M; ++i ) { *h_B(i,j) = alpha * (*h_A(i,j)) + beta * (*h_B(i,j)); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check result =================================================================== */ magma_sgetmatrix( M, N, d_B, ldda, h_A, lda, opts.queue ); blasf77_saxpy( &size, &c_neg_one, h_B, &ione, h_A, &ione ); Bnorm = lapackf77_slange( "F", &M, &N, h_B, &lda, work ); error = lapackf77_slange( "F", &M, &N, h_A, &lda, work ) / Bnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); const float d_neg_one = MAGMA_D_NEG_ONE; const float d_one = MAGMA_D_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float Anorm, error=0, error2=0; float *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloat_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol = opts.tolerance * lapackf77_slamch("E"); printf( "version %d\n", (int) opts.version ); if ( opts.version == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |R - Q^H*A| |I - Q^H*Q|\n"); printf("===============================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |b - A*x|\n"); printf("================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_work, float, lwork ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { // LAPACK complaint arguments magma_sgeqrf2_gpu( M, N, d_A, ldda, tau, &info ); } else { nb = magma_get_sgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, float, size ); if ( opts.version == 1 ) { // stores dT, V blocks have zeros, R blocks inverted & stored in dT magma_sgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // stores dT, V blocks have zeros, R blocks stored in dT magma_sgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); exit(1); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check && opts.version == 2 ) { /* ===================================================================== Check the result, following zqrt01 except using the reduced Q. This works for any M,N (square, tall, wide). Only for version 2, which has LAPACK complaint output. =================================================================== */ magma_sgetmatrix( M, N, d_A, ldda, h_R, lda ); magma_int_t ldq = M; magma_int_t ldr = min_mn; float *Q, *R; float *work; TESTING_MALLOC_CPU( Q, float, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, float, ldr*N ); // K by N TESTING_MALLOC_CPU( work, float, min_mn ); // generate M by K matrix Q, where K = min(M,N) lapackf77_slacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_sorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // copy K by N matrix R lapackf77_slaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr ); lapackf77_slacpy( "Upper", &min_mn, &N, h_R, &lda, R, &ldr ); // error = || R - Q^H*A || / (N * ||A||) blasf77_sgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr ); Anorm = lapackf77_slange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_slange( "1", &min_mn, &N, R, &ldr, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set R = I (K by K identity), then R = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_slaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr ); blasf77_ssyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr ); error2 = lapackf77_slansy( "1", "Upper", &min_mn, R, &ldr, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; } else if ( opts.check && M >= N ) { /* ===================================================================== Check the result by solving consistent linear system, A*x = b. Only for versions 1 & 3 with M >= N. =================================================================== */ magma_int_t lwork; float *x, *b, *hwork; magmaFloat_ptr d_B; const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, float, N ); TESTING_MALLOC_CPU( b, float, M ); lapackf77_slarnv( &ione, ISEED, &N, x ); blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, float, M ); magma_ssetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_sgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lwork ); // solve linear system magma_sgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_sgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_sgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lwork ); // solve linear system magma_sgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_sgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); exit(1); } magma_sgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) float norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_slange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_slange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_slange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check ) { if ( opts.version == 2 ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else if ( M >= N ) { bool okay = (error < tol); status += ! okay; printf( "%10.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf( "(error check only for M >= N)\n" ); } } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgegqr */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float e, e1, e2, e3, e4, e5, *work; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; float *h_A, *h_R, *tau, *dtau, *h_work, *h_rwork, tmp[1]; magmaFloat_ptr d_A, dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1, ldwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // versions 1...4 are valid if (opts.version < 1 || opts.version > 4) { printf("Unknown version %d; exiting\n", (int) opts.version ); return -1; } float tol = 10. * opts.tolerance * lapackf77_slamch("E"); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||I-Q'Q||_F / M ||I-Q'Q||_I / M ||A-Q R||_I\n"); printf(" MAGMA / LAPACK MAGMA / LAPACK\n"); printf("==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because sgegqr requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because sgegqr requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGEQRF( M, N ) / 1e9 + FLOPS_SORGQR( M, N, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lwork = max(lwork, 3*N*N); ldwork = N*N; if (opts.version == 2) { ldwork = 3*N*N + min_mn + 2; } TESTING_MALLOC_PIN( tau, float, min_mn ); TESTING_MALLOC_PIN( h_work, float, lwork ); TESTING_MALLOC_PIN(h_rwork, float, lwork ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_R, float, n2 ); TESTING_MALLOC_CPU( work, float, M ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( dtau, float, min_mn ); TESTING_MALLOC_DEV( dwork, float, ldwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup if ( opts.warmup ) { magma_sgegqr_gpu( 1, M, N, d_A, ldda, dwork, h_work, &info ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_sgegqr_gpu( opts.version, M, N, d_A, ldda, dwork, h_rwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgegqr returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix( M, N, d_A, ldda, h_R, M ); // Regenerate R // blasf77_sgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_A, &M, &c_zero, h_rwork, &N); // magma_sprint(N, N, h_work, N); blasf77_strmm("r", "u", "n", "n", &M, &N, &c_one, h_rwork, &N, h_R, &M); blasf77_saxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); e5 = lapackf77_slange("i", &M, &N, h_R, &M, work) / lapackf77_slange("i", &M, &N, h_A, &lda, work); magma_sgetmatrix( M, N, d_A, ldda, h_R, M ); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); /* Orthogonalize on the CPU */ lapackf77_sgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_sorgqr(&M, &N, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sorgqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_sgemm("c", "n", &N, &N, &M, &c_one, h_R, &M, h_R, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_S_SUB(h_work[ii], c_one); } e1 = lapackf77_slange("f", &N, &N, h_work, &N, work) / N; e3 = lapackf77_slange("i", &N, &N, h_work, &N, work) / N; blasf77_sgemm("c", "n", &N, &N, &M, &c_one, h_A, &M, h_A, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_S_SUB(h_work[ii], c_one); } e2 = lapackf77_slange("f", &N, &N, h_work, &N, work) / N; e4 = lapackf77_slange("i", &N, &N, h_work, &N, work) / N; if (opts.version != 4) e = e1; else e = e1 / (10.*max(M,N)); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e / %8.2e %8.2e / %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, e1, e2, e3, e4, e5, (e < tol ? "ok" : "failed")); status += ! (e < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_rwork ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssygvdx */ int main( int argc, char** argv) { TESTING_INIT(); /* Constants */ const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; /* Local variables */ real_Double_t gpu_time; float *h_A, *h_R, *h_B, *h_S, *h_work; #ifdef COMPLEX float *rwork; magma_int_t lrwork; #endif float *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lda, lwork, liwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; // pass ngpu = -1 to test multi-GPU code using 1 gpu magma_int_t abs_ngpu = abs( opts.ngpu ); printf("%% itype = %d, jobz = %s, range = %s, uplo = %s, fraction = %6.4f, ngpu = %d\n", int(opts.itype), lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), opts.fraction, int(abs_ngpu) ); if (opts.itype == 1) { printf("%% N M GPU Time (sec) |AZ-BZD| |D - D_magma|\n"); } else if (opts.itype == 2) { printf("%% N M GPU Time (sec) |ABZ-ZD| |D - D_magma|\n"); } else if (opts.itype == 3) { printf("%% N M GPU Time (sec) |BAZ-ZD| |D - D_magma|\n"); } printf("%%======================================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; // TODO: test vl-vu range magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (opts.fraction == 0) { il = max( 1, magma_int_t(0.1*N) ); iu = max( 1, magma_int_t(0.3*N) ); } else { il = 1; iu = max( 1, magma_int_t(opts.fraction*N) ); } magma_ssyevdx_getworksize(N, threads, (opts.jobz == MagmaVec), &lwork, #ifdef COMPLEX &lrwork, #endif &liwork); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_B, float, n2 ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_PIN( h_S, float, n2 ); TESTING_MALLOC_PIN( h_work, float, max( lwork, N*N )); // check needs N*N #ifdef COMPLEX TESTING_MALLOC_PIN( rwork, float, lrwork); #endif /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slarnv( &ione, ISEED, &n2, h_B ); magma_smake_hpd( N, h_B, lda ); magma_smake_symmetric( N, h_A, lda ); lapackf77_slacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_slacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); // =================================================================== // Performs operation using MAGMA // =================================================================== gpu_time = magma_wtime(); if (opts.ngpu == 1) { magma_ssygvdx_2stage( opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } else { magma_ssygvdx_2stage_m( abs_ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_ssygvdx_2stage returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A| |Z| N ) (itype = 1) | A B Z - Z D | / ( |A| |Z| N ) (itype = 2) | B A Z - Z D | / ( |A| |Z| N ) (itype = 3) (2) | D(with V, magma) - D(w/o V, lapack) | / | D | =================================================================== */ #ifdef REAL float *rwork = h_work + N*N; #endif if ( opts.jobz != MagmaNoVec ) { result[0] = 1.; result[0] /= safe_lapackf77_slansy("1", lapack_uplo_const(opts.uplo), &N, h_A, &lda, rwork); result[0] /= lapackf77_slange("1", &N, &m1, h_R, &lda, rwork); if (opts.itype == 1) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_sscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &lda, h_R, &lda, &c_one, h_work, &N); result[0] *= lapackf77_slange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_sscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_slange("1", &N, &m1, h_R, &lda, rwork)/N; } else if (opts.itype == 3) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_sscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_slange("1", &N, &m1, h_R, &lda, rwork)/N; } } lapackf77_slacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_slacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); lapackf77_ssygvd( &opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &lda, h_S, &lda, w2, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info ); if (info != 0) { printf("lapackf77_ssygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); } float maxw=0, diff=0; for (int j=0; j < m1; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (m1*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %9.4f ", (int) N, (int) m1, gpu_time); if ( opts.check ) { bool okay = (result[1] < tolulp); if ( opts.jobz != MagmaNoVec ) { okay = okay && (result[0] < tol); printf(" %8.2e", result[0] ); } else { printf(" --- "); } printf(" %8.2e %s\n", result[1], (okay ? "ok" : "failed")); status += ! okay; } else { printf(" ---\n"); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #ifdef COMPLEX TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeev */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; float *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2; float *w1i, *w2i; magmaFloatComplex *w1copy, *w2copy; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float tnrm, result[9]; magma_int_t N, n2, lda, nb, lwork, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float ulp, ulpinv, error; magma_int_t status = 0; ulp = lapackf77_slamch( "P" ); ulpinv = 1./ulp; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); // enable at least some minimal checks, if requested if ( opts.check && !opts.lapack && opts.jobvl == MagmaNoVec && opts.jobvr == MagmaNoVec ) { fprintf( stderr, "NOTE: Some checks require vectors to be computed;\n" " set jobvl=V (option -LV), or jobvr=V (option -RV), or both.\n" " Some checks require running lapack (-l); setting lapack.\n\n"); opts.lapack = true; } printf(" N CPU Time (sec) GPU Time (sec) |W_magma - W_lapack| / |W_lapack|\n"); printf("===========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; nb = magma_get_sgehrd_nb(N); lwork = N*(2 + nb); // generous workspace - required by sget22 lwork = max( lwork, N*(5 + 2*N) ); TESTING_MALLOC_CPU( w1copy, magmaFloatComplex, N ); TESTING_MALLOC_CPU( w2copy, magmaFloatComplex, N ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( w1i, float, N ); TESTING_MALLOC_CPU( w2i, float, N ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_PIN( VL, float, n2 ); TESTING_MALLOC_PIN( VR, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgeev( opts.jobvl, opts.jobvr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_sgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { /* =================================================================== * Check the result following LAPACK's [zcds]drvev routine. * The following tests are performed: * (1) | A * VR - VR * W | / ( n |A| ) * * Here VR is the matrix of unit right eigenvectors. * W is a diagonal matrix with diagonal entries W(j). * * (2) | |VR(i)| - 1 | and whether largest component real * * VR(i) denotes the i-th column of VR. * * (3) | A**T * VL - VL * W**T | / ( n |A| ) * * Here VL is the matrix of unit left eigenvectors, A**T is the * transpose of A, and W is as above. * * (4) | |VL(i)| - 1 | and whether largest component real * * VL(i) denotes the i-th column of VL. * * (5) W(full) = W(partial, W only) -- currently skipped * (6) W(full) = W(partial, W and VR) * (7) W(full) = W(partial, W and VL) * * W(full) denotes the eigenvalues computed when both VR and VL * are also computed, and W(partial) denotes the eigenvalues * computed when only W, only W and VR, or only W and VL are * computed. * * (8) VR(full) = VR(partial, W and VR) * * VR(full) denotes the right eigenvectors computed when both VR * and VL are computed, and VR(partial) denotes the result * when only VR is computed. * * (9) VL(full) = VL(partial, W and VL) * * VL(full) denotes the left eigenvectors computed when both VR * and VL are also computed, and VL(partial) denotes the result * when only VL is computed. * * (1, 2) only if jobvr = V * (3, 4) only if jobvl = V * (5-9) only if check = 2 (option -c2) ================================================================= */ float vmx, vrmx, vtst; // Initialize result. -1 indicates test was not run. for( int j = 0; j < 9; ++j ) result[j] = -1.; if ( opts.jobvr == MagmaVec ) { // Do test 1: | A * VR - VR * W | / ( n |A| ) // Note this writes result[1] also lapackf77_sget22( MagmaNoTransStr, MagmaNoTransStr, MagmaNoTransStr, &N, h_A, &lda, VR, &lda, w1, w1i, h_work, &result[0] ); result[0] *= ulp; // Do test 2: | |VR(i)| - 1 | and whether largest component real result[1] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_snrm2(N, &VR[j*lda], ione); else if (w1i[j] > 0.) tnrm = magma_slapy2( cblas_snrm2(N, &VR[j *lda], ione), cblas_snrm2(N, &VR[(j+1)*lda], ione) ); result[1] = max( result[1], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_slapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VR[jj + (j+1)*lda])==0. && MAGMA_S_ABS( VR[jj+j*lda] ) > vrmx) { vrmx = MAGMA_S_ABS( VR[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[1] = ulpinv; } } result[1] *= ulp; } if ( opts.jobvl == MagmaVec ) { // Do test 3: | A**T * VL - VL * W**T | / ( n |A| ) // Note this writes result[3] also lapackf77_sget22( MagmaTransStr, MagmaNoTransStr, MagmaTransStr, &N, h_A, &lda, VL, &lda, w1, w1i, h_work, &result[2] ); result[2] *= ulp; // Do test 4: | |VL(i)| - 1 | and whether largest component real result[3] = -1.; for( int j = 0; j < N; ++j ) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_snrm2(N, &VL[j*lda], ione); else if (w1i[j] > 0.) tnrm = magma_slapy2( cblas_snrm2(N, &VL[j *lda], ione), cblas_snrm2(N, &VL[(j+1)*lda], ione) ); result[3] = max( result[3], min( ulpinv, MAGMA_S_ABS(tnrm-1.)/ulp )); if (w1i[j] > 0.) { vmx = vrmx = 0.; for( int jj = 0; jj < N; ++jj ) { vtst = magma_slapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VL[jj + (j+1)*lda])==0. && MAGMA_S_ABS( VL[jj+j*lda]) > vrmx) { vrmx = MAGMA_S_ABS( VL[jj+j*lda] ); } } if (vrmx / vmx < 1. - ulp*2.) result[3] = ulpinv; } } result[3] *= ulp; } } if ( opts.check == 2 ) { // more extensive tests // this is really slow because it calls magma_zgeev multiple times float *LRE, DUM; TESTING_MALLOC_PIN( LRE, float, n2 ); lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); // ---------- // Compute eigenvalues, left and right eigenvectors magma_sgeev( MagmaVec, MagmaVec, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info ); if (info != 0) printf("magma_zgeev (case V, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // ---------- // Compute eigenvalues only // These are not exactly equal, and not in the same order, so skip for now. //lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); //magma_sgeev( MagmaNoVec, MagmaNoVec, // N, h_R, lda, w2, w2i, // &DUM, 1, &DUM, 1, // h_work, lwork, &info ); //if (info != 0) // printf("magma_sgeev (case N, N) returned error %d: %s.\n", // (int) info, magma_strerror( info )); // //// Do test 5: W(full) = W(partial, W only) //result[4] = 1; //for( int j = 0; j < N; ++j ) // if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) // result[4] = 0; // ---------- // Compute eigenvalues and right eigenvectors lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_sgeev( MagmaNoVec, MagmaVec, N, h_R, lda, w2, w2i, &DUM, 1, LRE, lda, h_work, lwork, &info ); if (info != 0) printf("magma_sgeev (case N, V) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 6: W(full) = W(partial, W and VR) result[5] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[5] = 0; // Do test 8: VR(full) = VR(partial, W and VR) result[7] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_S_EQUAL( VR[j+jj*lda], LRE[j+jj*lda] )) result[7] = 0; // ---------- // Compute eigenvalues and left eigenvectors lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_sgeev( MagmaVec, MagmaNoVec, N, h_R, lda, w2, w2i, LRE, lda, &DUM, 1, h_work, lwork, &info ); if (info != 0) printf("magma_sgeev (case V, N) returned error %d: %s.\n", (int) info, magma_strerror( info )); // Do test 7: W(full) = W(partial, W and VL) result[6] = 1; for( int j = 0; j < N; ++j ) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[6] = 0; // Do test 9: VL(full) = VL(partial, W and VL) result[8] = 1; for( int j = 0; j < N; ++j ) for( int jj = 0; jj < N; ++jj ) if ( ! MAGMA_S_EQUAL( VL[j+jj*lda], LRE[j+jj*lda] )) result[8] = 0; TESTING_FREE_PIN( LRE ); } /* ===================================================================== Performs operation using LAPACK Do this after checks, because it overwrites VL and VR. =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgeev( lapack_vec_const(opts.jobvl), lapack_vec_const(opts.jobvr), &N, h_A, &lda, w2, w2i, VL, &lda, VR, &lda, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_sgeev returned error %d: %s.\n", (int) info, magma_strerror( info )); // check | W_magma - W_lapack | / | W | // need to sort eigenvalues first // copy them into complex vectors for ease for( int j=0; j < N; ++j ) { w1copy[j] = MAGMA_C_MAKE( w1[j], w1i[j] ); w2copy[j] = MAGMA_C_MAKE( w2[j], w2i[j] ); } std::sort( w1copy, &w1copy[N], lessthan ); std::sort( w2copy, &w2copy[N], lessthan ); // adjust sorting to deal with numerical inaccuracy // search down w2 for eigenvalue that matches w1's eigenvalue for( int j=0; j < N; ++j ) { for( int j2=j; j2 < N; ++j2 ) { magmaFloatComplex diff = MAGMA_C_SUB( w1copy[j], w2copy[j2] ); float diff2 = magma_szlapy2( diff ) / max( magma_szlapy2( w1copy[j] ), tol ); if ( diff2 < 100*tol ) { if ( j != j2 ) { std::swap( w2copy[j], w2copy[j2] ); } break; } } } blasf77_caxpy( &N, &c_neg_one, w2copy, &ione, w1copy, &ione ); error = cblas_scnrm2( N, w1copy, 1 ); error /= cblas_scnrm2( N, w2copy, 1 ); printf("%5d %7.2f %7.2f %8.2e %s\n", (int) N, cpu_time, gpu_time, error, (error < tolulp ? "ok" : "failed")); status += ! (error < tolulp); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } if ( opts.check ) { // -1 indicates test was not run if ( result[0] != -1 ) { printf(" | A * VR - VR * W | / ( n |A| ) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } if ( result[1] != -1 ) { printf(" | |VR(i)| - 1 | = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed")); } if ( result[2] != -1 ) { printf(" | A'* VL - VL * W'| / ( n |A| ) = %8.2e %s\n", result[2], (result[2] < tol ? "ok" : "failed")); } if ( result[3] != -1 ) { printf(" | |VL(i)| - 1 | = %8.2e %s\n", result[3], (result[3] < tol ? "ok" : "failed")); } if ( result[4] != -1 ) { printf(" W (full) == W (partial, W only) %s\n", (result[4] == 1. ? "ok" : "failed")); } if ( result[5] != -1 ) { printf(" W (full) == W (partial, W and VR) %s\n", (result[5] == 1. ? "ok" : "failed")); } if ( result[6] != -1 ) { printf(" W (full) == W (partial, W and VL) %s\n", (result[6] == 1. ? "ok" : "failed")); } if ( result[7] != -1 ) { printf(" VR (full) == VR (partial, W and VR) %s\n", (result[7] == 1. ? "ok" : "failed")); } if ( result[8] != -1 ) { printf(" VL (full) == VL (partial, W and VL) %s\n", (result[8] == 1. ? "ok" : "failed")); } int newline = 0; if ( result[0] != -1 ) { status += ! (result[0] < tol); newline = 1; } if ( result[1] != -1 ) { status += ! (result[1] < tol); newline = 1; } if ( result[2] != -1 ) { status += ! (result[2] < tol); newline = 1; } if ( result[3] != -1 ) { status += ! (result[3] < tol); newline = 1; } if ( result[4] != -1 ) { status += ! (result[4] == 1.); newline = 1; } if ( result[5] != -1 ) { status += ! (result[5] == 1.); newline = 1; } if ( result[6] != -1 ) { status += ! (result[6] == 1.); newline = 1; } if ( result[7] != -1 ) { status += ! (result[7] == 1.); newline = 1; } if ( result[8] != -1 ) { status += ! (result[8] == 1.); newline = 1; } if ( newline ) { printf( "\n" ); } } TESTING_FREE_CPU( w1copy ); TESTING_FREE_CPU( w2copy ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( w1i ); TESTING_FREE_CPU( w2i ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( VL ); TESTING_FREE_PIN( VR ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgesv */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, lerror, Rnorm, Anorm, Xnorm, *work; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_LU, *h_B, *h_B0, *h_X; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); nrhs = opts.nrhs; printf("%% ngpu %d\n", (int) opts.ngpu ); if (opts.lapack) { printf("%% N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) ||B - AX|| / N*||A||*||X|| ||B - AX|| / N*||A||*||X||_CPU\n"); printf("%%================================================================================================================\n"); } else { printf("%% N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("%%===============================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; gflops = ( FLOPS_SGETRF( N, N ) + FLOPS_SGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, float, lda*N ); TESTING_MALLOC_CPU( h_LU, float, lda*N ); TESTING_MALLOC_CPU( h_B0, float, ldb*nrhs ); TESTING_MALLOC_CPU( h_B, float, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, float, ldb*nrhs ); TESTING_MALLOC_CPU( work, float, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_slarnv( &ione, ISEED, &sizeB, h_B ); // copy A to LU and B to X; save A and B for residual lapackf77_slacpy( "F", &N, &N, h_A, &lda, h_LU, &lda ); lapackf77_slacpy( "F", &N, &nrhs, h_B, &ldb, h_X, &ldb ); lapackf77_slacpy( "F", &N, &nrhs, h_B, &ldb, h_B0, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgesv( N, nrhs, h_LU, lda, ipiv, h_X, ldb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } //===================================================================== // Residual //===================================================================== Anorm = lapackf77_slange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_slange("I", &N, &nrhs, h_X, &ldb, work); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_slange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); bool okay = (error < tol); status += ! okay; /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lapackf77_slacpy( "F", &N, &N, h_A, &lda, h_LU, &lda ); lapackf77_slacpy( "F", &N, &nrhs, h_B0, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_sgesv( &N, &nrhs, h_LU, &lda, ipiv, h_X, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } //Anorm = lapackf77_slange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_slange("I", &N, &nrhs, h_X, &ldb, work); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B0, &ldb); Rnorm = lapackf77_slange("I", &N, &nrhs, h_B0, &ldb, work); lerror = Rnorm/(N*Anorm*Xnorm); bool lokay = (lerror < tol); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %-6s %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (okay ? "ok" : "failed"), lerror, (lokay ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (okay ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_LU ); TESTING_FREE_CPU( h_B0 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeadd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float *h_A, *h_B, *d_A, *d_B; float alpha = MAGMA_S_MAKE( 3.1415, 2.718 ); float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); /* Uncomment these lines to check parameters. * magma_xerbla calls lapack's xerbla to print out error. */ //magmablas_sgeadd( -1, N, alpha, d_A, ldda, d_B, ldda ); //magmablas_sgeadd( M, -1, alpha, d_A, ldda, d_B, ldda ); //magmablas_sgeadd( M, N, alpha, d_A, M-1, d_B, ldda ); //magmablas_sgeadd( M, N, alpha, d_A, ldda, d_B, N-1 ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) |Bl-Bm|/|Bl|\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = ((M+31)/32)*32; size = lda*N; gflops = 2.*M*N / 1e9; TESTING_MALLOC_CPU( h_A, float, lda *N ); TESTING_MALLOC_CPU( h_B, float, lda *N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, ldda*N ); lapackf77_slarnv( &ione, ISEED, &size, h_A ); lapackf77_slarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); magma_ssetmatrix( M, N, h_B, lda, d_B, ldda ); gpu_time = magma_sync_wtime( NULL ); magmablas_sgeadd( M, N, alpha, d_A, ldda, d_B, ldda ); gpu_time = magma_sync_wtime( NULL ) - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( int j = 0; j < N; ++j ) { blasf77_saxpy( &M, &alpha, &h_A[j*lda], &ione, &h_B[j*lda], &ione ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check result =================================================================== */ magma_sgetmatrix( M, N, d_B, ldda, h_A, lda ); error = lapackf77_slange( "F", &M, &N, h_B, &lda, work ); blasf77_saxpy( &size, &c_neg_one, h_A, &ione, h_B, &ione ); error = lapackf77_slange( "F", &M, &N, h_B, &lda, work ) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeadd_batched Code is very similar to testing_slacpy_batched.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_B; magmaFloat_ptr d_A, d_B; float **hAarray, **hBarray, **dAarray, **dBarray; float alpha = MAGMA_S_MAKE( 3.1415, 2.718 ); magma_int_t M, N, mb, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); mb = (opts.nb == 0 ? 32 : opts.nb); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*mb; nstride = 3*nb; printf("mb=%d, nb=%d, mstride=%d, nstride=%d\n", (int) mb, (int) nb, (int) mstride, (int) nstride ); printf(" M N ntile CPU GFlop/s (ms) GPU GFlop/s (ms) error \n"); printf("====================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = ((M+31)/32)*32; size = lda*N; if ( N < nb || M < nb ) { ntile = 0; } else { ntile = min( (M - nb)/mstride + 1, (N - nb)/nstride + 1 ); } gflops = 2.*mb*nb*ntile / 1e9; TESTING_MALLOC_CPU( h_A, float, lda *N ); TESTING_MALLOC_CPU( h_B, float, lda *N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, ldda*N ); TESTING_MALLOC_CPU( hAarray, float*, ntile ); TESTING_MALLOC_CPU( hBarray, float*, ntile ); TESTING_MALLOC_DEV( dAarray, float*, ntile ); TESTING_MALLOC_DEV( dBarray, float*, ntile ); lapackf77_slarnv( &ione, ISEED, &size, h_A ); lapackf77_slarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); magma_ssetmatrix( M, N, h_B, lda, d_B, ldda ); // setup pointers for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*ldda; hAarray[tile] = &d_A[offset]; hBarray[tile] = &d_B[offset]; } magma_setvector( ntile, sizeof(float*), hAarray, 1, dAarray, 1 ); magma_setvector( ntile, sizeof(float*), hBarray, 1, dBarray, 1 ); gpu_time = magma_sync_wtime( 0 ); magmablas_sgeadd_batched( mb, nb, alpha, dAarray, ldda, dBarray, ldda, ntile ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*lda; for( int j = 0; j < nb; ++j ) { blasf77_saxpy( &mb, &alpha, &h_A[offset + j*lda], &ione, &h_B[offset + j*lda], &ione ); } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( M, N, d_B, ldda, h_A, lda ); error = lapackf77_slange( "F", &M, &N, h_B, &lda, work ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_B, &ione); error = lapackf77_slange("f", &M, &N, h_B, &lda, work) / error; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, (int) ntile, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_CPU( hAarray ); TESTING_FREE_CPU( hBarray ); TESTING_FREE_DEV( dAarray ); TESTING_FREE_DEV( dBarray ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_sidr_strms( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_IDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_n_one = MAGMA_S_NEG_ONE; // internal user options const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const float angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; magma_int_t q; float residual; float nrm; float nrmb; float nrmr; float nrmt; float rho; float om; float gamma; // matrices and vectors magma_s_matrix dxs = {Magma_CSR}; magma_s_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_s_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_s_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_s_matrix dU = {Magma_CSR}; magma_s_matrix dM = {Magma_CSR}; magma_s_matrix df = {Magma_CSR}; magma_s_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_s_matrix dc = {Magma_CSR}; magma_s_matrix dv = {Magma_CSR}; magma_s_matrix dskp = {Magma_CSR}; magma_s_matrix dalpha = {Magma_CSR}; magma_s_matrix dbeta = {Magma_CSR}; float *hMdiag = NULL; float *hskp = NULL; float *halpha = NULL; float *hbeta = NULL; float *d1 = NULL, *d2 = NULL; // queue variables const magma_int_t nqueues = 3; // number of queues magma_queue_t queues[nqueues]; // chronometry real_Double_t tempo1, tempo2; // create additional queues queues[0] = queue; for ( q = 1; q < nqueues; q++ ) { magma_queue_create( queue->device(), &(queues[q]) ); } // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_snrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_sscal( x->num_rows, MAGMA_S_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_svinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_svinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_sresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_svinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_slarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_smtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_smfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_sqr(P1), QR factorization CHECK( magma_sqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_snrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_sscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_smtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_smfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_smalloc_pinned( &hskp, 5 )); CHECK( magma_svinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_smalloc_pinned( &halpha, s )); CHECK( magma_svinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_smalloc_pinned( &hbeta, s )); CHECK( magma_svinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_smalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_smalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_smtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_svinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_svinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_scopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_svinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_svinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_svinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_svinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_svinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_svinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_smalloc_pinned( &hMdiag, s )); magmablas_slaset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_svinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_svinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = r CHECK( magma_smtransfer( dr, &dv, Magma_DEV, Magma_DEV, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } cudaProfilerStart(); om = MAGMA_S_ONE; gamma = MAGMA_S_ZERO; innerflag = 0; // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // start iteration do { solver_par->numiter++; // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; dGcol.dval = dG.dval + k * dG.ld; // v = r - G(:,k:s) c(k:s) // Q1 magmablas_sgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, dGcol.dval, dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queues[1] ); // U(:,k) = om * v + U(:,k:s) c(k:s) // Q1 magmablas_sgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queues[1] ); // G(:,k) = A U(:,k) // Q1 CHECK( magma_s_spmv( c_one, A, dv, c_zero, dGcol, queues[1] )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) // Q1 halpha[i] = magma_sdot( dP.num_rows, &dP.dval[i*dP.ld], 1, dGcol.dval, 1, queues[1] ); // implicit sync Q1 --> alpha = P(:,i)' G(:,k) // alpha = alpha / M(i,i) halpha[i] = halpha[i] / hMdiag[i]; // G(:,k) = G(:,k) - alpha * G(:,i) // Q1 magma_saxpy( dG.num_rows, -halpha[i], &dG.dval[i*dG.ld], 1, dGcol.dval, 1, queues[1] ); } // sync Q1 --> G(:,k) = G(:,k) - alpha * G(:,i), skp[4] = f(k) magma_queue_sync( queues[1] ); // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) // Q2 magma_sgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], dGcol.dval, d1, d2, &dM.dval[k*dM.ld+k], queues[2] ); // non-first s iteration if ( k > 0 ) { // alpha = dalpha // Q0 magma_ssetvector_async( k, halpha, 1, dalpha.dval, 1, queues[0] ); // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) // Q0 magmablas_sgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, dv.dval, 1, queues[0] ); } // Mdiag(k) = M(k,k) // Q2 magma_sgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag[k], 1, queues[2] ); // implicit sync Q2 --> Mdiag(k) = M(k,k) // U(:,k) = v // Q0 magma_scopyvector_async( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queues[0] ); // check M(k,k) == 0 if ( MAGMA_S_EQUAL(hMdiag[k], MAGMA_S_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) hbeta[k] = hskp[4] / hMdiag[k]; // check for nan if ( magma_s_isnan( hbeta[k] ) || magma_s_isinf( hbeta[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) // Q2 magma_saxpy( dr.num_rows, -hbeta[k], dGcol.dval, 1, dr.dval, 1, queues[2] ); // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) // Q1 magma_saxpy( sk-1, -hbeta[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queues[1] ); // c(k+1:s) = f(k+1:s) // Q1 magma_scopyvector_async( sk-1, &df.dval[k+1], 1, &dc.dval[k+1], 1, queues[1] ); // c(k+1:s) = M(k+1:s,k+1:s) \ f(k+1:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk-1, &dM.dval[(k+1)*dM.ld+(k+1)], dM.ld, &dc.dval[k+1], 1, queues[1] ); // skp[4] = f(k+1) // Q1 magma_sgetvector_async( 1, &df.dval[k+1], 1, &hskp[4], 1, queues[1] ); } // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // x = x + beta * U(:,k) // Q0 magma_saxpy( x->num_rows, hbeta[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queues[0] ); // t't // t'rs // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // rs = rs - gamma * t // Q1 magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[1] ); // xs = xs - gamma * (xs - x) // Q0 magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // |rs| // Q1 nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[1] ); // implicit sync Q0 --> |r| //--------------------------------------- } // v = r // Q1 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // last s iteration if ( (k + 1) == s ) { // t = A r // Q2 CHECK( magma_s_spmv( c_one, A, dr, c_zero, dt, queues[2] )); solver_par->spmv_count++; // t't // t'r // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queues[2] )); } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // dbeta(1:s) = beta(1:s) // Q0 magma_ssetvector_async( s, hbeta, 1, dbeta.dval, 1, queues[0] ); // x = x + U(:,1:s) * beta(1:s) // Q0 magmablas_sgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queues[0] ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // computation of a new omega //--------------------------------------- // skp[0-2] = dskp[0-2] // Q2 magma_sgetvector( 2, dskp.dval, 1, hskp, 1, queues[2] ); // implicit sync Q2 --> skp = dskp // |t| nrmt = magma_ssqrt( MAGMA_S_REAL(hskp[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_S_REAL(hskp[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp[1] / hskp[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_S_EQUAL(om, MAGMA_S_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // sync Q1 --> v = r magma_queue_sync( queues[1] ); // r = r - om * t // Q2 magma_saxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queues[2] ); // x = x + om * v // Q0 magma_saxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queues[0] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_snrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q0 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] ); // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_sidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_sgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_sgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // rs = rs - gamma * (rs - r) // Q2 magma_saxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // xs = xs - gamma * (xs - x) // Q0 magma_sidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q0 magma_scopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[0] ); // new RHS for small systems // f = P' r // Q1 magma_sgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_sgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_scopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // |rs| // Q2 nrmr = magma_snrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_strsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } // sync Q0 --> v = r magma_queue_sync( queues[0] ); } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // sync all queues for ( q = 0; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_scopyvector_async( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_scopyvector_async( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } cudaProfilerStop(); // get last iteration timing tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_sresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // sync all queues, destory additional queues magma_queue_sync( queues[0] ); for ( q = 1; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); magma_queue_destroy( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_smfree( &dxs, queue ); magma_smfree( &drs, queue ); magma_smfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_smfree( &dr, queue ); magma_smfree( &dP, queue ); magma_smfree( &dP1, queue ); magma_smfree( &dG, queue ); magma_smfree( &dGcol, queue ); magma_smfree( &dU, queue ); magma_smfree( &dM, queue ); magma_smfree( &df, queue ); magma_smfree( &dt, queue ); magma_smfree( &dc, queue ); magma_smfree( &dv, queue ); magma_smfree( &dskp, queue ); magma_smfree( &dalpha, queue ); magma_smfree( &dbeta, queue ); magma_free_pinned( hMdiag ); magma_free_pinned( hskp ); magma_free_pinned( halpha ); magma_free_pinned( hbeta ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_sidr_strms */ }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgesv_gpu */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_B, *h_X; magmaFloat_ptr d_A, d_B; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); nrhs = opts.nrhs; printf(" N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_SGETRF( N, N ) + FLOPS_SGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, float, lda*N ); TESTING_MALLOC_CPU( h_B, float, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, float, ldb*nrhs ); TESTING_MALLOC_CPU( work, float, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, lddb*nrhs ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_slarnv( &ione, ISEED, &sizeB, h_B ); magma_ssetmatrix( N, N, h_A, lda, d_A, ldda ); magma_ssetmatrix( N, nrhs, h_B, ldb, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgesv_gpu( N, nrhs, d_A, ldda, ipiv, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgesv_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== magma_sgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); Anorm = lapackf77_slange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_slange("I", &N, &nrhs, h_X, &ldb, work); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_slange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; float *A, *B, *C, *C2, *LU; float *dA, *dB, *dC1, *dC2; float alpha = MAGMA_S_MAKE( 0.5, 0.1 ); float beta = MAGMA_S_MAKE( 0.7, 0.2 ); float dalpha = 0.6; float dbeta = 0.8; float work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_smalloc_pinned( &A, size ); assert( err == 0 ); err = magma_smalloc_pinned( &B, size ); assert( err == 0 ); err = magma_smalloc_pinned( &C, size ); assert( err == 0 ); err = magma_smalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_smalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_smalloc( &dA, size ); assert( err == 0 ); err = magma_smalloc( &dB, size ); assert( err == 0 ); err = magma_smalloc( &dC1, size ); assert( err == 0 ); err = magma_smalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_slarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, B ); lapackf77_slarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test SSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_ssetmatrix( m, n, A, ld, dA, ld ); magma_ssetmatrix( m, n, A, ld, dB, ld ); magma_sswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_sswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasSaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_sgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_slange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "sswap diff %.2g\n", error ); // ----- test ISAMAX // get argmax of column of A magma_ssetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_isamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIsamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (float)m * k / 1e9; printf( "isamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test SGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_ssetmatrix( m, n, A, ld, dA, ld ); magma_ssetvector( maxn, B, 1, dB, 1 ); magma_ssetvector( maxn, C, 1, dC1, 1 ); magma_ssetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_sgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasSaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_slange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_SGEMV( m, n ) / 1e9; printf( "sgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test SSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_ssetmatrix( m, m, A, ld, dA, ld ); magma_ssetvector( m, B, 1, dB, 1 ); magma_ssetvector( m, C, 1, dC1, 1 ); magma_ssetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ssymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_slange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYMV( m ) / 1e9; printf( "ssymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test STRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_slacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_sgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_ssetmatrix( m, m, LU, ld, dA, ld ); magma_ssetvector( m, C, 1, dC1, 1 ); magma_ssetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_strsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasStrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_slange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_STRSM( MagmaLeft, m, 1 ) / 1e9; printf( "strsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test SGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_ssetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_ssetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_sgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SGEMM( m, n, k ) / 1e9; printf( "sgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test SSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_ssetmatrix( m, m, A, ld, dA, ld ); magma_ssetmatrix( m, n, B, ld, dB, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ssymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYMM( side[is], m, n ) / 1e9; printf( "ssymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test SSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_ssetmatrix( n, k, A, ld, dA, ld ); magma_ssetmatrix( n, n, C, ld, dC1, ld ); magma_ssetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ssyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYRK( k, n ) / 1e9; printf( "ssyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test SSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_ssetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_ssetmatrix( n, n, C, ld, dC1, ld ); magma_ssetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ssyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYR2K( k, n ) / 1e9; printf( "ssyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test STRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_ssetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_strmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasStrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_STRMM( side[is], m, n ) / 1e9; printf( "strmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test STRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_ssetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_strsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasStrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_STRSM( side[is], m, n ) / 1e9; printf( "strsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgemm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; float magma_error, dev_error, Cnorm, work[1]; magma_int_t M, N, K; magma_int_t Am, An, Bm, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; float *h_A, *h_B, *h_C, *h_Cmagma, *h_Cdev; magmaFloat_ptr d_A, d_B, d_C; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 0.29, -0.86 ); float beta = MAGMA_S_MAKE( -0.48, 0.38 ); magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); #ifdef HAVE_CUBLAS // for CUDA, we can check MAGMA vs. CUBLAS, without running LAPACK printf("%% If running lapack (option --lapack), MAGMA and %s error are both computed\n" "%% relative to CPU BLAS result. Else, MAGMA error is computed relative to %s result.\n\n", g_platform_str, g_platform_str ); printf("%% transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf("%% M N K MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else // for others, we need LAPACK for check opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf("%% transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf("%% M N K %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("%%========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_SGEMM( M, N, K ) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if ( opts.transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } ldc = M; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default lddc = magma_roundup( ldc, opts.align ); // multiple of 32 by default sizeA = lda*An; sizeB = ldb*Bn; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, float, lda*An ); TESTING_MALLOC_CPU( h_B, float, ldb*Bn ); TESTING_MALLOC_CPU( h_C, float, ldc*N ); TESTING_MALLOC_CPU( h_Cmagma, float, ldc*N ); TESTING_MALLOC_CPU( h_Cdev, float, ldc*N ); TESTING_MALLOC_DEV( d_A, float, ldda*An ); TESTING_MALLOC_DEV( d_B, float, lddb*Bn ); TESTING_MALLOC_DEV( d_C, float, lddc*N ); /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_slarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_slarnv( &ione, ISEED, &sizeC, h_C ); magma_ssetmatrix( Am, An, h_A, lda, d_A, ldda, opts.queue ); magma_ssetmatrix( Bm, Bn, h_B, ldb, d_B, lddb, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_ssetmatrix( M, N, h_C, ldc, d_C, lddc, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); magmablas_sgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, ldda, d_B, lddb, beta, d_C, lddc, opts.queue ); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_sgetmatrix( M, N, d_C, lddc, h_Cmagma, ldc, opts.queue ); #endif /* ===================================================================== Performs operation using CUBLAS / clBLAS / Xeon Phi MKL =================================================================== */ magma_ssetmatrix( M, N, h_C, ldc, d_C, lddc, opts.queue ); dev_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS // opts.handle also uses opts.queue cublasSgemm( opts.handle, cublas_trans_const(opts.transA), cublas_trans_const(opts.transB), M, N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); #else magma_sgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, opts.queue ); #endif dev_time = magma_sync_wtime( opts.queue ) - dev_time; dev_perf = gflops / dev_time; magma_sgetmatrix( M, N, d_C, lddc, h_Cdev, ldc, opts.queue ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_sgemm( lapack_trans_const(opts.transA), lapack_trans_const(opts.transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & dev, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_slange( "F", &M, &N, h_C, &ldc, work ); blasf77_saxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cdev, &ione ); dev_error = lapackf77_slange( "F", &M, &N, h_Cdev, &ldc, work ) / Cnorm; #ifdef HAVE_CUBLAS blasf77_saxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cmagma, &ione ); magma_error = lapackf77_slange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (magma_error < tol && dev_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && dev_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (dev_error < tol ? "ok" : "failed")); status += ! (dev_error < tol); #endif } else { #ifdef HAVE_CUBLAS // compute relative error for magma, relative to dev (currently only with CUDA) Cnorm = lapackf77_slange( "F", &M, &N, h_Cdev, &ldc, work ); blasf77_saxpy( &sizeC, &c_neg_one, h_Cdev, &ione, h_Cmagma, &ione ); magma_error = lapackf77_slange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e --- %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time ); #endif } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Cmagma ); TESTING_FREE_CPU( h_Cdev ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssyevd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; float *h_A, *h_R, *h_work; float *w1, *w2; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork, lda, aux_iwork[1]; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float result[3], eps, aux_work[1]; eps = lapackf77_slamch( "E" ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: jobz = %s, uplo = %s\n", lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time (sec)\n"); printf("=======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; lda = N; // query for workspace sizes magma_ssyevd( opts.jobz, opts.uplo, N, NULL, lda, NULL, aux_work, -1, aux_iwork, -1, &info ); lwork = (magma_int_t) aux_work[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, float, N*lda ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, float, N*lda ); TESTING_MALLOC_PIN( h_work, float, lwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* warm up run */ if ( opts.warmup ) { magma_ssyevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_ssyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_ssyevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_ssyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ float temp1, temp2; // tau=NULL is unused since itype=1 lapackf77_ssyt21( &ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, h_work, h_R, &lda, h_R, &lda, NULL, h_work, &result[0] ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_ssyevd( MagmaNoVec, opts.uplo, N, h_R, lda, w2, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_ssyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for( int j=0; j<N; j++ ) { temp1 = max(temp1, fabsf(w1[j])); temp1 = max(temp1, fabsf(w2[j])); temp2 = max(temp2, fabsf(w1[j]-w2[j])); } result[2] = temp2 / (((float)N)*temp1); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_ssyevd( lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, w2, h_work, &lwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_ssyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %8.2e %s\n", result[0]*eps, (result[0]*eps < tol ? "ok" : "failed") ); printf("(2) | I - U'U | / N = %8.2e %s\n", result[1]*eps, (result[1]*eps < tol ? "ok" : "failed") ); printf("(3) | S(w/ U) - S(w/o U) | / |S| = %8.2e %s\n\n", result[2] , (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0]*eps < tol && result[1]*eps < tol && result[2] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sormbr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, dwork[1]; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, mi, ni, mm, nn, nq, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; float *C, *R, *A, *work, *tau, *tauq, *taup; float *d, *e; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_vect_t vect [] = { MagmaQ, MagmaP }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf(" M N K vect side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int ivect = 0; ivect < 2; ++ivect ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_sgebrd_nb( m ); ldc = m; // A is nq x k (vect=Q) or k x nq (vect=P) // where nq=m (left) or nq=n (right) nq = (side[iside] == MagmaLeft ? m : n ); mm = (vect[ivect] == MagmaQ ? nq : k ); nn = (vect[ivect] == MagmaQ ? k : nq); lda = mm; // MBR calls either MQR or MLQ in various ways if ( vect[ivect] == MagmaQ ) { if ( nq >= k ) { gflops = FLOPS_SORMQR( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_SORMQR( mi, ni, nq-1, side[iside] ) / 1e9; } } else { if ( nq > k ) { gflops = FLOPS_SORMLQ( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_SORMLQ( mi, ni, nq-1, side[iside] ) / 1e9; } } // workspace for gebrd is (mm + nn)*nb // workspace for unmbr is m*nb or n*nb, depending on side lwork_max = max( (mm + nn)*nb, max( m*nb, n*nb )); TESTING_MALLOC_CPU( C, float, ldc*n ); TESTING_MALLOC_CPU( R, float, ldc*n ); TESTING_MALLOC_CPU( A, float, lda*nn ); TESTING_MALLOC_CPU( work, float, lwork_max ); TESTING_MALLOC_CPU( d, float, min(mm,nn) ); TESTING_MALLOC_CPU( e, float, min(mm,nn) ); TESTING_MALLOC_CPU( tauq, float, min(mm,nn) ); TESTING_MALLOC_CPU( taup, float, min(mm,nn) ); // C is full, m x n size = ldc*n; lapackf77_slarnv( &ione, ISEED, &size, C ); lapackf77_slacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*nn; lapackf77_slarnv( &ione, ISEED, &size, A ); // compute BRD factorization to get Householder vectors in A, tauq, taup //lapackf77_sgebrd( &mm, &nn, A, &lda, d, e, tauq, taup, work, &lwork_max, &info ); magma_sgebrd( mm, nn, A, lda, d, e, tauq, taup, work, lwork_max, &info ); if (info != 0) printf("magma_sgebrd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( vect[ivect] == MagmaQ ) { tau = tauq; } else { tau = taup; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sormbr( lapack_vect_const( vect[ivect] ), lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sormbr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_sormbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); if (info != 0) printf("magma_sormbr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_S_REAL( work[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_sormbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sormbr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_slange( "Fro", &m, &n, C, &ldc, dwork ); size = ldc*n; blasf77_saxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_slange( "Fro", &m, &n, R, &ldc, dwork ) / error; printf( "%5d %5d %5d %c %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_vect_const( vect[ivect] ), lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( d ); TESTING_FREE_CPU( e ); TESTING_FREE_CPU( taup ); TESTING_FREE_CPU( tauq ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}} // end ivect, iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A, *h_R; float *d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||PA-LU||/(||A||*N)\n"); printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGETRF( M, N ) / 1e9; TESTING_MALLOC( ipiv, magma_int_t, min_mn ); TESTING_MALLOC( h_A, float, n2 ); TESTING_HOSTALLOC( h_R, float, n2 ); TESTING_DEVALLOC( d_A, float, ldda*N ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgetf2_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetf2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time*1000. ); } if ( opts.check ) { magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_R, lda, h_A, ipiv ); printf(" %8.2e\n", error ); } else { printf(" --- \n"); } TESTING_FREE( ipiv ); TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing strsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float cublas_error, normA, normx, normr, work[1]; magma_int_t N, info; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; float *h_A, *h_b, *h_x, *h_xcublas; float *d_A, *d_x; float c_neg_one = MAGMA_S_NEG_ONE; magma_opts opts; parse_opts( argc, argv, &opts ); printf("uplo = %c, transA = %c, diag = %c\n", opts.uplo, opts.transA, opts.diag ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("============================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; gflops = FLOPS_STRSM(opts.side, N, 1) / 1e9; lda = N; ldda = ((lda+31)/32)*32; sizeA = lda*N; TESTING_MALLOC( ipiv, magma_int_t, N ); TESTING_MALLOC( h_A, float, lda*N ); TESTING_MALLOC( h_b, float, N ); TESTING_MALLOC( h_x, float, N ); TESTING_MALLOC( h_xcublas, float, N ); TESTING_DEVALLOC( d_A, float, ldda*N ); TESTING_DEVALLOC( d_x, float, N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_sgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_slarnv( &ione, ISEED, &N, h_b ); blasf77_scopy( &N, h_b, &ione, h_x, &ione ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_ssetmatrix( N, N, h_A, lda, d_A, ldda ); magma_ssetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasStrsv( opts.uplo, opts.transA, opts.diag, N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_strsv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) // error for CUBLAS normA = lapackf77_slange( "F", &N, &N, h_A, &lda, work ); normx = lapackf77_slange( "F", &N, &ione, h_xcublas, &ione, work ); blasf77_strmv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_xcublas, &ione ); blasf77_saxpy( &N, &c_neg_one, h_b, &ione, h_xcublas, &ione ); normr = lapackf77_slange( "F", &N, &ione, h_xcublas, &N, work ); cublas_error = normr / (normA*normx); if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cublas_error ); } TESTING_FREE( h_A ); TESTING_FREE( h_x ); TESTING_FREE( h_xcublas ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_x ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetri */ int main( int argc, char** argv ) { TESTING_INIT(); // constants const float c_zero = MAGMA_S_ZERO; const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float *h_A, *h_Ainv, *h_R, *work; magmaFloat_ptr d_A, dwork; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float tmp; float error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% N CPU Gflop/s (sec) GPU Gflop/s (sec) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default ldwork = N * magma_get_sgetri_nb( N ); gflops = FLOPS_SGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_sgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_sgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_S_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, float, lwork ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_Ainv, float, n2 ); TESTING_MALLOC_CPU( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( dwork, float, ldwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_ssetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_sgetrf_gpu( N, N, d_A, ldda, ipiv, &info ); magma_sgetmatrix( N, N, d_A, ldda, h_Ainv, lda, opts.queue ); if (info != 0) { printf("magma_sgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } // check for exact singularity //h_Ainv[ 10 + 10*lda ] = MAGMA_S_MAKE( 0.0, 0.0 ); //magma_ssetmatrix( N, N, h_Ainv, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgetri_gpu( N, d_A, ldda, ipiv, dwork, ldwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgetri( &N, h_Ainv, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f)", (int) N, gpu_perf, gpu_time ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_sgetmatrix( N, N, d_A, ldda, h_Ainv, lda, opts.queue ); // compute 1-norm condition number estimate, following LAPACK's zget03 float normA, normAinv, rcond; normA = lapackf77_slange( "1", &N, &N, h_A, &lda, rwork ); normAinv = lapackf77_slange( "1", &N, &N, h_Ainv, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; error = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_slaset( "full", &N, &N, &c_zero, &c_one, h_R, &lda ); blasf77_sgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A, &lda, h_Ainv, &lda, &c_one, h_R, &lda ); error = lapackf77_slange( "1", &N, &N, h_R, &lda, rwork ); error = error * rcond / N; } bool okay = (error < tol); status += ! okay; printf( " %8.2e %s\n", error, (okay ? "ok" : "failed")); } else { printf( "\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssyrk */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; float cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An; magma_int_t sizeA, sizeC; magma_int_t lda, ldc, ldda, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float *h_A, *h_C, *h_Ccublas; float *d_A, *d_C; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_D_MAKE( 0.29, -0.86 ); float beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_SSYRK(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; } else { lda = An = K; Ak = N; } ldc = N; ldda = ((lda+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, float, lda*Ak ); TESTING_MALLOC_CPU( h_C, float, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, float, ldc*N ); TESTING_MALLOC_DEV( d_A, float, ldda*Ak ); TESTING_MALLOC_DEV( d_C, float, lddc*N ); /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_slarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_ssetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasSsyrk( handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ssyrk( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_slansy("fro", lapack_uplo_const(opts.uplo), &N, h_C, &ldc, work); blasf77_saxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_slansy( "fro", lapack_uplo_const(opts.uplo), &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y, *Ycublas, *Ymagma; float *dA, *dX, *dY, *dwork; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_SSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, float, sizeA ); TESTING_MALLOC_CPU( X, float, sizeX ); TESTING_MALLOC_CPU( Y, float, sizeY ); TESTING_MALLOC_CPU( Ycublas, float, sizeY ); TESTING_MALLOC_CPU( Ymagma, float, sizeY ); TESTING_MALLOC_DEV( dA, float, sizeA ); TESTING_MALLOC_DEV( dX, float, sizeX ); TESTING_MALLOC_DEV( dY, float, sizeY ); blocks = (N + nb - 1) / nb; ldwork = lda * (blocks + 1); TESTING_MALLOC_DEV( dwork, float, ldwork ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &sizeA, A ); magma_smake_symmetric( N, A, lda ); lapackf77_slarnv( &ione, ISEED, &sizeX, X ); lapackf77_slarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( N, N, A, lda, dA, lda ); magma_ssetvector( N, X, incx, dX, incx ); magma_ssetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasSsymv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMA BLAS =================================================================== */ magma_ssetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_ssymv_work( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy, dwork, ldwork ); // TODO provide option to test non-work interface //magmablas_ssymv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_ssymv( &opts.uplo, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_slange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_slange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }