/* //////////////////////////////////////////////////////////////////////////// -- Testing dsygvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time; double *h_A, *h_R, *h_B, *h_S, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif /* Matrix size */ double *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_dbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_dbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_B, double, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( h_S, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork); #endif /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); magma_dmake_hpd( N, h_B, N ); magma_dmake_symmetric( N, h_A, N ); magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } // ================================================================== // Warmup using MAGMA // ================================================================== if (opts.warmup) { lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_dsygvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } // =================================================================== // Performs operation using MAGMA // =================================================================== lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_dsygvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif double temp1, temp2; result[0] = 1.; result[0] /= lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_int_t m2 = m1; lapackf77_dsygvd(&opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &N, h_S, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); temp1 = temp2 = 0; for(int j=0; j<m2; j++) { temp1 = max(temp1, fabs(w1[j])); temp1 = max(temp1, fabs(w2[j])); temp2 = max(temp2, fabs(w1[j]-w2[j])); } result[1] = temp2 / (((double)m2)*temp1); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } printf( "(2) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R; double *d_A; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t N, n2, lda, ldda, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS_DPOTRI( N ) / 1e9; TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_hpd( N, h_A, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* factorize matrix */ magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); magma_dpotrf_gpu( opts.uplo, N, d_A, ldda, &info ); // check for exact singularity //magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); //h_R[ 10 + 10*lda ] = MAGMA_D_MAKE( 0.0, 0.0 ); //magma_dsetmatrix( N, N, h_R, lda, d_A, ldda ); gpu_time = magma_wtime(); magma_dpotri_gpu( opts.uplo, N, d_A, ldda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dpotri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lapackf77_dpotrf( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime(); lapackf77_dpotri( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dpotri returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); error = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed") ); status |= ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dpotf2_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R; magmaDouble_ptr d_A; magma_int_t N, n2, lda, ldda, info; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS_DPOTRF( N ) / 1e9; if ( N > 512 ) { printf( "%5d skipping because dpotf2 does not support N > 512\n", (int) N ); continue; } TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_hpd( N, h_A, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dpotf2_gpu( opts.uplo, N, d_A, ldda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dpotf2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); error = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time*1000. ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsysv_nopiv_gpu */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_B, *h_X, temp, *hwork; magmaDouble_ptr d_A, d_B; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB, lwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_DGETRF( N, N ) + FLOPS_DGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); bool nopiv = true; if ( nopiv ) { magma_dmake_hpd( N, h_A, lda ); // SPD / HPD does not require pivoting } else { magma_dmake_symmetric( N, h_A, lda ); // symmetric/symmetric generally requires pivoting } magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( N, nrhs, h_B, ldb, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dsysv_nopiv_gpu( opts.uplo, N, nrhs, d_A, ldda, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgesv_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); Anorm = lapackf77_dlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_dlange("I", &N, &nrhs, h_X, &ldb, work); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_dlange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lwork = -1; lapackf77_dsysv( lapack_uplo_const(opts.uplo), &N,&nrhs, h_A, &lda, ipiv, h_B, &ldb, &temp, &lwork, &info ); lwork = (magma_int_t) MAGMA_D_REAL( temp ); TESTING_MALLOC_PIN( hwork, double, lwork ); cpu_time = magma_wtime(); lapackf77_dsysv( lapack_uplo_const(opts.uplo), &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, hwork, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dsysv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); TESTING_FREE_CPU( hwork ); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dpotrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R; double *d_lA[ MagmaMaxGPUs ]; magma_int_t N, n2, lda, ldda, max_size, ngpu; magma_int_t info, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu = %d, uplo = %s\n", (int) opts.ngpu, lapack_uplo_const(opts.uplo) ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; nb = magma_get_dpotrf_nb( N ); gflops = FLOPS_DPOTRF( N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); // Allocate device memory // matrix is distributed by block-rows or block-columns // this is maximum size that any GPU stores; // size is rounded up to full blocks in both rows and columns max_size = nb*(1+N/(nb*ngpu)) * nb*((N+nb-1)/nb); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, max_size ); } /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_hpd( N, h_A, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ if ( opts.uplo == MagmaUpper ) { ldda = ((N+nb-1)/nb)*nb; magma_dsetmatrix_1D_col_bcyclic( N, N, h_R, lda, d_lA, ldda, ngpu, nb ); } else { ldda = (1+N/(nb*ngpu))*nb; magma_dsetmatrix_1D_row_bcyclic( N, N, h_R, lda, d_lA, ldda, ngpu, nb ); } gpu_time = magma_wtime(); magma_dpotrf_mgpu( ngpu, opts.uplo, N, d_lA, ldda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dpotrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.uplo == MagmaUpper ) { magma_dgetmatrix_1D_col_bcyclic( N, N, d_lA, ldda, h_R, lda, ngpu, nb ); } else { magma_dgetmatrix_1D_row_bcyclic( N, N, d_lA, ldda, h_R, lda, ngpu, nb ); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ for( int dev=0; dev < ngpu; dev++ ){ magma_setdevice( dev ); magma_device_sync(); } if ( opts.lapack ) { error = lapackf77_dlange("f", &N, &N, h_A, &lda, work ); blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work ) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); for( int dev=0; dev < ngpu; dev++ ){ magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsygvd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *h_B, *h_S, *h_work; double *w1, *w2; magma_int_t *iwork; magma_int_t N, n2, info, nb, lwork, liwork, lda; double result[4] = {0}; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double d_zero = 0.; double d_one = 1.; double d_neg_one = -1.; //magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, uplo = %s\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time(sec)\n"); printf("======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = N*lda; nb = magma_get_dsytrd_nb(N); lwork = 1 + 6*N*nb + 2* N*N; liwork = 3 + 5*N; TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_B, double, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( h_S, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); magma_dmake_hpd( N, h_B, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); /* warmup */ if ( opts.warmup ) { magma_dsygvd( opts.itype, opts.jobz, opts.uplo, N, h_R, lda, h_S, lda, w1, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_dsygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dsygvd( opts.itype, opts.jobz, opts.uplo, N, h_R, lda, h_S, lda, w1, h_work, lwork, iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_dsygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | I - V V' B | / ( N ) (itype = 1,2) | B - V V' | / ( |B| N ) (itype = 3) (3) | S(with V) - S(w/o V) | / | S | =================================================================== */ double temp1, temp2; //double *tau; if ( opts.itype == 1 || opts.itype == 2 ) { lapackf77_dlaset( "A", &N, &N, &d_zero, &c_one, h_S, &lda); blasf77_dgemm("N", "C", &N, &N, &N, &c_one, h_R, &lda, h_R, &lda, &d_zero, h_work, &N); blasf77_dsymm("R", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_B, &lda, h_work, &N, &c_one, h_S, &lda); result[1] = lapackf77_dlange("1", &N, &N, h_S, &lda, h_work) / N; } else if ( opts.itype == 3 ) { lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda); blasf77_dsyrk(lapack_uplo_const(opts.uplo), "N", &N, &N, &d_neg_one, h_R, &lda, &d_one, h_S, &lda); result[1] = lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_S, &lda, h_work) / N / lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_B, &lda, h_work); } result[0] = 1.; result[0] /= lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_work); result[0] /= lapackf77_dlange("1", &N, &N, h_R, &lda, h_work); if ( opts.itype == 1 ) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_R, &lda, &d_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_B, &lda, h_R, &lda, &c_one, h_work, &N); result[0] *= lapackf77_dlange("1", &N, &N, h_work, &N, &temp1)/N; } else if ( opts.itype == 2 ) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_B, &lda, h_R, &lda, &d_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_dlange("1", &N, &N, h_R, &lda, &temp1)/N; } else if ( opts.itype == 3 ) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_R, &lda, &d_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_B, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_dlange("1", &N, &N, h_R, &lda, &temp1)/N; } /* lapackf77_dsyt21(&ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, w1, h_R, &lda, h_R, &lda, tau, h_work, rwork, &result[0]); */ lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); magma_dsygvd( opts.itype, MagmaNoVec, opts.uplo, N, h_R, lda, h_S, lda, w2, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_dsygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for(int j=0; j<N; j++) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / temp1; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dsygvd( &opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_B, &lda, w2, h_work, &lwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_dsygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if ( opts.itype==1 ) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if ( opts.itype==2 ) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if ( opts.itype==3 ) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } if ( opts.itype==1 || opts.itype==2 ) { printf("(2) | I - Z Z' B | / N = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); } else { printf("(2) | B - Z Z' | / (|B| N) = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); } printf( "(3) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[2], (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0] < tol && result[1] < tol && result[2] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R; magma_int_t N, n2, lda, info; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu %d, uplo %c\n", (int) opts.ngpu, opts.uplo ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; gflops = FLOPS_DPOTRF( N ) / 1e9; TESTING_MALLOC( h_A, double, n2 ); TESTING_HOSTALLOC( h_R, double, n2 ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_hpd( N, h_A, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dpotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dpotrf( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed") ); status |= ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); } } TESTING_FINALIZE(); return status; }