/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvdx */ int main( int argc, char** argv) { TESTING_INIT(); /* Constants */ const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; /* Local variables */ real_Double_t gpu_time; magmaFloatComplex *h_A, *h_R, *h_B, *h_S, *h_work; #ifdef COMPLEX float *rwork; magma_int_t lrwork; #endif float *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lda, lwork, liwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; // pass ngpu = -1 to test multi-GPU code using 1 gpu magma_int_t abs_ngpu = abs( opts.ngpu ); printf("%% itype = %d, jobz = %s, range = %s, uplo = %s, fraction = %6.4f, ngpu = %d\n", int(opts.itype), lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), opts.fraction, int(abs_ngpu) ); if (opts.itype == 1) { printf("%% N M GPU Time (sec) |AZ-BZD| |D - D_magma|\n"); } else if (opts.itype == 2) { printf("%% N M GPU Time (sec) |ABZ-ZD| |D - D_magma|\n"); } else if (opts.itype == 3) { printf("%% N M GPU Time (sec) |BAZ-ZD| |D - D_magma|\n"); } printf("%%======================================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; // TODO: test vl-vu range magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (opts.fraction == 0) { il = max( 1, magma_int_t(0.1*N) ); iu = max( 1, magma_int_t(0.3*N) ); } else { il = 1; iu = max( 1, magma_int_t(opts.fraction*N) ); } magma_cheevdx_getworksize(N, threads, (opts.jobz == MagmaVec), &lwork, #ifdef COMPLEX &lrwork, #endif &liwork); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, max( lwork, N*N )); // check needs N*N #ifdef COMPLEX TESTING_MALLOC_PIN( rwork, float, lrwork); #endif /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, lda ); magma_cmake_hermitian( N, h_A, lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); // =================================================================== // Performs operation using MAGMA // =================================================================== gpu_time = magma_wtime(); if (opts.ngpu == 1) { magma_chegvdx_2stage( opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } else { magma_chegvdx_2stage_m( abs_ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_chegvdx_2stage returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A| |Z| N ) (itype = 1) | A B Z - Z D | / ( |A| |Z| N ) (itype = 2) | B A Z - Z D | / ( |A| |Z| N ) (itype = 3) (2) | D(with V, magma) - D(w/o V, lapack) | / | D | =================================================================== */ #ifdef REAL float *rwork = h_work + N*N; #endif if ( opts.jobz != MagmaNoVec ) { result[0] = 1.; result[0] /= safe_lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &lda, rwork); result[0] /= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &lda, h_R, &lda, &c_one, h_work, &N); result[0] *= lapackf77_clange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork)/N; } } lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); lapackf77_chegvd( &opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &lda, h_S, &lda, w2, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info ); if (info != 0) { printf("lapackf77_chegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); } float maxw=0, diff=0; for (int j=0; j < m1; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (m1*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %9.4f ", (int) N, (int) m1, gpu_time); if ( opts.check ) { bool okay = (result[1] < tolulp); if ( opts.jobz != MagmaNoVec ) { okay = okay && (result[0] < tol); printf(" %8.2e", result[0] ); } else { printf(" --- "); } printf(" %8.2e %s\n", result[1], (okay ? "ok" : "failed")); status += ! okay; } else { printf(" ---\n"); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #ifdef COMPLEX TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cposv_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B, *h_X; magmaFloatComplex_ptr d_A, d_B; magma_int_t N, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = ldb = N; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( work, float, N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*opts.nrhs ); /* ==================================================================== Initialize the matrix =================================================================== */ sizeA = lda*N; sizeB = ldb*opts.nrhs; lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); magma_cmake_hpd( N, h_A, lda ); magma_csetmatrix( N, N, h_A, N, d_A, ldda ); magma_csetmatrix( N, opts.nrhs, h_B, N, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cposv_gpu( opts.uplo, N, opts.nrhs, d_A, ldda, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Residual =================================================================== */ magma_cgetmatrix( N, opts.nrhs, d_B, lddb, h_X, ldb ); Anorm = lapackf77_clange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_clange("I", &N, &opts.nrhs, h_X, &ldb, work); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &opts.nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb ); Rnorm = lapackf77_clange("I", &N, &opts.nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cposv( lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cposv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R; magma_int_t N, n2, lda, info; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu = %d, uplo = %s\n", (int) opts.ngpu, lapack_uplo_const(opts.uplo) ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; gflops = FLOPS_CPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hpd( N, h_A, lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cpotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegst */ int main( int argc, char** argv) { TESTING_INIT(); // Constants const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; // Local variables real_Double_t gpu_time, cpu_time; magmaFloatComplex *h_A, *h_B, *h_R; magmaFloatComplex_ptr d_A, d_B; float Anorm, error, work[1]; magma_int_t N, n2, lda, ldda, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% itype N CPU time (sec) GPU time (sec) |R| \n"); printf("%%=======================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( lda, opts.align ); n2 = N*lda; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, ldda*N ); /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hermitian( N, h_A, lda ); magma_cmake_hpd( N, h_B, lda ); magma_cpotrf( opts.uplo, N, h_B, lda, &info ); if (info != 0) { printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_csetmatrix( N, N, h_B, lda, d_B, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_chegst_gpu( opts.itype, opts.uplo, N, d_A, ldda, d_B, ldda, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_chegst_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_chegst( &opts.itype, lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_B, &lda, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) { printf("lapackf77_chegst returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); blasf77_caxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); Anorm = safe_lapackf77_clanhe("f", lapack_uplo_const(opts.uplo), &N, h_A, &lda, work ); error = safe_lapackf77_clanhe("f", lapack_uplo_const(opts.uplo), &N, h_R, &lda, work ) / Anorm; bool okay = (error < tol); status += ! okay; printf("%3d %5d %7.2f %7.2f %8.2e %s\n", (int) opts.itype, (int) N, cpu_time, gpu_time, error, (okay ? "ok" : "failed")); } else { printf("%3d %5d --- %7.2f\n", (int) opts.itype, (int) N, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvdx */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t mgpu_time; magmaFloatComplex *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; #endif float *w1, result=0; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: ngpu = %d, itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.ngpu, (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M ngpu MGPU Time (sec)\n"); printf("====================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_cbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_cbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; //magma_int_t NB = 96;//magma_bulge_get_nb(N); //magma_int_t sizvblg = magma_cbulge_get_lq2(N, threads); //magma_int_t siz = max(sizvblg,n2)+2*(N*NB+N)+24*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, float, lrwork); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, N ); magma_cmake_hermitian( N, h_A, N ); if ( opts.warmup || opts.check ) { TESTING_MALLOC_CPU( h_Ainit, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_Binit, magmaFloatComplex, n2 ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } if ( opts.warmup ) { // ================================================================== // Warmup using MAGMA. I prefer to use smalltest to warmup A- // ================================================================== magma_chegvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== mgpu_time = magma_wtime(); magma_chegvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); mgpu_time = magma_wtime() - mgpu_time; if ( opts.check ) { // =================================================================== // Check the results following the LAPACK's [zc]hegvdx routine. // A x = lambda B x is solved // and the following 3 tests computed: // (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) // | A B Z - Z D | / ( |A||Z| N ) (itype = 2) // | B A Z - Z D | / ( |A||Z| N ) (itype = 3) // =================================================================== #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, rwork); result /= lapackf77_clange("1", &N , &m1, h_A, &N, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_clange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &m1, h_A, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &m1, h_A, &N, rwork)/N; } } // =================================================================== // Print execution time // =================================================================== printf("%5d %5d %4d %7.2f\n", (int) N, (int) m1, (int) opts.ngpu, mgpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } printf("\n"); status += ! (result < tol); } TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( iwork ); if ( opts.warmup || opts.check ) { TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); return status; }
int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *d_A; magma_int_t N, n2, lda, ldda, info; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magmaFloatComplex **d_A_array = NULL; magma_int_t *dinfo_magma; magma_int_t batchCount; magma_queue_t queue = magma_stream; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) batchCount = opts.batchcount; float tol = opts.tolerance * lapackf77_slamch("E"); printf("BatchCount N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; ldda = lda = ((N+31)/32)*32; n2 = lda* N * batchCount; gflops = batchCount * FLOPS_CPOTRF( N ) / 1e9 ; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda * N * batchCount); TESTING_MALLOC_DEV( dinfo_magma, magma_int_t, batchCount); magma_malloc((void**)&d_A_array, batchCount * sizeof(*d_A_array)); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); for(int i=0; i<batchCount; i++) { magma_cmake_hpd( N, h_A + i * lda * N, lda );// need modification } magma_int_t columns = N * batchCount; lapackf77_clacpy( MagmaUpperLowerStr, &N, &(columns), h_A, &lda, h_R, &lda ); magma_csetmatrix( N, columns, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ cset_pointer(d_A_array, d_A, ldda, 0, 0, ldda * N, batchCount, queue); gpu_time = magma_sync_wtime(NULL); info = magma_cpotrf_batched( opts.uplo, N, d_A_array, ldda, dinfo_magma, batchCount, queue); gpu_time = magma_sync_wtime(NULL) - gpu_time; gpu_perf = gflops / gpu_time; magma_int_t *cpu_info = (magma_int_t*) malloc(batchCount*sizeof(magma_int_t)); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_magma, 1, cpu_info, 1); for(int i=0; i<batchCount; i++) { if(cpu_info[i] != 0 ){ printf("magma_cpotrf_batched matrix %d returned internal error %d\n",i, (int)cpu_info[i] ); } } if (info != 0) printf("magma_cpotrf_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for(int i=0; i<batchCount; i++) { lapackf77_cpotrf( lapack_uplo_const(opts.uplo), &N, h_A + i * lda * N, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( N, columns, d_A, ldda, h_R, lda ); magma_int_t NN = lda*N; char const uplo = 'l'; // lapack_uplo_const(opts.uplo) float err = 0.0; for(int i=0; i<batchCount; i++) { error = lapackf77_clanhe("f", &uplo, &N, h_A + i * lda*N, &lda, work); blasf77_caxpy(&NN, &c_neg_one, h_A + i * lda*N, &ione, h_R + i * lda*N, &ione); error = lapackf77_clanhe("f", &uplo, &N, h_R + i * lda*N, &lda, work) / error; if ( isnan(error) || isinf(error) ) { err = error; break; } err = max(fabs(error),err); } printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., err, (error < tol ? "ok" : "failed")); status += ! (err < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int)batchCount, (int) N, gpu_perf, gpu_time*1000. ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_A_array ); TESTING_FREE_DEV( dinfo_magma ); free(cpu_info); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvd */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); magmaFloatComplex *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; #endif float *w1, *w2, result; magma_int_t *iwork; real_Double_t mgpu_time, gpu_time, cpu_time; /* Matrix size */ magma_int_t N, n2, nb; magma_int_t info; magma_int_t ione = 1; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: ngpu = %d, itype = %d, jobz = %s, uplo = %s, check = %d\n", (int) opts.ngpu, (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), (int) opts.check); printf(" N CPU Time (sec) GPU Time (sec) MGPU Time (sec)\n"); printf("=========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // TODO define lda N = opts.nsize[itest]; n2 = N*N; nb = magma_get_chetrd_nb(N); #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t lwork = max( N + N*nb, 2*N + N*N ); magma_int_t lrwork = 1 + 5*N +2*N*N; #else magma_int_t lwork = max( 2*N + N*nb, 1 + 6*N + 2*N*N ); #endif magma_int_t liwork = 3 + 5*N; TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, float, lrwork ); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, N ); magma_cmake_hermitian( N, h_A, N ); if ( opts.warmup || opts.check ) { TESTING_MALLOC_CPU( h_Ainit, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_Binit, magmaFloatComplex, n2 ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } if (opts.warmup) { // ================================================================== // Warmup using MAGMA. // ================================================================== magma_chegvd_m( opts.ngpu, opts.itype, opts.jobz, opts.uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== mgpu_time = magma_wtime(); magma_chegvd_m( opts.ngpu, opts.itype, opts.jobz, opts.uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); mgpu_time = magma_wtime() - mgpu_time; if (info != 0) printf("magma_chegvd_m returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, rwork); result /= lapackf77_clange("1", &N, &N, h_A, &N, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_clange("1", &N, &N, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &N, h_A, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &N, h_A, &N, rwork)/N; } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_chegvd(opts.itype, opts.jobz, opts.uplo, N, h_A, N, h_B, N, w2, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_chegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_chegvd(&opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, h_Binit, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_chegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); float temp1 = 0; float temp2 = 0; for(int j=0; j<N; j++) { temp1 = max(temp1, fabs(w1[j])); temp1 = max(temp1, fabs(w2[j])); temp2 = max(temp2, fabs(w1[j]-w2[j])); } float result2 = temp2 / (((float)N)*temp1); /* ===================================================================== Print execution time =================================================================== */ printf("%5d %7.2f %7.2f %7.2f\n", (int) N, cpu_time, gpu_time, mgpu_time); printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } printf( "(3) | D(MGPU)-D(LAPACK) |/ |D| = %8.2e %s\n\n", result2, (result2 < tolulp ? "ok" : "failed") ); status += ! (result < tol && result2 < tolulp); } else { printf("%5d --- --- %7.2f\n", (int) N, mgpu_time); } /* Memory clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); if ( opts.warmup || opts.check ) { TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time /*cpu_time*/; magmaFloatComplex *h_A, *h_R, *h_B, *h_S, *h_work; float *w1, *w2, vl=0, vu=0; float result[2] = {0}; magma_int_t *iwork; magma_int_t N, n2, info, il, iu, m1, nb, lwork, liwork; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; #ifdef COMPLEX float *rwork; magma_int_t lrwork; #endif //float d_one = 1.; //float d_ten = 10.; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); //float tolulp = opts.tolerance * lapackf77_slamch("P"); // TODO: how to check NoVec? This doesn't have an equivalent call to LAPACK. if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "WARNING: cannot currently check results when not computing vectors (option -JN).\n" ); } printf("using: itype = %d, jobz = %s, uplo = %s, check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; nb = magma_get_chetrd_nb(N); #ifdef COMPLEX lwork = max( N + N*nb, 2*N + N*N ); lrwork = 1 + 5*N +2*N*N; #else lwork = max( 2*N + N*nb, 1 + 6*N + 2*N*N ); #endif liwork = 3 + 5*N; if ( opts.fraction == 0 ) { il = N / 10; iu = N / 5+il; } else { il = 1; iu = (int) (opts.fraction*N); if (iu < 1) iu = 1; } TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); #ifdef COMPLEX TESTING_MALLOC_PIN( rwork, float, lrwork); #endif /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, N ); magma_cmake_hermitian( N, h_A, N ); // ================================================================== // Warmup using MAGMA // ================================================================== if ( opts.warmup ) { lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &N, h_S, &N ); magma_chegvdx( opts.itype, opts.jobz, MagmaRangeI, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); if (info != 0) printf("magma_chegvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_chegvdx( opts.itype, opts.jobz, MagmaRangeI, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_chegvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check && opts.jobz != MagmaNoVec ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | D(with V) - D(w/o V) | / | D | =================================================================== */ #ifdef REAL float *rwork = h_work + N*N; #endif result[0] = 1.; result[0] /= lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_clange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_clange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &N, rwork)/N; } // Disable eigenvalue check which calls routine again -- // it obscures whether error occurs in first call above or in this call. // TODO: add comparison to LAPACK, as in testing_chegvd.cpp? // //lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &N, h_R, &N ); //lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &N, h_S, &N ); // //magma_int_t m2; //magma_chegvdx( opts.itype, MagmaNoVec, MagmaRangeI, opts.uplo, // N, h_R, N, h_S, N, vl, vu, il, iu, &m2, w2, // h_work, lwork, // #ifdef COMPLEX // rwork, lrwork, // #endif // iwork, liwork, // &info ); //if (info != 0) // printf("magma_chegvdx returned error %d: %s.\n", // (int) info, magma_strerror( info )); // //float maxw=0, diff=0; //for(int j=0; j < m2; j++) { // maxw = max(maxw, fabs(w1[j])); // maxw = max(maxw, fabs(w2[j])); // diff = max(diff, fabs(w1[j]-w2[j])); //} //result[1] = diff / (m2*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check && opts.jobz != MagmaNoVec ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype == 1) { printf(" | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype == 2) { printf(" | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype == 3) { printf(" | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } //printf( " | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol); // && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #ifdef COMPLEX TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cposv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *h_B, *h_X; magma_int_t N, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d, uplo %c\n", (int) opts.ngpu, opts.uplo ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ldb = N; gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC( h_B, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC( h_X, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC( work, float, N ); /* ==================================================================== Initialize the matrix =================================================================== */ sizeA = lda*N; sizeB = ldb*opts.nrhs; lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); magma_cmake_hpd( N, h_A, lda ); // copy A to R and B to X; save A and B for residual lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cposv( opts.uplo, N, opts.nrhs, h_R, lda, h_X, ldb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Residual =================================================================== */ Anorm = lapackf77_clange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_clange("I", &N, &opts.nrhs, h_X, &ldb, work); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &opts.nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb ); Rnorm = lapackf77_clange("I", &N, &opts.nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status |= ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cposv( &opts.uplo, &N, &opts.nrhs, h_A, &lda, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cposv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) opts.nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) opts.nrhs, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } TESTING_FREE( h_A ); TESTING_FREE( h_R ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }