/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvdx */ int main( int argc, char** argv) { TESTING_INIT(); /* Constants */ const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; /* Local variables */ real_Double_t gpu_time; magmaFloatComplex *h_A, *h_R, *h_B, *h_S, *h_work; #ifdef COMPLEX float *rwork; magma_int_t lrwork; #endif float *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lda, lwork, liwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; // pass ngpu = -1 to test multi-GPU code using 1 gpu magma_int_t abs_ngpu = abs( opts.ngpu ); printf("%% itype = %d, jobz = %s, range = %s, uplo = %s, fraction = %6.4f, ngpu = %d\n", int(opts.itype), lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), opts.fraction, int(abs_ngpu) ); if (opts.itype == 1) { printf("%% N M GPU Time (sec) |AZ-BZD| |D - D_magma|\n"); } else if (opts.itype == 2) { printf("%% N M GPU Time (sec) |ABZ-ZD| |D - D_magma|\n"); } else if (opts.itype == 3) { printf("%% N M GPU Time (sec) |BAZ-ZD| |D - D_magma|\n"); } printf("%%======================================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; // TODO: test vl-vu range magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (opts.fraction == 0) { il = max( 1, magma_int_t(0.1*N) ); iu = max( 1, magma_int_t(0.3*N) ); } else { il = 1; iu = max( 1, magma_int_t(opts.fraction*N) ); } magma_cheevdx_getworksize(N, threads, (opts.jobz == MagmaVec), &lwork, #ifdef COMPLEX &lrwork, #endif &liwork); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, max( lwork, N*N )); // check needs N*N #ifdef COMPLEX TESTING_MALLOC_PIN( rwork, float, lrwork); #endif /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, lda ); magma_cmake_hermitian( N, h_A, lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); // =================================================================== // Performs operation using MAGMA // =================================================================== gpu_time = magma_wtime(); if (opts.ngpu == 1) { magma_chegvdx_2stage( opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } else { magma_chegvdx_2stage_m( abs_ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_chegvdx_2stage returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A| |Z| N ) (itype = 1) | A B Z - Z D | / ( |A| |Z| N ) (itype = 2) | B A Z - Z D | / ( |A| |Z| N ) (itype = 3) (2) | D(with V, magma) - D(w/o V, lapack) | / | D | =================================================================== */ #ifdef REAL float *rwork = h_work + N*N; #endif if ( opts.jobz != MagmaNoVec ) { result[0] = 1.; result[0] /= safe_lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &lda, rwork); result[0] /= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &lda, h_R, &lda, &c_one, h_work, &N); result[0] *= lapackf77_clange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork)/N; } } lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); lapackf77_chegvd( &opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &lda, h_S, &lda, w2, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info ); if (info != 0) { printf("lapackf77_chegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); } float maxw=0, diff=0; for (int j=0; j < m1; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (m1*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %9.4f ", (int) N, (int) m1, gpu_time); if ( opts.check ) { bool okay = (result[1] < tolulp); if ( opts.jobz != MagmaNoVec ) { okay = okay && (result[0] < tol); printf(" %8.2e", result[0] ); } else { printf(" --- "); } printf(" %8.2e %s\n", result[1], (okay ? "ok" : "failed")); status += ! okay; } else { printf(" ---\n"); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #ifdef COMPLEX TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvdx */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t mgpu_time; magmaFloatComplex *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; #endif float *w1, result=0; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: ngpu = %d, itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.ngpu, (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M ngpu MGPU Time (sec)\n"); printf("====================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_cbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_cbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; //magma_int_t NB = 96;//magma_bulge_get_nb(N); //magma_int_t sizvblg = magma_cbulge_get_lq2(N, threads); //magma_int_t siz = max(sizvblg,n2)+2*(N*NB+N)+24*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, float, lrwork); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, N ); magma_cmake_hermitian( N, h_A, N ); if ( opts.warmup || opts.check ) { TESTING_MALLOC_CPU( h_Ainit, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_Binit, magmaFloatComplex, n2 ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } if ( opts.warmup ) { // ================================================================== // Warmup using MAGMA. I prefer to use smalltest to warmup A- // ================================================================== magma_chegvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== mgpu_time = magma_wtime(); magma_chegvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); mgpu_time = magma_wtime() - mgpu_time; if ( opts.check ) { // =================================================================== // Check the results following the LAPACK's [zc]hegvdx routine. // A x = lambda B x is solved // and the following 3 tests computed: // (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) // | A B Z - Z D | / ( |A||Z| N ) (itype = 2) // | B A Z - Z D | / ( |A||Z| N ) (itype = 3) // =================================================================== #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, rwork); result /= lapackf77_clange("1", &N , &m1, h_A, &N, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_clange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &m1, h_A, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &m1, h_A, &N, rwork)/N; } } // =================================================================== // Print execution time // =================================================================== printf("%5d %5d %4d %7.2f\n", (int) N, (int) m1, (int) opts.ngpu, mgpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } printf("\n"); status += ! (result < tol); } TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( iwork ); if ( opts.warmup || opts.check ) { TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); return status; }