/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvdx */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t mgpu_time; magmaDoubleComplex *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif double *w1, result; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_timestr_t start, end; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); char jobz = opts.jobz; int checkres = opts.check; char range = 'A'; char uplo = opts.uplo; magma_int_t itype = opts.itype; double f = opts.fraction; if (f != 1) range='I'; if ( checkres && jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); jobz = MagmaVec; } printf("using: nrgpu = %d, itype = %d, jobz = %c, range = %c, uplo = %c, checkres = %d, fraction = %6.4f\n", (int) opts.ngpu, (int) itype, jobz, range, uplo, (int) checkres, f); printf(" N M nr GPU MGPU Time(s) \n"); printf("====================================\n"); magma_int_t threads = magma_get_numthreads(); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_zbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_zbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; //magma_int_t NB = 96;//magma_bulge_get_nb(N); //magma_int_t sizvblg = magma_zbulge_get_lq2(N, threads); //magma_int_t siz = max(sizvblg,n2)+2*(N*NB+N)+24*N; /* Allocate host memory for the matrix */ TESTING_HOSTALLOC( h_A, magmaDoubleComplex, n2); TESTING_HOSTALLOC( h_B, magmaDoubleComplex, n2); TESTING_MALLOC( w1, double , N); TESTING_HOSTALLOC(h_work, magmaDoubleComplex, lwork); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_HOSTALLOC( rwork, double, lrwork); #endif TESTING_MALLOC( iwork, magma_int_t, liwork); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); /* increase the diagonal */ { for(int i=0; i<N; i++) { MAGMA_Z_SET2REAL( h_B[i*N+i], ( MAGMA_Z_REAL(h_B[i*N+i]) + 1.*N ) ); MAGMA_Z_SET2REAL( h_A[i*N+i], MAGMA_Z_REAL(h_A[i*N+i]) ); } } if((opts.warmup)||( checkres )){ TESTING_MALLOC(h_Ainit, magmaDoubleComplex, n2); TESTING_MALLOC(h_Binit, magmaDoubleComplex, n2); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == 'I'){ il = 1; iu = (int) (f*N); } if(opts.warmup){ // ================================================================== // Warmup using MAGMA. I prefer to use smalltest to warmup A- // ================================================================== magma_zhegvdx_2stage_m(opts.ngpu, itype, jobz, range, uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== start = get_current_time(); magma_zhegvdx_2stage_m(opts.ngpu, itype, jobz, range, uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); end = get_current_time(); mgpu_time = GetTimerValue(start,end)/1000.; if ( checkres ) { // =================================================================== // Check the results following the LAPACK's [zc]hegvdx routine. // A x = lambda B x is solved // and the following 3 tests computed: // (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) // | A B Z - Z D | / ( |A||Z| N ) (itype = 2) // | B A Z - Z D | / ( |A||Z| N ) (itype = 3) // =================================================================== #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_zlanhe("1",&uplo, &N, h_Ainit, &N, rwork); result /= lapackf77_zlange("1",&N , &m1, h_A, &N, rwork); if (itype == 1){ blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_zhemm("L", &uplo, &N, &m1, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_zlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (itype == 2){ blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_zlange("1", &N, &m1, h_A, &N, rwork)/N; } else if (itype == 3){ blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_zhemm("L", &uplo, &N, &m1, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_zlange("1", &N, &m1, h_A, &N, rwork)/N; } } // =================================================================== // Print execution time // =================================================================== printf("%5d %5d %2d %6.2f\n", (int) N, (int) m1, (int) opts.ngpu, mgpu_time); if ( checkres ){ printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if(itype==1) printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); else if(itype==2) printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); else if(itype==3) printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); } TESTING_HOSTFREE( h_A); TESTING_HOSTFREE( h_B); TESTING_FREE( w1); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_HOSTFREE( rwork); #endif TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); if((opts.warmup)||( checkres )){ TESTING_FREE( h_Ainit); TESTING_FREE( h_Binit); } } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssygvdx */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t mgpu_time; float *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; #endif float *w1, result=0; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: ngpu = %d, itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.ngpu, (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M ngpu MGPU Time (sec)\n"); printf("====================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_sbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_sbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; //magma_int_t NB = 96;//magma_bulge_get_nb(N); //magma_int_t sizvblg = magma_sbulge_get_lq2(N, threads); //magma_int_t siz = max(sizvblg,n2)+2*(N*NB+N)+24*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, float, n2 ); TESTING_MALLOC_PIN( h_B, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, float, lrwork); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slarnv( &ione, ISEED, &n2, h_B ); magma_smake_hpd( N, h_B, N ); magma_smake_symmetric( N, h_A, N ); if ( opts.warmup || opts.check ) { TESTING_MALLOC_CPU( h_Ainit, float, n2 ); TESTING_MALLOC_CPU( h_Binit, float, n2 ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } if ( opts.warmup ) { // ================================================================== // Warmup using MAGMA. I prefer to use smalltest to warmup A- // ================================================================== magma_ssygvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== mgpu_time = magma_wtime(); magma_ssygvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); mgpu_time = magma_wtime() - mgpu_time; if ( opts.check ) { // =================================================================== // Check the results following the LAPACK's [zc]hegvdx routine. // A x = lambda B x is solved // and the following 3 tests computed: // (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) // | A B Z - Z D | / ( |A||Z| N ) (itype = 2) // | B A Z - Z D | / ( |A||Z| N ) (itype = 3) // =================================================================== #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_slansy("1", lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, rwork); result /= lapackf77_slange("1", &N , &m1, h_A, &N, rwork); if (opts.itype == 1) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_slange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_slange("1", &N, &m1, h_A, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_slange("1", &N, &m1, h_A, &N, rwork)/N; } } // =================================================================== // Print execution time // =================================================================== printf("%5d %5d %4d %7.2f\n", (int) N, (int) m1, (int) opts.ngpu, mgpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } printf("\n"); status += ! (result < tol); } TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( iwork ); if ( opts.warmup || opts.check ) { TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t gpu_time, gpu_perf, gflops; magmaFloatComplex *h_A, *h_R, *h_work, *dT1; magmaFloatComplex *tau; float *D, *E; /* Matrix size */ magma_int_t N, n2, lda, lwork, ldt, lwork0; magma_int_t info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) magma_int_t WANTZ=0; magma_int_t THREADS=1; #endif #endif magma_int_t NE = 0; magma_int_t NB = 0; magma_int_t ngpu = 1; magma_opts opts; parse_opts( argc, argv, &opts ); NB = opts.nb; if (NB < 1) NB = 64; //64; //magma_get_chetrd_he2hb_nb(N); // what is NE ? if (NE < 1) NE = 64; //N; //magma_get_chetrd_he2hb_nb(N); // N not yet initialized printf(" N GPU GFlop/s \n"); printf("=====================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; n2 = N*lda; gflops = FLOPS_CHETRD( N ) / 1e9; /* We suppose the magma NB is bigger than lapack NB */ lwork0 = N*NB; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, N-1 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork0 ); TESTING_MALLOC_PIN( D, float, N ); TESTING_MALLOC_PIN( E, float, N ); //TESTING_MALLOC_DEV( dT1, magmaFloatComplex, (2*min(N,N)+(N+31)/32*32)*NB ); TESTING_MALLOC_DEV( dT1, magmaFloatComplex, (N*NB) ); // if (WANTZ) gflops = 2.0*gflops; /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hermitian( N, h_A, lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_device_t cdev; magma_getdevice( &cdev ); gpu_time = magma_wtime(); /* magma_chetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork0, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_chetrd_bhe2trc_v5(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ /* magma_chetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_chetrd_bhe2trc(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ magma_range_t range = MagmaRangeAll; magma_int_t fraction_ev = 100; magma_int_t il, iu, m1; float vl=0., vu=0.; if (fraction_ev == 0) { il = N / 10; iu = N / 5+il; } else { il = 1; iu = (int)(fraction_ev*N); if (iu < 1) iu = 1; } magmaFloatComplex *hh_work; magma_int_t *iwork; magma_int_t nb, /*lwork,*/ liwork; magma_int_t threads = magma_get_parallel_numthreads(); #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; lwork = magma_cbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; TESTING_MALLOC_PIN( rwork, float, lrwork ); #else lwork = magma_cbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; nb = magma_get_chetrd_nb(N); TESTING_MALLOC_PIN( hh_work, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); if (ngpu == 1) { printf("calling cheevdx_2stage 1 GPU\n"); magma_cheevdx_2stage( opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } else { printf("calling cheevdx_2stage_m %d GPU\n", (int) ngpu); magma_cheevdx_2stage_m(ngpu, opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } magma_setdevice( cdev ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ /* if ( opts.check ) { FILE *fp ; printf("Writing input matrix in matlab_i_mat.txt ...\n"); fp = fopen ("matlab_i_mat.txt", "w") ; if ( fp == NULL ) { printf("Couldn't open output file\n"); exit(1); } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #if defined(PRECISION_z) || defined(PRECISION_c) fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_A[k+j*lda].x, h_A[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_A[k+j*lda]); #endif } } fclose( fp ) ; printf("Writing output matrix in matlab_o_mat.txt ...\n"); fp = fopen ("matlab_o_mat.txt", "w") ; if ( fp == NULL ) { printf("Couldn't open output file\n"); exit(1); } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #if defined(PRECISION_z) || defined(PRECISION_c) fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_R[k+j*lda].x, h_R[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_R[k+j*lda]); #endif } } fclose( fp ) ; } */ /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); char JOBZ; if (WANTZ == 0) JOBZ = 'N'; else JOBZ = 'V'; float nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaFloatComplex *work2, *AINIT; float *rwork2, *D2; // TODO free this memory ! magma_cmalloc_cpu( &work2, lwork2 ); magma_smalloc_cpu( &rwork2, N ); magma_smalloc_cpu( &D2, N ); magma_cmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaFloatComplex)); /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */ cpu_time = magma_wtime(); i= min(12, THREADS); #if defined(USEMKL) mkl_set_num_threads( i ); #endif #if defined(USEACML) omp_set_num_threads(i); #endif lapackf77_cheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #if defined(PRECISION_z) || defined (PRECISION_c) rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////ssterf_( &N, D, E, &info); //// cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, i); /* for (i=0; i < 10; i++) printf(" voici lpk D[%d] %8.2e\n", i, D2[i]); */ //magmaFloatComplex mydz=0.0, mydo=1.0; //magmaFloatComplex *Z; // magma_cmalloc_cpu( &Z, N*lda ); // dgemm_("N", "N", &N, &N, &N, &mydo, h_R, &lda, h_A, &lda, &mydz, Z, &lda); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaFloatComplex *WORKAJETER; float *RWORKAJETER, *RESU; // TODO free this memory ! magma_cmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_smalloc_cpu( &RWORKAJETER, N ); magma_smalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(float)); MATYPE=3; float NOTHING=0.0; cpu_time = magma_wtime(); // check results ccheck_eig_(&JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); #if defined(USEMKL) mkl_set_num_threads( 1 ); #endif #if defined(USEACML) omp_set_num_threads(1); #endif printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif #endif printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); printf("============================================================================\n\n\n"); /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); TESTING_FREE_DEV( dT1 ); /* TODO - not all memory has been freed inside loop */ fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE_MGPU(); return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvd */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); magmaFloatComplex *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; #endif float *w1, *w2, result; magma_int_t *iwork; real_Double_t mgpu_time, gpu_time, cpu_time; /* Matrix size */ magma_int_t N, n2, nb; magma_int_t info; magma_int_t ione = 1; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: ngpu = %d, itype = %d, jobz = %s, uplo = %s, check = %d\n", (int) opts.ngpu, (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), (int) opts.check); printf(" N CPU Time (sec) GPU Time (sec) MGPU Time (sec)\n"); printf("=========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // TODO define lda N = opts.nsize[itest]; n2 = N*N; nb = magma_get_chetrd_nb(N); #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t lwork = max( N + N*nb, 2*N + N*N ); magma_int_t lrwork = 1 + 5*N +2*N*N; #else magma_int_t lwork = max( 2*N + N*nb, 1 + 6*N + 2*N*N ); #endif magma_int_t liwork = 3 + 5*N; TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, float, lrwork ); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, N ); magma_cmake_hermitian( N, h_A, N ); if ( opts.warmup || opts.check ) { TESTING_MALLOC_CPU( h_Ainit, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_Binit, magmaFloatComplex, n2 ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } if (opts.warmup) { // ================================================================== // Warmup using MAGMA. // ================================================================== magma_chegvd_m( opts.ngpu, opts.itype, opts.jobz, opts.uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== mgpu_time = magma_wtime(); magma_chegvd_m( opts.ngpu, opts.itype, opts.jobz, opts.uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); mgpu_time = magma_wtime() - mgpu_time; if (info != 0) printf("magma_chegvd_m returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, rwork); result /= lapackf77_clange("1", &N, &N, h_A, &N, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_clange("1", &N, &N, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &N, h_A, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_csscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_clange("1", &N, &N, h_A, &N, rwork)/N; } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_chegvd(opts.itype, opts.jobz, opts.uplo, N, h_A, N, h_B, N, w2, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_chegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_chegvd(&opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, h_Binit, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_chegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); float temp1 = 0; float temp2 = 0; for(int j=0; j<N; j++) { temp1 = max(temp1, fabs(w1[j])); temp1 = max(temp1, fabs(w2[j])); temp2 = max(temp2, fabs(w1[j]-w2[j])); } float result2 = temp2 / (((float)N)*temp1); /* ===================================================================== Print execution time =================================================================== */ printf("%5d %7.2f %7.2f %7.2f\n", (int) N, cpu_time, gpu_time, mgpu_time); printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } printf( "(3) | D(MGPU)-D(LAPACK) |/ |D| = %8.2e %s\n\n", result2, (result2 < tolulp ? "ok" : "failed") ); status += ! (result < tol && result2 < tolulp); } else { printf("%5d --- --- %7.2f\n", (int) N, mgpu_time); } /* Memory clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); if ( opts.warmup || opts.check ) { TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssygvd */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); float *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; #endif float *w1, *w2, result; magma_int_t *iwork; float mgpu_time, gpu_time, cpu_time; /* Matrix size */ magma_int_t N=0, n2; magma_int_t info; magma_int_t ione = 1; float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ISEED[4] = {0,0,0,1}; magma_timestr_t start, end; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); char jobz = opts.jobz; int checkres = opts.check; char uplo = opts.uplo; magma_int_t itype = opts.itype; if ( checkres && jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); jobz = MagmaVec; } printf("using: nrgpu = %d, itype = %d, jobz = %c, uplo = %c, checkres = %d\n", (int) opts.ngpu, (int) itype, jobz, uplo, (int) checkres); printf(" N M nr GPU MGPU Time(s) \n"); printf("====================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t lwork = 2*N + N*N; magma_int_t lrwork = 1 + 5*N +2*N*N; // MKL's ssygvd has a bug for small N - it looks like what is returned by a // query (consistent with LAPACK's number above) is different from a the memory // requirement ckeck (that returns info -11). The lwork increase below is needed // to pass this check. if (N<32) lwork = 34*32; #else magma_int_t lwork = 1 + 6*N + 2*N*N; #endif magma_int_t liwork = 3 + 5*N; TESTING_MALLOC_PIN( h_A, float, n2 ); TESTING_MALLOC_PIN( h_B, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, float, lrwork ); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); printf(" N CPU Time(s) GPU Time(s) MGPU Time(s) \n"); printf("==================================================\n"); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slarnv( &ione, ISEED, &n2, h_B ); magma_smake_hpd( N, h_B, N ); magma_smake_symmetric( N, h_A, N ); if((opts.warmup)||( checkres )){ TESTING_MALLOC_CPU( h_Ainit, float, n2 ); TESTING_MALLOC_CPU( h_Binit, float, n2 ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } if(opts.warmup){ // ================================================================== // Warmup using MAGMA. // ================================================================== magma_ssygvd_m( opts.ngpu, itype, jobz, uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== start = get_current_time(); magma_ssygvd_m( opts.ngpu, itype, jobz, uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); end = get_current_time(); if(info != 0) printf("magma_ssygvd_m returned error %d: %s.\n", (int) info, magma_strerror( info )); mgpu_time = GetTimerValue(start,end)/1000.; if ( checkres ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_slansy("1",&uplo, &N, h_Ainit, &N, rwork); result /= lapackf77_slange("1",&N , &N, h_A, &N, rwork); if (itype == 1){ blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", &uplo, &N, &N, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_slange("1", &N, &N, h_work, &N, rwork)/N; } else if (itype == 2){ blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_slange("1", &N, &N, h_A, &N, rwork)/N; } else if (itype == 3){ blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_slange("1", &N, &N, h_A, &N, rwork)/N; } lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_ssygvd(itype, jobz, uplo, N, h_A, N, h_B, N, w2, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); end = get_current_time(); if(info != 0) printf("magma_ssygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); gpu_time = GetTimerValue(start,end)/1000.; /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_ssygvd(&itype, &jobz, &uplo, &N, h_Ainit, &N, h_Binit, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); end = get_current_time(); if (info != 0) printf("lapackf77_ssygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); cpu_time = GetTimerValue(start,end)/1000.; float temp1 = 0; float temp2 = 0; for(int j=0; j<N; j++){ temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } float result2 = temp2 / (((float)N)*temp1); /* ===================================================================== Print execution time =================================================================== */ printf("%5d %6.2f %6.2f %6.2f\n", (int) N, cpu_time, gpu_time, mgpu_time); printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if(itype==1) printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); else if(itype==2) printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); else if(itype==3) printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); printf( "(3) | D(MGPU)-D(LAPACK) |/ |D| = %8.2e%s\n\n", result2, (result2 < tolulp ? "" : " failed") ); } else { printf("%5d ------ ------ %6.2f\n", (int) N, mgpu_time); } /* Memory clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); if((opts.warmup)||( checkres )){ TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t gflops, gpu_time, gpu_perf; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; magma_int_t N, n2, lda, ldda, lwork, ldt, info, nstream; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; // TODO add these options to parse_opts magma_int_t NE = 0; magma_int_t distblk = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t WANTZ = (opts.jobz == MagmaVec); double tol = opts.tolerance * lapackf77_dlamch("E"); if (opts.nb == 0) opts.nb = 64; //magma_get_zhetrd_he2hb_nb(N); if (NE < 1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); nstream = max(3, opts.ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex *da[MagmaMaxGPUs], *dT1[MagmaMaxGPUs]; if ((distblk == 0) || (distblk < opts.nb)) distblk = max(256, opts.nb); printf("voici ngpu %d distblk %d NB %d nstream %d\n ", (int) opts.ngpu, (int) distblk, (int) opts.nb, (int) nstream); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; ldda = ((N+31)/32)*32; n2 = N*lda; /* We suppose the magma NB is bigger than lapack NB */ lwork = N*opts.nb; //gflops = ....? /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / opts.ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*opts.nb ); } /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, opts.ngpu, distblk); //magmaDoubleComplex *dabis; //TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); //magma_zsetmatrix(N, N, h_R, lda, dabis, ldda); for (int count=0; count < 1; ++count) { magma_setdevice(0); gpu_time = magma_wtime(); if (opts.version == 30) { magma_zhetrd_he2hb_mgpu_spec( opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, da, ldda, dT1, opts.nb, opts.ngpu, distblk, streams, nstream, opts.nthread, &info); } else { nstream = 3; magma_zhetrd_he2hb_mgpu( opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, da, ldda, dT1, opts.nb, opts.ngpu, distblk, streams, nstream, opts.nthread, &info); } // magma_zhetrd_he2hb(opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, dT1[0], &info); gpu_time = magma_wtime() - gpu_time; printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, opts.nb, distblk, opts.ngpu, opts.version, gpu_time); } magma_setdevice(0); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice(dev); cudaDeviceSynchronize(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); // todo neither of these is declared in headers // magma_zhetrd_bhe2trc_v5(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // todo where is this timer started? // gpu_time = magma_wtime() - gpu_time; // todo what are the gflops? gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); char JOBZ; if (WANTZ == 0) JOBZ = 'N'; else JOBZ = 'V'; double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2, *AINIT; double *rwork2, *D2; // TODO free this memory ! magma_zmalloc_cpu( &work2, lwork2 ); magma_dmalloc_cpu( &rwork2, N ); magma_dmalloc_cpu( &D2, N ); magma_zmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); int nt = min(12, opts.nthread); #if defined(USEMKL) mkl_set_num_threads(nt); #endif #if defined(USEACML) omp_set_num_threads(nt); #endif #if defined(PRECISION_z) || defined (PRECISION_c) lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, rwork2, &info ); #else lapackf77_dsyev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, &info ); #endif ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, nt); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; // TODO free this memory ! magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_dmalloc_cpu( &RWORKAJETER, N ); magma_dmalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(double)); MATYPE=3; double NOTHING=0.0; cpu_time = magma_wtime(); // check results zcheck_eig_( lapack_vec_const(opts.jobz), &MATYPE, &N, &opts.nb, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); #if defined(USEMKL) mkl_set_num_threads(1); #endif #if defined(USEACML) omp_set_num_threads(1); #endif printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) opts.nthread, (int) N, (int) opts.nb, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif // PRECISION_z || PRECISION_d #endif // CHECKEIG printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("============================================================================\n\n\n"); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); } magma_setdevice( 0 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } TESTING_FINALIZE_MGPU(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); magma_timestr_t start, end; double eps, flops, gpu_perf, gpu_time; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; /* Matrix size */ magma_int_t N = 0, n2, lda, lwork,ldt; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, checkres, once = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; char *uplo = (char *)MagmaLowerStr; magma_int_t ngpu = magma_num_gpus(); magma_int_t nstream = max(3,ngpu+1); magma_int_t WANTZ=0; magma_int_t THREADS=1; magma_int_t NE = 0; magma_int_t NB = 0; magma_int_t distblk =0; magma_int_t ver =0; checkres = 0; //getenv("MAGMA_TESTINGS_CHECK") != NULL; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); once = 1; } else if (strcmp("-NB", argv[i])==0) { NB = atoi(argv[++i]); } else if (strcmp("-D", argv[i])==0) { distblk = atoi(argv[++i]); } else if (strcmp("-threads", argv[i])==0) { THREADS = atoi(argv[++i]); } else if (strcmp("-wantz", argv[i])==0) { WANTZ = atoi(argv[++i]); } else if (strcmp("-NE", argv[i])==0) { NE = atoi(argv[++i]); } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = 1; } else if ( strcmp("-v", argv[i]) == 0 && i+1 < argc ) { ver = atoi( argv[++i] ); } else if ( strcmp("-nstream", argv[i]) == 0 && i+1 < argc ) { nstream = atoi( argv[++i] ); magma_assert( nstream > 0 && nstream <= 20, "error: -nstream %s is invalid; must be > 0 and <= 20.\n", argv[i] ); } else if ( strcmp("-ngpu", argv[i]) == 0 && i+1 < argc ) { ngpu = atoi( argv[++i] ); magma_assert( ngpu > 0 || ngpu > MagmaMaxGPUs, "error: -ngpu %s is invalid; must be > 0.\n", argv[i] ); } else if (strcmp("-U", argv[i])==0) uplo = (char *)MagmaUpperStr; else if (strcmp("-L", argv[i])==0) uplo = (char *)MagmaLowerStr; } if ( N > 0 ) printf(" testing_zhetrd_he2hb -L|U -N %d -NB %d -wantz %d -threads %d check %d \n\n", N, NB, WANTZ, THREADS, checkres); else { printf("\nUsage: \n"); printf(" testing_zhetrd_he2hb -L|U -N %d -NB -wantz -threads \n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zhetrd_he2hb -L|U -N %d\n\n", 1024); N = size[9]; } eps = lapackf77_dlamch( "E" ); lda = N; ldt = N; n2 = lda * N; if(NB<1) NB = 64; //64; //magma_get_zhetrd_he2hb_nb(N); if(NE<1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); /* We suppose the magma NB is bigger than lapack NB */ lwork = N*NB; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); nstream = max(3,ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex *da[MagmaMaxGPUs],*dT1[MagmaMaxGPUs]; magma_int_t ldda = ((N+31)/32)*32; if((distblk==0)||(distblk<NB)) distblk = max(256,NB); printf("voici ngpu %d distblk %d NB %d nstream %d\n ",ngpu,distblk,NB,nstream); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*NB ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for(i=0; i<10; i++){ if ( !once ) { N = size[i]; } lda = N; n2 = N*lda; /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, ngpu, distblk); //magmaDoubleComplex *dabis; // TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); // magma_zsetmatrix(N,N,h_R,lda,dabis,ldda); for (int count=0; count<1;++count){ magma_setdevice(0); start = get_current_time(); if(ver==30){ magma_zhetrd_he2hb_mgpu_spec(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, da, ldda, dT1, NB, ngpu, distblk, streams, nstream, THREADS, &info); }else{ nstream =3; magma_zhetrd_he2hb_mgpu(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, da, ldda, dT1, NB, ngpu, distblk, streams, nstream, THREADS, &info); } // magma_zhetrd_he2hb(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, dT1[0], &info); end = get_current_time(); printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, NB, distblk, ngpu, ver, GetTimerValue(start,end) / 1000.); } magma_setdevice(0); //goto fin; //return 0; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice(dev); cudaDeviceSynchronize(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); magma_zhetrd_bhe2trc_v5(THREADS, WANTZ, uplo[0], NE, N, NB, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(THREADS, WANTZ, uplo[0], NE, N, NB, h_R, lda, D, E, dT1[0], ldt); end = get_current_time(); if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); gpu_perf = flops / GetTimerValue(start,end); gpu_time = GetTimerValue(start,end) / 1000.; /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( checkres ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); char JOBZ; if(WANTZ==0) JOBZ='N'; else JOBZ = 'V'; double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2 = (magmaDoubleComplex *) malloc (lwork2*sizeof(magmaDoubleComplex)); double *rwork2 = (double *) malloc (N*sizeof(double)); double *D2 = (double *) malloc (N*sizeof(double)); magmaDoubleComplex *AINIT = (magmaDoubleComplex *) malloc (N*lda*sizeof(magmaDoubleComplex)); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */ start = get_current_time(); i= min(12,THREADS); #if defined(USEMKL) mkl_set_num_threads( i ); #endif #if defined(USEACML) omp_set_num_threads(i); #endif #if defined(PRECISION_z) || defined (PRECISION_c) lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, rwork2, &info ); #else lapackf77_dsyev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, &info ); #endif ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); //// end = get_current_time(); printf(" Finish CHECK - EIGEN timing= %f threads %d\n", GetTimerValue(start,end) / 1000., i); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; WORKAJETER = (magmaDoubleComplex *) malloc( (2* N * N + N) * sizeof(magmaDoubleComplex) ); RWORKAJETER = (double *) malloc( N * sizeof(double) ); RESU = (double *) malloc(10*sizeof(double)); int MATYPE; memset(RESU,0,10*sizeof(double)); MATYPE=3; double NOTHING=0.0; start = get_current_time(); // check results zcheck_eig_(&JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2 , D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); end = get_current_time(); printf(" Finish CHECK - results timing= %f\n", GetTimerValue(start,end) / 1000.); #if defined(USEMKL) mkl_set_num_threads( 1 ); #endif #if defined(USEACML) omp_set_num_threads(1); #endif printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if(WANTZ>0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n",RESU[0]); if(WANTZ>0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif #endif printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("============================================================================\n\n\n"); if ( once ) break; } //fin: /* Memory clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } magma_setdevice( 0 ); /* Shutdown */ TESTING_FINALIZE_MGPU(); return EXIT_SUCCESS; }