// Initialize matrix to random & symmetrize. // Having this in separate function ensures the same ISEED is always used, // so we can re-generate the identical matrix. void init_matrix( int m, int n, magmaDoubleComplex *h_A, magma_int_t lda ) { assert( m == n ); magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t n2 = lda*n; lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( n, h_A, lda ); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time; magmaDoubleComplex *h_A, *h_R, *h_B, *h_S, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif /* Matrix size */ double *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_zbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_zbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork); #endif /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); magma_zmake_hpd( N, h_B, N ); magma_zmake_hermitian( N, h_A, N ); magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } // ================================================================== // Warmup using MAGMA // ================================================================== if (opts.warmup) { lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_zhegvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } // =================================================================== // Performs operation using MAGMA // =================================================================== lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_zhegvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if ( opts.check && opts.jobz != MagmaNoVec ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif result[0] = 1.; result[0] /= lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_int_t m2 = m1; lapackf77_zhegvd(&opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &N, h_S, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); double maxw=0, diff=0; for(int j=0; j<m2; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (m2*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check && opts.jobz != MagmaNoVec ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf(" | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==2) { printf(" | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==3) { printf(" | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } printf( " | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_zhemm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex calpha = MAGMA_Z_MAKE( 3.456, 5.678 ); magmaDoubleComplex cbeta = MAGMA_Z_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; double error=0., errorbis=0., work[1]; magmaDoubleComplex *hA, *hX, *hB, *hR; magmaDoubleComplex_ptr dA[MagmaMaxGPUs], dX[MagmaMaxGPUs], dB[MagmaMaxGPUs], dwork[MagmaMaxGPUs], hwork[MagmaMaxGPUs+1]; magmaDoubleComplex_ptr dA2; magma_int_t M, N, size, lda, ldda, msize, nb, nstream; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); nstream = (opts.nstream > 0 ? opts.nstream : 2); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx = 0; magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu); printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx); for (int i=0; i < nbcmplx; ++i) { int myngpu = gnode[i][MagmaMaxGPUs]; printf("cmplx %d has %d gpu ", i, myngpu); for(int j=0; j < myngpu; ++j) printf(" %d", (int) gnode[i][j]); printf("\n"); } magma_int_t nbevents = 2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t redevents[MagmaMaxGPUs][20]; magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; for( int d = 0; d < opts.ngpu; ++d ) { for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[d][i], cudaEventDisableTiming); cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming); } } printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version ); printf(" M N nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||X||\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; ldda = ((M + 31)/32)*32; size = lda*M; gflops = FLOPS_ZHEMM( MagmaLeft, (double)msize, (double)N ) / 1e9; magma_int_t dworksiz = ldda*N*3; magma_int_t hworksiz = lda*N; TESTING_MALLOC_CPU( hA, magmaDoubleComplex, lda*M ); TESTING_MALLOC_CPU( hX, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( hB, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( hR, magmaDoubleComplex, lda*N ); for( int d = 0; d < opts.ngpu; ++d ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( d ); TESTING_MALLOC_DEV( dA[d], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dX[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dB[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork[d], magmaDoubleComplex, dworksiz ); TESTING_MALLOC_PIN( hwork[d], magmaDoubleComplex, hworksiz ); } TESTING_MALLOC_PIN( hwork[opts.ngpu], magmaDoubleComplex, lda*N ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, magmaDoubleComplex, ldda*M ); } lapackf77_zlarnv( &ione, iseed, &size, hA ); magma_zmake_hermitian( M, hA, lda ); size = lda*N; lapackf77_zlarnv( &ione, iseed, &size, hX ); lapackf77_zlarnv( &ione, iseed, &size, hB ); lapackf77_zlacpy( "Full", &M, &N, hB, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); //magmablasSetKernelStream( streams[ d ][ 0 ] ); magma_zsetmatrix( M, N, hX, lda, dX[d], ldda ); //if (d == 0) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); // this is wrong coz when offset != 0 the gpu who do the beta*C may be not 0 so this should be related to stdev(starting device who own i=0 first col) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); } //memset(hR, 0, lda*N*sizeof(magmaDoubleComplex)); trace_init( 1, opts.ngpu, nstream, (magma_queue_t*) streams ); //magma_int_t offset = 0; //nb; gpu_time = magma_sync_wtime(0); magmablas_zhemm_mgpu_com( MagmaLeft, MagmaLower, msize, N, calpha, dA, ldda, offset, dX, ldda, cbeta, dB, ldda, dwork, dworksiz, hR, lda, hwork, hworksiz, opts.ngpu, nb, streams, nstream, redevents2, nbevents, gnode, nbcmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "zhemm-m%d-n%d-nb%d-stream%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) nstream, (int) opts.ngpu, (int) iter ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magmablasSetKernelStream( 0 ); magma_zsetmatrix( M, M, hA, lda, dA2, ldda ); magma_zsetmatrix( M, N, hX, lda, dX[0], ldda ); magma_zsetmatrix( M, N, hB, lda, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0); magma_zhemm( MagmaLeft, MagmaLower, msize, N, calpha, dA2+offset*ldda+offset, ldda, dX[0], ldda, cbeta, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||X|| errorbis = lapackf77_zlange("fro", &msize, &msize, hA+offset*lda+offset, &lda, work ); errorbis *= lapackf77_zlange("fro", &msize, &N, hX, &lda, work ); //printf( "A =" ); magma_zprint( M, M, hA, lda ); //printf( "X =" ); magma_zprint( M, N, hX, lda ); //printf( "B =" ); magma_zprint( M, N, hB, lda ); cpu_time = magma_wtime(); blasf77_zhemm( "Left", "Lower", &msize, &N, &calpha, hA+offset*lda+offset, &lda, hX, &lda, &cbeta, hB, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* trace_file = fopen("AJETE/C", "w"); for (int j = 0; j < N; j++) for (int i = 0; i < siz; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, hB[j*lda+i]); fclose(trace_file); */ magma_int_t firstprint=0; for(magma_int_t dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_zgetmatrix( M, N, dB[dev], ldda, hR, lda ); // compute relative error ||R||/||A||*||X||, where R := B_magma - B_lapack = R - B size = lda*N; blasf77_zaxpy( &size, &c_neg_one, hB, &ione, hR, &ione ); error = lapackf77_zlange("fro", &msize, &N, hR, &lda, work) / errorbis; //printf( "R =" ); magma_zprint( M, N, hR, lda ); if (firstprint == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (error < tol ? "ok" : "failed") ); } else { printf( "%89s %8.2e %s\n", " ", error, (error < tol ? "ok" : "failed") ); } status += ! (error < tol); firstprint =1; } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hX ); TESTING_FREE_CPU( hB ); TESTING_FREE_PIN( hR ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); TESTING_FREE_DEV( dA[d] ); TESTING_FREE_DEV( dX[d] ); TESTING_FREE_DEV( dB[d] ); TESTING_FREE_DEV( dwork[d] ); TESTING_FREE_PIN( hwork[d] ); } TESTING_FREE_PIN( hwork[opts.ngpu] ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( redevents[d][i] ); magma_event_destroy( redevents2[d][i] ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaDoubleComplex *dA, *dX, *dY, *dC_work; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC( Ymagma, magmaDoubleComplex, sizeY ); TESTING_DEVALLOC( dA, magmaDoubleComplex, sizeA ); TESTING_DEVALLOC( dX, magmaDoubleComplex, sizeX ); TESTING_DEVALLOC( dY, magmaDoubleComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = lda * (blocks + 1); TESTING_DEVALLOC( dC_work, magmaDoubleComplex, ldwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, lda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasZhemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMA BLAS =================================================================== */ magma_zsetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); #if (GPUSHMEM >= 200) magmablas_zhemv2( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy, dC_work, ldwork ); #else magmablas_zhemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); #endif magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( &opts.uplo, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); TESTING_DEVFREE( dC_work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
// -------------------- int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_time=0, cpu_perf=0, gpu_time, gpu_perf, mgpu_time, mgpu_perf, cuda_time, cuda_perf; double Ynorm, error=0, error2=0, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t n_local[MagmaMaxGPUs]; magma_int_t N, Noffset, lda, ldda, blocks, lhwork, ldwork, matsize, vecsize; magma_int_t incx = 1; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ylapack, *Ycublas, *Ymagma, *Ymagma1, *hwork; magmaDoubleComplex_ptr dA, dX, dY; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs], dwork[MagmaMaxGPUs]; magma_device_t dev; magma_queue_t queues[MagmaMaxGPUs]; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code double tol = opts.tolerance * lapackf77_dlamch("E"); magma_int_t nb = 64; // required by magmablas_zhemv_mgpu implementation for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_create( dev, &queues[dev] ); } // currently, tests all offsets in the offsets array; // comment out loop below to test a specific offset. magma_int_t offset = opts.offset; magma_int_t offsets[] = { 0, 1, 31, 32, 33, 63, 64, 65, 100, 200 }; magma_int_t noffsets = sizeof(offsets) / sizeof(*offsets); printf("%% uplo = %s, ngpu %d, block size = %d, offset %d\n", lapack_uplo_const(opts.uplo), (int) opts.ngpu, (int) nb, (int) offset ); printf( "%% BLAS CUBLAS MAGMA 1 GPU MAGMA MGPU Error rel Error rel\n" "%% N offset Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) to CUBLAS to LAPACK\n" "%%==================================================================================================================\n" ); for( int itest = 0; itest < opts.ntest; ++itest ) { // comment out these two lines & end of loop to test a specific offset for( int ioffset=0; ioffset < noffsets; ioffset += 1 ) { offset = offsets[ioffset]; for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; Noffset = N + offset; lda = Noffset; ldda = magma_roundup( Noffset, opts.align ); // multiple of 32 by default matsize = Noffset*ldda; vecsize = (Noffset-1)*incx + 1; gflops = FLOPS_ZHEMV( N ) / 1e9; blocks = magma_ceildiv( N + (offset % nb), nb ); lhwork = N*opts.ngpu; ldwork = ldda*(blocks + 1); TESTING_MALLOC_CPU( A, magmaDoubleComplex, matsize ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma1, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ylapack, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( X, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( hwork, magmaDoubleComplex, lhwork ); magma_setdevice( opts.device ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, matsize ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, vecsize ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, vecsize ); // TODO make magma_zmalloc_bcyclic helper function? for( dev=0; dev < opts.ngpu; dev++ ) { n_local[dev] = ((Noffset/nb)/opts.ngpu)*nb; if (dev < (Noffset/nb) % opts.ngpu) n_local[dev] += nb; else if (dev == (Noffset/nb) % opts.ngpu) n_local[dev] += Noffset % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaDoubleComplex, ldda*n_local[dev] ); TESTING_MALLOC_DEV( dwork[dev], magmaDoubleComplex, ldwork ); } ////////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &matsize, A ); magma_zmake_hermitian( Noffset, A, lda ); lapackf77_zlarnv( &ione, ISEED, &vecsize, X ); lapackf77_zlarnv( &ione, ISEED, &vecsize, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_setdevice( opts.device ); magma_zsetmatrix( Noffset, Noffset, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Noffset, X, incx, dX, incx, opts.queue ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); cuda_time = magma_sync_wtime(0); cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, &beta, dY + offset, incx ); cuda_time = magma_sync_wtime(0) - cuda_time; cuda_perf = gflops / cuda_time; magma_zgetvector( Noffset, dY, incx, Ycublas, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (1 GPU) =================================================================== */ magma_setdevice( opts.device ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); magmablas_zhemv_work( opts.uplo, N, alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, beta, dY + offset, incx, dwork[ opts.device ], ldwork, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; magma_zgetvector( Noffset, dY, incx, Ymagma1, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (multi-GPU) =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( Noffset, Noffset, A, lda, d_lA, ldda, opts.ngpu, nb, queues ); blasf77_zcopy( &Noffset, Y, &incx, Ymagma, &incx ); // workspaces do NOT need to be zero -- set to NAN to prove for( dev=0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork[dev], ldwork, opts.queue ); } lapackf77_zlaset( "Full", &lhwork, &ione, &MAGMA_Z_NAN, &MAGMA_Z_NAN, hwork, &lhwork ); mgpu_time = magma_sync_wtime(0); magma_int_t info; info = magmablas_zhemv_mgpu( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } info = magmablas_zhemv_mgpu_sync( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_sync returned error %d: %s.\n", (int) info, magma_strerror( info )); } mgpu_time = magma_sync_wtime(0) - mgpu_time; mgpu_perf = gflops / mgpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { blasf77_zcopy( &Noffset, Y, &incx, Ylapack, &incx ); cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A + offset + offset*lda, &lda, X + offset, &incx, &beta, Ylapack + offset, &incx ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Compute the Difference LAPACK vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ylapack, &incx ); error2 = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ) / Ynorm; } /* ===================================================================== Compute the Difference Cublas vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ycublas, &incx ); error = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ) / Ynorm; bool okay = (error < tol && error2 < tol); status += ! okay; if ( opts.lapack ) { printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) N, (int) offset, cpu_perf, cpu_time*1000., cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, error2, (okay ? "ok" : "failed") ); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e --- %s\n", (int) N, (int) offset, cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, (okay ? "ok" : "failed") ); } /* Free Memory */ TESTING_FREE_CPU( A ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_CPU( Ymagma1 ); TESTING_FREE_CPU( Ylapack ); TESTING_FREE_PIN( X ); TESTING_FREE_PIN( hwork ); magma_setdevice( opts.device ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); for( dev=0; dev < opts.ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); TESTING_FREE_DEV( dwork[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } // comment out these two lines line & top of loop test a specific offset } // end for ioffset printf( "\n" ); } for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_destroy( queues[dev] ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cpu_perf, cpu_time; double magma_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ymagma; magmaDoubleComplex *dA, *dX, *dY, *dwork; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error\n"); printf("=========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork ); magmablas_zlaset( MagmaFull, ldda, N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* Note: CUBLAS does not implement zsymv */ /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, ldda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); //magma_zprint_gpu( ldda, blocks, dwork, ldda ); magma_time = magma_sync_wtime( 0 ); magmablas_zsymv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork ); // TODO provide option to test non-work interface //magmablas_zsymv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); //magma_zprint_gpu( ldda, blocks, dwork, ldda ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); lapackf77_zsymv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, cpu_perf, 1000.*cpu_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_time, gpu_perf; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; magma_int_t N, n2, lda, ldda, lwork, ldt, info, nstream; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; // TODO add these options to parse_opts magma_int_t NE = 0; magma_int_t distblk = 0; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t WANTZ = (opts.jobz == MagmaVec); double tol = opts.tolerance * lapackf77_dlamch("E"); if (opts.nb == 0) opts.nb = 64; //magma_get_zhetrd_he2hb_nb(N); if (NE < 1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); nstream = max(3, opts.ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex_ptr da[MagmaMaxGPUs], dT1[MagmaMaxGPUs]; if ((distblk == 0) || (distblk < opts.nb)) distblk = max(256, opts.nb); printf("%% ngpu %d, distblk %d, NB %d, nstream %d\n", (int) opts.ngpu, (int) distblk, (int) opts.nb, (int) nstream); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default n2 = lda*N; /* We suppose the magma NB is bigger than lapack NB */ lwork = N*opts.nb; //gflops = ....? /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / opts.ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*opts.nb ); } /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, opts.ngpu, distblk); //magmaDoubleComplex_ptr dabis; //TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); //magma_zsetmatrix(N, N, h_R, lda, dabis, ldda); for (int count=0; count < 1; ++count) { magma_setdevice(0); gpu_time = magma_wtime(); if (opts.version == 30) { // see src/obsolete and magmablas/obsolete printf( "magma_zhetrd_he2hb_mgpu_spec not compiled\n" ); //magma_zhetrd_he2hb_mgpu_spec( // opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, // da, ldda, dT1, opts.nb, opts.ngpu, distblk, // streams, nstream, opts.nthread, &info); } else { nstream = 3; magma_zhetrd_he2hb_mgpu( opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, da, ldda, dT1, opts.nb, opts.ngpu, distblk, streams, nstream, opts.nthread, &info); } // magma_zhetrd_he2hb(opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, dT1[0], &info); gpu_time = magma_wtime() - gpu_time; printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, opts.nb, distblk, opts.ngpu, opts.version, gpu_time); } magma_setdevice(0); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice(dev); magma_device_sync(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); // todo neither of these is declared in headers // magma_zhetrd_bhe2trc_v5(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // todo where is this timer started? // gpu_time = magma_wtime() - gpu_time; // todo what are the gflops? gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2, *AINIT; double *rwork2, *D2; // TODO free this memory ! magma_zmalloc_cpu( &work2, lwork2 ); magma_dmalloc_cpu( &rwork2, N ); magma_dmalloc_cpu( &D2, N ); magma_zmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); int nt = min(12, opts.nthread); magma_set_lapack_numthreads(nt); lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #ifdef COMPLEX rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, nt); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; // TODO free this memory ! magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_dmalloc_cpu( &RWORKAJETER, N ); magma_dmalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(double)); MATYPE=3; double NOTHING=0.0; cpu_time = magma_wtime(); // check results zcheck_eig_( lapack_vec_const(opts.jobz), &MATYPE, &N, &opts.nb, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); magma_set_lapack_numthreads(1); printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) opts.nthread, (int) N, (int) opts.nb, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif // PRECISION_z || PRECISION_d #endif // CHECKEIG printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("%%===========================================================================\n\n\n"); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); } magma_setdevice( 0 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magma_int_t ione = 1; real_Double_t atomics_perf=0, atomics_time=0; real_Double_t gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error=0, atomics_error=0, cublas_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma; magmaDoubleComplex_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% N MAGMA Gflop/s (ms) Atomics Gflop/s CUBLAS Gflop/s CPU Gflop/s MAGMA error Atomics CUBLAS\n"); printf("%%=====================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Yatomics, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); blocks = magma_ceildiv( N, nb ); ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork, ldwork ); magmablas_zlaset( MagmaFull, ldda, N, MAGMA_Z_NAN, MAGMA_Z_NAN, dA, ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_zlaset( "Lower", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[1], &lda ); } else { lapackf77_zlaset( "Upper", &N1, &N1, &MAGMA_Z_NAN, &MAGMA_Z_NAN, &A[lda], &lda ); } lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, ldda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); magmablasSetKernelStream( opts.queue ); // opts.handle also uses opts.queue cublas_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_zhemv( opts.uplo, N, alpha, dA, 0, ldda, dX, 0, incx, beta, dY, 0, incy, opts.queue ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using CUBLAS - using atomics =================================================================== */ #ifdef HAVE_CUBLAS cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED ); magma_zsetvector( N, Y, incy, dY, incy ); // sync on queue doesn't work -- need device sync or use NULL stream -- bug in CUBLAS? atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ); cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ) - atomics_time; atomics_perf = gflops / atomics_time; magma_zgetvector( N, dY, incy, Yatomics, incy ); cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED ); #endif /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( opts.queue ); if ( opts.version == 1 ) { magmablas_zhemv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_zhemv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); } magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; #ifdef HAVE_CUBLAS blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy ); atomics_error = lapackf77_zlange( "M", &N, &ione, Yatomics, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; #endif bool okay = (magma_error < tol && cublas_error < tol && atomics_error < tol); status += ! okay; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, atomics_perf, 1000.*atomics_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, atomics_error, (okay ? "ok" : "failed")); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Yatomics ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, gpu_perf, gflops; magmaDoubleComplex *h_A, *h_R, *h_work, *dT1; magmaDoubleComplex *tau; double *D, *E; /* Matrix size */ magma_int_t N, n2, lda, lwork, lwork0; //ldt magma_int_t info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) magma_int_t WANTZ=0; magma_int_t THREADS=1; #endif #endif magma_int_t NE = 0; magma_int_t NB = 0; magma_int_t ngpu = 1; magma_opts opts; opts.parse_opts( argc, argv ); NB = opts.nb; if (NB < 1) NB = 64; //64; //magma_get_zhetrd_he2hb_nb(N); // what is NE ? if (NE < 1) NE = 64; //N; //magma_get_zhetrd_he2hb_nb(N); // N not yet initialized printf("%% N GPU GFlop/s \n"); printf("%%====================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; //ldt = N; n2 = lda*N; gflops = FLOPS_ZHETRD( N ) / 1e9; /* We suppose the magma NB is bigger than lapack NB */ lwork0 = N*NB; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork0 ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); //TESTING_MALLOC_DEV( dT1, magmaDoubleComplex, (2*min(N,N) + roundup( N, 32 ))*NB ); TESTING_MALLOC_DEV( dT1, magmaDoubleComplex, (N*NB) ); // if (WANTZ) gflops = 2.0*gflops; /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_device_t cdev; magma_getdevice( &cdev ); gpu_time = magma_wtime(); /* magma_zhetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork0, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_zhetrd_bhe2trc_v5(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ /* magma_zhetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_zhetrd_bhe2trc(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ magma_range_t range = MagmaRangeAll; magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (opts.fraction == 0) { il = max( 1, magma_int_t(0.1*N) ); iu = max( 1, magma_int_t(0.3*N) ); } else { il = 1; iu = max( 1, magma_int_t(opts.fraction*N) ); } magmaDoubleComplex *hh_work; magma_int_t *iwork; magma_int_t /*nb,*/ /*lwork,*/ liwork; magma_int_t threads = magma_get_parallel_numthreads(); #ifdef COMPLEX double *rwork; magma_int_t lrwork; #endif magma_zheevdx_getworksize(N, threads, (opts.jobz == MagmaVec), &lwork, #ifdef COMPLEX &lrwork, #endif &liwork); TESTING_MALLOC_PIN( hh_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); #ifdef COMPLEX TESTING_MALLOC_PIN( rwork, double, lrwork ); #endif if (ngpu == 1) { printf("calling zheevdx_2stage 1 GPU\n"); magma_zheevdx_2stage( opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info); } else { printf("calling zheevdx_2stage_m %d GPU\n", (int) ngpu); magma_zheevdx_2stage_m(ngpu, opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info); } magma_setdevice( cdev ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ /* if ( opts.check ) { FILE *fp; printf("Writing input matrix in matlab_i_mat.txt ...\n"); fp = fopen ("matlab_i_mat.txt", "w"); if ( fp == NULL ) { printf("Couldn't open output file\n"); return -1; } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #ifdef COMPLEX fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_A[k+j*lda].x, h_A[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_A[k+j*lda]); #endif } } fclose( fp ); printf("Writing output matrix in matlab_o_mat.txt ...\n"); fp = fopen ("matlab_o_mat.txt", "w"); if ( fp == NULL ) { printf("Couldn't open output file\n"); return -1; } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #ifdef COMPLEX fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_R[k+j*lda].x, h_R[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_R[k+j*lda]); #endif } } fclose( fp ); } */ /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); const char* JOBZ; if (WANTZ == 0) JOBZ = MagmaNoVecStr; else JOBZ = MagmaVecStr; double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2, *AINIT; double *rwork2, *D2; // TODO free this memory ! magma_zmalloc_cpu( &work2, lwork2 ); magma_dmalloc_cpu( &rwork2, N ); magma_dmalloc_cpu( &D2, N ); magma_zmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */ cpu_time = magma_wtime(); i= min(12, THREADS); magma_set_lapack_numthreads( i ); lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #ifdef COMPLEX rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); //// cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, i); /* for (i=0; i < 10; i++) printf(" voici lpk D[%d] %8.2e\n", i, D2[i]); */ //magmaDoubleComplex mydz=0.0, mydo=1.0; //magmaDoubleComplex *Z; // magma_zmalloc_cpu( &Z, N*lda ); // dgemm_("N", "N", &N, &N, &N, &mydo, h_R, &lda, h_A, &lda, &mydz, Z, &lda); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; // TODO free this memory ! magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_dmalloc_cpu( &RWORKAJETER, N ); magma_dmalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(double)); MATYPE=3; double NOTHING=0.0; cpu_time = magma_wtime(); // check results zcheck_eig_( JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); magma_set_lapack_numthreads( 1 ); printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif #endif printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); printf("%%===========================================================================\n\n\n"); /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); TESTING_FREE_DEV( dT1 ); /* TODO - not all memory has been freed inside loop */ fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zheevd_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; magmaDoubleComplex *h_A, *h_R, *d_R, *h_work, aux_work[1]; #ifdef COMPLEX double *rwork, aux_rwork[1]; magma_int_t lrwork; #endif double *w1, *w2, result[4]={0, 0, 0, 0}, eps; magma_int_t *iwork, aux_iwork[1]; magma_int_t N, n2, info, lwork, liwork, lda, ldda; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; eps = lapackf77_dlamch( "E" ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); // checking NoVec requires LAPACK opts.lapack |= (opts.check && opts.jobz == MagmaNoVec); printf("using: jobz = %s, uplo = %s\n", lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time (sec)\n"); printf("=======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; lda = N; ldda = roundup( N, opts.roundup ); // by default, round to multiple of 32 // query for workspace sizes magma_zheevd_gpu( opts.jobz, opts.uplo, N, NULL, ldda, NULL, NULL, lda, aux_work, -1, #ifdef COMPLEX aux_rwork, -1, #endif aux_iwork, -1, &info ); lwork = (magma_int_t) MAGMA_Z_REAL( aux_work[0] ); #ifdef COMPLEX lrwork = (magma_int_t) aux_rwork[0]; #endif liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, N*lda ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); #ifdef COMPLEX TESTING_MALLOC_CPU( rwork, double, lrwork ); #endif TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, N*lda ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_DEV( d_R, magmaDoubleComplex, N*ldda ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, N ); magma_zsetmatrix( N, N, h_A, lda, d_R, ldda ); /* warm up run */ if ( opts.warmup ) { magma_zheevd_gpu( opts.jobz, opts.uplo, N, d_R, ldda, w1, h_R, lda, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); if (info != 0) printf("magma_zheevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zsetmatrix( N, N, h_A, lda, d_R, ldda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zheevd_gpu( opts.jobz, opts.uplo, N, d_R, ldda, w1, h_R, lda, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_zheevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check && opts.jobz != MagmaNoVec ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ magma_zgetmatrix( N, N, d_R, ldda, h_R, lda ); magmaDoubleComplex *work; TESTING_MALLOC_CPU( work, magmaDoubleComplex, 2*N*N ); // e=NULL is unused since kband=0; tau=NULL is unused since itype=1 lapackf77_zhet21( &ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, NULL, h_R, &lda, h_R, &lda, NULL, work, #ifdef COMPLEX rwork, #endif &result[0] ); result[0] *= eps; result[1] *= eps; TESTING_FREE_CPU( work ); work=NULL; // Disable eigenvalue check which calls routine again -- // it obscures whether error occurs in first call above or in this call. // But see comparison to LAPACK below. // //magma_zsetmatrix( N, N, h_A, lda, d_R, ldda ); //magma_zheevd_gpu( MagmaNoVec, opts.uplo, // N, d_R, ldda, w2, // h_R, lda, // h_work, lwork, // #ifdef COMPLEX // rwork, lrwork, // #endif // iwork, liwork, // &info); //if (info != 0) // printf("magma_zheevd_gpu returned error %d: %s.\n", // (int) info, magma_strerror( info )); // //double maxw=0, diff=0; //for( int j=0; j < N; j++ ) { // maxw = max(maxw, fabs(w1[j])); // maxw = max(maxw, fabs(w2[j])); // diff = max(diff, fabs(w1[j] - w2[j])); //} //result[2] = diff / (N*maxw); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zheevd( lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, w2, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_zheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); // compare eigenvalues double maxw=0, diff=0; for( int j=0; j < N; j++ ) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[3] = diff / (N*maxw); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check && opts.jobz != MagmaNoVec ) { printf("Testing the factorization A = U S U' for correctness:\n"); printf(" | A - U S U' | / (|A| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); printf(" | I - U'U | / N = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); //printf(" | S(w/ U) - S(w/o U) | / |S| = %8.2e %s\n\n", result[2], (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0] < tol && result[1] < tol); // && result[2] < tolulp) } if ( opts.lapack ) { printf(" | S_magma - S_lapack | / |S| = %8.2e %s\n\n", result[3], (result[3] < tolulp ? "ok" : "failed") ); status += ! (result[3] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); #ifdef COMPLEX TESTING_FREE_CPU( rwork ); #endif TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_DEV( d_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time /*cpu_time*/; magmaDoubleComplex *h_A, *h_R, *h_B, *h_S, *h_work; double *w1, *w2, vl=0, vu=0; double result[2] = {0}; magma_int_t *iwork; magma_int_t N, n2, info, il, iu, m1, m2, nb, lwork, liwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif //double d_one = 1.; //double d_ten = 10.; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, uplo = %s, check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; nb = magma_get_zhetrd_nb(N); #if defined(PRECISION_z) || defined(PRECISION_c) lwork = 2*N*nb + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = 1 + 6*N*nb + 2* N*N; #endif liwork = 3 + 5*N; if ( opts.fraction == 0 ) { il = N / 10; iu = N / 5+il; } else { il = 1; iu = (int) (opts.fraction*N); if (iu < 1) iu = 1; } TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork); #endif /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); magma_zmake_hpd( N, h_B, N ); magma_zmake_hermitian( N, h_A, N ); // ================================================================== // Warmup using MAGMA // ================================================================== if(opts.warmup){ lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_zhegvdx( opts.itype, opts.jobz, MagmaRangeI, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info ); if (info != 0) printf("magma_zhegvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_zhegvdx( opts.itype, opts.jobz, MagmaRangeI, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_zhegvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif double temp1, temp2; result[0] = 1.; result[0] /= lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_zhegvdx( opts.itype, MagmaNoVec, MagmaRangeI, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m2, w2, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info ); if (info != 0) printf("magma_zhegvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for(int j=0; j < m2; j++) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[1] = temp2 / (((double)m2)*temp1); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype == 1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype == 2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype == 3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } printf( "(2) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); magma_timestr_t start, end; double eps, flops, gpu_perf, gpu_time; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; /* Matrix size */ magma_int_t N = 0, n2, lda, lwork,ldt; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, checkres, once = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; char *uplo = (char *)MagmaLowerStr; magma_int_t ngpu = magma_num_gpus(); magma_int_t nstream = max(3,ngpu+1); magma_int_t WANTZ=0; magma_int_t THREADS=1; magma_int_t NE = 0; magma_int_t NB = 0; magma_int_t distblk =0; magma_int_t ver =0; checkres = 0; //getenv("MAGMA_TESTINGS_CHECK") != NULL; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); once = 1; } else if (strcmp("-NB", argv[i])==0) { NB = atoi(argv[++i]); } else if (strcmp("-D", argv[i])==0) { distblk = atoi(argv[++i]); } else if (strcmp("-threads", argv[i])==0) { THREADS = atoi(argv[++i]); } else if (strcmp("-wantz", argv[i])==0) { WANTZ = atoi(argv[++i]); } else if (strcmp("-NE", argv[i])==0) { NE = atoi(argv[++i]); } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = 1; } else if ( strcmp("-v", argv[i]) == 0 && i+1 < argc ) { ver = atoi( argv[++i] ); } else if ( strcmp("-nstream", argv[i]) == 0 && i+1 < argc ) { nstream = atoi( argv[++i] ); magma_assert( nstream > 0 && nstream <= 20, "error: -nstream %s is invalid; must be > 0 and <= 20.\n", argv[i] ); } else if ( strcmp("-ngpu", argv[i]) == 0 && i+1 < argc ) { ngpu = atoi( argv[++i] ); magma_assert( ngpu > 0 || ngpu > MagmaMaxGPUs, "error: -ngpu %s is invalid; must be > 0.\n", argv[i] ); } else if (strcmp("-U", argv[i])==0) uplo = (char *)MagmaUpperStr; else if (strcmp("-L", argv[i])==0) uplo = (char *)MagmaLowerStr; } if ( N > 0 ) printf(" testing_zhetrd_he2hb -L|U -N %d -NB %d -wantz %d -threads %d check %d \n\n", N, NB, WANTZ, THREADS, checkres); else { printf("\nUsage: \n"); printf(" testing_zhetrd_he2hb -L|U -N %d -NB -wantz -threads \n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zhetrd_he2hb -L|U -N %d\n\n", 1024); N = size[9]; } eps = lapackf77_dlamch( "E" ); lda = N; ldt = N; n2 = lda * N; if(NB<1) NB = 64; //64; //magma_get_zhetrd_he2hb_nb(N); if(NE<1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); /* We suppose the magma NB is bigger than lapack NB */ lwork = N*NB; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); nstream = max(3,ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex *da[MagmaMaxGPUs],*dT1[MagmaMaxGPUs]; magma_int_t ldda = ((N+31)/32)*32; if((distblk==0)||(distblk<NB)) distblk = max(256,NB); printf("voici ngpu %d distblk %d NB %d nstream %d\n ",ngpu,distblk,NB,nstream); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*NB ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for(i=0; i<10; i++){ if ( !once ) { N = size[i]; } lda = N; n2 = N*lda; /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, ngpu, distblk); //magmaDoubleComplex *dabis; // TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); // magma_zsetmatrix(N,N,h_R,lda,dabis,ldda); for (int count=0; count<1;++count){ magma_setdevice(0); start = get_current_time(); if(ver==30){ magma_zhetrd_he2hb_mgpu_spec(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, da, ldda, dT1, NB, ngpu, distblk, streams, nstream, THREADS, &info); }else{ nstream =3; magma_zhetrd_he2hb_mgpu(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, da, ldda, dT1, NB, ngpu, distblk, streams, nstream, THREADS, &info); } // magma_zhetrd_he2hb(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, dT1[0], &info); end = get_current_time(); printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, NB, distblk, ngpu, ver, GetTimerValue(start,end) / 1000.); } magma_setdevice(0); //goto fin; //return 0; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice(dev); cudaDeviceSynchronize(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); magma_zhetrd_bhe2trc_v5(THREADS, WANTZ, uplo[0], NE, N, NB, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(THREADS, WANTZ, uplo[0], NE, N, NB, h_R, lda, D, E, dT1[0], ldt); end = get_current_time(); if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); gpu_perf = flops / GetTimerValue(start,end); gpu_time = GetTimerValue(start,end) / 1000.; /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( checkres ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); char JOBZ; if(WANTZ==0) JOBZ='N'; else JOBZ = 'V'; double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2 = (magmaDoubleComplex *) malloc (lwork2*sizeof(magmaDoubleComplex)); double *rwork2 = (double *) malloc (N*sizeof(double)); double *D2 = (double *) malloc (N*sizeof(double)); magmaDoubleComplex *AINIT = (magmaDoubleComplex *) malloc (N*lda*sizeof(magmaDoubleComplex)); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */ start = get_current_time(); i= min(12,THREADS); #if defined(USEMKL) mkl_set_num_threads( i ); #endif #if defined(USEACML) omp_set_num_threads(i); #endif #if defined(PRECISION_z) || defined (PRECISION_c) lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, rwork2, &info ); #else lapackf77_dsyev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, &info ); #endif ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); //// end = get_current_time(); printf(" Finish CHECK - EIGEN timing= %f threads %d\n", GetTimerValue(start,end) / 1000., i); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; WORKAJETER = (magmaDoubleComplex *) malloc( (2* N * N + N) * sizeof(magmaDoubleComplex) ); RWORKAJETER = (double *) malloc( N * sizeof(double) ); RESU = (double *) malloc(10*sizeof(double)); int MATYPE; memset(RESU,0,10*sizeof(double)); MATYPE=3; double NOTHING=0.0; start = get_current_time(); // check results zcheck_eig_(&JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2 , D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); end = get_current_time(); printf(" Finish CHECK - results timing= %f\n", GetTimerValue(start,end) / 1000.); #if defined(USEMKL) mkl_set_num_threads( 1 ); #endif #if defined(USEACML) omp_set_num_threads(1); #endif printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if(WANTZ>0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n",RESU[0]); if(WANTZ>0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif #endif printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("============================================================================\n\n\n"); if ( once ) break; } //fin: /* Memory clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } magma_setdevice( 0 ); /* Shutdown */ TESTING_FINALIZE_MGPU(); return EXIT_SUCCESS; }