/* //////////////////////////////////////////////////////////////////////////// -- Testing dsygvd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *h_B, *h_S, *h_work; double *w1, *w2; magma_int_t *iwork; magma_int_t N, n2, info, nb, lwork, liwork, lda; double result[4] = {0}; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double d_zero = 0.; double d_one = 1.; double d_neg_one = -1.; //magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, uplo = %s\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time(sec)\n"); printf("======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = N*lda; nb = magma_get_dsytrd_nb(N); lwork = 1 + 6*N*nb + 2* N*N; liwork = 3 + 5*N; TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_B, double, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( h_S, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); magma_dmake_hpd( N, h_B, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); /* warmup */ if ( opts.warmup ) { magma_dsygvd( opts.itype, opts.jobz, opts.uplo, N, h_R, lda, h_S, lda, w1, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_dsygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dsygvd( opts.itype, opts.jobz, opts.uplo, N, h_R, lda, h_S, lda, w1, h_work, lwork, iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_dsygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | I - V V' B | / ( N ) (itype = 1,2) | B - V V' | / ( |B| N ) (itype = 3) (3) | S(with V) - S(w/o V) | / | S | =================================================================== */ double temp1, temp2; //double *tau; if ( opts.itype == 1 || opts.itype == 2 ) { lapackf77_dlaset( "A", &N, &N, &d_zero, &c_one, h_S, &lda); blasf77_dgemm("N", "C", &N, &N, &N, &c_one, h_R, &lda, h_R, &lda, &d_zero, h_work, &N); blasf77_dsymm("R", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_B, &lda, h_work, &N, &c_one, h_S, &lda); result[1] = lapackf77_dlange("1", &N, &N, h_S, &lda, h_work) / N; } else if ( opts.itype == 3 ) { lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda); blasf77_dsyrk(lapack_uplo_const(opts.uplo), "N", &N, &N, &d_neg_one, h_R, &lda, &d_one, h_S, &lda); result[1] = lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_S, &lda, h_work) / N / lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_B, &lda, h_work); } result[0] = 1.; result[0] /= lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_work); result[0] /= lapackf77_dlange("1", &N, &N, h_R, &lda, h_work); if ( opts.itype == 1 ) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_R, &lda, &d_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_B, &lda, h_R, &lda, &c_one, h_work, &N); result[0] *= lapackf77_dlange("1", &N, &N, h_work, &N, &temp1)/N; } else if ( opts.itype == 2 ) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_B, &lda, h_R, &lda, &d_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_dlange("1", &N, &N, h_R, &lda, &temp1)/N; } else if ( opts.itype == 3 ) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_R, &lda, &d_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_B, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_dlange("1", &N, &N, h_R, &lda, &temp1)/N; } /* lapackf77_dsyt21(&ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, w1, h_R, &lda, h_R, &lda, tau, h_work, rwork, &result[0]); */ lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); magma_dsygvd( opts.itype, MagmaNoVec, opts.uplo, N, h_R, lda, h_S, lda, w2, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_dsygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for(int j=0; j<N; j++) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / temp1; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dsygvd( &opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_B, &lda, w2, h_work, &lwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_dsygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if ( opts.itype==1 ) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if ( opts.itype==2 ) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if ( opts.itype==3 ) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } if ( opts.itype==1 || opts.itype==2 ) { printf("(2) | I - Z Z' B | / N = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); } else { printf("(2) | B - Z Z' | / (|B| N) = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); } printf( "(3) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[2], (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0] < tol && result[1] < tol && result[2] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsygvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time; double *h_A, *h_R, *h_B, *h_S, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif /* Matrix size */ double *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_dbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_dbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_B, double, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( h_S, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork); #endif /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); magma_dmake_hpd( N, h_B, N ); magma_dmake_symmetric( N, h_A, N ); magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } // ================================================================== // Warmup using MAGMA // ================================================================== if (opts.warmup) { lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_dsygvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } // =================================================================== // Performs operation using MAGMA // =================================================================== lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_dsygvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif double temp1, temp2; result[0] = 1.; result[0] /= lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_int_t m2 = m1; lapackf77_dsygvd(&opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &N, h_S, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); temp1 = temp2 = 0; for(int j=0; j<m2; j++) { temp1 = max(temp1, fabs(w1[j])); temp1 = max(temp1, fabs(w2[j])); temp2 = max(temp2, fabs(w1[j]-w2[j])); } result[1] = temp2 / (((double)m2)*temp1); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } printf( "(2) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_dsymm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 3.456, 5.678 ); double beta = MAGMA_D_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; double Anorm, error, work[1]; double *hA, *hB, *hC, *hR; magmaDouble_ptr dA[MagmaMaxGPUs], dB[MagmaMaxGPUs], dC[MagmaMaxGPUs], dwork[MagmaMaxGPUs]; magmaDouble_ptr dA2; magma_int_t i, j, dev, M, N, size, lda, ldb, ldc, ldda, lddb, lddc, msize, nb; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code double tol = opts.tolerance * lapackf77_dlamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t ncmplx = 0; magma_buildconnection_mgpu( gnode, &ncmplx, opts.ngpu ); printf("%% Initializing communication pattern... GPU-ncmplx %d\n", (int) ncmplx); for (i=0; i < ncmplx; ++i) { magma_int_t myngpu = gnode[i][MagmaMaxGPUs]; printf("%% cmplx %d has %d GPUs:", i, myngpu); for (j=0; j < myngpu; ++j) { printf(" %d", (int) gnode[i][j]); if (j < myngpu-1) { printf(","); } } printf("\n"); } // number of queues per GPU. Requires ngpu. magma_int_t nqueue = opts.ngpu; // number of events per GPU. Require ngpu*ngpu. magma_int_t nevents = opts.ngpu*opts.ngpu; magma_queue_t queues[MagmaMaxGPUs][20], queues0[MagmaMaxGPUs]; magma_event_t events[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs + 10]; for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( i = 0; i < nqueue; ++i ) { magma_queue_create( dev, &queues[dev][i] ); } queues0[dev] = queues[dev][0]; for( i = 0; i < nevents; ++i ) { cudaEventCreateWithFlags( &events[dev][i], cudaEventDisableTiming ); } } printf("%% nb %d, ngpu %d, version %d\n", (int) nb, (int) opts.ngpu, (int) opts.version ); printf("%% M N nb offset CPU Gflop/s (sec) GPU Gflop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||B||\n"); printf("%%========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; // TODO depends on side ldb = M; ldc = M; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default lddc = magma_roundup( ldc, opts.align ); // multiple of 32 by default gflops = FLOPS_DSYMM( MagmaLeft, (double)msize, (double)N ) / 1e9; magma_int_t dworksiz = lddc*N + (M*N)*opts.ngpu; TESTING_MALLOC_CPU( hA, double, lda*M ); TESTING_MALLOC_CPU( hB, double, ldb*N ); TESTING_MALLOC_CPU( hC, double, ldc*N ); TESTING_MALLOC_PIN( hR, double, ldc*N ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( dA[dev], double, ldda*mlocal ); TESTING_MALLOC_DEV( dB[dev], double, lddb*N ); TESTING_MALLOC_DEV( dC[dev], double, lddc*N ); TESTING_MALLOC_DEV( dwork[dev], double, dworksiz ); } if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, double, ldda*M ); } size = lda*M; lapackf77_dlarnv( &ione, iseed, &size, hA ); magma_dmake_symmetric( M, hA, lda ); size = ldb*N; lapackf77_dlarnv( &ione, iseed, &size, hB ); size = ldc*N; lapackf77_dlarnv( &ione, iseed, &size, hC ); lapackf77_dlacpy( "Full", &M, &N, hC, &ldc, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb, queues0 ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); magma_dsetmatrix( M, N, hB, lda, dB[dev], ldda, opts.queue ); // since when offset != 0, the GPU that does beta*C may not be 0, // send initial hC to all GPUs. magma_dsetmatrix( M, N, hC, lda, dC[dev], ldda, opts.queue ); } trace_init( 1, opts.ngpu, nqueue, (magma_queue_t*) queues ); gpu_time = magma_sync_wtime(0); magmablas_dsymm_mgpu( MagmaLeft, MagmaLower, msize, N, alpha, dA, ldda, offset, dB, ldda, beta, dC, ldda, dwork, dworksiz, opts.ngpu, nb, queues, nqueue, events, nevents, gnode, ncmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "dsymm-m%d-n%d-nb%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) opts.ngpu, (int) iter ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magma_dsetmatrix( M, M, hA, lda, dA2, ldda, opts.queue ); magma_dsetmatrix( M, N, hB, lda, dB[0], ldda, opts.queue ); magma_dsetmatrix( M, N, hC, lda, dwork[0], ldda, opts.queue ); gpu_time2 = magma_sync_wtime(0); magma_dsymm( MagmaLeft, MagmaLower, msize, N, alpha, dA2 + offset + offset*ldda, ldda, dB[0], ldda, beta, dwork[0], ldda, opts.queue ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||B|| Anorm = lapackf77_dlange("fro", &msize, &msize, hA + offset + offset*lda, &lda, work ); Anorm *= lapackf77_dlange("fro", &msize, &N, hB, &lda, work ); //printf( "A =" ); magma_dprint( M, M, hA, lda ); //printf( "B =" ); magma_dprint( M, N, hB, lda ); //printf( "C =" ); magma_dprint( M, N, hC, lda ); cpu_time = magma_wtime(); blasf77_dsymm( "Left", "Lower", &msize, &N, &alpha, hA + offset + offset*lda, &lda, hB, &lda, &beta, hC, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; for (dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_dgetmatrix( M, N, dC[dev], ldda, hR, lda, opts.queue ); // compute relative error ||R||/||A||*||B||, where R := C_magma - C_lapack = R - C size = ldc*N; blasf77_daxpy( &size, &c_neg_one, hC, &ione, hR, &ione ); error = lapackf77_dlange("fro", &msize, &N, hR, &lda, work) / Anorm; //printf( "R =" ); magma_dprint( M, N, hR, lda ); bool okay = (error < tol); status += ! okay; if (dev == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (okay ? "ok" : "failed") ); } else { printf( " dev %d %74s %8.2e %s\n", dev, "", error, (okay ? "ok" : "failed") ); } } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hB ); TESTING_FREE_CPU( hC ); TESTING_FREE_PIN( hR ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( dA[dev] ); TESTING_FREE_DEV( dB[dev] ); TESTING_FREE_DEV( dC[dev] ); TESTING_FREE_DEV( dwork[dev] ); } if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( i = 0; i < nqueue; ++i ) { magma_queue_destroy( queues[dev][i] ); } for( i = 0; i < nevents; ++i ) { magma_event_destroy( events[dev][i] ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }