// Initialize matrix to random & symmetrize. // Having this in separate function ensures the same ISEED is always used, // so we can re-generate the identical matrix. void init_matrix( int m, int n, double *h_A, magma_int_t lda ) { assert( m == n ); magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t n2 = lda*n; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_symmetric( n, h_A, lda ); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsygvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time; double *h_A, *h_R, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif /* Matrix size */ double *w1, *w2; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1};; magma_int_t info_ortho = 0; magma_int_t info_solution = 0; magma_int_t info_reduction = 0; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, range = %s, uplo = %s, check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec) ||I-Q'Q||/. ||A-QDQ'||/. ||D-D_magma||/.\n"); printf("=======================================================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_dbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_dbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork ); #endif /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_symmetric( N, h_A, N ); magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } if (opts.warmup) { // ================================================================== // Warmup using MAGMA // ================================================================== lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); if (opts.ngpu == 1) { //printf("calling dsyevdx_2stage 1 GPU\n"); magma_dsyevdx_2stage(opts.jobz, range, opts.uplo, N, h_R, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } else { //printf("calling dsyevdx_2stage_m %d GPU\n", (int) opts.ngpu); magma_dsyevdx_2stage_m(opts.ngpu, opts.jobz, range, opts.uplo, N, h_R, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } } // =================================================================== // Performs operation using MAGMA // =================================================================== lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); gpu_time = magma_wtime(); if (opts.ngpu == 1) { //printf("calling dsyevdx_2stage 1 GPU\n"); magma_dsyevdx_2stage(opts.jobz, range, opts.uplo, N, h_R, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } else { //printf("calling dsyevdx_2stage_m %d GPU\n", (int) opts.ngpu); magma_dsyevdx_2stage_m(opts.ngpu, opts.jobz, range, opts.uplo, N, h_R, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } gpu_time = magma_wtime() - gpu_time; printf("%5d %5d %7.2f ", (int) N, (int) m1, gpu_time ); if ( opts.check ) { double eps = lapackf77_dlamch("E"); //printf("\n"); //printf("------ TESTS FOR MAGMA DSYEVD ROUTINE ------- \n"); //printf(" Size of the Matrix %d by %d\n", (int) N, (int) N); //printf("\n"); //printf(" The matrix A is randomly generated for each test.\n"); //printf("============\n"); //printf(" The relative machine precision (eps) is %8.2e\n",eps); //printf(" Computational tests pass if scaled residuals are less than 60.\n"); /* Check the orthogonality, reduction and the eigen solutions */ if (opts.jobz == MagmaVec) { info_ortho = check_orthogonality(N, N, h_R, N, eps); info_reduction = check_reduction(opts.uplo, N, 1, h_A, w1, N, h_R, eps); } //printf("------ CALLING LAPACK DSYEVD TO COMPUTE only eigenvalue and verify elementswise ------- \n"); lapackf77_dsyevd("N", "L", &N, h_A, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); info_solution = check_solution(N, w2, w1, eps); if ( (info_solution == 0) && (info_ortho == 0) && (info_reduction == 0) ) { printf(" ok\n"); //printf("***************************************************\n"); //printf(" ---- TESTING DSYEVD ...................... PASSED !\n"); //printf("***************************************************\n"); } else { printf(" failed\n"); status += 1; //printf("************************************************\n"); //printf(" - TESTING DSYEVD ... FAILED !\n"); //printf("************************************************\n"); } } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsysv_nopiv_gpu */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_B, *h_X, temp, *hwork; magmaDouble_ptr d_A, d_B; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB, lwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_DGETRF( N, N ) + FLOPS_DGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); bool nopiv = true; if ( nopiv ) { magma_dmake_hpd( N, h_A, lda ); // SPD / HPD does not require pivoting } else { magma_dmake_symmetric( N, h_A, lda ); // symmetric/symmetric generally requires pivoting } magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( N, nrhs, h_B, ldb, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dsysv_nopiv_gpu( opts.uplo, N, nrhs, d_A, ldda, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgesv_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); Anorm = lapackf77_dlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_dlange("I", &N, &nrhs, h_X, &ldb, work); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_dlange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lwork = -1; lapackf77_dsysv( lapack_uplo_const(opts.uplo), &N,&nrhs, h_A, &lda, ipiv, h_B, &ldb, &temp, &lwork, &info ); lwork = (magma_int_t) MAGMA_D_REAL( temp ); TESTING_MALLOC_PIN( hwork, double, lwork ); cpu_time = magma_wtime(); lapackf77_dsysv( lapack_uplo_const(opts.uplo), &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, hwork, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dsysv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); TESTING_FREE_CPU( hwork ); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsygvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time; double *h_A, *h_R, *h_B, *h_S, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif /* Matrix size */ double *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_dbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_dbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_B, double, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( h_S, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork); #endif /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); magma_dmake_hpd( N, h_B, N ); magma_dmake_symmetric( N, h_A, N ); magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } // ================================================================== // Warmup using MAGMA // ================================================================== if (opts.warmup) { lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_dsygvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } // =================================================================== // Performs operation using MAGMA // =================================================================== lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_dsygvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif double temp1, temp2; result[0] = 1.; result[0] /= lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_int_t m2 = m1; lapackf77_dsygvd(&opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &N, h_S, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); temp1 = temp2 = 0; for(int j=0; j<m2; j++) { temp1 = max(temp1, fabs(w1[j])); temp1 = max(temp1, fabs(w2[j])); temp2 = max(temp2, fabs(w1[j]-w2[j])); } result[1] = temp2 / (((double)m2)*temp1); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } printf( "(2) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_dsymm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 3.456, 5.678 ); double beta = MAGMA_D_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; double Anorm, error, work[1]; double *hA, *hB, *hC, *hR; magmaDouble_ptr dA[MagmaMaxGPUs], dB[MagmaMaxGPUs], dC[MagmaMaxGPUs], dwork[MagmaMaxGPUs]; magmaDouble_ptr dA2; magma_int_t i, j, dev, M, N, size, lda, ldb, ldc, ldda, lddb, lddc, msize, nb; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code double tol = opts.tolerance * lapackf77_dlamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t ncmplx = 0; magma_buildconnection_mgpu( gnode, &ncmplx, opts.ngpu ); printf("%% Initializing communication pattern... GPU-ncmplx %d\n", (int) ncmplx); for (i=0; i < ncmplx; ++i) { magma_int_t myngpu = gnode[i][MagmaMaxGPUs]; printf("%% cmplx %d has %d GPUs:", i, myngpu); for (j=0; j < myngpu; ++j) { printf(" %d", (int) gnode[i][j]); if (j < myngpu-1) { printf(","); } } printf("\n"); } // number of queues per GPU. Requires ngpu. magma_int_t nqueue = opts.ngpu; // number of events per GPU. Require ngpu*ngpu. magma_int_t nevents = opts.ngpu*opts.ngpu; magma_queue_t queues[MagmaMaxGPUs][20], queues0[MagmaMaxGPUs]; magma_event_t events[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs + 10]; for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( i = 0; i < nqueue; ++i ) { magma_queue_create( dev, &queues[dev][i] ); } queues0[dev] = queues[dev][0]; for( i = 0; i < nevents; ++i ) { cudaEventCreateWithFlags( &events[dev][i], cudaEventDisableTiming ); } } printf("%% nb %d, ngpu %d, version %d\n", (int) nb, (int) opts.ngpu, (int) opts.version ); printf("%% M N nb offset CPU Gflop/s (sec) GPU Gflop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||B||\n"); printf("%%========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; // TODO depends on side ldb = M; ldc = M; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default lddc = magma_roundup( ldc, opts.align ); // multiple of 32 by default gflops = FLOPS_DSYMM( MagmaLeft, (double)msize, (double)N ) / 1e9; magma_int_t dworksiz = lddc*N + (M*N)*opts.ngpu; TESTING_MALLOC_CPU( hA, double, lda*M ); TESTING_MALLOC_CPU( hB, double, ldb*N ); TESTING_MALLOC_CPU( hC, double, ldc*N ); TESTING_MALLOC_PIN( hR, double, ldc*N ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( dA[dev], double, ldda*mlocal ); TESTING_MALLOC_DEV( dB[dev], double, lddb*N ); TESTING_MALLOC_DEV( dC[dev], double, lddc*N ); TESTING_MALLOC_DEV( dwork[dev], double, dworksiz ); } if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, double, ldda*M ); } size = lda*M; lapackf77_dlarnv( &ione, iseed, &size, hA ); magma_dmake_symmetric( M, hA, lda ); size = ldb*N; lapackf77_dlarnv( &ione, iseed, &size, hB ); size = ldc*N; lapackf77_dlarnv( &ione, iseed, &size, hC ); lapackf77_dlacpy( "Full", &M, &N, hC, &ldc, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb, queues0 ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); magma_dsetmatrix( M, N, hB, lda, dB[dev], ldda, opts.queue ); // since when offset != 0, the GPU that does beta*C may not be 0, // send initial hC to all GPUs. magma_dsetmatrix( M, N, hC, lda, dC[dev], ldda, opts.queue ); } trace_init( 1, opts.ngpu, nqueue, (magma_queue_t*) queues ); gpu_time = magma_sync_wtime(0); magmablas_dsymm_mgpu( MagmaLeft, MagmaLower, msize, N, alpha, dA, ldda, offset, dB, ldda, beta, dC, ldda, dwork, dworksiz, opts.ngpu, nb, queues, nqueue, events, nevents, gnode, ncmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "dsymm-m%d-n%d-nb%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) opts.ngpu, (int) iter ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magma_dsetmatrix( M, M, hA, lda, dA2, ldda, opts.queue ); magma_dsetmatrix( M, N, hB, lda, dB[0], ldda, opts.queue ); magma_dsetmatrix( M, N, hC, lda, dwork[0], ldda, opts.queue ); gpu_time2 = magma_sync_wtime(0); magma_dsymm( MagmaLeft, MagmaLower, msize, N, alpha, dA2 + offset + offset*ldda, ldda, dB[0], ldda, beta, dwork[0], ldda, opts.queue ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||B|| Anorm = lapackf77_dlange("fro", &msize, &msize, hA + offset + offset*lda, &lda, work ); Anorm *= lapackf77_dlange("fro", &msize, &N, hB, &lda, work ); //printf( "A =" ); magma_dprint( M, M, hA, lda ); //printf( "B =" ); magma_dprint( M, N, hB, lda ); //printf( "C =" ); magma_dprint( M, N, hC, lda ); cpu_time = magma_wtime(); blasf77_dsymm( "Left", "Lower", &msize, &N, &alpha, hA + offset + offset*lda, &lda, hB, &lda, &beta, hC, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; for (dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_dgetmatrix( M, N, dC[dev], ldda, hR, lda, opts.queue ); // compute relative error ||R||/||A||*||B||, where R := C_magma - C_lapack = R - C size = ldc*N; blasf77_daxpy( &size, &c_neg_one, hC, &ione, hR, &ione ); error = lapackf77_dlange("fro", &msize, &N, hR, &lda, work) / Anorm; //printf( "R =" ); magma_dprint( M, N, hR, lda ); bool okay = (error < tol); status += ! okay; if (dev == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (okay ? "ok" : "failed") ); } else { printf( " dev %d %74s %8.2e %s\n", dev, "", error, (okay ? "ok" : "failed") ); } } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hB ); TESTING_FREE_CPU( hC ); TESTING_FREE_PIN( hR ); for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( dA[dev] ); TESTING_FREE_DEV( dB[dev] ); TESTING_FREE_DEV( dC[dev] ); TESTING_FREE_DEV( dwork[dev] ); } if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( i = 0; i < nqueue; ++i ) { magma_queue_destroy( queues[dev][i] ); } for( i = 0; i < nevents; ++i ) { magma_event_destroy( events[dev][i] ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }