/* //////////////////////////////////////////////////////////////////////////// -- Testing zher2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_B, *h_C, *h_Ccublas; magmaDoubleComplex *d_A, *d_B, *d_C; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[itest]; K = opts.ksize[itest]; gflops = FLOPS_ZHER2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*Bk ); TESTING_MALLOC_CPU( h_C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*Bk ); TESTING_MALLOC_DEV( d_C, magmaDoubleComplex, lddc*N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_zsetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_zsetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasZher2k( handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_zher2k( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &N, h_C, &ldc, work ); blasf77_zaxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_zher2k_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.2345, 4.321 ); double beta = 3.14159; real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double error, work[1]; magmaDoubleComplex *hA, *hR, *hR2, *hV, *hW; magmaDoubleComplex_ptr dV[MagmaMaxGPUs], dW[MagmaMaxGPUs], dA[MagmaMaxGPUs]; magma_int_t n, k, size, lda, ldda, nb, ngpu, nstream; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_queue_t streams[MagmaMaxGPUs][20]; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); ngpu = opts.ngpu; nb = (opts.nb > 0 ? opts.nb : 64); nstream = (opts.nstream > 0 ? opts.nstream : 2); printf( "version 1: magmablas_zher2k_mgpu2 %s\n", (opts.version==1 ? "(enabled)" : "")); printf( "version 2: magmablas_zher2k_mgpu_spec %s\n", (opts.version==2 ? "(enabled)" : "")); #ifdef ICHI printf( "version 3: magma_zher2k_mgpu (Ichi) %s\n", (opts.version==3 ? "(enabled)" : "")); #endif printf( "\n" ); printf( "nb %d, ngpu %d, nstream %d\n", (int) nb, (int) ngpu, (int) nstream ); printf(" n k nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) |R|/(|V|*|W|+|A|)\n"); printf("===================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { n = opts.nsize[itest]; k = opts.ksize[itest]; for( int offset = 0; offset < n; offset += min(k,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { lda = n; ldda = ((n + 31)/32)*32; gflops = FLOPS_ZHER2K( k, n-offset ) / 1e9; TESTING_MALLOC_CPU( hA, magmaDoubleComplex, lda*n ); TESTING_MALLOC_CPU( hR, magmaDoubleComplex, lda*n ); TESTING_MALLOC_CPU( hR2, magmaDoubleComplex, lda*n ); TESTING_MALLOC_CPU( hV, magmaDoubleComplex, lda*k*2 ); //TESTING_MALLOC_CPU( hW, magmaDoubleComplex, lda*k ); for( int d = 0; d < ngpu; ++d ) { magma_int_t nlocal = ((n / nb) / ngpu + 1) * nb; magma_setdevice( d ); TESTING_MALLOC_DEV( dA[d], magmaDoubleComplex, ldda*nlocal ); TESTING_MALLOC_DEV( dV[d], magmaDoubleComplex, ldda*k*2 ); //TESTING_MALLOC_DEV( dW[d], magmaDoubleComplex, ldda*k ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[d][i] ); } } size = lda*n; lapackf77_zlarnv( &ione, ISEED, &size, hA ); size = lda*k*2; lapackf77_zlarnv( &ione, ISEED, &size, hV ); hW = hV + lda*k; //lapackf77_zlarnv( &ione, ISEED, &size, hW ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( n, n, hA, lda, dA, ldda, ngpu, nb ); for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); dW[d] = dV[d] + ldda*k; magma_zsetmatrix( n, k, hV, lda, dV[d], ldda ); magma_zsetmatrix( n, k, hW, lda, dW[d], ldda ); } gpu_time = magma_sync_wtime(0); if ( opts.version == 1 ) { magmablas_zher2k_mgpu2( MagmaLower, MagmaNoTrans, n-offset, k, alpha, dV, ldda, 0, dW, ldda, 0, beta, dA, ldda, offset, ngpu, nb, streams, nstream ); } else if ( opts.version == 2 ) { magmablas_zher2k_mgpu_spec( MagmaLower, MagmaNoTrans, n-offset, k, alpha, dV, ldda, 0, dW, ldda, 0, beta, dA, ldda, offset, ngpu, nb, streams, nstream ); } else { #ifdef ICHI magma_zher2k_mgpu( ngpu, MagmaLower, MagmaNoTrans, nb, n-offset, k, alpha, dV, ldda, //dW, ldda, beta, dA, ldda, offset, nstream, streams ); #endif } gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; // Get dA back to the CPU to compare with the CPU result. magma_zgetmatrix_1D_col_bcyclic( n, n, dA, ldda, hR, lda, ngpu, nb ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack || opts.check ) { // store ||V||*||W|| + ||A|| magma_int_t n_offset = n - offset; error = lapackf77_zlange("f", &n_offset, &k, hV, &lda, work ); error *= lapackf77_zlange("f", &n_offset, &k, hW, &lda, work ); error += lapackf77_zlange("f", &n_offset, &n_offset, &hA[offset + offset*lda], &lda, work ); cpu_time = magma_wtime(); blasf77_zher2k( "Lower", "NoTrans", &n_offset, &k, &alpha, hV, &lda, hW, &lda, &beta, &hA[offset + offset*lda], &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; // compute relative error ||R||/||A||, where R := A_magma - A_lapack = R - A size = lda*n; blasf77_zaxpy( &size, &c_neg_one, hA, &ione, hR, &ione ); error = lapackf77_zlanhe("fro", "Lower", &n_offset, &hR[offset + offset*lda], &lda, work) / error; printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) n, (int) k, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); //, gpu_perf2, gpu_time2, error, error2 ); status += ! (error < tol); } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) ---\n", (int) n, (int) k, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hR ); TESTING_FREE_CPU( hR2 ); TESTING_FREE_CPU( hV ); //TESTING_FREE_CPU( hW ); for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); TESTING_FREE_DEV( dA[d] ); TESTING_FREE_DEV( dV[d] ); //TESTING_FREE_DEV( dW[d] ); for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[d][i] ); } } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } TESTING_FINALIZE(); return status; }