/* //////////////////////////////////////////////////////////////////////////// -- Testing clanhe */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A; float *h_work; magmaFloatComplex_ptr d_A; magmaFloat_ptr d_work; magma_int_t i, j, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; float error, norm_magma, norm_lapack; magma_int_t status = 0; magma_int_t lapack_nan_fail = 0; magma_int_t lapack_inf_fail = 0; bool mkl_warning = false; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); float tol2; magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_clanhe( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_clanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 ); if ( norm_magma != -1 ) { printf( "expected magmablas_clanhe to return -1 error, but got %f\n", norm_magma ); status = 1; } }} printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL // MKL 11.1 has bug in multi-threaded clanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm, which is not tested here // because MAGMA doesn't implement Frobenius norm yet. MKLVersion mkl_version; mkl_get_version( &mkl_version ); magma_int_t la_threads = magma_get_lapack_numthreads(); bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2); if ( mkl_single_thread ) { printf( "\nNote: using single thread to work around MKL clanhe bug.\n\n" ); } #endif printf("%% N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error nan inf\n"); printf("%%=================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { /* < 4 for Frobenius */ for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(magmaFloatComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_work, float, N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, float, N ); /* Initialize the matrix */ lapackf77_clarnv( &idist, ISEED, &n2, h_A ); magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_clanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because %s norm isn't supported\n", (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] )); goto cleanup; } else if (norm_magma < 0) { printf("magmablas_clanhe returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // work around MKL bug in multi-threaded clanhe magma_set_lapack_numthreads( 1 ); } #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_clanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) { printf("lapackf77_clanhe returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in cuCabsf. #ifdef REAL tol2 = 0; #endif } bool okay; okay = (error <= tol2); status += ! okay; mkl_warning |= ! okay; /* ==================================================================== Check for NAN and INF propagation =================================================================== */ #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) i = rand() % N; j = rand() % N; magma_int_t tmp; if ( uplo[iuplo] == MagmaLower && i < j ) { tmp = i; i = j; j = tmp; } else if ( uplo[iuplo] == MagmaUpper && i > j ) { tmp = i; i = j; j = tmp; } *h_A(i,j) = MAGMA_C_NAN; magma_csetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_clanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_clanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool nan_okay; nan_okay = isnan(norm_magma); bool la_nan_okay; la_nan_okay = isnan(norm_lapack); lapack_nan_fail += ! la_nan_okay; status += ! nan_okay; *h_A(i,j) = MAGMA_C_INF; magma_csetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_clanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_clanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool inf_okay; inf_okay = isinf(norm_magma); bool la_inf_okay; la_inf_okay = isinf(norm_lapack); lapack_inf_fail += ! la_inf_okay; status += ! inf_okay; #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); } #endif printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %-6s %6s%1s %6s%1s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed"), (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"), (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*")); cleanup: TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } // end iter if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iuplo, inorm printf( "\n" ); } // don't print "failed" here because then run_tests.py thinks MAGMA failed if ( lapack_nan_fail ) { printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" ); } if ( lapack_inf_fail ) { printf( "* Warning: LAPACK did not pass INF propagation test\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1) has a bug in clanhe with multiple threads;\n" " corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ztrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; double cublas_error, normA, normx, normr, work[1]; magma_int_t N, info; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaDoubleComplex *h_A, *h_b, *h_x, *h_xcublas; magmaDoubleComplex_ptr d_A, d_x; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("uplo = %s, transA = %s, diag = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("============================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; gflops = FLOPS_ZTRSM(opts.side, N, 1) / 1e9; lda = N; ldda = ((lda+31)/32)*32; sizeA = lda*N; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_b, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( h_x, magmaDoubleComplex, N ); TESTING_MALLOC_CPU( h_xcublas, magmaDoubleComplex, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_x, magmaDoubleComplex, N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_zlarnv( &ione, ISEED, &N, h_b ); blasf77_zcopy( &N, h_b, &ione, h_x, &ione ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); magma_zsetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasZtrsv( opts.handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ztrsv( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) // error for CUBLAS normA = lapackf77_zlange( "F", &N, &N, h_A, &lda, work ); normx = lapackf77_zlange( "F", &N, &ione, h_xcublas, &ione, work ); blasf77_ztrmv( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &N, h_A, &lda, h_xcublas, &ione ); blasf77_zaxpy( &N, &c_neg_one, h_b, &ione, h_xcublas, &ione ); normr = lapackf77_zlange( "F", &N, &ione, h_xcublas, &N, work ); cublas_error = normr / (normA*normx); if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) %8.2e %s\n", (int) N, cublas_perf, 1000.*cublas_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_b ); TESTING_FREE_CPU( h_x ); TESTING_FREE_CPU( h_xcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_x ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(void) { int N; // A[N][N], B[N][N], C[N][N] int sz; // number of elements in each matrix float tmp; N = ORDER; sz = N * N; std::vector<float> h_A(sz); // Matrix A on the host std::vector<float> h_B(sz); // Matrix B on the host std::vector<float> h_C(sz); // Matrix C on the host cl::Buffer d_A; // matrix A on the device cl::Buffer d_B; // matrix B on the device cl::Buffer d_C; // matrix C on the device initmat(N, N, N, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER); zero_mat(N, N, h_C); util::Timer timer; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { tmp = 0.0f; for (int k = 0; k < N; k++) { tmp += h_A[i*N+k] * h_B[k*N+j]; } h_C[i*N+j] = tmp; } } double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); printf("\n===== Parallel matrix mult (dot prod), order %d on device ======\n",ORDER); switch (DEVICE) { case CL_DEVICE_TYPE_DEFAULT: printf("DEVICE=DEFAULT\n"); break; case CL_DEVICE_TYPE_CPU: printf("DEVICE=CPU\n"); break; case CL_DEVICE_TYPE_GPU: printf("DEVICE=GPU\n"); break; default: printf("DEVICE=%d\n", DEVICE); break; } zero_mat(N, N, h_C); try { cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context. // Build program explicitly so I can catch errors and display // compiler error messages (should any be generated) cl::Program program(context, util::loadProgram("matmul_kernel.cl")); try { program.build(); } catch (cl::Error error) { // If it was a build error then show the error if (error.err() == CL_BUILD_PROGRAM_FAILURE) { std::vector<cl::Device> devices; devices = context.getInfo<CL_CONTEXT_DEVICES>(); std::string built = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); std::cerr << built << "\n"; } throw error; } // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto mmul = cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg> (program, "mmul"); util::Timer timer; d_A = cl::Buffer(context, begin(h_A), end(h_A), true); d_B = cl::Buffer(context, begin(h_B), end(h_B), true); d_C = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * sz); // Work-group computes a block of C. This size is also set // in a #define inside the kernel function. Note this blocksize // must evenly divide the matrix order int blocksize = 16; cl::LocalSpaceArg A_block = cl::Local(sizeof(float) * blocksize*blocksize); cl::LocalSpaceArg B_block = cl::Local(sizeof(float) * blocksize*blocksize); mmul( cl::EnqueueArgs( queue, cl::NDRange(N,N), cl::NDRange(blocksize,blocksize)), N, d_A, d_B, d_C, A_block, B_block); cl::copy(queue, d_C, begin(h_C), end(h_C)); double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << std::endl; } }
/* //////////////////////////////////////////////////////////////////////////// -- Testing strsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float cublas_error, normA, normx, normr, work[1]; magma_int_t N, info; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; float *h_A, *h_b, *h_x, *h_xcublas; float *d_A, *d_x; float c_neg_one = MAGMA_S_NEG_ONE; magma_opts opts; parse_opts( argc, argv, &opts ); printf("uplo = %c, transA = %c, diag = %c\n", opts.uplo, opts.transA, opts.diag ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("============================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; gflops = FLOPS_STRSM(opts.side, N, 1) / 1e9; lda = N; ldda = ((lda+31)/32)*32; sizeA = lda*N; TESTING_MALLOC( ipiv, magma_int_t, N ); TESTING_MALLOC( h_A, float, lda*N ); TESTING_MALLOC( h_b, float, N ); TESTING_MALLOC( h_x, float, N ); TESTING_MALLOC( h_xcublas, float, N ); TESTING_DEVALLOC( d_A, float, ldda*N ); TESTING_DEVALLOC( d_x, float, N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_sgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_slarnv( &ione, ISEED, &N, h_b ); blasf77_scopy( &N, h_b, &ione, h_x, &ione ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_ssetmatrix( N, N, h_A, lda, d_A, ldda ); magma_ssetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasStrsv( opts.uplo, opts.transA, opts.diag, N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_strsv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) // error for CUBLAS normA = lapackf77_slange( "F", &N, &N, h_A, &lda, work ); normx = lapackf77_slange( "F", &N, &ione, h_xcublas, &ione, work ); blasf77_strmv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_xcublas, &ione ); blasf77_saxpy( &N, &c_neg_one, h_b, &ione, h_xcublas, &ione ); normr = lapackf77_slange( "F", &N, &ione, h_xcublas, &N, work ); cublas_error = normr / (normA*normx); if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cublas_error ); } TESTING_FREE( h_A ); TESTING_FREE( h_x ); TESTING_FREE( h_xcublas ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_x ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ztrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; double magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaDoubleComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2; magmaDoubleComplex *d_A, *d_B; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_ZTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_B1, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_X1, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_X2, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bmagma, magmaDoubleComplex, ldb*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, Ak ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info ); for( int j = 0; j < Ak; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(magmaDoubleComplex)); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_ztrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_zgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasZtrsm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ztrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaDoubleComplex)); magmaDoubleComplex alpha2 = MAGMA_Z_DIV( c_one, alpha ); blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); double norm1 = lapackf77_zlange( "M", &M, &N, h_X1, &ldb, work ); double normx = lapackf77_zlange( "M", &M, &N, h_Bmagma, &ldb, work ); double normA = lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaDoubleComplex)); blasf77_ztrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_zaxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_zlange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_zlange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_zlange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_B1 ); TESTING_FREE_CPU( h_X1 ); TESTING_FREE_CPU( h_X2 ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_CPU( h_Bmagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argc, char *argv[]) { int N; // A[N][N], B[N][N], C[N][N] int size; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing N = ORDER; size = N * N; std::vector<float> h_A(size); // Host memory for Matrix A std::vector<float> h_B(size); // Host memory for Matrix B std::vector<float> h_C(size); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory //-------------------------------------------------------------------------------- // Create a context and queue //-------------------------------------------------------------------------------- try { cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices std::vector<cl::Device> devices; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { std::cout << "Invalid device index (try '--list')\n"; return EXIT_FAILURE; } cl::Device device = devices[deviceIndex]; std::string name; getDeviceName(device, name); std::cout << "\nUsing OpenCL device: " << name << "\n"; std::vector<cl::Device> chosen_device; chosen_device.push_back(device); cl::Context context(chosen_device); cl::CommandQueue queue(context, device); //-------------------------------------------------------------------------------- // Run sequential matmul //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true); d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(N, N); naive_mmul(cl::EnqueueArgs(queue, global), N, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, h_C.begin(), h_C.end()); results(N, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeadd */ int main( int argc, char** argv) { #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define h_B(i_, j_) (h_B + (i_) + (j_)*lda) // B uses lda TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Bnorm, error, work[1]; float *h_A, *h_B, *d_A, *d_B; float alpha = MAGMA_S_MAKE( 3.1415, 2.71828 ); float beta = MAGMA_S_MAKE( 6.0221, 6.67408 ); float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); /* Uncomment these lines to check parameters. * magma_xerbla calls lapack's xerbla to print out error. */ //magmablas_sgeadd( -1, N, alpha, d_A, ldda, d_B, ldda, opts.queue ); //magmablas_sgeadd( M, -1, alpha, d_A, ldda, d_B, ldda, opts.queue ); //magmablas_sgeadd( M, N, alpha, d_A, M-1, d_B, ldda, opts.queue ); //magmablas_sgeadd( M, N, alpha, d_A, ldda, d_B, N-1, opts.queue ); printf("%% M N CPU Gflop/s (ms) GPU Gflop/s (ms) |Bl-Bm|/|Bl|\n"); printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default size = lda*N; gflops = 2.*M*N / 1e9; TESTING_MALLOC_CPU( h_A, float, lda *N ); TESTING_MALLOC_CPU( h_B, float, lda *N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, ldda*N ); lapackf77_slarnv( &ione, ISEED, &size, h_A ); lapackf77_slarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda, opts.queue ); magma_ssetmatrix( M, N, h_B, lda, d_B, ldda, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); if ( opts.version == 1 ) { magmablas_sgeadd( M, N, alpha, d_A, ldda, d_B, ldda, opts.queue ); } else { magmablas_sgeadd2( M, N, alpha, d_A, ldda, beta, d_B, ldda, opts.queue ); } gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if ( opts.version == 1 ) { for( int j = 0; j < N; ++j ) { blasf77_saxpy( &M, &alpha, &h_A[j*lda], &ione, &h_B[j*lda], &ione ); } } else { for( int j = 0; j < N; ++j ) { // daxpby for( int i=0; i < M; ++i ) { *h_B(i,j) = alpha * (*h_A(i,j)) + beta * (*h_B(i,j)); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check result =================================================================== */ magma_sgetmatrix( M, N, d_B, ldda, h_A, lda, opts.queue ); blasf77_saxpy( &size, &c_neg_one, h_B, &ione, h_A, &ione ); Bnorm = lapackf77_slange( "F", &M, &N, h_B, &lda, work ); error = lapackf77_slange( "F", &M, &N, h_A, &lda, work ) / Bnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float magma_error=0, cublas_error, lapack_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_Blapack, *h_X; magmaFloatComplex_ptr d_A, d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); // pass ngpu = -1 to test multi-GPU code using 1 gpu magma_int_t abs_ngpu = abs( opts.ngpu ); printf("%% side = %s, uplo = %s, transA = %s, diag = %s, ngpu = %d\n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), int(abs_ngpu) ); printf("%% M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA CUBLAS LAPACK error\n"); printf("%%============================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_CTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Blapack, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bmagma, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, Ak ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info ); for( int j = 0; j < Ak; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); memcpy( h_Blapack, h_B, sizeB*sizeof(magmaFloatComplex) ); magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ #if defined(HAVE_CUBLAS) magma_csetmatrix( M, N, h_B, ldb, d_B, lddb, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); if (opts.ngpu == 1) { magmablas_ctrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb, opts.queue ); } else { magma_ctrsm_m( abs_ngpu, opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); } magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb, opts.queue ); #endif /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N, h_B, ldb, d_B, lddb, opts.queue ); cublas_time = magma_sync_wtime( opts.queue ); #if defined(HAVE_CUBLAS) // opts.handle also uses opts.queue cublasCtrsm( opts.handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb ); #elif defined(HAVE_clBLAS) clblasCtrsm( clblasColumnMajor, clblas_side_const(opts.side), clblas_uplo_const(opts.uplo), clblas_trans_const(opts.transA), clblas_diag_const(opts.diag), M, N, alpha, d_A, 0, ldda, d_B, 0, lddb, 1, &opts.queue, 0, NULL, NULL ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb, opts.queue ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_Blapack, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - 1/alpha*A*x|| / (||A||*||x||) magmaFloatComplex inv_alpha = MAGMA_C_DIV( c_one, alpha ); float normR, normX, normA; normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); #if defined(HAVE_CUBLAS) // check magma memcpy( h_X, h_Bmagma, sizeB*sizeof(magmaFloatComplex) ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &inv_alpha, h_A, &lda, h_X, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_X, &ione ); normR = lapackf77_clange( "M", &M, &N, h_X, &ldb, work ); normX = lapackf77_clange( "M", &M, &N, h_Bmagma, &ldb, work ); magma_error = normR/(normX*normA); #endif // check cublas memcpy( h_X, h_Bcublas, sizeB*sizeof(magmaFloatComplex) ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &inv_alpha, h_A, &lda, h_X, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_X, &ione ); normR = lapackf77_clange( "M", &M, &N, h_X, &ldb, work ); normX = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ); cublas_error = normR/(normX*normA); if ( opts.lapack ) { // check lapack // this verifies that the matrix wasn't so bad that it couldn't be solved accurately. memcpy( h_X, h_Blapack, sizeB*sizeof(magmaFloatComplex) ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &inv_alpha, h_A, &lda, h_X, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_X, &ione ); normR = lapackf77_clange( "M", &M, &N, h_X, &ldb, work ); normX = lapackf77_clange( "M", &M, &N, h_Blapack, &ldb, work ); lapack_error = normR/(normX*normA); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, lapack_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e --- %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_Blapack ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_CPU( h_Bmagma ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double *h_A, *h_R; magmaDouble_ptr d_lA[MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, info; double mz_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; magma_int_t num_gpus0 = 1, num_gpus, flag = 0; int nb, mb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0) { size[0] = size[9] = N; flag = 1; }else exit(1); } if(strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i])==0){ if(strcmp("L", argv[++i])==0){ uplo = MagmaLower; }else{ uplo = MagmaUpper; } } } } else { printf("\nUsage: \n"); printf(" testing_dpotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0;i<10;i++){ N = size[i]; nb = magma_get_dpotrf_nb(N); mb = nb; if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; }else{ num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb); if(n_local > ldda) ldda = n_local; if(n2 < N*N) n2 = N*N; if(flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; //magma_queue_t queues[MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } printf("each buffer size: %d\n", ldda); /* allocate local matrix on Buffers */ for(i=0; i<num_gpus0; i++){ TESTING_MALLOC_DEV( d_lA[i], double, ldda ); } printf("\n\n"); printf("Using GPUs: %d\n", num_gpus0); if(uplo == MagmaUpper){ printf("\n testing_dpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0); }else{ printf("\n testing_dpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0); } printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { h_A(i,i) = MAGMA_D_MAKE( MAGMA_D_REAL(h_A(i,i)) + N, 0 ); for( int j = 0; j < i; ++j ) { h_A(i, j) = MAGMA_D_CNJG( h_A(j,i) ); } } lapackf77_dlacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* Warm up to measure the performance */ nb = magma_get_dpotrf_nb(N); if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); }else{ num_gpus = num_gpus0; } /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dsetmatrix( N, nk, &h_A[j*lda], lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dsetmatrix( nk, N, &h_A[j], lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } magma_dpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dsetmatrix( N, nk, &h_A[j*lda], lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dsetmatrix( nk, N, &h_A[j], lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } gpu_time = magma_wtime(); magma_dpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_dpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* gather matrix from gpus */ if(uplo==MagmaUpper){ // Upper for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dgetmatrix( N, nk, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, &h_R[j*lda], lda, queues[2*k]); } }else{ // Lower for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dgetmatrix( nk, N, d_lA[k], (j/(nb*num_gpus)*nb), ldda, &h_R[j], lda, queues[2*k] ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if(uplo == MagmaLower){ lapackf77_dpotrf( MagmaLowerStr, &N, h_A, &lda, &info ); }else{ lapackf77_dpotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_dpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_dlange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (flag != 0) break; } /* clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); for(i=0;i<num_gpus;i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); }
int main(int argc, char* argv[]) { int N = -1 ; // number of rows 2^12 int M = -1 ; // number of columns 2^10 int S = -1 ; // total size 2^22 int nrepeat = 100 ; // number of repeats of the test // Read command line arguments for(int i=0; i<argc; i++) { if( (strcmp(argv[i], "-N") == 0) || (strcmp(argv[i], "-Rows") == 0) ) { N = pow( 2, atoi(argv[++i]) ); printf(" User N is %d\n",N); } else if( (strcmp(argv[i], "-M") == 0) || (strcmp(argv[i], "-Columns") == 0)) { M = pow( 2, atof(argv[++i]) ); printf(" User M is %d\n",M); } else if( (strcmp(argv[i], "-S") == 0) || (strcmp(argv[i], "-Size") == 0)) { S = pow( 2, atof(argv[++i]) ); printf(" User S is %d\n",S); } else if( strcmp(argv[i], "-nrepeat") == 0) { nrepeat = atoi(argv[++i]); } else if( (strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "-help") == 0) ) { printf(" y^T*A*x Options:\n"); printf(" -Rows (-N) <int>: exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n"); printf(" -Columns (-M) <int>: exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n"); printf(" -Size (-S) <int>: exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n"); printf(" -nrepeat <int>: number of repetitions (default: 100)\n"); printf(" -help (-h): print this message\n\n"); exit(1); } } //Check Sizes checkSizes( N, M, S, nrepeat ); Kokkos::initialize(argc,argv); // typedef Kokkos::DefaultExecutionSpace::array_layout Layout; // typedef Kokkos::LayoutLeft Layout ; typedef Kokkos::LayoutRight Layout ; // Allocate y, x vectors and Matrix A: // Device typedef Kokkos::View<double*, Layout> ViewVectorType; typedef Kokkos::View<double**, Layout> ViewMatrixType; ViewVectorType y("y", N); ViewVectorType x("x", M); ViewMatrixType A("A", N, M); //Host mirror ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view(y); ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view(x); ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view(A); // Initialize y vector on host for (int i = 0; i < N; ++i) { h_y( i ) = 1; } // Initialize x vector on host for (int i = 0; i < M; ++i) { h_x( i ) = 1; } // Initialize A matrix, note 2D indexing computation on host for (int j = 0; j < N; ++j) { for ( int i = 0 ; i < M ; ++i ) { h_A( j , i ) = 1; } } //Deep copy host view to device views Kokkos::deep_copy(y, h_y); Kokkos::deep_copy(x, h_x); Kokkos::deep_copy(A, h_A); typedef Kokkos::TeamPolicy<> team_policy ; typedef Kokkos::TeamPolicy<>::member_type member_type ; // Timer products struct timeval begin,end; gettimeofday(&begin,NULL); for ( int repeat = 0; repeat < nrepeat; repeat++) { //Application: <y,Ax> = y^T*A*x double result = 0; Kokkos::parallel_reduce( team_policy( N , Kokkos::AUTO ), KOKKOS_LAMBDA ( const member_type& teamMember, double &update ) { const int j = teamMember.league_rank(); double temp2 = 0; Kokkos::parallel_reduce( Kokkos::TeamThreadRange( teamMember, M ), [&] (const int i, double &innerUpdate ) { innerUpdate += A( j , i ) * x( i ); }, temp2); if ( teamMember.team_rank() == 0 ) update += y( j ) * temp2; }, result ); //Output result if ( repeat == (nrepeat - 1) ) printf(" Computed result for %d x %d is %lf\n", N, M, result); const double solution = (double)N *(double)M; if ( result != solution ) { printf(" Error: result( %lf ) != solution( %lf )\n",result,solution); } }