int test(Epsilon const& epsilon) { int retval = EXIT_SUCCESS; viennacl::tools::uniform_random_numbers<NumericT> randomNumber; std::size_t N = 210; std::size_t K = 300; std::size_t M = 420; std::size_t nnz_row = 40; // -------------------------------------------------------------------------- std::vector<std::map<unsigned int, NumericT> > stl_A(N); std::vector<std::map<unsigned int, NumericT> > stl_B(K); std::vector<std::map<unsigned int, NumericT> > stl_C(N); for (std::size_t i=0; i<stl_A.size(); ++i) for (std::size_t j=0; j<nnz_row; ++j) stl_A[i][static_cast<unsigned int>(randomNumber() * NumericT(K))] = NumericT(1.0) + NumericT(); for (std::size_t i=0; i<stl_B.size(); ++i) for (std::size_t j=0; j<nnz_row; ++j) stl_B[i][static_cast<unsigned int>(randomNumber() * NumericT(M))] = NumericT(1.0) + NumericT(); viennacl::compressed_matrix<NumericT> vcl_A(N, K); viennacl::compressed_matrix<NumericT> vcl_B(K, M); viennacl::compressed_matrix<NumericT> vcl_C; viennacl::tools::sparse_matrix_adapter<NumericT> adapted_stl_A(stl_A, N, K); viennacl::tools::sparse_matrix_adapter<NumericT> adapted_stl_B(stl_B, K, M); viennacl::copy(adapted_stl_A, vcl_A); viennacl::copy(adapted_stl_B, vcl_B); // -------------------------------------------------------------------------- std::cout << "Testing products: STL" << std::endl; prod(stl_A, stl_B, stl_C); std::cout << "Testing products: compressed_matrix" << std::endl; vcl_C = viennacl::linalg::prod(vcl_A, vcl_B); if ( std::fabs(diff(stl_C, vcl_C)) > epsilon ) { std::cout << "# Error at operation: matrix-matrix product with compressed_matrix (vcl_C)" << std::endl; std::cout << " diff: " << std::fabs(diff(stl_C, vcl_C)) << std::endl; retval = EXIT_FAILURE; } viennacl::compressed_matrix<NumericT> vcl_D = viennacl::linalg::prod(vcl_A, vcl_B); if ( std::fabs(diff(stl_C, vcl_D)) > epsilon ) { std::cout << "# Error at operation: matrix-matrix product with compressed_matrix (vcl_D)" << std::endl; std::cout << " diff: " << std::fabs(diff(stl_C, vcl_C)) << std::endl; retval = EXIT_FAILURE; } viennacl::compressed_matrix<NumericT> vcl_E(viennacl::linalg::prod(vcl_A, vcl_B)); if ( std::fabs(diff(stl_C, vcl_E)) > epsilon ) { std::cout << "# Error at operation: matrix-matrix product with compressed_matrix (vcl_E)" << std::endl; std::cout << " diff: " << std::fabs(diff(stl_C, vcl_C)) << std::endl; retval = EXIT_FAILURE; } // -------------------------------------------------------------------------- return retval; }
int run_benchmark() { Timer timer; double exec_time; // // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory) // std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE); std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE); std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE); // // Fill the matrix // for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i) for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j) stl_A[i*BLAS3_MATRIX_SIZE + j] = random<ScalarType>(); for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i) for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j) stl_B[i + j*BLAS3_MATRIX_SIZE] = random<ScalarType>(); // // Set up some ViennaCL objects // #ifdef VIENNACL_WITH_OPENCL viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag()); #endif //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math"); //uncomment for additional optimizations //viennacl::ocl::current_context().build_options("-cl-opt-disable"); //uncomment to get poor performance viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE); viennacl::matrix<ScalarType> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE); viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE); ///////////////////////////////////////////////// //////////// Matrix-matrix products ///////////// ///////////////////////////////////////////////// // // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product // std::cout << " ------ Benchmark 1: Matrix-Matrix product ------ " << std::endl; #ifdef VIENNACL_WITH_OPENCL std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices(); #else std::vector<long> devices(1); #endif for (std::size_t i=0; i<devices.size(); ++i) { #ifdef VIENNACL_WITH_OPENCL viennacl::ocl::current_context().switch_device(devices[i]); std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl; #endif viennacl::fast_copy(&(stl_A[0]), &(stl_A[0]) + stl_A.size(), vcl_A); viennacl::fast_copy(&(stl_B[0]), &(stl_B[0]) + stl_B.size(), vcl_B); vcl_C = viennacl::linalg::prod(vcl_A, vcl_B); viennacl::backend::finish(); timer.start(); vcl_C = viennacl::linalg::prod(vcl_A, vcl_B); viennacl::backend::finish(); exec_time = timer.get(); std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl; std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl; std::cout << std::endl; } std::cout << " ------ Benchmark 2: Matrix-Matrix product using ranges ------ " << std::endl; viennacl::range r(BLAS3_MATRIX_SIZE/4, 3 * BLAS3_MATRIX_SIZE/4); for (std::size_t i=0; i<devices.size(); ++i) { #ifdef VIENNACL_WITH_OPENCL viennacl::ocl::current_context().switch_device(devices[i]); std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl; #endif viennacl::fast_copy(&(stl_A[0]), &(stl_A[0]) + stl_A.size(), vcl_A); viennacl::fast_copy(&(stl_B[0]), &(stl_B[0]) + stl_B.size(), vcl_B); viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r)); viennacl::backend::finish(); timer.start(); viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r)); viennacl::backend::finish(); exec_time = timer.get(); std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl; std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl; std::cout << std::endl; } std::cout << " ------ Benchmark 3: Matrix-Matrix product using slices ------ " << std::endl; viennacl::slice s(0, 2, BLAS3_MATRIX_SIZE/2); for (std::size_t i=0; i<devices.size(); ++i) { #ifdef VIENNACL_WITH_OPENCL viennacl::ocl::current_context().switch_device(devices[i]); std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl; #endif viennacl::fast_copy(&(stl_A[0]), &(stl_A[0]) + stl_A.size(), vcl_A); viennacl::fast_copy(&(stl_B[0]), &(stl_B[0]) + stl_B.size(), vcl_B); viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s)); viennacl::backend::finish(); timer.start(); viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s)); viennacl::backend::finish(); exec_time = timer.get(); std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl; std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl; std::cout << std::endl; } std::cout << " ------ Benchmark 4: LU factorization ------ " << std::endl; for (std::size_t i=0; i<devices.size(); ++i) { #ifdef VIENNACL_WITH_OPENCL viennacl::ocl::current_context().switch_device(devices[i]); std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl; #endif viennacl::fast_copy(&(stl_A[0]), &(stl_A[0]) + stl_A.size(), vcl_A); viennacl::linalg::lu_factorize(vcl_A); viennacl::backend::finish(); timer.start(); viennacl::linalg::lu_factorize(vcl_A); viennacl::backend::finish(); exec_time = timer.get(); std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl; std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_A.size2() / 1000.0) / exec_time << std::endl; std::cout << std::endl; } return EXIT_SUCCESS; }