コード例 #1
0
double run_benchmark(size_t matrix_size)
{

    //
    // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //
    std::vector<ScalarType> stl_B(matrix_size * matrix_size);
    std::vector<ScalarType> stl_C(matrix_size * matrix_size);

    //
    // Fill the matrix
    //
    for (unsigned int i = 0; i < matrix_size; ++i)
        for (unsigned int j = 0; j < matrix_size; ++j)
            stl_B[i*matrix_size + j] = random<ScalarType>();

    for (unsigned int i = 0; i < matrix_size; ++i)
        for (unsigned int j = 0; j < matrix_size; ++j)
            stl_C[i + j*matrix_size] = random<ScalarType>();



    //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
    //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
    viennacl::matrix<ScalarType> vcl_A(matrix_size, matrix_size);
    viennacl::matrix<ScalarType,FB> vcl_B(matrix_size, matrix_size);
    viennacl::matrix<ScalarType,FC> vcl_C(matrix_size, matrix_size);

    typedef viennacl::generator::matrix< viennacl::matrix<ScalarType> > dma_t;
    typedef viennacl::generator::matrix< viennacl::matrix<ScalarType,FB> > dmb_t;
    typedef viennacl::generator::matrix< viennacl::matrix<ScalarType,FC> > dmc_t;

    viennacl::fast_copy(&(stl_B[0]),
                        &(stl_B[0]) + stl_B.size(),
                        vcl_B);
    viennacl::fast_copy(&(stl_C[0]),
                        &(stl_C[0]) + stl_C.size(),
                        vcl_C);

    viennacl::generator::custom_operation op;
    op.add(dma_t(vcl_A) = viennacl::generator::prod(dmb_t(vcl_B), dmc_t(vcl_C)));
    op.program();
    op.execute();
    viennacl::backend::finish();

    double res = 0;
    Timer timer;
    timer.start();
    for(unsigned int r = 0 ; r < N_RUNS ; ++r){
        op.execute();
    }
    viennacl::backend::finish();
    res = timer.get();

    return res/N_RUNS;
}
コード例 #2
0
ファイル: sparse_prod.cpp プロジェクト: Rombur/viennacl-dev
int test(Epsilon const& epsilon)
{
  int retval = EXIT_SUCCESS;

  viennacl::tools::uniform_random_numbers<NumericT> randomNumber;

  std::size_t N = 210;
  std::size_t K = 300;
  std::size_t M = 420;
  std::size_t nnz_row = 40;
  // --------------------------------------------------------------------------
  std::vector<std::map<unsigned int, NumericT> > stl_A(N);
  std::vector<std::map<unsigned int, NumericT> > stl_B(K);
  std::vector<std::map<unsigned int, NumericT> > stl_C(N);

  for (std::size_t i=0; i<stl_A.size(); ++i)
    for (std::size_t j=0; j<nnz_row; ++j)
      stl_A[i][static_cast<unsigned int>(randomNumber() * NumericT(K))] = NumericT(1.0) + NumericT();

  for (std::size_t i=0; i<stl_B.size(); ++i)
    for (std::size_t j=0; j<nnz_row; ++j)
      stl_B[i][static_cast<unsigned int>(randomNumber() * NumericT(M))] = NumericT(1.0) + NumericT();


  viennacl::compressed_matrix<NumericT>  vcl_A(N, K);
  viennacl::compressed_matrix<NumericT>  vcl_B(K, M);
  viennacl::compressed_matrix<NumericT>  vcl_C;

  viennacl::tools::sparse_matrix_adapter<NumericT> adapted_stl_A(stl_A, N, K);
  viennacl::tools::sparse_matrix_adapter<NumericT> adapted_stl_B(stl_B, K, M);
  viennacl::copy(adapted_stl_A, vcl_A);
  viennacl::copy(adapted_stl_B, vcl_B);

  // --------------------------------------------------------------------------
  std::cout << "Testing products: STL" << std::endl;
  prod(stl_A, stl_B, stl_C);

  std::cout << "Testing products: compressed_matrix" << std::endl;
  vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);

  if ( std::fabs(diff(stl_C, vcl_C)) > epsilon )
  {
    std::cout << "# Error at operation: matrix-matrix product with compressed_matrix (vcl_C)" << std::endl;
    std::cout << "  diff: " << std::fabs(diff(stl_C, vcl_C)) << std::endl;
    retval = EXIT_FAILURE;
  }

  viennacl::compressed_matrix<NumericT> vcl_D = viennacl::linalg::prod(vcl_A, vcl_B);
  if ( std::fabs(diff(stl_C, vcl_D)) > epsilon )
  {
    std::cout << "# Error at operation: matrix-matrix product with compressed_matrix (vcl_D)" << std::endl;
    std::cout << "  diff: " << std::fabs(diff(stl_C, vcl_C)) << std::endl;
    retval = EXIT_FAILURE;
  }

  viennacl::compressed_matrix<NumericT> vcl_E(viennacl::linalg::prod(vcl_A, vcl_B));
  if ( std::fabs(diff(stl_C, vcl_E)) > epsilon )
  {
    std::cout << "# Error at operation: matrix-matrix product with compressed_matrix (vcl_E)" << std::endl;
    std::cout << "  diff: " << std::fabs(diff(stl_C, vcl_C)) << std::endl;
    retval = EXIT_FAILURE;
  }

  // --------------------------------------------------------------------------
  return retval;
}
コード例 #3
0
ファイル: blas3.cpp プロジェクト: nlukash/viennacl-dev
int run_benchmark()
{
  Timer timer;
  double exec_time;

  //
  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
  //
  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);

  //
  // Fill the matrix
  //
  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
      stl_A[i*BLAS3_MATRIX_SIZE + j] = random<ScalarType>();

  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
      stl_B[i + j*BLAS3_MATRIX_SIZE] = random<ScalarType>();

  //
  // Set up some ViennaCL objects
  //
#ifdef VIENNACL_WITH_OPENCL
  viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());
#endif

  //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
  //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
  viennacl::matrix<ScalarType> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
  
  
  /////////////////////////////////////////////////
  //////////// Matrix-matrix products /////////////
  /////////////////////////////////////////////////
  
  //
  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
  //
  
  std::cout << " ------ Benchmark 1: Matrix-Matrix product ------ " << std::endl;
  
  
#ifdef VIENNACL_WITH_OPENCL
  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
  std::vector<long> devices(1);
#endif
  for (std::size_t i=0; i<devices.size(); ++i)
  {
#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[i]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    viennacl::fast_copy(&(stl_A[0]),
                        &(stl_A[0]) + stl_A.size(),
                        vcl_A);
    viennacl::fast_copy(&(stl_B[0]),
                        &(stl_B[0]) + stl_B.size(),
                        vcl_B);
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();
    timer.start();
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();
    exec_time = timer.get();
    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
    std::cout << std::endl;
  }

  std::cout << " ------ Benchmark 2: Matrix-Matrix product using ranges ------ " << std::endl;

  viennacl::range r(BLAS3_MATRIX_SIZE/4, 3 * BLAS3_MATRIX_SIZE/4);
  for (std::size_t i=0; i<devices.size(); ++i)
  {
#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[i]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif
    
    viennacl::fast_copy(&(stl_A[0]),
                        &(stl_A[0]) + stl_A.size(),
                        vcl_A);
    viennacl::fast_copy(&(stl_B[0]),
                        &(stl_B[0]) + stl_B.size(),
                        vcl_B);
    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
    viennacl::backend::finish();
    timer.start();
    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
    viennacl::backend::finish();
    exec_time = timer.get();
    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
    std::cout << std::endl;
  }

  std::cout << " ------ Benchmark 3: Matrix-Matrix product using slices ------ " << std::endl;

  viennacl::slice s(0, 2, BLAS3_MATRIX_SIZE/2);
  for (std::size_t i=0; i<devices.size(); ++i)
  {
#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[i]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    viennacl::fast_copy(&(stl_A[0]),
                        &(stl_A[0]) + stl_A.size(),
                        vcl_A);
    viennacl::fast_copy(&(stl_B[0]),
                        &(stl_B[0]) + stl_B.size(),
                        vcl_B);
    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
    viennacl::backend::finish();
    timer.start();
    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
    viennacl::backend::finish();
    exec_time = timer.get();
    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
    std::cout << std::endl;
  }

  
  std::cout << " ------ Benchmark 4: LU factorization ------ " << std::endl;

  for (std::size_t i=0; i<devices.size(); ++i)
  {
#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[i]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    viennacl::fast_copy(&(stl_A[0]),
                        &(stl_A[0]) + stl_A.size(),
                        vcl_A);
    viennacl::linalg::lu_factorize(vcl_A);
    viennacl::backend::finish();
    timer.start();
    viennacl::linalg::lu_factorize(vcl_A);
    viennacl::backend::finish();
    exec_time = timer.get();
    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_A.size2() / 1000.0) / exec_time << std::endl;
    std::cout << std::endl;
  }
  
  return EXIT_SUCCESS;
}