void run_benchmark()
{
  std::size_t matrix_size = 1500;  //some odd number, not too large
  std::size_t rhs_num = 153;

  viennacl::matrix<NumericT, F_A> vcl_A(matrix_size, matrix_size);
  viennacl::matrix<NumericT, F_B> vcl_B(matrix_size, rhs_num);
  viennacl::matrix<NumericT, F_B> result(matrix_size, rhs_num);

  viennacl::vector<NumericT> vcl_vec_B(matrix_size);
  viennacl::vector<NumericT> vcl_vec_result(matrix_size);

  fill_matrix(vcl_A);
  fill_matrix(vcl_B);

  fill_vector(vcl_vec_B);
  std::cout << "------- Solve Matrix-Matrix: ----------\n" << std::endl;
  run_solver_matrix<NumericT>(vcl_A,vcl_B,result,viennacl::linalg::lower_tag());
  run_solver_matrix<NumericT>(vcl_A,vcl_B,result,viennacl::linalg::unit_lower_tag());
  run_solver_matrix<NumericT>(vcl_A,vcl_B,result,viennacl::linalg::upper_tag());
  run_solver_matrix<NumericT>(vcl_A,vcl_B,result,viennacl::linalg::unit_upper_tag());
  std::cout << "------- End Matrix-Matrix: ----------\n" << std::endl;

  std::cout << "------- Solve Matrix-Vector: ----------\n" << std::endl;
  run_solver_vector<NumericT>(vcl_A,vcl_vec_B,vcl_vec_result,viennacl::linalg::lower_tag());
  run_solver_vector<NumericT>(vcl_A,vcl_vec_B,vcl_vec_result,viennacl::linalg::unit_lower_tag());
  run_solver_vector<NumericT>(vcl_A,vcl_vec_B,vcl_vec_result,viennacl::linalg::upper_tag());
  run_solver_vector<NumericT>(vcl_A,vcl_vec_B,vcl_vec_result,viennacl::linalg::unit_upper_tag());
  std::cout << "------- End Matrix-Vector: ----------\n" << std::endl;
}
Example #2
0
double run_benchmark(size_t matrix_size)
{

    //
    // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //
    std::vector<ScalarType> stl_B(matrix_size * matrix_size);
    std::vector<ScalarType> stl_C(matrix_size * matrix_size);

    //
    // Fill the matrix
    //
    for (unsigned int i = 0; i < matrix_size; ++i)
        for (unsigned int j = 0; j < matrix_size; ++j)
            stl_B[i*matrix_size + j] = random<ScalarType>();

    for (unsigned int i = 0; i < matrix_size; ++i)
        for (unsigned int j = 0; j < matrix_size; ++j)
            stl_C[i + j*matrix_size] = random<ScalarType>();



    //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
    //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
    viennacl::matrix<ScalarType> vcl_A(matrix_size, matrix_size);
    viennacl::matrix<ScalarType,FB> vcl_B(matrix_size, matrix_size);
    viennacl::matrix<ScalarType,FC> vcl_C(matrix_size, matrix_size);

    typedef viennacl::generator::matrix< viennacl::matrix<ScalarType> > dma_t;
    typedef viennacl::generator::matrix< viennacl::matrix<ScalarType,FB> > dmb_t;
    typedef viennacl::generator::matrix< viennacl::matrix<ScalarType,FC> > dmc_t;

    viennacl::fast_copy(&(stl_B[0]),
                        &(stl_B[0]) + stl_B.size(),
                        vcl_B);
    viennacl::fast_copy(&(stl_C[0]),
                        &(stl_C[0]) + stl_C.size(),
                        vcl_C);

    viennacl::generator::custom_operation op;
    op.add(dma_t(vcl_A) = viennacl::generator::prod(dmb_t(vcl_B), dmc_t(vcl_C)));
    op.program();
    op.execute();
    viennacl::backend::finish();

    double res = 0;
    Timer timer;
    timer.start();
    for(unsigned int r = 0 ; r < N_RUNS ; ++r){
        op.execute();
    }
    viennacl::backend::finish();
    res = timer.get();

    return res/N_RUNS;
}
Example #3
0
int main (int, const char **)
{
  typedef float                                           ScalarType;    //feel free to change this to 'double' if supported by your hardware
  typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
  
  typedef viennacl::matrix<ScalarType, viennacl::row_major>    VCLMatrixType;
  
  std::size_t dim_large = 5;
  std::size_t dim_small = 3;
  
  //
  // Setup ublas objects and fill with data:
  //
  MatrixType ublas_A(dim_large, dim_large);
  MatrixType ublas_B(dim_small, dim_small);
  MatrixType ublas_C(dim_large, dim_small);
  MatrixType ublas_D(dim_small, dim_large);
  
  
  for (std::size_t i=0; i<ublas_A.size1(); ++i)
    for (std::size_t j=0; j<ublas_A.size2(); ++j)
      ublas_A(i,j) = static_cast<ScalarType>((i+1) + (j+1)*(i+1));

  for (std::size_t i=0; i<ublas_B.size1(); ++i)
    for (std::size_t j=0; j<ublas_B.size2(); ++j)
      ublas_B(i,j) = static_cast<ScalarType>((i+1) + (j+1)*(i+1));

  for (std::size_t i=0; i<ublas_C.size1(); ++i)
    for (std::size_t j=0; j<ublas_C.size2(); ++j)
      ublas_C(i,j) = static_cast<ScalarType>((j+2) + (j+1)*(i+1));

  for (std::size_t i=0; i<ublas_D.size1(); ++i)
    for (std::size_t j=0; j<ublas_D.size2(); ++j)
      ublas_D(i,j) = static_cast<ScalarType>((j+2) + (j+1)*(i+1));
  
  //
  // Extract submatrices using the ranges in ublas
  //
  boost::numeric::ublas::range ublas_r1(0, dim_small);                      //the first 'dim_small' entries
  boost::numeric::ublas::range ublas_r2(dim_large - dim_small, dim_large);  //the last 'dim_small' entries
  boost::numeric::ublas::matrix_range<MatrixType> ublas_A_sub1(ublas_A, ublas_r1, ublas_r1); //upper left part of A
  boost::numeric::ublas::matrix_range<MatrixType> ublas_A_sub2(ublas_A, ublas_r2, ublas_r2); //lower right part of A

  boost::numeric::ublas::matrix_range<MatrixType> ublas_C_sub(ublas_C, ublas_r1, ublas_r1); //upper left part of C
  boost::numeric::ublas::matrix_range<MatrixType> ublas_D_sub(ublas_D, ublas_r1, ublas_r1); //upper left part of D

  //
  // Setup ViennaCL objects
  //
  VCLMatrixType vcl_A(dim_large, dim_large);
  VCLMatrixType vcl_B(dim_small, dim_small);
  VCLMatrixType vcl_C(dim_large, dim_small);
  VCLMatrixType vcl_D(dim_small, dim_large);
  
  viennacl::copy(ublas_A, vcl_A);
  viennacl::copy(ublas_B, vcl_B);
  viennacl::copy(ublas_C, vcl_C);
  viennacl::copy(ublas_D, vcl_D);
  
  //
  // Extract submatrices using the ranges in ViennaCL
  //
  viennacl::range vcl_r1(0, dim_small);   //the first 'dim_small' entries
  viennacl::range vcl_r2(dim_large - dim_small, dim_large); //the last 'dim_small' entries
  viennacl::matrix_range<VCLMatrixType>   vcl_A_sub1(vcl_A, vcl_r1, vcl_r1); //upper left part of A
  viennacl::matrix_range<VCLMatrixType>   vcl_A_sub2(vcl_A, vcl_r2, vcl_r2); //lower right part of A
  
  viennacl::matrix_range<VCLMatrixType>   vcl_C_sub(vcl_C, vcl_r1, vcl_r1); //upper left part of C
  viennacl::matrix_range<VCLMatrixType>   vcl_D_sub(vcl_D, vcl_r1, vcl_r1); //upper left part of D

  //
  // Copy from ublas to submatrices and back:
  //
  
  ublas_A_sub1 = ublas_B;
  viennacl::copy(ublas_B, vcl_A_sub1);
  viennacl::copy(vcl_A_sub1, ublas_B);
  
  //
  // Addition:
  //
  
  // range to range:
  ublas_A_sub2 += ublas_A_sub2;
  vcl_A_sub2 += vcl_A_sub2;

  // range to matrix:
  ublas_B += ublas_A_sub2;
  vcl_B += vcl_A_sub2;

  
  //
  // use matrix range with matrix-matrix product:
  //
  ublas_A_sub1 += prod(ublas_C_sub, ublas_D_sub);
  vcl_A_sub1 += viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);

  //
  // Print result matrices:
  //
  std::cout << "Result ublas:    " << ublas_A << std::endl;
  std::cout << "Result ViennaCL: " << vcl_A << std::endl;
  
  //
  //  That's it.
  //
  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;

  return EXIT_SUCCESS;
}
Example #4
0
int test(Epsilon const& epsilon)
{
  int retval = EXIT_SUCCESS;

  viennacl::tools::uniform_random_numbers<NumericT> randomNumber;

  std::size_t N = 210;
  std::size_t K = 300;
  std::size_t M = 420;
  std::size_t nnz_row = 40;
  // --------------------------------------------------------------------------
  std::vector<std::map<unsigned int, NumericT> > stl_A(N);
  std::vector<std::map<unsigned int, NumericT> > stl_B(K);
  std::vector<std::map<unsigned int, NumericT> > stl_C(N);

  for (std::size_t i=0; i<stl_A.size(); ++i)
    for (std::size_t j=0; j<nnz_row; ++j)
      stl_A[i][static_cast<unsigned int>(randomNumber() * NumericT(K))] = NumericT(1.0) + NumericT();

  for (std::size_t i=0; i<stl_B.size(); ++i)
    for (std::size_t j=0; j<nnz_row; ++j)
      stl_B[i][static_cast<unsigned int>(randomNumber() * NumericT(M))] = NumericT(1.0) + NumericT();


  viennacl::compressed_matrix<NumericT>  vcl_A(N, K);
  viennacl::compressed_matrix<NumericT>  vcl_B(K, M);
  viennacl::compressed_matrix<NumericT>  vcl_C;

  viennacl::tools::sparse_matrix_adapter<NumericT> adapted_stl_A(stl_A, N, K);
  viennacl::tools::sparse_matrix_adapter<NumericT> adapted_stl_B(stl_B, K, M);
  viennacl::copy(adapted_stl_A, vcl_A);
  viennacl::copy(adapted_stl_B, vcl_B);

  // --------------------------------------------------------------------------
  std::cout << "Testing products: STL" << std::endl;
  prod(stl_A, stl_B, stl_C);

  std::cout << "Testing products: compressed_matrix" << std::endl;
  vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);

  if ( std::fabs(diff(stl_C, vcl_C)) > epsilon )
  {
    std::cout << "# Error at operation: matrix-matrix product with compressed_matrix (vcl_C)" << std::endl;
    std::cout << "  diff: " << std::fabs(diff(stl_C, vcl_C)) << std::endl;
    retval = EXIT_FAILURE;
  }

  viennacl::compressed_matrix<NumericT> vcl_D = viennacl::linalg::prod(vcl_A, vcl_B);
  if ( std::fabs(diff(stl_C, vcl_D)) > epsilon )
  {
    std::cout << "# Error at operation: matrix-matrix product with compressed_matrix (vcl_D)" << std::endl;
    std::cout << "  diff: " << std::fabs(diff(stl_C, vcl_C)) << std::endl;
    retval = EXIT_FAILURE;
  }

  viennacl::compressed_matrix<NumericT> vcl_E(viennacl::linalg::prod(vcl_A, vcl_B));
  if ( std::fabs(diff(stl_C, vcl_E)) > epsilon )
  {
    std::cout << "# Error at operation: matrix-matrix product with compressed_matrix (vcl_E)" << std::endl;
    std::cout << "  diff: " << std::fabs(diff(stl_C, vcl_C)) << std::endl;
    retval = EXIT_FAILURE;
  }

  // --------------------------------------------------------------------------
  return retval;
}
Example #5
0
int run_benchmark()
{
  Timer timer;
  double exec_time;

  //
  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
  //
  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);

  //
  // Fill the matrix
  //
  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
      stl_A[i*BLAS3_MATRIX_SIZE + j] = random<ScalarType>();

  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
      stl_B[i + j*BLAS3_MATRIX_SIZE] = random<ScalarType>();

  //
  // Set up some ViennaCL objects
  //
#ifdef VIENNACL_WITH_OPENCL
  viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());
#endif

  //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
  //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
  viennacl::matrix<ScalarType> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
  
  
  /////////////////////////////////////////////////
  //////////// Matrix-matrix products /////////////
  /////////////////////////////////////////////////
  
  //
  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
  //
  
  std::cout << " ------ Benchmark 1: Matrix-Matrix product ------ " << std::endl;
  
  
#ifdef VIENNACL_WITH_OPENCL
  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
  std::vector<long> devices(1);
#endif
  for (std::size_t i=0; i<devices.size(); ++i)
  {
#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[i]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    viennacl::fast_copy(&(stl_A[0]),
                        &(stl_A[0]) + stl_A.size(),
                        vcl_A);
    viennacl::fast_copy(&(stl_B[0]),
                        &(stl_B[0]) + stl_B.size(),
                        vcl_B);
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();
    timer.start();
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();
    exec_time = timer.get();
    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
    std::cout << std::endl;
  }

  std::cout << " ------ Benchmark 2: Matrix-Matrix product using ranges ------ " << std::endl;

  viennacl::range r(BLAS3_MATRIX_SIZE/4, 3 * BLAS3_MATRIX_SIZE/4);
  for (std::size_t i=0; i<devices.size(); ++i)
  {
#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[i]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif
    
    viennacl::fast_copy(&(stl_A[0]),
                        &(stl_A[0]) + stl_A.size(),
                        vcl_A);
    viennacl::fast_copy(&(stl_B[0]),
                        &(stl_B[0]) + stl_B.size(),
                        vcl_B);
    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
    viennacl::backend::finish();
    timer.start();
    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
    viennacl::backend::finish();
    exec_time = timer.get();
    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
    std::cout << std::endl;
  }

  std::cout << " ------ Benchmark 3: Matrix-Matrix product using slices ------ " << std::endl;

  viennacl::slice s(0, 2, BLAS3_MATRIX_SIZE/2);
  for (std::size_t i=0; i<devices.size(); ++i)
  {
#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[i]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    viennacl::fast_copy(&(stl_A[0]),
                        &(stl_A[0]) + stl_A.size(),
                        vcl_A);
    viennacl::fast_copy(&(stl_B[0]),
                        &(stl_B[0]) + stl_B.size(),
                        vcl_B);
    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
    viennacl::backend::finish();
    timer.start();
    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
    viennacl::backend::finish();
    exec_time = timer.get();
    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
    std::cout << std::endl;
  }

  
  std::cout << " ------ Benchmark 4: LU factorization ------ " << std::endl;

  for (std::size_t i=0; i<devices.size(); ++i)
  {
#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[i]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    viennacl::fast_copy(&(stl_A[0]),
                        &(stl_A[0]) + stl_A.size(),
                        vcl_A);
    viennacl::linalg::lu_factorize(vcl_A);
    viennacl::backend::finish();
    timer.start();
    viennacl::linalg::lu_factorize(vcl_A);
    viennacl::backend::finish();
    exec_time = timer.get();
    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_A.size2() / 1000.0) / exec_time << std::endl;
    std::cout << std::endl;
  }
  
  return EXIT_SUCCESS;
}
int test_prod(Epsilon const& epsilon)
{
  int ret;

  viennacl::tools::uniform_random_numbers<NumericT> randomNumber;

  std::size_t matrix_size1 = 29;  //some odd number, not too large
  std::size_t matrix_size2 = 47;  //some odd number, not too large
  std::size_t matrix_size3 = 33;  //some odd number, not too large
  //std::size_t matrix_size1 = 128;  //some odd number, not too large
  //std::size_t matrix_size2 = 64;  //some odd number, not too large
  //std::size_t matrix_size3 = 128;  //some odd number, not too large
  //std::size_t matrix_size1 = 256;  // for testing AMD kernels
  //std::size_t matrix_size2 = 256;  // for testing AMD kernels
  //std::size_t matrix_size3 = 256;  // for testing AMD kernels

  // --------------------------------------------------------------------------

  // ublas reference:
  std::vector<std::vector<NumericT> > A(matrix_size1, std::vector<NumericT>(matrix_size2));
  std::vector<std::vector<NumericT> > big_A(4*matrix_size1, std::vector<NumericT>(4*matrix_size2, NumericT(3.1415)));

  std::vector<std::vector<NumericT> > B(matrix_size2, std::vector<NumericT>(matrix_size3));
  std::vector<std::vector<NumericT> > big_B(4*matrix_size2, std::vector<NumericT>(4*matrix_size3, NumericT(42.0)));

  std::vector<std::vector<NumericT> > C(matrix_size1, std::vector<NumericT>(matrix_size3));

  //fill A and B:
  for (std::size_t i = 0; i < A.size(); ++i)
    for (std::size_t j = 0; j < A[0].size(); ++j)
      A[i][j] = static_cast<NumericT>(0.1) * randomNumber();
  for (std::size_t i = 0; i < B.size(); ++i)
    for (std::size_t j = 0; j < B[0].size(); ++j)
      B[i][j] = static_cast<NumericT>(0.1) * randomNumber();

  std::vector<std::vector<NumericT> >     A_trans(A[0].size(), std::vector<NumericT>(A.size()));
  for (std::size_t i = 0; i < A.size(); ++i)
    for (std::size_t j = 0; j < A[0].size(); ++j)
      A_trans[j][i] = A[i][j];

  std::vector<std::vector<NumericT> > big_A_trans(big_A[0].size(), std::vector<NumericT>(big_A.size()));
  for (std::size_t i = 0; i < big_A.size(); ++i)
    for (std::size_t j = 0; j < big_A[0].size(); ++j)
      big_A_trans[j][i] = big_A[i][j];


  std::vector<std::vector<NumericT> >     B_trans(B[0].size(), std::vector<NumericT>(B.size()));
  for (std::size_t i = 0; i < B.size(); ++i)
    for (std::size_t j = 0; j < B[0].size(); ++j)
      B_trans[j][i] = B[i][j];

  std::vector<std::vector<NumericT> > big_B_trans(big_B[0].size(), std::vector<NumericT>(big_B.size()));
  for (std::size_t i = 0; i < big_B.size(); ++i)
    for (std::size_t j = 0; j < big_B[0].size(); ++j)
      big_B_trans[j][i] = big_B[i][j];

  //
  // ViennaCL objects
  //

  // A
  viennacl::range range1_A(matrix_size1, 2*matrix_size1);
  viennacl::range range2_A(matrix_size2, 2*matrix_size2);
  viennacl::slice slice1_A(matrix_size1, 2, matrix_size1);
  viennacl::slice slice2_A(matrix_size2, 3, matrix_size2);

  viennacl::matrix<NumericT, F_A>    vcl_A(matrix_size1, matrix_size2);
  viennacl::copy(A, vcl_A);

  viennacl::matrix<NumericT, F_A>    vcl_big_range_A(4*matrix_size1, 4*matrix_size2);
  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A(vcl_big_range_A, range1_A, range2_A);
  viennacl::copy(A, vcl_range_A);

  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A(4*matrix_size1, 4*matrix_size2);
  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A(vcl_big_slice_A, slice1_A, slice2_A);
  viennacl::copy(A, vcl_slice_A);


  // A^T
  viennacl::matrix<NumericT, F_A>    vcl_A_trans(matrix_size2, matrix_size1);
  viennacl::copy(A_trans, vcl_A_trans);

  viennacl::matrix<NumericT, F_A>    vcl_big_range_A_trans(4*matrix_size2, 4*matrix_size1);
  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A_trans(vcl_big_range_A_trans, range2_A, range1_A);
  viennacl::copy(A_trans, vcl_range_A_trans);

  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A_trans(4*matrix_size2, 4*matrix_size1);
  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A_trans(vcl_big_slice_A_trans, slice2_A, slice1_A);
  viennacl::copy(A_trans, vcl_slice_A_trans);



  // B
  viennacl::range range1_B(2*matrix_size2, 3*matrix_size2);
  viennacl::range range2_B(2*matrix_size3, 3*matrix_size3);
  viennacl::slice slice1_B(matrix_size2, 3, matrix_size2);
  viennacl::slice slice2_B(matrix_size3, 2, matrix_size3);

  viennacl::matrix<NumericT, F_B>    vcl_B(matrix_size2, matrix_size3);
  viennacl::copy(B, vcl_B);

  viennacl::matrix<NumericT, F_B>    vcl_big_range_B(4*matrix_size2, 4*matrix_size3);
  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B(vcl_big_range_B, range1_B, range2_B);
  viennacl::copy(B, vcl_range_B);

  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B(4*matrix_size2, 4*matrix_size3);
  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B(vcl_big_slice_B, slice1_B, slice2_B);
  viennacl::copy(B, vcl_slice_B);


  // B^T

  viennacl::matrix<NumericT, F_B>    vcl_B_trans(matrix_size3, matrix_size2);
  viennacl::copy(B_trans, vcl_B_trans);

  viennacl::matrix<NumericT, F_B>    vcl_big_range_B_trans(4*matrix_size3, 4*matrix_size2);
  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B_trans(vcl_big_range_B_trans, range2_B, range1_B);
  viennacl::copy(B_trans, vcl_range_B_trans);

  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B_trans(4*matrix_size3, 4*matrix_size2);
  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B_trans(vcl_big_slice_B_trans, slice2_B, slice1_B);
  viennacl::copy(B_trans, vcl_slice_B_trans);


  // C

  viennacl::range range1_C(matrix_size1-1, 2*matrix_size1-1);
  viennacl::range range2_C(matrix_size3-1, 2*matrix_size3-1);
  viennacl::slice slice1_C(matrix_size1-1, 3, matrix_size1);
  viennacl::slice slice2_C(matrix_size3-1, 3, matrix_size3);

  viennacl::matrix<NumericT, F_C>    vcl_C(matrix_size1, matrix_size3);

  viennacl::matrix<NumericT, F_C>    vcl_big_range_C(4*matrix_size1, 4*matrix_size3);
  viennacl::matrix_range<viennacl::matrix<NumericT, F_C> > vcl_range_C(vcl_big_range_C, range1_C, range2_C);

  viennacl::matrix<NumericT, F_C>    vcl_big_slice_C(4*matrix_size1, 4*matrix_size3);
  viennacl::matrix_slice<viennacl::matrix<NumericT, F_C> > vcl_slice_C(vcl_big_slice_C, slice1_C, slice2_C);


  std::cout << "--- Part 1: Testing matrix-matrix products ---" << std::endl;

  //////
  //////  A: matrix
  //////

  //
  //
  std::cout << "Now using A=matrix, B=matrix, C=matrix" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_A, vcl_A_trans,
                            vcl_B, vcl_B_trans,
                            vcl_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=matrix, B=matrix, C=range" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_A, vcl_A_trans,
                            vcl_B, vcl_B_trans,
                            vcl_range_C);
  if (ret != EXIT_SUCCESS)
    return ret;

  //
  //
  std::cout << "Now using A=matrix, B=matrix, C=slice" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_A, vcl_A_trans,
                            vcl_B, vcl_B_trans,
                            vcl_slice_C);
  if (ret != EXIT_SUCCESS)
    return ret;



  //
  //
  std::cout << "Now using A=matrix, B=range, C=matrix" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_A, vcl_A_trans,
                            vcl_range_B, vcl_range_B_trans,
                            vcl_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=matrix, B=range, C=range" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_A, vcl_A_trans,
                            vcl_range_B, vcl_range_B_trans,
                            vcl_range_C);
  if (ret != EXIT_SUCCESS)
    return ret;

  //
  //
  std::cout << "Now using A=matrix, B=range, C=slice" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_A, vcl_A_trans,
                            vcl_range_B, vcl_range_B_trans,
                            vcl_slice_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=matrix, B=slice, C=matrix" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_A, vcl_A_trans,
                            vcl_slice_B, vcl_slice_B_trans,
                            vcl_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=matrix, B=slice, C=range" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_A, vcl_A_trans,
                            vcl_slice_B, vcl_slice_B_trans,
                            vcl_range_C);
  if (ret != EXIT_SUCCESS)
    return ret;

  //
  //
  std::cout << "Now using A=matrix, B=slice, C=slice" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_A, vcl_A_trans,
                            vcl_slice_B, vcl_slice_B_trans,
                            vcl_slice_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //////
  //////  A: range
  //////

  //
  //
  std::cout << "Now using A=range, B=matrix, C=matrix" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_range_A, vcl_range_A_trans,
                            vcl_B, vcl_B_trans,
                            vcl_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=range, B=matrix, C=range" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_range_A, vcl_range_A_trans,
                            vcl_B, vcl_B_trans,
                            vcl_range_C);
  if (ret != EXIT_SUCCESS)
    return ret;

  //
  //
  std::cout << "Now using A=range, B=matrix, C=slice" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_range_A, vcl_range_A_trans,
                            vcl_B, vcl_B_trans,
                            vcl_slice_C);
  if (ret != EXIT_SUCCESS)
    return ret;



  //
  //
  std::cout << "Now using A=range, B=range, C=matrix" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_range_A, vcl_range_A_trans,
                            vcl_range_B, vcl_range_B_trans,
                            vcl_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=range, B=range, C=range" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_range_A, vcl_range_A_trans,
                            vcl_range_B, vcl_range_B_trans,
                            vcl_range_C);
  if (ret != EXIT_SUCCESS)
    return ret;

  //
  //
  std::cout << "Now using A=range, B=range, C=slice" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_range_A, vcl_range_A_trans,
                            vcl_range_B, vcl_range_B_trans,
                            vcl_slice_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=range, B=slice, C=matrix" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_range_A, vcl_range_A_trans,
                            vcl_slice_B, vcl_slice_B_trans,
                            vcl_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=range, B=slice, C=range" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_range_A, vcl_range_A_trans,
                            vcl_slice_B, vcl_slice_B_trans,
                            vcl_range_C);
  if (ret != EXIT_SUCCESS)
    return ret;

  //
  //
  std::cout << "Now using A=range, B=slice, C=slice" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_range_A, vcl_range_A_trans,
                            vcl_slice_B, vcl_slice_B_trans,
                            vcl_slice_C);
  if (ret != EXIT_SUCCESS)
    return ret;



  //////
  //////  A: slice
  //////

  //
  //
  std::cout << "Now using A=slice, B=matrix, C=matrix" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_slice_A, vcl_slice_A_trans,
                            vcl_B, vcl_B_trans,
                            vcl_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=slice, B=matrix, C=range" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_slice_A, vcl_slice_A_trans,
                            vcl_B, vcl_B_trans,
                            vcl_range_C);
  if (ret != EXIT_SUCCESS)
    return ret;

  //
  //
  std::cout << "Now using A=slice, B=matrix, C=slice" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_slice_A, vcl_slice_A_trans,
                            vcl_B, vcl_B_trans,
                            vcl_slice_C);
  if (ret != EXIT_SUCCESS)
    return ret;



  //
  //
  std::cout << "Now using A=slice, B=range, C=matrix" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_slice_A, vcl_slice_A_trans,
                            vcl_range_B, vcl_range_B_trans,
                            vcl_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=slice, B=range, C=range" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_slice_A, vcl_slice_A_trans,
                            vcl_range_B, vcl_range_B_trans,
                            vcl_range_C);
  if (ret != EXIT_SUCCESS)
    return ret;

  //
  //
  std::cout << "Now using A=slice, B=range, C=slice" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_slice_A, vcl_slice_A_trans,
                            vcl_range_B, vcl_range_B_trans,
                            vcl_slice_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=slice, B=slice, C=matrix" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_slice_A, vcl_slice_A_trans,
                            vcl_slice_B, vcl_slice_B_trans,
                            vcl_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  //
  //
  std::cout << "Now using A=slice, B=slice, C=range" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_slice_A, vcl_slice_A_trans,
                            vcl_slice_B, vcl_slice_B_trans,
                            vcl_range_C);
  if (ret != EXIT_SUCCESS)
    return ret;

  //
  //
  std::cout << "Now using A=slice, B=slice, C=slice" << std::endl;
  ret = test_prod<NumericT>(epsilon,
                            A, A_trans, B, B_trans, C,
                            vcl_slice_A, vcl_slice_A_trans,
                            vcl_slice_B, vcl_slice_B_trans,
                            vcl_slice_C);
  if (ret != EXIT_SUCCESS)
    return ret;


  return ret;

}
int run_test()
{
    //typedef float               ScalarType;
    typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
    typedef boost::numeric::ublas::vector<ScalarType>       VectorType;
    
    typedef viennacl::matrix<ScalarType, T>    VCLMatrixType;
    typedef viennacl::vector<ScalarType>       VCLVectorType;
    
    viennacl::scalar<ScalarType> gpu_pi = ScalarType(3.1415);
    
    std::size_t dim_large = 151;
    std::size_t dim_small = 37;
    //std::size_t dim_large = 35;
    //std::size_t dim_small = 11;
    
    //setup ublas objects:
    MatrixType ublas_A(dim_large, dim_large);
    for (std::size_t i=0; i<ublas_A.size1(); ++i)
      for (std::size_t j=0; j<ublas_A.size2(); ++j)
        ublas_A(i,j) = ScalarType((i+1) + (j+1)*(i+1));

    MatrixType ublas_B(dim_small, dim_small);
    for (std::size_t i=0; i<ublas_B.size1(); ++i)
      for (std::size_t j=0; j<ublas_B.size2(); ++j)
        ublas_B(i,j) = ScalarType((i+1) + (j+1)*(i+1));

    MatrixType ublas_C(dim_large, dim_small);
    for (std::size_t i=0; i<ublas_C.size1(); ++i)
      for (std::size_t j=0; j<ublas_C.size2(); ++j)
        ublas_C(i,j) = ScalarType((j+2) + (j+1)*(i+1));

    MatrixType ublas_D(dim_small, dim_large);
    for (std::size_t i=0; i<ublas_D.size1(); ++i)
      for (std::size_t j=0; j<ublas_D.size2(); ++j)
        ublas_D(i,j) = ScalarType((j+2) + (j+1)*(i+1));
      
    boost::numeric::ublas::range ublas_r1(0, dim_small);
    boost::numeric::ublas::range ublas_r2(dim_large - dim_small, dim_large);
    boost::numeric::ublas::matrix_range<MatrixType> ublas_A_sub1(ublas_A, ublas_r1, ublas_r1);
    boost::numeric::ublas::matrix_range<MatrixType> ublas_A_sub2(ublas_A, ublas_r2, ublas_r2);

    boost::numeric::ublas::matrix_range<MatrixType> ublas_C_sub(ublas_C, ublas_r1, ublas_r1);
    boost::numeric::ublas::matrix_range<MatrixType> ublas_D_sub(ublas_D, ublas_r1, ublas_r1);

    //Setup ViennaCL objects    
    VCLMatrixType vcl_A(dim_large, dim_large);
    viennacl::copy(ublas_A, vcl_A);
    VCLMatrixType vcl_B(dim_small, dim_small);
    viennacl::copy(ublas_B, vcl_B);
    VCLMatrixType vcl_C(dim_large, dim_small);
    viennacl::copy(ublas_C, vcl_C);
    VCLMatrixType vcl_D(dim_small, dim_large);
    viennacl::copy(ublas_D, vcl_D);
    
    viennacl::range vcl_r1(0, dim_small);
    viennacl::range vcl_r2(dim_large - dim_small, dim_large);
    viennacl::matrix_range<VCLMatrixType>   vcl_A_sub1(vcl_A, vcl_r1, vcl_r1);
    viennacl::matrix_range<VCLMatrixType>   vcl_A_sub2(vcl_A, vcl_r2, vcl_r2);
    
    viennacl::matrix_range<VCLMatrixType>   vcl_C_sub(vcl_C, vcl_r1, vcl_r1);
    viennacl::matrix_range<VCLMatrixType>   vcl_D_sub(vcl_D, vcl_r1, vcl_r1);

    std::cout << std::endl;
    std::cout << "//" << std::endl;
    std::cout << "////////// Test: Copy CTOR //////////" << std::endl;
    std::cout << "//" << std::endl;

    {
      std::cout << "Testing matrix created from range... ";
      ublas_B = ublas_A_sub1;
      VCLMatrixType vcl_temp = vcl_A_sub1;
      if (check_for_equality(ublas_B, vcl_temp))
        std::cout << "PASSED!" << std::endl;
      else
      {
        std::cout << std::endl << "TEST failed!";
        return EXIT_FAILURE;
      }
      
      std::cout << "Testing range created from range... ";
      //ublas_A_sub1 = ublas_A_sub1;
      VCLMatrixType vcl_ctor_sub1 = vcl_A_sub1;  //Note: This is mostly a compilation test only
      if (check_for_equality(ublas_A, vcl_A))
        std::cout << "PASSED!" << std::endl;
      else
      {
        std::cout << std::endl << "TEST failed!";
        return EXIT_FAILURE;
      }
    }
    
    
    std::cout << std::endl;
    std::cout << "//" << std::endl;
    std::cout << "////////// Test: Assignments //////////" << std::endl;
    std::cout << "//" << std::endl;

    std::cout << "Testing matrix assigned to range... ";
    ublas_A_sub1 = ublas_B;
    vcl_A_sub1 = vcl_B;
    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    std::cout << "Testing range assigned to matrix... ";
    ublas_B = ublas_A_sub2;
    vcl_B = vcl_A_sub2;
    if (check_for_equality(ublas_B, vcl_B))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    std::cout << "Testing range assigned to range... ";
    ublas_A_sub1 = ublas_C_sub;
    vcl_A_sub1 = vcl_C_sub;
    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    
    
    std::cout << std::endl;
    std::cout << "//" << std::endl;
    std::cout << "////////// Test 1: Copy to GPU //////////" << std::endl;
    std::cout << "//" << std::endl;
    
    ublas_A_sub1 = ublas_B;
    viennacl::copy(ublas_B, vcl_A_sub1);
    std::cout << "Testing upper left copy to A... ";
    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    
    ublas_A_sub2 = ublas_B;
    viennacl::copy(ublas_B, vcl_A_sub2);
    std::cout << "Testing lower right copy to A... ";
    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    
    
    ublas_C_sub = ublas_B;
    viennacl::copy(ublas_B, vcl_C_sub);
    std::cout << "Testing upper copy to C... ";
    if (check_for_equality(ublas_C, vcl_C))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    
    ublas_D_sub = ublas_B;
    viennacl::copy(ublas_B, vcl_D_sub);
    std::cout << "Testing left copy to D... ";
    if (check_for_equality(ublas_D, vcl_D))
      std::cout << "PASSED!" << std::endl;
    else
      std::cout << std::endl << "TEST failed!";
    
    std::cout << std::endl;
    std::cout << "//" << std::endl;
    std::cout << "////////// Test 2: Copy from GPU //////////" << std::endl;
    std::cout << "//" << std::endl;
    
    std::cout << "Testing upper left copy to A... ";
    if (check_for_equality(ublas_A_sub1, vcl_A_sub1))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    std::cout << "Testing lower right copy to A... ";
    if (check_for_equality(ublas_A_sub2, vcl_A_sub2))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    std::cout << "Testing upper copy to C... ";
    if (check_for_equality(ublas_C_sub, vcl_C_sub))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Testing left copy to D... ";
    if (check_for_equality(ublas_D_sub, vcl_D_sub))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    
    std::cout << "//" << std::endl;
    std::cout << "////////// Test 3: Addition //////////" << std::endl;
    std::cout << "//" << std::endl;
    viennacl::copy(ublas_A_sub2, vcl_A_sub2);
    
    std::cout << "Inplace add to submatrix: ";
    ublas_A_sub2 += ublas_A_sub2;
    vcl_A_sub2 += vcl_A_sub2;

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Inplace add to matrix: ";
    ublas_B += ublas_A_sub2;
    vcl_B += vcl_A_sub2;

    if (check_for_equality(ublas_B, vcl_B))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Inplace add of matrix: ";
    ublas_A_sub2 += ublas_B;
    vcl_A_sub2 += vcl_B;

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    std::cout << "Add to submatrix: ";
    ublas_A_sub2 = ublas_A_sub2 + ublas_A_sub2;
    vcl_A_sub2 = vcl_A_sub2 + vcl_A_sub2;

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Add to matrix: ";
    ublas_B = ublas_A_sub2 + ublas_A_sub2;
    vcl_B = vcl_A_sub2 + vcl_A_sub2;

    if (check_for_equality(ublas_B, vcl_B))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    
    
    std::cout << "//" << std::endl;
    std::cout << "////////// Test 4: Subtraction //////////" << std::endl;
    std::cout << "//" << std::endl;
    viennacl::copy(ublas_A_sub2, vcl_A_sub2);
    
    std::cout << "Inplace sub to submatrix: ";
    ublas_A_sub2 -= ublas_A_sub2;
    vcl_A_sub2 -= vcl_A_sub2;

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Inplace sub to matrix: ";
    ublas_B -= ublas_A_sub2;
    vcl_B -= vcl_A_sub2;

    if (check_for_equality(ublas_B, vcl_B))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    std::cout << "Inplace sub of matrix: ";
    ublas_A_sub2 -= ublas_B;
    vcl_A_sub2 -= vcl_B;

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    std::cout << "Sub from submatrix: ";
    ublas_A_sub2 = ublas_A_sub2 - ublas_A_sub2;
    vcl_A_sub2 = vcl_A_sub2 - vcl_A_sub2;

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Sub from matrix: ";
    ublas_B = ublas_A_sub2 - ublas_A_sub2;
    vcl_B = vcl_A_sub2 - vcl_A_sub2;

    if (check_for_equality(ublas_B, vcl_B))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    
    std::cout << "//" << std::endl;
    std::cout << "////////// Test 5: Scaling //////////" << std::endl;
    std::cout << "//" << std::endl;
    viennacl::copy(ublas_A, vcl_A);
    
    std::cout << "Multiplication with CPU scalar: ";
    ublas_A_sub2 *= ScalarType(3.1415);
    vcl_A_sub2 *= ScalarType(3.1415);

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Multiplication with GPU scalar: ";
    ublas_A_sub2 *= gpu_pi;
    vcl_A_sub2 *= gpu_pi;

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    
    std::cout << "Division with CPU scalar: ";
    ublas_A_sub2 /= ScalarType(3.1415);
    vcl_A_sub2 /= ScalarType(3.1415);

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Division with GPU scalar: ";
    ublas_A_sub2 /= gpu_pi;
    vcl_A_sub2 /= gpu_pi;

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }
    
    

    std::cout << "//" << std::endl;
    std::cout << "////////// Test 6: Matrix-Matrix Products //////////" << std::endl;
    std::cout << "//" << std::endl;

    std::cout << "Assigned C = A * B: ";
    ublas_A_sub1 = prod(ublas_C_sub, ublas_D_sub);
    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Assigned C = A^T * B: ";
    ublas_A_sub1 = prod(trans(ublas_C_sub), ublas_D_sub);
    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), vcl_D_sub);

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Assigned C = A * B^T: ";
    ublas_A_sub1 = prod(ublas_C_sub, trans(ublas_D_sub));
    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, trans(vcl_D_sub));

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Assigned C = A^T * B^T: ";
    ublas_A_sub1 = prod(trans(ublas_C_sub), trans(ublas_D_sub));
    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), trans(vcl_D_sub));

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << "Inplace add of prod(): ";
    ublas_A_sub1 += prod(ublas_C_sub, ublas_D_sub);
    vcl_A_sub1 += viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);

    if (check_for_equality(ublas_A, vcl_A))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }


    std::cout << "//" << std::endl;
    std::cout << "////////// Test 7: Matrix-Vector Products //////////" << std::endl;
    std::cout << "//" << std::endl;

    VectorType ublas_v1(dim_large);
    for (std::size_t i=0; i<ublas_v1.size(); ++i)
      ublas_v1(i) = i;
    boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub(ublas_v1, ublas_r1);

    VectorType ublas_v2(dim_large);
    for (std::size_t i=0; i<ublas_v2.size(); ++i)
      ublas_v2(i) = i - 5;
    boost::numeric::ublas::vector_range<VectorType> ublas_v2_sub(ublas_v2, ublas_r1);

    
    VCLVectorType vcl_v1(ublas_v1.size());
    viennacl::vector_range<VCLVectorType> vcl_v1_sub(vcl_v1, vcl_r1);
    VCLVectorType vcl_v2(ublas_v2.size());
    viennacl::vector_range<VCLVectorType> vcl_v2_sub(vcl_v2, vcl_r1);
    viennacl::copy(ublas_v1, vcl_v1);
    viennacl::copy(ublas_v2, vcl_v2);
    viennacl::copy(ublas_A_sub1, vcl_A_sub1);
    
    
    ublas_v2_sub = prod(ublas_A_sub1, ublas_v1_sub);
    vcl_v2_sub = viennacl::linalg::prod(vcl_A_sub1, vcl_v1_sub);

    if (check_for_equality_vector(ublas_v2, vcl_v2))
      std::cout << "PASSED!" << std::endl;
    else
    {
      std::cout << std::endl << "TEST failed!";
      return EXIT_FAILURE;
    }

    std::cout << std::endl;
    std::cout << "----------------------------------------------" << std::endl;
    std::cout << std::endl;


    return EXIT_SUCCESS;
}