int main()
{
  std::size_t size  = 10; // at least 7
  float  eps_float  = 1e-5;
  double eps_double = 1e-12;

  float  ref_float_alpha;
  double ref_double_alpha;

  std::vector<float> ref_float_x(size, 1.0f);
  std::vector<float> ref_float_y(size, 2.0f);

  std::vector<double> ref_double_x(size, 1.0);
  std::vector<double> ref_double_y(size, 2.0);

  // Host setup
  ViennaCLHostBackend my_host_backend = NULL;
  float host_float_alpha = 0;
  viennacl::vector<float> host_float_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::MAIN_MEMORY));
  viennacl::vector<float> host_float_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::MAIN_MEMORY));

  double host_double_alpha = 0;
  viennacl::vector<double> host_double_x = viennacl::scalar_vector<double>(size, 1.0, viennacl::context(viennacl::MAIN_MEMORY));
  viennacl::vector<double> host_double_y = viennacl::scalar_vector<double>(size, 2.0, viennacl::context(viennacl::MAIN_MEMORY));

  // CUDA setup
#ifdef VIENNACL_WITH_CUDA
  ViennaCLCUDABackend my_cuda_backend = NULL;
  float cuda_float_alpha = 0;
  viennacl::vector<float> cuda_float_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::CUDA_MEMORY));
  viennacl::vector<float> cuda_float_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::CUDA_MEMORY));

  double cuda_double_alpha = 0;
  viennacl::vector<double> cuda_double_x = viennacl::scalar_vector<double>(size, 1.0, viennacl::context(viennacl::CUDA_MEMORY));
  viennacl::vector<double> cuda_double_y = viennacl::scalar_vector<double>(size, 2.0, viennacl::context(viennacl::CUDA_MEMORY));
#endif

  // OpenCL setup
#ifdef VIENNACL_WITH_OPENCL
  std::size_t context_id = 0;
  float opencl_float_alpha = 0;
  viennacl::vector<float> opencl_float_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::ocl::get_context(context_id)));
  viennacl::vector<float> opencl_float_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::ocl::get_context(context_id)));

  double opencl_double_alpha = 0;
  viennacl::vector<double> *opencl_double_x = NULL;
  viennacl::vector<double> *opencl_double_y = NULL;
  if( viennacl::ocl::current_device().double_support() )
  {
    *opencl_double_x = viennacl::scalar_vector<double>(size, 1.0, viennacl::context(viennacl::ocl::get_context(context_id)));
    *opencl_double_y = viennacl::scalar_vector<double>(size, 2.0, viennacl::context(viennacl::ocl::get_context(context_id)));
  }

  ViennaCLOpenCLBackend_impl my_opencl_backend_impl;
  my_opencl_backend_impl.context_id = context_id;
  ViennaCLOpenCLBackend my_opencl_backend = &my_opencl_backend_impl;
#endif

  // consistency checks:
  check(ref_float_x, host_float_x, eps_float);
  check(ref_float_y, host_float_y, eps_float);
  check(ref_double_x, host_double_x, eps_double);
  check(ref_double_y, host_double_y, eps_double);
#ifdef VIENNACL_WITH_CUDA
  check(ref_float_x, cuda_float_x, eps_float);
  check(ref_float_y, cuda_float_y, eps_float);
  check(ref_double_x, cuda_double_x, eps_double);
  check(ref_double_y, cuda_double_y, eps_double);
#endif
#ifdef VIENNACL_WITH_OPENCL
  check(ref_float_x, opencl_float_x, eps_float);
  check(ref_float_y, opencl_float_y, eps_float);
  if( viennacl::ocl::current_device().double_support() )
  {
    check(ref_double_x, *opencl_double_x, eps_double);
    check(ref_double_y, *opencl_double_y, eps_double);
  }
#endif

  // ASUM
  std::cout << std::endl << "-- Testing xASUM...";
  ref_float_alpha  = 0;
  ref_double_alpha = 0;
  for (std::size_t i=0; i<size/4; ++i)
  {
    ref_float_alpha  += std::fabs(ref_float_x[2 + 3*i]);
    ref_double_alpha += std::fabs(ref_double_x[2 + 3*i]);
  }

  std::cout << std::endl << "Host: ";
  ViennaCLHostSasum(my_host_backend, size/4,
                    &host_float_alpha,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 3);
  check(ref_float_alpha, host_float_alpha, eps_float);
  ViennaCLHostDasum(my_host_backend, size/4,
                    &host_double_alpha,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 3);
  check(ref_double_alpha, host_double_alpha, eps_double);


#ifdef VIENNACL_WITH_CUDA
  std::cout << std::endl << "CUDA: ";
  ViennaCLCUDASasum(my_cuda_backend, size/4,
                    &cuda_float_alpha,
                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 3);
  check(ref_float_alpha, cuda_float_alpha, eps_float);
  ViennaCLCUDADasum(my_cuda_backend, size/4,
                    &cuda_double_alpha,
                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 3);
  check(ref_double_alpha, cuda_double_alpha, eps_double);
#endif

#ifdef VIENNACL_WITH_OPENCL
  std::cout << std::endl << "OpenCL: ";
  ViennaCLOpenCLSasum(my_opencl_backend, size/4,
                      &opencl_float_alpha,
                      viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 3);
  check(ref_float_alpha, opencl_float_alpha, eps_float);
  if( viennacl::ocl::current_device().double_support() )
  {
    ViennaCLOpenCLDasum(my_opencl_backend, size/4,
                        &opencl_double_alpha,
                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 3);
    check(ref_double_alpha, opencl_double_alpha, eps_double);
  }
#endif



  // AXPY
  std::cout << std::endl << "-- Testing xAXPY...";
  for (std::size_t i=0; i<size/3; ++i)
  {
    ref_float_y[1 + 2*i]  += 2.0f * ref_float_x[0 + 2*i];
    ref_double_y[1 + 2*i] += 2.0  * ref_double_x[0 + 2*i];
  }

  std::cout << std::endl << "Host: ";
  ViennaCLHostSaxpy(my_host_backend, size/3,
                    2.0f,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 0, 2,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 2);
  check(ref_float_x, host_float_x, eps_float);
  check(ref_float_y, host_float_y, eps_float);
  ViennaCLHostDaxpy(my_host_backend, size/3,
                    2.0,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 0, 2,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 2);
  check(ref_double_x, host_double_x, eps_double);
  check(ref_double_y, host_double_y, eps_double);


#ifdef VIENNACL_WITH_CUDA
  std::cout << std::endl << "CUDA: ";
  ViennaCLCUDASaxpy(my_cuda_backend, size/3,
                    2.0f,
                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 0, 2,
                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 2);
  check(ref_float_x, cuda_float_x, eps_float);
  check(ref_float_y, cuda_float_y, eps_float);
  ViennaCLCUDADaxpy(my_cuda_backend, size/3,
                    2.0,
                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 0, 2,
                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 2);
  check(ref_double_x, cuda_double_x, eps_double);
  check(ref_double_y, cuda_double_y, eps_double);
#endif

#ifdef VIENNACL_WITH_OPENCL
  std::cout << std::endl << "OpenCL: ";
  ViennaCLOpenCLSaxpy(my_opencl_backend, size/3,
                      2.0f,
                      viennacl::traits::opencl_handle(opencl_float_x).get(), 0, 2,
                      viennacl::traits::opencl_handle(opencl_float_y).get(), 1, 2);
  check(ref_float_x, opencl_float_x, eps_float);
  check(ref_float_y, opencl_float_y, eps_float);
  if( viennacl::ocl::current_device().double_support() )
  {
    ViennaCLOpenCLDaxpy(my_opencl_backend, size/3,
                        2.0,
                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 0, 2,
                        viennacl::traits::opencl_handle(*opencl_double_y).get(), 1, 2);
    check(ref_double_x, *opencl_double_x, eps_double);
    check(ref_double_y, *opencl_double_y, eps_double);
  }
#endif



  // COPY
  std::cout << std::endl << "-- Testing xCOPY...";
  for (std::size_t i=0; i<size/3; ++i)
  {
    ref_float_y[0 + 2*i]  = ref_float_x[1 + 2*i];
    ref_double_y[0 + 2*i] = ref_double_x[1 + 2*i];
  }

  std::cout << std::endl << "Host: ";
  ViennaCLHostScopy(my_host_backend, size/3,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 2,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 0, 2);
  check(ref_float_x, host_float_x, eps_float);
  check(ref_float_y, host_float_y, eps_float);
  ViennaCLHostDcopy(my_host_backend, size/3,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 2,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 0, 2);
  check(ref_double_x, host_double_x, eps_double);
  check(ref_double_y, host_double_y, eps_double);


#ifdef VIENNACL_WITH_CUDA
  std::cout << std::endl << "CUDA: ";
  ViennaCLCUDAScopy(my_cuda_backend, size/3,
                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 2,
                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 0, 2);
  check(ref_float_x, cuda_float_x, eps_float);
  check(ref_float_y, cuda_float_y, eps_float);
  ViennaCLCUDADcopy(my_cuda_backend, size/3,
                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 2,
                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 0, 2);
  check(ref_double_x, cuda_double_x, eps_double);
  check(ref_double_y, cuda_double_y, eps_double);
#endif

#ifdef VIENNACL_WITH_OPENCL
  std::cout << std::endl << "OpenCL: ";
  ViennaCLOpenCLScopy(my_opencl_backend, size/3,
                      viennacl::traits::opencl_handle(opencl_float_x).get(), 1, 2,
                      viennacl::traits::opencl_handle(opencl_float_y).get(), 0, 2);
  check(ref_float_x, opencl_float_x, eps_float);
  check(ref_float_y, opencl_float_y, eps_float);
  if( viennacl::ocl::current_device().double_support() )
  {
    ViennaCLOpenCLDcopy(my_opencl_backend, size/3,
                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 1, 2,
                        viennacl::traits::opencl_handle(*opencl_double_y).get(), 0, 2);
    check(ref_double_x, *opencl_double_x, eps_double);
    check(ref_double_y, *opencl_double_y, eps_double);
  }
#endif



  // DOT
  std::cout << std::endl << "-- Testing xDOT...";
  ref_float_alpha  = 0;
  ref_double_alpha = 0;
  for (std::size_t i=0; i<size/2; ++i)
  {
    ref_float_alpha  += ref_float_y[3 + 2*i]  * ref_float_x[2 + 2*i];
    ref_double_alpha += ref_double_y[3 + 2*i] * ref_double_x[2 + 2*i];
  }

  std::cout << std::endl << "Host: ";
  ViennaCLHostSdot(my_host_backend, size/2,
                   &host_float_alpha,
                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 1,
                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 3, 1);
  check(ref_float_alpha, host_float_alpha, eps_float);
  ViennaCLHostDdot(my_host_backend, size/2,
                   &host_double_alpha,
                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 1,
                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 3, 1);
  check(ref_double_alpha, host_double_alpha, eps_double);


#ifdef VIENNACL_WITH_CUDA
  std::cout << std::endl << "CUDA: ";
  ViennaCLCUDASdot(my_cuda_backend, size/2,
                   &cuda_float_alpha,
                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 1,
                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 3, 1);
  check(ref_float_alpha, cuda_float_alpha, eps_float);
  ViennaCLCUDADdot(my_cuda_backend, size/2,
                   &cuda_double_alpha,
                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 1,
                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 3, 1);
  check(ref_double_alpha, cuda_double_alpha, eps_double);
#endif

#ifdef VIENNACL_WITH_OPENCL
  std::cout << std::endl << "OpenCL: ";
  ViennaCLOpenCLSdot(my_opencl_backend, size/2,
                     &opencl_float_alpha,
                     viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 1,
                     viennacl::traits::opencl_handle(opencl_float_y).get(), 3, 1);
  check(ref_float_alpha, opencl_float_alpha, eps_float);
  if( viennacl::ocl::current_device().double_support() )
  {
    ViennaCLOpenCLDdot(my_opencl_backend, size/2,
                       &opencl_double_alpha,
                       viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 1,
                       viennacl::traits::opencl_handle(*opencl_double_y).get(), 3, 1);
    check(ref_double_alpha, opencl_double_alpha, eps_double);
  }
#endif



  // NRM2
  std::cout << std::endl << "-- Testing xNRM2...";
  ref_float_alpha  = 0;
  ref_double_alpha = 0;
  for (std::size_t i=0; i<size/3; ++i)
  {
    ref_float_alpha  += ref_float_x[1 + 2*i]  * ref_float_x[1 + 2*i];
    ref_double_alpha += ref_double_x[1 + 2*i] * ref_double_x[1 + 2*i];
  }
  ref_float_alpha = std::sqrt(ref_float_alpha);
  ref_double_alpha = std::sqrt(ref_double_alpha);

  std::cout << std::endl << "Host: ";
  ViennaCLHostSnrm2(my_host_backend, size/3,
                    &host_float_alpha,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 2);
  check(ref_float_alpha, host_float_alpha, eps_float);
  ViennaCLHostDnrm2(my_host_backend, size/3,
                    &host_double_alpha,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 2);
  check(ref_double_alpha, host_double_alpha, eps_double);


#ifdef VIENNACL_WITH_CUDA
  std::cout << std::endl << "CUDA: ";
  ViennaCLCUDASnrm2(my_cuda_backend, size/3,
                    &cuda_float_alpha,
                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 2);
  check(ref_float_alpha, cuda_float_alpha, eps_float);
  ViennaCLCUDADnrm2(my_cuda_backend, size/3,
                    &cuda_double_alpha,
                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 2);
  check(ref_double_alpha, cuda_double_alpha, eps_double);
#endif

#ifdef VIENNACL_WITH_OPENCL
  std::cout << std::endl << "OpenCL: ";
  ViennaCLOpenCLSnrm2(my_opencl_backend, size/3,
                      &opencl_float_alpha,
                      viennacl::traits::opencl_handle(opencl_float_x).get(), 1, 2);
  check(ref_float_alpha, opencl_float_alpha, eps_float);
  if( viennacl::ocl::current_device().double_support() )
  {
    ViennaCLOpenCLDnrm2(my_opencl_backend, size/3,
                        &opencl_double_alpha,
                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 1, 2);
    check(ref_double_alpha, opencl_double_alpha, eps_double);
  }
#endif




  // ROT
  std::cout << std::endl << "-- Testing xROT...";
  for (std::size_t i=0; i<size/4; ++i)
  {
    float tmp            =  0.6 * ref_float_x[2 + 3*i] + 0.8 * ref_float_y[1 + 2*i];
    ref_float_y[1 + 2*i] = -0.8 * ref_float_x[2 + 3*i] + 0.6 * ref_float_y[1 + 2*i];;
    ref_float_x[2 + 3*i] = tmp;

    double tmp2           =  0.6 * ref_double_x[2 + 3*i] + 0.8 * ref_double_y[1 + 2*i];
    ref_double_y[1 + 2*i] = -0.8 * ref_double_x[2 + 3*i] + 0.6 * ref_double_y[1 + 2*i];;
    ref_double_x[2 + 3*i] = tmp2;
  }

  std::cout << std::endl << "Host: ";
  ViennaCLHostSrot(my_host_backend, size/4,
                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 3,
                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 2,
                   0.6f, 0.8f);
  check(ref_float_x, host_float_x, eps_float);
  check(ref_float_y, host_float_y, eps_float);
  ViennaCLHostDrot(my_host_backend, size/4,
                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 3,
                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 2,
                   0.6, 0.8);
  check(ref_double_x, host_double_x, eps_double);
  check(ref_double_y, host_double_y, eps_double);


#ifdef VIENNACL_WITH_CUDA
  std::cout << std::endl << "CUDA: ";
  ViennaCLCUDASrot(my_cuda_backend, size/4,
                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 3,
                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 2,
                   0.6f, 0.8f);
  check(ref_float_x, cuda_float_x, eps_float);
  check(ref_float_y, cuda_float_y, eps_float);
  ViennaCLCUDADrot(my_cuda_backend, size/4,
                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 3,
                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 2,
                   0.6, 0.8);
  check(ref_double_x, cuda_double_x, eps_double);
  check(ref_double_y, cuda_double_y, eps_double);
#endif

#ifdef VIENNACL_WITH_OPENCL
  std::cout << std::endl << "OpenCL: ";
  ViennaCLOpenCLSrot(my_opencl_backend, size/4,
                     viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 3,
                     viennacl::traits::opencl_handle(opencl_float_y).get(), 1, 2,
                     0.6f, 0.8f);
  check(ref_float_x, opencl_float_x, eps_float);
  check(ref_float_y, opencl_float_y, eps_float);
  if( viennacl::ocl::current_device().double_support() )
  {
    ViennaCLOpenCLDrot(my_opencl_backend, size/4,
                       viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 3,
                       viennacl::traits::opencl_handle(*opencl_double_y).get(), 1, 2,
                       0.6, 0.8);
    check(ref_double_x, *opencl_double_x, eps_double);
    check(ref_double_y, *opencl_double_y, eps_double);
  }
#endif



  // SCAL
  std::cout << std::endl << "-- Testing xSCAL...";
  for (std::size_t i=0; i<size/4; ++i)
  {
    ref_float_x[1 + 3*i]  *= 2.0f;
    ref_double_x[1 + 3*i] *= 2.0;
  }

  std::cout << std::endl << "Host: ";
  ViennaCLHostSscal(my_host_backend, size/4,
                    2.0f,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 3);
  check(ref_float_x, host_float_x, eps_float);
  ViennaCLHostDscal(my_host_backend, size/4,
                    2.0,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 3);
  check(ref_double_x, host_double_x, eps_double);

#ifdef VIENNACL_WITH_CUDA
  std::cout << std::endl << "CUDA: ";
  ViennaCLCUDASscal(my_cuda_backend, size/4,
                    2.0f,
                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 3);
  check(ref_float_x, cuda_float_x, eps_float);
  ViennaCLCUDADscal(my_cuda_backend, size/4,
                    2.0,
                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 3);
  check(ref_double_x, cuda_double_x, eps_double);
#endif

#ifdef VIENNACL_WITH_OPENCL
  std::cout << std::endl << "OpenCL: ";
  ViennaCLOpenCLSscal(my_opencl_backend, size/4,
                      2.0f,
                      viennacl::traits::opencl_handle(opencl_float_x).get(), 1, 3);
  check(ref_float_x, opencl_float_x, eps_float);
  if( viennacl::ocl::current_device().double_support() )
  {
    ViennaCLOpenCLDscal(my_opencl_backend, size/4,
                        2.0,
                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 1, 3);
    check(ref_double_x, *opencl_double_x, eps_double);
  }
#endif


  // SWAP
  std::cout << std::endl << "-- Testing xSWAP...";
  for (std::size_t i=0; i<size/3; ++i)
  {
    float tmp = ref_float_x[2 + 2*i];
    ref_float_x[2 + 2*i] = ref_float_y[1 + 2*i];
    ref_float_y[1 + 2*i] = tmp;

    double tmp2 = ref_double_x[2 + 2*i];
    ref_double_x[2 + 2*i] = ref_double_y[1 + 2*i];
    ref_double_y[1 + 2*i] = tmp2;
  }

  std::cout << std::endl << "Host: ";
  ViennaCLHostSswap(my_host_backend, size/3,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 2,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 2);
  check(ref_float_y, host_float_y, eps_float);
  ViennaCLHostDswap(my_host_backend, size/3,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 2,
                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 2);
  check(ref_double_y, host_double_y, eps_double);


#ifdef VIENNACL_WITH_CUDA
  std::cout << std::endl << "CUDA: ";
  ViennaCLCUDASswap(my_cuda_backend, size/3,
                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 2,
                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 2);
  check(ref_float_y, cuda_float_y, eps_float);
  ViennaCLCUDADswap(my_cuda_backend, size/3,
                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 2,
                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 2);
  check(ref_double_y, cuda_double_y, eps_double);
#endif

#ifdef VIENNACL_WITH_OPENCL
  std::cout << std::endl << "OpenCL: ";
  ViennaCLOpenCLSswap(my_opencl_backend, size/3,
                      viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 2,
                      viennacl::traits::opencl_handle(opencl_float_y).get(), 1, 2);
  check(ref_float_y, opencl_float_y, eps_float);
  if( viennacl::ocl::current_device().double_support() )
  {
    ViennaCLOpenCLDswap(my_opencl_backend, size/3,
                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 2,
                        viennacl::traits::opencl_handle(*opencl_double_y).get(), 1, 2);
    check(ref_double_y, *opencl_double_y, eps_double);
  }
#endif


  // IAMAX
  std::cout << std::endl << "-- Testing IxASUM...";
  size_t ref_index = 0;
  ref_float_alpha = 0;
  for (std::size_t i=0; i<size/3; ++i)
  {
    if (ref_float_x[0 + 2*i] > std::fabs(ref_float_alpha))
    {
      ref_index = i;
      ref_float_alpha = std::fabs(ref_float_x[0 + 2*i]);
    }
  }

  std::cout << std::endl << "Host: ";
  size_t idx = 0;
  ViennaCLHostiSamax(my_host_backend, size/3,
                     &idx,
                     viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 0, 2);
  check(ref_index, idx, eps_float);
  idx = 0;
  ViennaCLHostiDamax(my_host_backend, size/3,
                     &idx,
                     viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 0, 2);
  check(ref_index, idx, eps_double);

#ifdef VIENNACL_WITH_CUDA
  std::cout << std::endl << "CUDA: ";
  idx = 0;
  ViennaCLCUDAiSamax(my_cuda_backend, size/3,
                     &idx,
                     viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 0, 2);
  check(ref_float_x[2*ref_index], ref_float_x[2*idx], eps_float);
  idx = 0;
  ViennaCLCUDAiDamax(my_cuda_backend, size/3,
                     &idx,
                     viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 0, 2);
  check(ref_double_x[2*ref_index], ref_double_x[2*idx], eps_double);
#endif

#ifdef VIENNACL_WITH_OPENCL
  std::cout << std::endl << "OpenCL: ";
  idx = 0;
  ViennaCLOpenCLiSamax(my_opencl_backend, size/3,
                       &idx,
                       viennacl::traits::opencl_handle(opencl_float_x).get(), 0, 2);
  check(ref_float_x[2*ref_index], ref_float_x[2*idx], eps_float);
  idx = 0;
  if( viennacl::ocl::current_device().double_support() )
  {
    ViennaCLOpenCLiDamax(my_opencl_backend, size/3,
                         &idx,
                         viennacl::traits::opencl_handle(*opencl_double_x).get(), 0, 2);
    check(ref_double_x[2*ref_index], ref_double_x[2*idx], eps_double);
  }
#endif



  //
  //  That's it.
  //
  std::cout << std::endl << "!!!! TEST COMPLETED SUCCESSFULLY !!!!" << std::endl;

  return EXIT_SUCCESS;
}
示例#2
0
/**
*  In this example we only create two vectors and swap values between them.
**/
int main()
{
    std::size_t size = 10;

    ViennaCLInt half_size = static_cast<ViennaCLInt>(size / 2);


    /**
    * Before we start we need to create a backend.
    * This allows one later to specify OpenCL command queues, CPU threads, or CUDA streams while preserving common interfaces.
    **/
    ViennaCLBackend my_backend;
    ViennaCLBackendCreate(&my_backend);


    /**
    *  <h2>Host-based Execution</h2>
    *
    *  We use the host to swap all odd entries of x (all ones) with all even entries in y (all twos):
    **/

    viennacl::vector<float> host_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::MAIN_MEMORY));
    viennacl::vector<float> host_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::MAIN_MEMORY));

    ViennaCLHostSswap(my_backend, half_size,
                      viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_x), 1, 2,
                      viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_y), 0, 2);

    std::cout << " --- Host ---" << std::endl;
    std::cout << "host_x: " << host_x << std::endl;
    std::cout << "host_y: " << host_y << std::endl;

    /**
    *   <h2>CUDA-based Execution</h2>
    *
    *  We use CUDA to swap all even entries in x (all ones) with all odd entries in y (all twos)
    **/

#ifdef VIENNACL_WITH_CUDA
    viennacl::vector<float> cuda_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::CUDA_MEMORY));
    viennacl::vector<float> cuda_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::CUDA_MEMORY));

    ViennaCLCUDASswap(my_backend, half_size,
                      viennacl::cuda_arg(cuda_x), 0, 2,
                      viennacl::cuda_arg(cuda_y), 1, 2);

    std::cout << " --- CUDA ---" << std::endl;
    std::cout << "cuda_x: " << cuda_x << std::endl;
    std::cout << "cuda_y: " << cuda_y << std::endl;
#endif

    /**
    *  <h2>OpenCL-based Execution</h2>
    *
    *  Use OpenCL to swap all odd entries in x (all ones) with all odd entries in y (all twos)
    **/

#ifdef VIENNACL_WITH_OPENCL
    long context_id = 0;
    viennacl::vector<float> opencl_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::ocl::get_context(context_id)));
    viennacl::vector<float> opencl_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::ocl::get_context(context_id)));

    ViennaCLBackendSetOpenCLContextID(my_backend, static_cast<ViennaCLInt>(context_id));

    ViennaCLOpenCLSswap(my_backend, half_size,
                        viennacl::traits::opencl_handle(opencl_x).get(), 1, 2,
                        viennacl::traits::opencl_handle(opencl_y).get(), 1, 2);

    std::cout << " --- OpenCL ---" << std::endl;
    std::cout << "opencl_x: " << opencl_x << std::endl;
    std::cout << "opencl_y: " << opencl_y << std::endl;
#endif

    /**
    *  The last step is to clean up by destroying the backend:
    **/
    ViennaCLBackendDestroy(&my_backend);

    std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;

    return EXIT_SUCCESS;
}