void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
  {
    viennacl::kernel & kernel = programs[0].program().kernel(kernel_prefix);

    kernel.local_work_size(0, p_.local_size_0);
    kernel.local_work_size(1, p_.local_size_1);
    kernel.global_work_size(0,p_.local_size_0*p_.num_groups_0);
    kernel.global_work_size(1,p_.local_size_1*p_.num_groups_1);

    scheduler::statement_node const & root = statements.data().front().array()[statements.data().front().root()];
    unsigned int current_arg = 0;
    if (up_to_internal_size_)
    {
      kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::internal_size1_fun())));
      kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::internal_size2_fun())));
    }
    else
    {
      kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::size1_fun())));
      kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::size2_fun())));
    }

    set_arguments(statements, kernel, current_arg);

    kernel.enqueue();
  }
Exemplo n.º 2
0
        void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg)  const{
          configure_local_sizes(k, kernel_id);

          k.global_work_size(0,local_size_1_*num_groups_row_);
          k.global_work_size(1,local_size_2_*num_groups_col_);

          scheduler::statement_node const & first_node = statements.front().second;
          k.arg(n_arg++, cl_uint(utils::call_on_matrix(first_node.lhs, utils::internal_size1_fun())));
          k.arg(n_arg++, cl_uint(utils::call_on_matrix(first_node.lhs, utils::internal_size2_fun())));
        }
Exemplo n.º 3
0
Arquivo: Event.cpp Projeto: k0zmo/clw
 void EventList::waitForFinished()
 {
     if(_events.empty())
         return;
     cl_int error;
     if((error = clWaitForEvents(cl_uint(_events.size()),
             operator const cl_event*())) != CL_SUCCESS)
         detail::reportError("EventList::waitForFinished() ", error);
 }
Exemplo n.º 4
0
        void configure_range_enqueue_arguments(std::size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg)  const{
          configure_local_sizes(k, kernel_id);

          k.global_work_size(0,group_size_*num_groups_);
          k.global_work_size(1,1);

          scheduler::statement_node const & first_node = statements.front().second;
          viennacl::vcl_size_t N = utils::call_on_vector(first_node.lhs, utils::internal_size_fun());
          k.arg(n_arg++, cl_uint(N/vectorization_));
        }
/**
 * Reference CPU implementation of Simple Convolution 
 * for performance comparison
 */
void 
SimpleConvolution::simpleConvolutionCPUReference(cl_uint  *output,
                                                 const cl_uint  *input,
                                                 const cl_float *mask,
                                                 const cl_uint  width,
                                                 const cl_uint height,
                                                 const cl_uint maskWidth,
                                                 const cl_uint maskHeight)
{
    cl_uint vstep = (maskWidth  - 1) / 2;
    cl_uint hstep = (maskHeight - 1) / 2;

    // for each pixel in the input
    for(cl_uint x = 0; x < width; x++)
        for(cl_uint y = 0; y < height; y++)
        {
            /*
             * find the left, right, top and bottom indices such that
             * the indices do not go beyond image boundaires
             */
            cl_uint left    = (x           <  vstep) ? 0         : (x - vstep);
            cl_uint right   = ((x + vstep) >= width) ? width - 1 : (x + vstep); 
            cl_uint top     = (y           <  hstep) ? 0         : (y - hstep);
            cl_uint bottom  = ((y + hstep) >= height)? height - 1: (y + hstep); 

            /*
             * initializing wighted sum value
             */
            cl_float sumFX = 0;

            for(cl_uint i = left; i <= right; ++i)
                for(cl_uint j = top ; j <= bottom; ++j)
                {
                    /*
                     * performing wighted sum within the mask boundaries
                     */
                    cl_uint maskIndex = (j - (y - hstep)) * maskWidth  + (i - (x - vstep));
                    cl_uint index     = j                 * width      + i;
                    
                    /*
                     * to round to the nearest integer
                     */
                    sumFX += ((float)input[index] * mask[maskIndex]);
                }
            sumFX += 0.5f;
            output[y*width + x] = cl_uint(sumFX);
        }
}
Exemplo n.º 6
0
void nmf(viennacl::matrix_base<NumericT> const & V,
         viennacl::matrix_base<NumericT>       & W,
         viennacl::matrix_base<NumericT>       & H,
         viennacl::linalg::nmf_config const & conf)
{
  viennacl::hsa::context & ctx = const_cast<viennacl::hsa::context &>(viennacl::traits::hsa_context(V));

  const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div";

  viennacl::linalg::opencl::kernels::nmf<NumericT, viennacl::hsa::context>::init(ctx);

  vcl_size_t k = W.size2();
  conf.iters_ = 0;

  if (viennacl::linalg::norm_frobenius(W) <= 0)
    W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1), ctx);

  if (viennacl::linalg::norm_frobenius(H) <= 0)
    H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1), ctx);

  viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major(), ctx);
  viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major(), ctx);
  viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major(), ctx);

  viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major(), ctx);
  viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major(), ctx);
  viennacl::matrix_base<NumericT> htmp(k, k, H.row_major(), ctx);

  viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major(), ctx);

  NumericT last_diff = 0;
  NumericT diff_init = 0;
  bool stagnation_flag = false;

  for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
  {
    conf.iters_ = i + 1;
    {
      hn = viennacl::linalg::prod(trans(W), V);
      htmp = viennacl::linalg::prod(trans(W), W);
      hd = viennacl::linalg::prod(htmp, H);

      viennacl::hsa::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(), NMF_MUL_DIV_KERNEL);
      viennacl::hsa::enqueue(mul_div_kernel(H, hn, hd, cl_uint(H.internal_size1() * H.internal_size2())));
    }
    {
      wn = viennacl::linalg::prod(V, trans(H));
      wtmp = viennacl::linalg::prod(W, H);
      wd = viennacl::linalg::prod(wtmp, trans(H));

      viennacl::hsa::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(), NMF_MUL_DIV_KERNEL);

      viennacl::hsa::enqueue(mul_div_kernel(W, wn, wd, cl_uint(W.internal_size1() * W.internal_size2())));
    }

    if (i % conf.check_after_steps() == 0)  //check for convergence
    {
      appr = viennacl::linalg::prod(W, H);

      appr -= V;
      NumericT diff_val = viennacl::linalg::norm_frobenius(appr);

      if (i == 0)
        diff_init = diff_val;

      if (conf.print_relative_error())
        std::cout << diff_val / diff_init << std::endl;

      // Approximation check
      if (diff_val / diff_init < conf.tolerance())
        break;

      // Stagnation check
      if (std::fabs(diff_val - last_diff) / (diff_val * NumericT(conf.check_after_steps())) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates
      {
        if (stagnation_flag)    // iteration stagnates (two iterates with no notable progress)
          break;
        else
          // record stagnation in this iteration
          stagnation_flag = true;
      } else
        // good progress in this iteration, so unset stagnation flag
        stagnation_flag = false;

      // prepare for next iterate:
      last_diff = diff_val;
    }
  }
}
Exemplo n.º 7
0
 inline cl_uint make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
 {
   return static_cast<cl_uint>( ((length > 1) ? (cl_uint(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
 }
Exemplo n.º 8
0
    void nmf(viennacl::matrix<ScalarType> const & v,
             viennacl::matrix<ScalarType> & w,
             viennacl::matrix<ScalarType> & h,
             std::size_t k,
             ScalarType eps = 0.000001,
             std::size_t max_iter = 10000,
             std::size_t check_diff_every_step = 100)
    {
      viennacl::linalg::kernels::nmf<ScalarType, 1>::init();
      
      w.resize(v.size1(), k);
      h.resize(k, v.size2());

      std::vector<ScalarType> stl_w(w.internal_size1() * w.internal_size2());
      std::vector<ScalarType> stl_h(h.internal_size1() * h.internal_size2());

      for (std::size_t j = 0; j < stl_w.size(); j++)
          stl_w[j] = static_cast<ScalarType>(rand()) / RAND_MAX;

      for (std::size_t j = 0; j < stl_h.size(); j++)
          stl_h[j] = static_cast<ScalarType>(rand()) / RAND_MAX;

      viennacl::matrix<ScalarType> wn(v.size1(), k);
      viennacl::matrix<ScalarType> wd(v.size1(), k);
      viennacl::matrix<ScalarType> wtmp(v.size1(), v.size2());

      viennacl::matrix<ScalarType> hn(k, v.size2());
      viennacl::matrix<ScalarType> hd(k, v.size2());
      viennacl::matrix<ScalarType> htmp(k, k);

      viennacl::matrix<ScalarType> appr(v.size1(), v.size2());
      viennacl::vector<ScalarType> diff(v.size1() * v.size2());

      viennacl::fast_copy(&stl_w[0], &stl_w[0] + stl_w.size(), w);
      viennacl::fast_copy(&stl_h[0], &stl_h[0] + stl_h.size(), h);

      ScalarType last_diff = 0.0f;


      
      for (std::size_t i = 0; i < max_iter; i++)
      {
        {
          hn = viennacl::linalg::prod(trans(w), v);
          htmp = viennacl::linalg::prod(trans(w), w);
          hd = viennacl::linalg::prod(htmp, h);

          viennacl::ocl::kernel & mul_div_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), 
                                                                             NMF_MUL_DIV_KERNEL);
          viennacl::ocl::enqueue(mul_div_kernel(h, hn, hd, cl_uint(stl_h.size())));
        }
        {
          wn = viennacl::linalg::prod(v, trans(h));
          wtmp = viennacl::linalg::prod(w, h);
          wd = viennacl::linalg::prod(wtmp, trans(h));

          viennacl::ocl::kernel & mul_div_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), 
                                                                             NMF_MUL_DIV_KERNEL);
          
          viennacl::ocl::enqueue(mul_div_kernel(w, wn, wd, cl_uint(stl_w.size())));
        }

        if (i % check_diff_every_step == 0)
        {
          appr = viennacl::linalg::prod(w, h);

         viennacl::ocl::kernel & sub_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), 
                                                                        NMF_SUB_KERNEL);
          //this is a cheat. i.e save difference of two matrix into vector to get norm_2
          viennacl::ocl::enqueue(sub_kernel(appr, v, diff, cl_uint(v.size1() * v.size2())));
          ScalarType diff_val = viennacl::linalg::norm_2(diff);

          if((diff_val < eps) || (fabs(diff_val - last_diff) < eps))
          {
              //std::cout << "Breaked at diff - " << diff_val << "\n";
              break;
          }

          last_diff = diff_val;

          //printf("Iteration #%lu - %.5f \n", i, diff_val);
        }
      }
      
      
    }
Exemplo n.º 9
0
    VectorType solve(const MatrixType & matrix, VectorType const & rhs, mixed_precision_cg_tag const & tag)
    {
      //typedef typename VectorType::value_type      ScalarType;
      typedef typename viennacl::result_of::value_type<VectorType>::type        ScalarType;
      typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;

      //TODO: Assert CPU_ScalarType == double

      //std::cout << "Starting CG" << std::endl;
      vcl_size_t problem_size = viennacl::traits::size(rhs);
      VectorType result(rhs);
      viennacl::traits::clear(result);

      VectorType residual = rhs;

      CPU_ScalarType ip_rr = viennacl::linalg::inner_prod(rhs, rhs);
      CPU_ScalarType new_ip_rr = 0;
      CPU_ScalarType norm_rhs_squared = ip_rr;

      if (norm_rhs_squared == 0) //solution is zero if RHS norm is zero
        return result;

      static bool first = true;

      viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
      if (first)
      {
        ctx.add_program(double_float_conversion_program, "double_float_conversion_program");
      }

      viennacl::vector<float> residual_low_precision(problem_size, viennacl::traits::context(rhs));
      viennacl::vector<float> result_low_precision(problem_size, viennacl::traits::context(rhs));
      viennacl::vector<float> p_low_precision(problem_size, viennacl::traits::context(rhs));
      viennacl::vector<float> tmp_low_precision(problem_size, viennacl::traits::context(rhs));
      float inner_ip_rr = static_cast<float>(ip_rr);
      float new_inner_ip_rr = 0;
      float initial_inner_rhs_norm_squared = static_cast<float>(ip_rr);
      float alpha;
      float beta;

      viennacl::ocl::kernel & assign_double_to_float      = ctx.get_kernel("double_float_conversion_program", "assign_double_to_float");
      viennacl::ocl::kernel & inplace_add_float_to_double = ctx.get_kernel("double_float_conversion_program", "inplace_add_float_to_double");

      // transfer rhs to single precision:
      viennacl::ocl::enqueue( assign_double_to_float(p_low_precision.handle().opencl_handle(),
                                                     rhs.handle().opencl_handle(),
                                                     cl_uint(rhs.size())
                                                    ) );
      //std::cout << "copying p_low_precision..." << std::endl;
      //assign_double_to_float(p_low_precision.handle(), residual.handle(), residual.size());
      residual_low_precision = p_low_precision;

      // transfer matrix to single precision:
      viennacl::compressed_matrix<float> matrix_low_precision(matrix.size1(), matrix.size2(), matrix.nnz(), viennacl::traits::context(rhs));
      viennacl::backend::memory_copy(matrix.handle1(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle1()), 0, 0, sizeof(cl_uint) * (matrix.size1() + 1) );
      viennacl::backend::memory_copy(matrix.handle2(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle2()), 0, 0, sizeof(cl_uint) * (matrix.nnz()) );

      viennacl::ocl::enqueue( assign_double_to_float(matrix_low_precision.handle().opencl_handle(),
                                                     matrix.handle().opencl_handle(),
                                                     cl_uint(matrix.nnz())
                                                    ) );
      //std::cout << "copying matrix_low_precision..." << std::endl;
      //assign_double_to_float(const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle()), matrix.handle(), matrix.nnz());

      //std::cout << "Starting CG solver iterations... " << std::endl;


      for (unsigned int i = 0; i < tag.max_iterations(); ++i)
      {
        tag.iters(i+1);

        // lower precision 'inner iteration'
        tmp_low_precision = viennacl::linalg::prod(matrix_low_precision, p_low_precision);

        alpha = inner_ip_rr / viennacl::linalg::inner_prod(tmp_low_precision, p_low_precision);
        result_low_precision += alpha * p_low_precision;
        residual_low_precision -= alpha * tmp_low_precision;

        new_inner_ip_rr = viennacl::linalg::inner_prod(residual_low_precision, residual_low_precision);

        beta = new_inner_ip_rr / inner_ip_rr;
        inner_ip_rr = new_inner_ip_rr;

        p_low_precision = residual_low_precision + beta * p_low_precision;



        if (new_inner_ip_rr < tag.inner_tolerance() * initial_inner_rhs_norm_squared || i == tag.max_iterations()-1)
        {
          //std::cout << "outer correction at i=" << i << std::endl;
          //result += result_low_precision;
          viennacl::ocl::enqueue( inplace_add_float_to_double(result.handle().opencl_handle(),
                                                              result_low_precision.handle().opencl_handle(),
                                                              cl_uint(result.size())
                                                             ) );

          // residual = b - Ax  (without introducing a temporary)
          residual = viennacl::linalg::prod(matrix, result);
          residual = rhs - residual;

          new_ip_rr = viennacl::linalg::inner_prod(residual, residual);
          if (new_ip_rr / norm_rhs_squared < tag.tolerance() *  tag.tolerance())//squared norms involved here
            break;

          // p_low_precision = residual;
          viennacl::ocl::enqueue( assign_double_to_float(p_low_precision.handle().opencl_handle(),
                                                         residual.handle().opencl_handle(),
                                                         cl_uint(residual.size())
                                                        ) );
          result_low_precision.clear();
          residual_low_precision = p_low_precision;
          initial_inner_rhs_norm_squared = static_cast<float>(new_ip_rr);
          inner_ip_rr = static_cast<float>(new_ip_rr);
        }
      }

      //store last error estimate:
      tag.error(std::sqrt(new_ip_rr / norm_rhs_squared));

      return result;
    }
Exemplo n.º 10
0
 static void dump(viennacl::backend::mem_handle const & buff, uniform_tag tag, cl_uint start, cl_uint size){
   viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::rand<ScalarType,1>::program_name(),"dump_uniform");
   k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(size,k.local_work_size(0)));
   viennacl::ocl::enqueue(k(buff.opencl_handle(), start, size, cl_float(tag.a), cl_float(tag.b) , cl_uint(time(0))));
 }
Exemplo n.º 11
0
/**
* With this let us go right to main():
**/
int main()
{
  typedef float       ScalarType;


  /**
  * <h2>Part 1: Set up a custom context</h2>
  *
  * The following is rather lengthy because OpenCL is a fairly low-level framework.
  * For comparison, the subsequent code explicitly performs the OpenCL setup that is done
  * in the background within the 'custom_kernels'-tutorial
  **/

  //manually set up a custom OpenCL context:
  std::vector<cl_device_id> device_id_array;

  //get all available devices
  viennacl::ocl::platform pf;
  std::cout << "Platform info: " << pf.info() << std::endl;
  std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT);
  std::cout << devices[0].name() << std::endl;
  std::cout << "Number of devices for custom context: " << devices.size() << std::endl;

  //set up context using all found devices:
  for (std::size_t i=0; i<devices.size(); ++i)
  {
      device_id_array.push_back(devices[i].id());
  }

  std::cout << "Creating context..." << std::endl;
  cl_int err;
  cl_context my_context = clCreateContext(0, cl_uint(device_id_array.size()), &(device_id_array[0]), NULL, NULL, &err);
  VIENNACL_ERR_CHECK(err);


  //create two Vectors:
  unsigned int vector_size = 10;
  std::vector<ScalarType> vec1(vector_size);
  std::vector<ScalarType> vec2(vector_size);
  std::vector<ScalarType> result(vector_size);

  //
  // fill the operands vec1 and vec2:
  //
  for (unsigned int i=0; i<vector_size; ++i)
  {
    vec1[i] = static_cast<ScalarType>(i);
    vec2[i] = static_cast<ScalarType>(vector_size-i);
  }

  //
  // create memory in OpenCL context:
  //
  cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err);
  VIENNACL_ERR_CHECK(err);
  cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err);
  VIENNACL_ERR_CHECK(err);
  cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err);
  VIENNACL_ERR_CHECK(err);

  //
  // create a command queue for each device:
  //

  std::vector<cl_command_queue> queues(devices.size());
  for (std::size_t i=0; i<devices.size(); ++i)
  {
    queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err);
    VIENNACL_ERR_CHECK(err);
  }

  //
  // create and build a program in the context:
  //
  std::size_t source_len = std::string(my_compute_program).length();
  cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err);
  err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL);

/*            char buffer[1024];
            cl_build_status status;
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL);
            std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl;
            std::cout << "Log: " << buffer << std::endl;*/

  VIENNACL_ERR_CHECK(err);

  //
  // create a kernel from the program:
  //
  const char * kernel_name = "elementwise_prod";
  cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err);
  VIENNACL_ERR_CHECK(err);


  //
  // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2;
  //
  err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result);
  VIENNACL_ERR_CHECK(err);
  err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size);
  VIENNACL_ERR_CHECK(err);
  std::size_t global_size = vector_size;
  std::size_t local_size = vector_size;
  err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);


  //
  // Read and output result:
  //
  err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);
  err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL);
  VIENNACL_ERR_CHECK(err);

  std::cout << "vec1  : ";
  for (std::size_t i=0; i<vec1.size(); ++i)
    std::cout << vec1[i] << " ";
  std::cout << std::endl;

  std::cout << "vec2  : ";
  for (std::size_t i=0; i<vec2.size(); ++i)
    std::cout << vec2[i] << " ";
  std::cout << std::endl;

  std::cout << "result: ";
  for (std::size_t i=0; i<result.size(); ++i)
    std::cout << result[i] << " ";
  std::cout << std::endl;

  /**
  * <h2>Part 2: Reuse Custom OpenCL Context with ViennaCL</h2>
  *
  * To let ViennaCL reuse the previously created context, we need to make it known to ViennaCL \em before any ViennaCL objects are created.
  * We inject the custom context as the context with default id '0' when using viennacl::ocl::switch_context().
  **/
  viennacl::ocl::setup_context(0, my_context, device_id_array, queues);
  viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero)

  /**
  * Check that ViennaCL really uses the new context:
  **/
  std::cout << "Existing context: " << my_context << std::endl;
  std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl;

  /**
  * Wrap existing OpenCL objects into ViennaCL:
  **/
  viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size);
  viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size);
  viennacl::vector<ScalarType> vcl_result(mem_result, vector_size);
  viennacl::scalar<ScalarType> vcl_s = 2.0;

  std::cout << "Standard vector operations within ViennaCL:" << std::endl;
  vcl_result = vcl_s * vcl_vec1 + vcl_vec2;

  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;

  /**
  * We can also reuse the existing elementwise_prod kernel.
  * Therefore, we first have to make the existing program known to ViennaCL
  * For more details on the three lines, see tutorial 'custom-kernels'
  **/
  std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl;
  viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program");
  viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel(my_kernel, "elementwise_prod");
  viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size())));  //Note that std::size_t might differ between host and device. Thus, a cast to cl_uint is necessary here.

  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;


  /**
  * Since a linear piece of memory can be interpreted in several ways,
  * we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/
  * The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products:
  **/
  viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3);

  vcl_vec2.resize(3);   //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2);

  std::cout << "result of matrix-vector product: ";
  std::cout << vcl_result << std::endl;

  /**
  *  Any further operations can be carried out in the same way.
  *  Just keep in mind that any resizing of vectors or matrices leads to a reallocation of the underlying memory buffer, through which the 'wrapper' is lost.
  **/
  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;

  return EXIT_SUCCESS;
}