コード例 #1
0
    void inner_prod_impl(vector_base<T> const & x,
                         vector_tuple<T> const & y_tuple,
                         vector_base<T> & result)
    {
      assert( x.size() == y_tuple.const_at(0).size() && bool("Size mismatch") );
      assert( result.size() == y_tuple.const_size() && bool("Number of elements does not match result size") );

      switch (viennacl::traits::handle(x).get_active_handle_id())
      {
        case viennacl::MAIN_MEMORY:
          viennacl::linalg::host_based::inner_prod_impl(x, y_tuple, result);
          break;
#ifdef VIENNACL_WITH_OPENCL
        case viennacl::OPENCL_MEMORY:
          viennacl::linalg::opencl::inner_prod_impl(x, y_tuple, result);
          break;
#endif
#ifdef VIENNACL_WITH_CUDA
        case viennacl::CUDA_MEMORY:
          viennacl::linalg::cuda::inner_prod_impl(x, y_tuple, result);
          break;
#endif
        default:
          throw "not implemented";
      }
    }
コード例 #2
0
void inner_prod_impl(vector_base<NumericT> const & x,
                     vector_tuple<NumericT> const & vec_tuple,
                     vector_base<NumericT> & result)
{
  typedef NumericT        value_type;

  value_type const * data_x = detail::extract_raw_pointer<value_type>(x);

  vcl_size_t start_x = viennacl::traits::start(x);
  vcl_size_t inc_x   = viennacl::traits::stride(x);
  vcl_size_t size_x  = viennacl::traits::size(x);

  std::vector<value_type> temp(vec_tuple.const_size());
  std::vector<value_type const *> data_y(vec_tuple.const_size());
  std::vector<vcl_size_t> start_y(vec_tuple.const_size());
  std::vector<vcl_size_t> stride_y(vec_tuple.const_size());

  for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j)
  {
    data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j));
    start_y[j] = viennacl::traits::start(vec_tuple.const_at(j));
    stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j));
  }

  // Note: No OpenMP here because it cannot perform a reduction on temp-array. Savings in memory bandwidth are expected to still justify this approach...
  for (vcl_size_t i = 0; i < size_x; ++i)
  {
    value_type entry_x = data_x[i*inc_x+start_x];
    for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
      temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]];
  }

  for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
    result[j] = temp[j];  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
}