void inner_prod_impl(vector_base<NumericT> const & x, vector_tuple<NumericT> const & vec_tuple, vector_base<NumericT> & result) { typedef NumericT value_type; value_type const * data_x = detail::extract_raw_pointer<value_type>(x); vcl_size_t start_x = viennacl::traits::start(x); vcl_size_t inc_x = viennacl::traits::stride(x); vcl_size_t size_x = viennacl::traits::size(x); std::vector<value_type> temp(vec_tuple.const_size()); std::vector<value_type const *> data_y(vec_tuple.const_size()); std::vector<vcl_size_t> start_y(vec_tuple.const_size()); std::vector<vcl_size_t> stride_y(vec_tuple.const_size()); for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j) { data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j)); start_y[j] = viennacl::traits::start(vec_tuple.const_at(j)); stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j)); } // Note: No OpenMP here because it cannot perform a reduction on temp-array. Savings in memory bandwidth are expected to still justify this approach... for (vcl_size_t i = 0; i < size_x; ++i) { value_type entry_x = data_x[i*inc_x+start_x]; for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j) temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]]; } for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j) result[j] = temp[j]; //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation }
void inner_prod_impl(vector_base<T> const & x, vector_tuple<T> const & y_tuple, vector_base<T> & result) { assert( x.size() == y_tuple.const_at(0).size() && bool("Size mismatch") ); assert( result.size() == y_tuple.const_size() && bool("Number of elements does not match result size") ); switch (viennacl::traits::handle(x).get_active_handle_id()) { case viennacl::MAIN_MEMORY: viennacl::linalg::host_based::inner_prod_impl(x, y_tuple, result); break; #ifdef VIENNACL_WITH_OPENCL case viennacl::OPENCL_MEMORY: viennacl::linalg::opencl::inner_prod_impl(x, y_tuple, result); break; #endif #ifdef VIENNACL_WITH_CUDA case viennacl::CUDA_MEMORY: viennacl::linalg::cuda::inner_prod_impl(x, y_tuple, result); break; #endif default: throw "not implemented"; } }