KOKKOS_INLINE_FUNCTION void operator() (const typename ViewType::size_type i) const { // On CPUs this loop could be vectorized so j should do stride 1 // access on a for optimal performance. I.e. a should be LayoutRight. // On GPUs threads should do coalesced loads and stores. That means // that i should be the stride one access for optimal performance. for (typename ViewType::size_type j = 0; j < a.dimension_1 (); ++j) { a(i,j) = 1.0*a.dimension_0()*i + 1.0*j; } }
MPVectorAtomicFunctor( const ViewType & v , const scalar_type & s ) : m_v( v ), m_s( s ) { Kokkos::parallel_for( m_v.dimension_0() , *this ); }
GetMeanValsFunc(const ViewType& vals) { mean_vals = ViewType("mean-values", vals.dimension_0()); Kokkos::deep_copy( mean_vals, vals ); }
// Kernel launch static void apply(const ViewType& v, const ScalarType& s) { const size_type nrow = v.dimension_0(); Kokkos::parallel_for( nrow, ScalarAssignKernel(v,s) ); }
size_t getStride2DView (ViewType A) { size_t stride[8]; A.stride (stride); return A.dimension_1 () > 1 ? stride[1] : A.dimension_0 (); }