OutputIterator transform(ExecutionPolicy &sep, Iterator b, Iterator e, OutputIterator out, UnaryOperation op) { { cl::sycl::queue q(sep.get_queue()); auto device = q.get_device(); size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>(); auto bufI = sycl::helpers::make_const_buffer(b, e); auto bufO = sycl::helpers::make_buffer(out, out + bufI.get_count()); auto vectorSize = bufI.get_count(); size_t global = sep.calculateGlobalSize(vectorSize, local); auto f = [vectorSize, local, global, &bufI, &bufO, op]( cl::sycl::handler &h) mutable { cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1}, cl::sycl::range<3>{local, 1, 1}}; auto aI = bufI.template get_access<cl::sycl::access::mode::read>(h); auto aO = bufO.template get_access<cl::sycl::access::mode::write>(h); h.parallel_for<typename ExecutionPolicy::kernelName>( r, [aI, aO, op, vectorSize](cl::sycl::nd_item<3> id) { if ((id.get_global(0) < vectorSize)) { aO[id.get_global(0)] = op(aI[id.get_global(0)]); } }); }; q.submit(f); } return out; }
OutputIterator transform(ExecutionPolicy &sep, InputIterator first1, InputIterator last1, InputIterator first2, OutputIterator result, BinaryOperation op) { cl::sycl::queue q(sep.get_queue()); auto device = q.get_device(); size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>(); auto buf1 = sycl::helpers::make_const_buffer(first1, last1); auto n = buf1.get_count(); auto buf2 = sycl::helpers::make_const_buffer(first2, first2 + n); auto res = sycl::helpers::make_buffer(result, result + n); size_t global = sep.calculateGlobalSize(n, local); auto f = [n, local, global, &buf1, &buf2, &res, op](cl::sycl::handler &h) mutable { cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1}, cl::sycl::range<3>{local, 1, 1}}; auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h); auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h); auto aO = res.template get_access<cl::sycl::access::mode::write>(h); h.parallel_for<typename ExecutionPolicy::kernelName>( r, [a1, a2, aO, op, n](cl::sycl::nd_item<3> id) { if (id.get_global(0) < n) { aO[id.get_global(0)] = op(a1[id.get_global(0)], a2[id.get_global(0)]); } }); }; q.submit(f); return first2 + n; }
typename std::iterator_traits<Iterator>::value_type reduce( ExecutionPolicy &sep, Iterator b, Iterator e, T init, BinaryOperation bop) { cl::sycl::queue q(sep.get_queue()); auto vectorSize = std::distance(b, e); if (vectorSize < 1) { return init; } auto device = q.get_device(); auto local = device.get_info<cl::sycl::info::device::max_work_group_size>(); typedef typename std::iterator_traits<Iterator>::value_type type_; auto bufI = sycl::helpers::make_const_buffer(b, e); size_t length = vectorSize; size_t global = sep.calculateGlobalSize(length, local); do { auto f = [length, local, global, &bufI, bop](cl::sycl::handler &h) mutable { cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1}, cl::sycl::range<3>{local, 1, 1}}; auto aI = bufI.template get_access<cl::sycl::access::mode::read_write>(h); cl::sycl::accessor<type_, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> scratch(cl::sycl::range<1>(local), h); h.parallel_for<typename ExecutionPolicy::kernelName>( r, [aI, scratch, local, length, bop](cl::sycl::nd_item<3> id) { int globalid = id.get_global(0); int localid = id.get_local(0); auto r = ReductionStrategy<T>(local, length, id, scratch); r.workitem_get_from(aI); r.combine_threads(bop); r.workgroup_write_to(aI); }); }; q.submit(f); length = length / local; } while (length > 1); q.wait_and_throw(); auto hI = bufI.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>(); return hI[0] + init; }
typename std::enable_if< std::is_same< agency::detail::execution_policy_execution_category_t<ExecutionPolicy>, parallel_execution_tag >::value, cuda::parallel_execution_policy >::type replace_executor(const ExecutionPolicy& policy, const cuda::parallel_executor& exec) { return cuda::parallel_execution_policy(policy.param(), exec); }
bulk_async_execution_policy_result_t< ExecutionPolicy, Function, Args... > bulk_async_execution_policy(index_sequence<UserArgIndices...>, index_sequence<SharedArgIndices...>, ExecutionPolicy& policy, Function f, Args&&... args) { using agent_type = typename ExecutionPolicy::execution_agent_type; using agent_traits = execution_agent_traits<agent_type>; using execution_category = typename agent_traits::execution_category; // get the parameters of the agent auto param = policy.param(); auto agent_shape = agent_traits::domain(param).shape(); // this is a tuple of factories // each factory in the tuple creates the execution agent's shared parameter at the corresponding hierarchy level auto agent_shared_parameter_factory_tuple = detail::make_agent_shared_parameter_factory_tuple<agent_type>(param); using executor_type = typename ExecutionPolicy::executor_type; // convert the shape of the agent into the type of the executor's shape using executor_shape_type = executor_shape_t<executor_type>; executor_shape_type executor_shape = detail::shape_cast<executor_shape_type>(agent_shape); // create the function that will marshal parameters received from bulk_invoke(executor) and execute the agent auto lambda = execute_agent_functor<executor_type,agent_traits,Function,UserArgIndices...>{param, agent_shape, executor_shape, f}; return detail::bulk_async_executor( policy.executor(), executor_shape, lambda, std::forward<Args>(args)..., agency::share_at_scope_from_factory<SharedArgIndices>(detail::get<SharedArgIndices>(agent_shared_parameter_factory_tuple))... ); }
basic_execution_policy< typename ExecutionPolicy::execution_agent_type, Executor > replace_executor(const ExecutionPolicy& policy, const Executor& exec) { using policy_category = detail::execution_policy_execution_category_t<ExecutionPolicy>; using executor_category = executor_execution_category_t<Executor>; static_assert(detail::is_weaker_than<policy_category, executor_category>::value, "replace_executor(): Execution policy's forward progress requirements cannot be satisfied by executor's guarantees."); using result_type = basic_execution_policy< typename ExecutionPolicy::execution_agent_type, Executor >; return result_type(policy.param(), exec); }
InputIterator for_each_n(ExecutionPolicy &exec, InputIterator first, Size n, Function f) { cl::sycl::queue q(exec.get_queue()); if (n > 0) { auto last(first + n); auto bufI = sycl::helpers::make_buffer(first, last); auto vectorSize = bufI.get_count(); auto cg = [vectorSize, &bufI, f](cl::sycl::handler &h) mutable { cl::sycl::range<3> r{vectorSize, 1, 1}; auto aI = bufI.template get_access<cl::sycl::access::mode::read_write>(h); h.parallel_for<typename ExecutionPolicy::kernelName>( r, [aI, f](cl::sycl::id<3> id) { f(aI[id.get(0)]); }); }; q.submit(cg); return last; } else { return first; } }
void transform(ExecutionPolicy &sep, cl::sycl::queue &q, Buffer &buf1, Buffer &buf2, Buffer &res, BinaryOperation op) { auto device = q.get_device(); size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>(); auto n = buf1.get_count(); size_t global = sep.calculateGlobalSize(n, local); auto f = [n, local, global, &buf1, &buf2, &res, op](cl::sycl::handler &h) mutable { cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1}, cl::sycl::range<3>{local, 1, 1}}; auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h); auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h); auto aO = res.template get_access<cl::sycl::access::mode::write>(h); h.parallel_for<class TransformAlgorithm>( r, [a1, a2, aO, op, n](cl::sycl::nd_item<3> id) { if (id.get_global(0) < n) { aO[id.get_global(0)] = op(a1[id.get_global(0)], a2[id.get_global(0)]); } }); }; q.submit(f); }
void test(ExecutionPolicy policy) { using agent = typename ExecutionPolicy::execution_agent_type; using agent_traits = agency::execution_agent_traits<agent>; { // bulk_async with no parameters auto f = agency::bulk_async(policy, [](agent& self) { return 7; }); auto result = f.get(); using executor_type = typename ExecutionPolicy::executor_type; using container_type = agency::executor_container_t<executor_type,int>; auto shape = agent_traits::domain(policy.param()).shape(); assert(container_type(shape,7) == result); } { // bulk_async with one parameter int val = 13; auto f = agency::bulk_async(policy, [](agent& self, int val) { return val; }, val ); auto result = f.get(); using executor_type = typename ExecutionPolicy::executor_type; using container_type = agency::executor_container_t<executor_type,int>; auto shape = agent_traits::domain(policy.param()).shape(); assert(container_type(shape,val) == result); } { // bulk_async with one shared parameter int val = 13; auto f = agency::bulk_async(policy, [](agent& self, int& val) { return val; }, agency::share(val) ); auto result = f.get(); using executor_type = typename ExecutionPolicy::executor_type; using container_type = agency::executor_container_t<executor_type,int>; auto shape = agent_traits::domain(policy.param()).shape(); assert(container_type(shape,val) == result); } }
bool equal(ExecutionPolicy& exec, ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, ForwardIt2 last2, BinaryPredicate p) { cl::sycl::queue q(exec.get_queue()); auto size1 = sycl::helpers::distance(first1, last1); auto size2 = sycl::helpers::distance(first2, last2); if (size1 != size2) { return false; } if (size1 < 1) { return true; } auto device = q.get_device(); auto length = size1; auto ndRange = exec.calculateNdRange(size1); const auto local = ndRange.get_local_range()[0]; auto buf1 = sycl::helpers::make_const_buffer(first1, last1); auto buf2 = sycl::helpers::make_const_buffer(first2, last2); auto bufR = cl::sycl::buffer<bool, 1>(cl::sycl::range<1>(size1)); do { int passes = 0; auto f = [passes, length, ndRange, local, &buf1, &buf2, &bufR, p](cl::sycl::handler& h) mutable { auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h); auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h); auto aR = bufR.template get_access<cl::sycl::access::mode::read_write>(h); cl::sycl::accessor<bool, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> scratch(ndRange.get_local_range(), h); h.parallel_for<typename ExecutionPolicy::kernelName>( ndRange, [a1, a2, aR, scratch, passes, local, length, p](cl::sycl::nd_item<1> id) { auto r = ReductionStrategy<bool>(local, length, id, scratch); if (passes == 0) { r.workitem_get_from(p, a1, a2); } else { r.workitem_get_from(aR); } r.combine_threads(std::logical_and<bool>{}); r.workgroup_write_to(aR); }); // end kernel }; // end command group q.submit(f); length = length / local; ndRange = cl::sycl::nd_range<1>{cl::sycl::range<1>(std::max(length, local)), ndRange.get_local_range()}; ++passes; } while (length > 1); q.wait_and_throw(); auto hr = bufR.template get_access<cl::sycl::access::mode::read>( cl::sycl::range<1>{1}, cl::sycl::id<1>{0}); return hr[0]; }
// Ensure we are setting appropriate level of configuration // Automatic to Full Manual spectrum TEST(ExecutionPolicyTest, StateReflectsConfiguration) { // Defaults: fully automatic ExecutionPolicy p; int configState = p.getConfigState(); EXPECT_EQ (ExecutionPolicy::Automatic, configState); EXPECT_NE (ExecutionPolicy::FullManual, configState); EXPECT_EQ (0, configState & ExecutionPolicy::SharedMem); EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize); EXPECT_EQ (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (0, p.getGridSize()); EXPECT_EQ (0, p.getBlockSize()); EXPECT_EQ (0, p.getSharedMemBytes()); // shared memory only p = ExecutionPolicy(); // reset p.setSharedMemBytes( 1024 ); configState = p.getConfigState(); EXPECT_NE (ExecutionPolicy::FullManual, configState); EXPECT_NE (ExecutionPolicy::Automatic, configState); EXPECT_NE (0, configState & ExecutionPolicy::SharedMem); EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize); EXPECT_EQ (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (0, p.getGridSize()); EXPECT_EQ (0, p.getBlockSize()); EXPECT_EQ (1024, p.getSharedMemBytes()); // Block Size Only p = ExecutionPolicy(); // reset p.setBlockSize( 512 ); configState = p.getConfigState(); EXPECT_NE (ExecutionPolicy::FullManual, configState); EXPECT_NE (ExecutionPolicy::Automatic, configState); EXPECT_EQ (0, configState & ExecutionPolicy::SharedMem); EXPECT_NE (0, configState & ExecutionPolicy::BlockSize); EXPECT_EQ (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (0, p.getGridSize()); EXPECT_EQ (512, p.getBlockSize()); EXPECT_EQ (0, p.getSharedMemBytes()); // Block Size and Shared Memory Only p = ExecutionPolicy(); // reset p.setBlockSize( 512 ); p.setSharedMemBytes( 1024 ); configState = p.getConfigState(); EXPECT_NE (ExecutionPolicy::FullManual, configState); EXPECT_NE (ExecutionPolicy::Automatic, configState); EXPECT_NE (0, configState & ExecutionPolicy::SharedMem); EXPECT_NE (0, configState & ExecutionPolicy::BlockSize); EXPECT_EQ (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (0, p.getGridSize()); EXPECT_EQ (512, p.getBlockSize()); EXPECT_EQ (1024, p.getSharedMemBytes()); // Grid Size Only p = ExecutionPolicy(); // reset p.setGridSize( 100 ); configState = p.getConfigState(); EXPECT_NE (ExecutionPolicy::FullManual, configState); EXPECT_NE (ExecutionPolicy::Automatic, configState); EXPECT_EQ (0, configState & ExecutionPolicy::SharedMem); EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize); EXPECT_NE (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (100, p.getGridSize()); EXPECT_EQ (0, p.getBlockSize()); EXPECT_EQ (0, p.getSharedMemBytes()); // Grid Size and Shared Memory Only p = ExecutionPolicy(); // reset p.setGridSize( 100 ); p.setSharedMemBytes( 1024 ); configState = p.getConfigState(); EXPECT_NE (ExecutionPolicy::FullManual, configState); EXPECT_NE (ExecutionPolicy::Automatic, configState); EXPECT_NE (0, configState & ExecutionPolicy::SharedMem); EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize); EXPECT_NE (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (100, p.getGridSize()); EXPECT_EQ (0, p.getBlockSize()); EXPECT_EQ (1024, p.getSharedMemBytes()); // Grid Size and Block Size Only p = ExecutionPolicy(); // reset p.setGridSize( 100 ); p.setBlockSize( 512 ); configState = p.getConfigState(); EXPECT_NE (ExecutionPolicy::FullManual, configState); EXPECT_NE (ExecutionPolicy::Automatic, configState); EXPECT_EQ (0, configState & ExecutionPolicy::SharedMem); EXPECT_NE (0, configState & ExecutionPolicy::BlockSize); EXPECT_NE (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (100, p.getGridSize()); EXPECT_EQ (512, p.getBlockSize()); EXPECT_EQ (0, p.getSharedMemBytes()); // Full Manual Configuration p = ExecutionPolicy{1, 256, 10}; configState = p.getConfigState(); EXPECT_EQ (ExecutionPolicy::FullManual, configState); EXPECT_NE (ExecutionPolicy::Automatic, configState); EXPECT_NE (0, configState & ExecutionPolicy::SharedMem); EXPECT_NE (0, configState & ExecutionPolicy::BlockSize); EXPECT_NE (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (1, p.getGridSize()); EXPECT_EQ (256, p.getBlockSize()); EXPECT_EQ (10, p.getSharedMemBytes()); // Full Manual Configuration With Separate Calls p = ExecutionPolicy(); // reset p.setGridSize( 100 ); p.setBlockSize( 512 ); p.setSharedMemBytes( 1024); configState = p.getConfigState(); EXPECT_EQ (ExecutionPolicy::FullManual, configState); EXPECT_NE (ExecutionPolicy::Automatic, configState); EXPECT_NE (0, configState & ExecutionPolicy::SharedMem); EXPECT_NE (0, configState & ExecutionPolicy::BlockSize); EXPECT_NE (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (100, p.getGridSize()); EXPECT_EQ (512, p.getBlockSize()); EXPECT_EQ (1024, p.getSharedMemBytes()); p = ExecutionPolicy(); // reset p.setGridSize( 0 ); p.setBlockSize( 0 ); p.setSharedMemBytes( 0 ); configState = p.getConfigState(); // Setting Zero Shared Memory makes it *Manual* EXPECT_NE (ExecutionPolicy::FullManual, configState); EXPECT_NE (ExecutionPolicy::Automatic, configState); EXPECT_NE (0, configState & ExecutionPolicy::SharedMem); // Setting 0 grid or block size makes them *Automatic* EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize); EXPECT_EQ (0, configState & ExecutionPolicy::GridSize); EXPECT_EQ (0, p.getGridSize()); EXPECT_EQ (0, p.getBlockSize()); EXPECT_EQ (0, p.getSharedMemBytes()); // Re-setting block or grid size to >0 should set them to manual p.setGridSize( 100 ); p.setBlockSize( 512 ); configState = p.getConfigState(); EXPECT_NE(0, configState & ExecutionPolicy::GridSize); EXPECT_NE(0, configState & ExecutionPolicy::BlockSize); // Re-setting block or grid size to zero should set them to automatic p.setGridSize( 0 ); p.setBlockSize( 0 ); configState = p.getConfigState(); EXPECT_EQ(0, configState & ExecutionPolicy::GridSize); EXPECT_EQ(0, configState & ExecutionPolicy::BlockSize); // Setting stream (trivial) p.setStream(1); EXPECT_EQ(1, p.getStream()); }