OutputIterator transform(ExecutionPolicy &sep, Iterator b, Iterator e, OutputIterator out, UnaryOperation op) { { cl::sycl::queue q(sep.get_queue()); auto device = q.get_device(); size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>(); auto bufI = sycl::helpers::make_const_buffer(b, e); auto bufO = sycl::helpers::make_buffer(out, out + bufI.get_count()); auto vectorSize = bufI.get_count(); size_t global = sep.calculateGlobalSize(vectorSize, local); auto f = [vectorSize, local, global, &bufI, &bufO, op]( cl::sycl::handler &h) mutable { cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1}, cl::sycl::range<3>{local, 1, 1}}; auto aI = bufI.template get_access<cl::sycl::access::mode::read>(h); auto aO = bufO.template get_access<cl::sycl::access::mode::write>(h); h.parallel_for<typename ExecutionPolicy::kernelName>( r, [aI, aO, op, vectorSize](cl::sycl::nd_item<3> id) { if ((id.get_global(0) < vectorSize)) { aO[id.get_global(0)] = op(aI[id.get_global(0)]); } }); }; q.submit(f); } return out; }
OutputIterator transform(ExecutionPolicy &sep, InputIterator first1, InputIterator last1, InputIterator first2, OutputIterator result, BinaryOperation op) { cl::sycl::queue q(sep.get_queue()); auto device = q.get_device(); size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>(); auto buf1 = sycl::helpers::make_const_buffer(first1, last1); auto n = buf1.get_count(); auto buf2 = sycl::helpers::make_const_buffer(first2, first2 + n); auto res = sycl::helpers::make_buffer(result, result + n); size_t global = sep.calculateGlobalSize(n, local); auto f = [n, local, global, &buf1, &buf2, &res, op](cl::sycl::handler &h) mutable { cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1}, cl::sycl::range<3>{local, 1, 1}}; auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h); auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h); auto aO = res.template get_access<cl::sycl::access::mode::write>(h); h.parallel_for<typename ExecutionPolicy::kernelName>( r, [a1, a2, aO, op, n](cl::sycl::nd_item<3> id) { if (id.get_global(0) < n) { aO[id.get_global(0)] = op(a1[id.get_global(0)], a2[id.get_global(0)]); } }); }; q.submit(f); return first2 + n; }
typename std::iterator_traits<Iterator>::value_type reduce( ExecutionPolicy &sep, Iterator b, Iterator e, T init, BinaryOperation bop) { cl::sycl::queue q(sep.get_queue()); auto vectorSize = std::distance(b, e); if (vectorSize < 1) { return init; } auto device = q.get_device(); auto local = device.get_info<cl::sycl::info::device::max_work_group_size>(); typedef typename std::iterator_traits<Iterator>::value_type type_; auto bufI = sycl::helpers::make_const_buffer(b, e); size_t length = vectorSize; size_t global = sep.calculateGlobalSize(length, local); do { auto f = [length, local, global, &bufI, bop](cl::sycl::handler &h) mutable { cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1}, cl::sycl::range<3>{local, 1, 1}}; auto aI = bufI.template get_access<cl::sycl::access::mode::read_write>(h); cl::sycl::accessor<type_, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> scratch(cl::sycl::range<1>(local), h); h.parallel_for<typename ExecutionPolicy::kernelName>( r, [aI, scratch, local, length, bop](cl::sycl::nd_item<3> id) { int globalid = id.get_global(0); int localid = id.get_local(0); auto r = ReductionStrategy<T>(local, length, id, scratch); r.workitem_get_from(aI); r.combine_threads(bop); r.workgroup_write_to(aI); }); }; q.submit(f); length = length / local; } while (length > 1); q.wait_and_throw(); auto hI = bufI.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>(); return hI[0] + init; }
void transform(ExecutionPolicy &sep, cl::sycl::queue &q, Buffer &buf1, Buffer &buf2, Buffer &res, BinaryOperation op) { auto device = q.get_device(); size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>(); auto n = buf1.get_count(); size_t global = sep.calculateGlobalSize(n, local); auto f = [n, local, global, &buf1, &buf2, &res, op](cl::sycl::handler &h) mutable { cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1}, cl::sycl::range<3>{local, 1, 1}}; auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h); auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h); auto aO = res.template get_access<cl::sycl::access::mode::write>(h); h.parallel_for<class TransformAlgorithm>( r, [a1, a2, aO, op, n](cl::sycl::nd_item<3> id) { if (id.get_global(0) < n) { aO[id.get_global(0)] = op(a1[id.get_global(0)], a2[id.get_global(0)]); } }); }; q.submit(f); }