Esempio n. 1
0
OutputIterator transform(ExecutionPolicy &sep, Iterator b, Iterator e,
                         OutputIterator out, UnaryOperation op) {
  {
    cl::sycl::queue q(sep.get_queue());
    auto device = q.get_device();
    size_t local =
        device.get_info<cl::sycl::info::device::max_work_group_size>();
    auto bufI = sycl::helpers::make_const_buffer(b, e);
    auto bufO = sycl::helpers::make_buffer(out, out + bufI.get_count());
    auto vectorSize = bufI.get_count();
    size_t global = sep.calculateGlobalSize(vectorSize, local);
    auto f = [vectorSize, local, global, &bufI, &bufO, op](
        cl::sycl::handler &h) mutable {
      cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1},
                              cl::sycl::range<3>{local, 1, 1}};
      auto aI = bufI.template get_access<cl::sycl::access::mode::read>(h);
      auto aO = bufO.template get_access<cl::sycl::access::mode::write>(h);
      h.parallel_for<typename ExecutionPolicy::kernelName>(
          r, [aI, aO, op, vectorSize](cl::sycl::nd_item<3> id) {
            if ((id.get_global(0) < vectorSize)) {
              aO[id.get_global(0)] = op(aI[id.get_global(0)]);
            }
          });
    };
    q.submit(f);
  }
  return out;
}
Esempio n. 2
0
OutputIterator transform(ExecutionPolicy &sep, InputIterator first1,
                         InputIterator last1, InputIterator first2,
                         OutputIterator result, BinaryOperation op) {
  cl::sycl::queue q(sep.get_queue());
  auto device = q.get_device();
  size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>();
  auto buf1 = sycl::helpers::make_const_buffer(first1, last1);
  auto n = buf1.get_count();
  auto buf2 = sycl::helpers::make_const_buffer(first2, first2 + n);
  auto res = sycl::helpers::make_buffer(result, result + n);
  size_t global = sep.calculateGlobalSize(n, local);
  auto f =
      [n, local, global, &buf1, &buf2, &res, op](cl::sycl::handler &h) mutable {
    cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1},
                            cl::sycl::range<3>{local, 1, 1}};
    auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h);
    auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h);
    auto aO = res.template get_access<cl::sycl::access::mode::write>(h);
    h.parallel_for<typename ExecutionPolicy::kernelName>(
        r, [a1, a2, aO, op, n](cl::sycl::nd_item<3> id) {
          if (id.get_global(0) < n) {
            aO[id.get_global(0)] =
                op(a1[id.get_global(0)], a2[id.get_global(0)]);
          }
        });
  };
  q.submit(f);
  return first2 + n;
}
Esempio n. 3
0
typename std::iterator_traits<Iterator>::value_type reduce(
    ExecutionPolicy &sep, Iterator b, Iterator e, T init, BinaryOperation bop) {
  cl::sycl::queue q(sep.get_queue());
  auto vectorSize = std::distance(b, e);

  if (vectorSize < 1) {
    return init;
  }

  auto device = q.get_device();
  auto local = device.get_info<cl::sycl::info::device::max_work_group_size>();
  typedef typename std::iterator_traits<Iterator>::value_type type_;
  auto bufI = sycl::helpers::make_const_buffer(b, e);
  size_t length = vectorSize;
  size_t global = sep.calculateGlobalSize(length, local);

  do {
    auto f = [length, local, global, &bufI, bop](cl::sycl::handler &h) mutable {
      cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1},
                              cl::sycl::range<3>{local, 1, 1}};
      auto aI = bufI.template get_access<cl::sycl::access::mode::read_write>(h);
      cl::sycl::accessor<type_, 1, cl::sycl::access::mode::read_write,
                         cl::sycl::access::target::local>
          scratch(cl::sycl::range<1>(local), h);

      h.parallel_for<typename ExecutionPolicy::kernelName>(
          r, [aI, scratch, local, length, bop](cl::sycl::nd_item<3> id) {
            int globalid = id.get_global(0);
            int localid = id.get_local(0);

            auto r = ReductionStrategy<T>(local, length, id, scratch);
            r.workitem_get_from(aI);
            r.combine_threads(bop);
            r.workgroup_write_to(aI);
          });
    };
    q.submit(f);
    length = length / local;
  } while (length > 1);
  q.wait_and_throw();
  auto hI = bufI.template get_access<cl::sycl::access::mode::read,
                                     cl::sycl::access::target::host_buffer>();
  return hI[0] + init;
}
Esempio n. 4
0
void transform(ExecutionPolicy &sep, cl::sycl::queue &q, Buffer &buf1,
               Buffer &buf2, Buffer &res, BinaryOperation op) {
  auto device = q.get_device();
  size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>();
  auto n = buf1.get_count();
  size_t global = sep.calculateGlobalSize(n, local);
  auto f =
      [n, local, global, &buf1, &buf2, &res, op](cl::sycl::handler &h) mutable {
    cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1},
                            cl::sycl::range<3>{local, 1, 1}};
    auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h);
    auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h);
    auto aO = res.template get_access<cl::sycl::access::mode::write>(h);
    h.parallel_for<class TransformAlgorithm>(
        r, [a1, a2, aO, op, n](cl::sycl::nd_item<3> id) {
          if (id.get_global(0) < n) {
            aO[id.get_global(0)] =
                op(a1[id.get_global(0)], a2[id.get_global(0)]);
          }
        });
  };
  q.submit(f);
}