Esempio n. 1
0
OutputIterator transform(ExecutionPolicy &sep, Iterator b, Iterator e,
                         OutputIterator out, UnaryOperation op) {
  {
    cl::sycl::queue q(sep.get_queue());
    auto device = q.get_device();
    size_t local =
        device.get_info<cl::sycl::info::device::max_work_group_size>();
    auto bufI = sycl::helpers::make_const_buffer(b, e);
    auto bufO = sycl::helpers::make_buffer(out, out + bufI.get_count());
    auto vectorSize = bufI.get_count();
    size_t global = sep.calculateGlobalSize(vectorSize, local);
    auto f = [vectorSize, local, global, &bufI, &bufO, op](
        cl::sycl::handler &h) mutable {
      cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1},
                              cl::sycl::range<3>{local, 1, 1}};
      auto aI = bufI.template get_access<cl::sycl::access::mode::read>(h);
      auto aO = bufO.template get_access<cl::sycl::access::mode::write>(h);
      h.parallel_for<typename ExecutionPolicy::kernelName>(
          r, [aI, aO, op, vectorSize](cl::sycl::nd_item<3> id) {
            if ((id.get_global(0) < vectorSize)) {
              aO[id.get_global(0)] = op(aI[id.get_global(0)]);
            }
          });
    };
    q.submit(f);
  }
  return out;
}
Esempio n. 2
0
OutputIterator transform(ExecutionPolicy &sep, InputIterator first1,
                         InputIterator last1, InputIterator first2,
                         OutputIterator result, BinaryOperation op) {
  cl::sycl::queue q(sep.get_queue());
  auto device = q.get_device();
  size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>();
  auto buf1 = sycl::helpers::make_const_buffer(first1, last1);
  auto n = buf1.get_count();
  auto buf2 = sycl::helpers::make_const_buffer(first2, first2 + n);
  auto res = sycl::helpers::make_buffer(result, result + n);
  size_t global = sep.calculateGlobalSize(n, local);
  auto f =
      [n, local, global, &buf1, &buf2, &res, op](cl::sycl::handler &h) mutable {
    cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1},
                            cl::sycl::range<3>{local, 1, 1}};
    auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h);
    auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h);
    auto aO = res.template get_access<cl::sycl::access::mode::write>(h);
    h.parallel_for<typename ExecutionPolicy::kernelName>(
        r, [a1, a2, aO, op, n](cl::sycl::nd_item<3> id) {
          if (id.get_global(0) < n) {
            aO[id.get_global(0)] =
                op(a1[id.get_global(0)], a2[id.get_global(0)]);
          }
        });
  };
  q.submit(f);
  return first2 + n;
}
Esempio n. 3
0
typename std::iterator_traits<Iterator>::value_type reduce(
    ExecutionPolicy &sep, Iterator b, Iterator e, T init, BinaryOperation bop) {
  cl::sycl::queue q(sep.get_queue());
  auto vectorSize = std::distance(b, e);

  if (vectorSize < 1) {
    return init;
  }

  auto device = q.get_device();
  auto local = device.get_info<cl::sycl::info::device::max_work_group_size>();
  typedef typename std::iterator_traits<Iterator>::value_type type_;
  auto bufI = sycl::helpers::make_const_buffer(b, e);
  size_t length = vectorSize;
  size_t global = sep.calculateGlobalSize(length, local);

  do {
    auto f = [length, local, global, &bufI, bop](cl::sycl::handler &h) mutable {
      cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1},
                              cl::sycl::range<3>{local, 1, 1}};
      auto aI = bufI.template get_access<cl::sycl::access::mode::read_write>(h);
      cl::sycl::accessor<type_, 1, cl::sycl::access::mode::read_write,
                         cl::sycl::access::target::local>
          scratch(cl::sycl::range<1>(local), h);

      h.parallel_for<typename ExecutionPolicy::kernelName>(
          r, [aI, scratch, local, length, bop](cl::sycl::nd_item<3> id) {
            int globalid = id.get_global(0);
            int localid = id.get_local(0);

            auto r = ReductionStrategy<T>(local, length, id, scratch);
            r.workitem_get_from(aI);
            r.combine_threads(bop);
            r.workgroup_write_to(aI);
          });
    };
    q.submit(f);
    length = length / local;
  } while (length > 1);
  q.wait_and_throw();
  auto hI = bufI.template get_access<cl::sycl::access::mode::read,
                                     cl::sycl::access::target::host_buffer>();
  return hI[0] + init;
}
Esempio n. 4
0
typename std::enable_if<
  std::is_same<
    agency::detail::execution_policy_execution_category_t<ExecutionPolicy>,
    parallel_execution_tag
  >::value,
  cuda::parallel_execution_policy
>::type
  replace_executor(const ExecutionPolicy& policy, const cuda::parallel_executor& exec)
{
  return cuda::parallel_execution_policy(policy.param(), exec);
}
bulk_async_execution_policy_result_t<
  ExecutionPolicy, Function, Args...
>
  bulk_async_execution_policy(index_sequence<UserArgIndices...>,
                              index_sequence<SharedArgIndices...>,
                              ExecutionPolicy& policy, Function f, Args&&... args)
{
  using agent_type = typename ExecutionPolicy::execution_agent_type;
  using agent_traits = execution_agent_traits<agent_type>;
  using execution_category = typename agent_traits::execution_category;

  // get the parameters of the agent
  auto param = policy.param();
  auto agent_shape = agent_traits::domain(param).shape();

  // this is a tuple of factories
  // each factory in the tuple creates the execution agent's shared parameter at the corresponding hierarchy level
  auto agent_shared_parameter_factory_tuple = detail::make_agent_shared_parameter_factory_tuple<agent_type>(param);

  using executor_type = typename ExecutionPolicy::executor_type;

  // convert the shape of the agent into the type of the executor's shape
  using executor_shape_type = executor_shape_t<executor_type>;
  executor_shape_type executor_shape = detail::shape_cast<executor_shape_type>(agent_shape);

  // create the function that will marshal parameters received from bulk_invoke(executor) and execute the agent
  auto lambda = execute_agent_functor<executor_type,agent_traits,Function,UserArgIndices...>{param, agent_shape, executor_shape, f};

  return detail::bulk_async_executor(
    policy.executor(),
    executor_shape,
    lambda,
    std::forward<Args>(args)...,
    agency::share_at_scope_from_factory<SharedArgIndices>(detail::get<SharedArgIndices>(agent_shared_parameter_factory_tuple))...
  );
}
Esempio n. 6
0
basic_execution_policy<
  typename ExecutionPolicy::execution_agent_type,
  Executor
>
replace_executor(const ExecutionPolicy& policy, const Executor& exec)
{
  using policy_category = detail::execution_policy_execution_category_t<ExecutionPolicy>;
  using executor_category = executor_execution_category_t<Executor>;

  static_assert(detail::is_weaker_than<policy_category, executor_category>::value, "replace_executor(): Execution policy's forward progress requirements cannot be satisfied by executor's guarantees.");

  using result_type = basic_execution_policy<
    typename ExecutionPolicy::execution_agent_type,
    Executor
  >;

  return result_type(policy.param(), exec);
}
InputIterator for_each_n(ExecutionPolicy &exec, InputIterator first, Size n,
                         Function f) {
    cl::sycl::queue q(exec.get_queue());
    if (n > 0) {
        auto last(first + n);
        auto bufI = sycl::helpers::make_buffer(first, last);
        auto vectorSize = bufI.get_count();
        auto cg = [vectorSize, &bufI, f](cl::sycl::handler &h) mutable {
            cl::sycl::range<3> r{vectorSize, 1, 1};
            auto aI = bufI.template get_access<cl::sycl::access::mode::read_write>(h);
            h.parallel_for<typename ExecutionPolicy::kernelName>(
            r, [aI, f](cl::sycl::id<3> id) {
                f(aI[id.get(0)]);
            });
        };
        q.submit(cg);
        return last;
    } else {
        return first;
    }
}
Esempio n. 8
0
void transform(ExecutionPolicy &sep, cl::sycl::queue &q, Buffer &buf1,
               Buffer &buf2, Buffer &res, BinaryOperation op) {
  auto device = q.get_device();
  size_t local = device.get_info<cl::sycl::info::device::max_work_group_size>();
  auto n = buf1.get_count();
  size_t global = sep.calculateGlobalSize(n, local);
  auto f =
      [n, local, global, &buf1, &buf2, &res, op](cl::sycl::handler &h) mutable {
    cl::sycl::nd_range<3> r{cl::sycl::range<3>{std::max(global, local), 1, 1},
                            cl::sycl::range<3>{local, 1, 1}};
    auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h);
    auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h);
    auto aO = res.template get_access<cl::sycl::access::mode::write>(h);
    h.parallel_for<class TransformAlgorithm>(
        r, [a1, a2, aO, op, n](cl::sycl::nd_item<3> id) {
          if (id.get_global(0) < n) {
            aO[id.get_global(0)] =
                op(a1[id.get_global(0)], a2[id.get_global(0)]);
          }
        });
  };
  q.submit(f);
}
Esempio n. 9
0
void test(ExecutionPolicy policy)
{
  using agent = typename ExecutionPolicy::execution_agent_type;
  using agent_traits = agency::execution_agent_traits<agent>;

  {
    // bulk_async with no parameters

    auto f = agency::bulk_async(policy, [](agent& self)
    {
      return 7;
    });

    auto result = f.get();

    using executor_type = typename ExecutionPolicy::executor_type;
    using container_type = agency::executor_container_t<executor_type,int>;

    auto shape = agent_traits::domain(policy.param()).shape();

    assert(container_type(shape,7) == result);
  }

  {
    // bulk_async with one parameter

    int val = 13;

    auto f = agency::bulk_async(policy,
      [](agent& self, int val)
      {
        return val;
      },
      val
    );

    auto result = f.get();

    using executor_type = typename ExecutionPolicy::executor_type;
    using container_type = agency::executor_container_t<executor_type,int>;

    auto shape = agent_traits::domain(policy.param()).shape();

    assert(container_type(shape,val) == result);
  }

  {
    // bulk_async with one shared parameter

    int val = 13;

    auto f = agency::bulk_async(policy,
      [](agent& self, int& val)
      {
        return val;
      },
      agency::share(val)
    );

    auto result = f.get();

    using executor_type = typename ExecutionPolicy::executor_type;
    using container_type = agency::executor_container_t<executor_type,int>;

    auto shape = agent_traits::domain(policy.param()).shape();

    assert(container_type(shape,val) == result);
  }
}
Esempio n. 10
0
bool equal(ExecutionPolicy& exec, ForwardIt1 first1, ForwardIt1 last1,
           ForwardIt2 first2, ForwardIt2 last2, BinaryPredicate p) {
  cl::sycl::queue q(exec.get_queue());

  auto size1 = sycl::helpers::distance(first1, last1);
  auto size2 = sycl::helpers::distance(first2, last2);

  if (size1 != size2) {
    return false;
  }

  if (size1 < 1) {
    return true;
  }

  auto device = q.get_device();

  auto length = size1;
  auto ndRange = exec.calculateNdRange(size1);
  const auto local = ndRange.get_local_range()[0];

  auto buf1 = sycl::helpers::make_const_buffer(first1, last1);
  auto buf2 = sycl::helpers::make_const_buffer(first2, last2);
  auto bufR = cl::sycl::buffer<bool, 1>(cl::sycl::range<1>(size1));

  do {
    int passes = 0;

    auto f = [passes, length, ndRange, local, &buf1, &buf2, &bufR,
              p](cl::sycl::handler& h) mutable {
      auto a1 = buf1.template get_access<cl::sycl::access::mode::read>(h);
      auto a2 = buf2.template get_access<cl::sycl::access::mode::read>(h);
      auto aR = bufR.template get_access<cl::sycl::access::mode::read_write>(h);
      cl::sycl::accessor<bool, 1, cl::sycl::access::mode::read_write,
                         cl::sycl::access::target::local>
          scratch(ndRange.get_local_range(), h);

      h.parallel_for<typename ExecutionPolicy::kernelName>(
          ndRange, [a1, a2, aR, scratch, passes, local, length,
              p](cl::sycl::nd_item<1> id) {
            auto r =
                ReductionStrategy<bool>(local, length, id, scratch);
            if (passes == 0) {
              r.workitem_get_from(p, a1, a2);
            } else {
              r.workitem_get_from(aR);
            }
            r.combine_threads(std::logical_and<bool>{});
            r.workgroup_write_to(aR);
          });  // end kernel
    };         // end command group

    q.submit(f);
    length = length / local;
    ndRange = cl::sycl::nd_range<1>{cl::sycl::range<1>(std::max(length, local)),
                                    ndRange.get_local_range()};
    ++passes;
  } while (length > 1);
  q.wait_and_throw();
  auto hr = bufR.template get_access<cl::sycl::access::mode::read>(
      cl::sycl::range<1>{1}, cl::sycl::id<1>{0});
  return hr[0];
}
// Ensure we are setting appropriate level of configuration
// Automatic to Full Manual spectrum
TEST(ExecutionPolicyTest, StateReflectsConfiguration) {
	// Defaults: fully automatic
	ExecutionPolicy p; 
    int configState = p.getConfigState();
    EXPECT_EQ (ExecutionPolicy::Automatic,  configState);
    EXPECT_NE (ExecutionPolicy::FullManual, configState);
    EXPECT_EQ (0, configState & ExecutionPolicy::SharedMem);
    EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_EQ (0, configState & ExecutionPolicy::GridSize);
	EXPECT_EQ (0, p.getGridSize());
	EXPECT_EQ (0, p.getBlockSize());
	EXPECT_EQ (0, p.getSharedMemBytes());

	// shared memory only
    p = ExecutionPolicy(); // reset
    p.setSharedMemBytes( 1024 );
    configState = p.getConfigState();
    EXPECT_NE (ExecutionPolicy::FullManual, configState);
    EXPECT_NE (ExecutionPolicy::Automatic,  configState);
    EXPECT_NE (0, configState & ExecutionPolicy::SharedMem);
    EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_EQ (0, configState & ExecutionPolicy::GridSize);
	EXPECT_EQ (0, p.getGridSize());
	EXPECT_EQ (0, p.getBlockSize());
	EXPECT_EQ (1024, p.getSharedMemBytes());

    // Block Size Only
    p = ExecutionPolicy(); // reset
    p.setBlockSize( 512 );
    configState = p.getConfigState();
    EXPECT_NE (ExecutionPolicy::FullManual, configState);
    EXPECT_NE (ExecutionPolicy::Automatic,  configState);
    EXPECT_EQ (0, configState & ExecutionPolicy::SharedMem);
    EXPECT_NE (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_EQ (0, configState & ExecutionPolicy::GridSize);
    EXPECT_EQ (0, p.getGridSize());
	EXPECT_EQ (512, p.getBlockSize());
	EXPECT_EQ (0, p.getSharedMemBytes());

    // Block Size and Shared Memory Only
    p = ExecutionPolicy(); // reset
    p.setBlockSize( 512 );
    p.setSharedMemBytes( 1024 );
    configState = p.getConfigState();
    EXPECT_NE (ExecutionPolicy::FullManual, configState);
    EXPECT_NE (ExecutionPolicy::Automatic,  configState);
    EXPECT_NE (0, configState & ExecutionPolicy::SharedMem);
    EXPECT_NE (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_EQ (0, configState & ExecutionPolicy::GridSize);
    EXPECT_EQ (0, p.getGridSize());
	EXPECT_EQ (512, p.getBlockSize());
	EXPECT_EQ (1024, p.getSharedMemBytes());

    // Grid Size Only
    p = ExecutionPolicy(); // reset
    p.setGridSize( 100 );
    configState = p.getConfigState();
    EXPECT_NE (ExecutionPolicy::FullManual, configState);
    EXPECT_NE (ExecutionPolicy::Automatic, configState);
    EXPECT_EQ (0, configState & ExecutionPolicy::SharedMem);
    EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_NE (0, configState & ExecutionPolicy::GridSize);
    EXPECT_EQ (100, p.getGridSize());
	EXPECT_EQ (0, p.getBlockSize());
	EXPECT_EQ (0, p.getSharedMemBytes());

    // Grid Size and Shared Memory Only
    p = ExecutionPolicy(); // reset
    p.setGridSize( 100 );
    p.setSharedMemBytes( 1024 );
    configState = p.getConfigState();
    EXPECT_NE (ExecutionPolicy::FullManual, configState);
    EXPECT_NE (ExecutionPolicy::Automatic, configState);
    EXPECT_NE (0, configState & ExecutionPolicy::SharedMem);
    EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_NE (0, configState & ExecutionPolicy::GridSize);
	EXPECT_EQ (100, p.getGridSize());
	EXPECT_EQ (0, p.getBlockSize());
	EXPECT_EQ (1024, p.getSharedMemBytes());

    // Grid Size and Block Size Only
    p = ExecutionPolicy(); // reset
    p.setGridSize( 100 );
    p.setBlockSize( 512 );
    configState = p.getConfigState();
    EXPECT_NE (ExecutionPolicy::FullManual, configState);
    EXPECT_NE (ExecutionPolicy::Automatic, configState);
    EXPECT_EQ (0, configState & ExecutionPolicy::SharedMem);
    EXPECT_NE (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_NE (0, configState & ExecutionPolicy::GridSize);
	EXPECT_EQ (100, p.getGridSize());
	EXPECT_EQ (512, p.getBlockSize());
	EXPECT_EQ (0, p.getSharedMemBytes());

    // Full Manual Configuration
    p = ExecutionPolicy{1, 256, 10};
    configState = p.getConfigState();
    EXPECT_EQ (ExecutionPolicy::FullManual, configState);
    EXPECT_NE (ExecutionPolicy::Automatic, configState);
    EXPECT_NE (0, configState & ExecutionPolicy::SharedMem);
    EXPECT_NE (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_NE (0, configState & ExecutionPolicy::GridSize);
   	EXPECT_EQ (1, p.getGridSize());
	EXPECT_EQ (256, p.getBlockSize());
	EXPECT_EQ (10, p.getSharedMemBytes());

	// Full Manual Configuration With Separate Calls
    p = ExecutionPolicy(); // reset
    p.setGridSize( 100 );
    p.setBlockSize( 512 );
    p.setSharedMemBytes( 1024);
    configState = p.getConfigState();
    EXPECT_EQ (ExecutionPolicy::FullManual, configState);
    EXPECT_NE (ExecutionPolicy::Automatic, configState);
    EXPECT_NE (0, configState & ExecutionPolicy::SharedMem);
    EXPECT_NE (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_NE (0, configState & ExecutionPolicy::GridSize);
   	EXPECT_EQ (100, p.getGridSize());
	EXPECT_EQ (512, p.getBlockSize());
	EXPECT_EQ (1024, p.getSharedMemBytes());

	p = ExecutionPolicy(); // reset
    p.setGridSize( 0 );
    p.setBlockSize( 0 );
    p.setSharedMemBytes( 0 );
    configState = p.getConfigState();

	// Setting Zero Shared Memory makes it *Manual*
    EXPECT_NE (ExecutionPolicy::FullManual, configState);
    EXPECT_NE (ExecutionPolicy::Automatic, configState);
    EXPECT_NE (0, configState & ExecutionPolicy::SharedMem);
    
    // Setting 0 grid or block size makes them *Automatic*
    EXPECT_EQ (0, configState & ExecutionPolicy::BlockSize);
    EXPECT_EQ (0, configState & ExecutionPolicy::GridSize);
   	EXPECT_EQ (0, p.getGridSize());
	EXPECT_EQ (0, p.getBlockSize());
	EXPECT_EQ (0, p.getSharedMemBytes());

	// Re-setting block or grid size to >0 should set them to manual
	p.setGridSize( 100 );
	p.setBlockSize( 512 );
	configState = p.getConfigState();
	EXPECT_NE(0, configState & ExecutionPolicy::GridSize);
	EXPECT_NE(0, configState & ExecutionPolicy::BlockSize);

	// Re-setting block or grid size to zero should set them to automatic
	p.setGridSize( 0 );
	p.setBlockSize( 0 );
	configState = p.getConfigState();
	EXPECT_EQ(0, configState & ExecutionPolicy::GridSize);
	EXPECT_EQ(0, configState & ExecutionPolicy::BlockSize);

    // Setting stream (trivial)
    p.setStream(1);
    EXPECT_EQ(1, p.getStream());
}