void generate(OutputIterator first_ctr, OutputIterator last_ctr, OutputIterator first_key, OutputIterator last_key, command_queue &queue) { const size_t size_ctr = detail::iterator_range_size(first_ctr, last_ctr); const size_t size_key = detail::iterator_range_size(first_key, last_key); if(!size_ctr || !size_key || (size_ctr != size_key)) { return; } kernel rng_kernel = m_program.create_kernel("generate_rng"); rng_kernel.set_arg(0, first_ctr.get_buffer()); rng_kernel.set_arg(1, first_key.get_buffer()); size_t offset = 0; for(;;){ size_t count = 0; size_t size = size_ctr/2; if(size > threads){ count = threads; } else { count = size; } rng_kernel.set_arg(2, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(rng_kernel, 0, count, 0); offset += count; if(offset >= size){ break; } } }
inline future<OutputIterator> dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, is_device_iterator<OutputIterator>, can_copy_with_copy_buffer< InputIterator, OutputIterator > > >::type* = 0) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; difference_type n = std::distance(first, last); if(n < 1){ // nothing to copy return make_future(result, event()); } event event_ = queue.enqueue_copy_buffer( first.get_buffer(), result.get_buffer(), first.get_index() * sizeof(value_type), result.get_index() * sizeof(value_type), static_cast<size_t>(n) * sizeof(value_type) ); return make_future(result + n, event_); }
inline static bool center_generate(OutputIterator& sink, Context& ctx, Delimiter const& d, Parameter const& param, Embedded const& e, unsigned int const width, Padding const& p) { // make sure all generator parameters are valid BOOST_MPL_ASSERT_MSG( (spirit::traits::is_component<karma::domain, Embedded>::value), embedded_is_not_convertible_to_a_generator, (Context, Embedded)); BOOST_MPL_ASSERT_MSG( (spirit::traits::is_component<karma::domain, Padding>::value), padding_is_not_convertible_to_a_generator, (Context, Padding)); typedef typename result_of::as_component<karma::domain, Embedded>::type embedded; typedef typename result_of::as_component<karma::domain, Padding>::type padding; // wrap the given output iterator to allow left padding detail::enable_buffering<OutputIterator> buffering(sink, width); // first generate the embedded output embedded ec = spirit::as_component(karma::domain(), e); typedef typename embedded::director director; bool r = director::generate(ec, sink, ctx, d, param); buffering.disable(); // do not perform buffering any more // generate the left padding detail::enable_counting<OutputIterator> counting(sink, (sink.buffer_size() + width) / 2); padding pc = spirit::as_component(karma::domain(), p); typedef typename padding::director padding_director; while (r && sink.count() < width) r = padding_director::generate(pc, sink, ctx, unused, unused); if (r) { // copy the embedded output to the target output iterator sink.buffer_copy(); // generate the right padding std::size_t const max_count = width + (width - sink.buffer_size()) / 2; while (r && sink.count() < max_count) r = padding_director::generate(pc, sink, ctx, unused, unused); } return r; }
void fill(OutputIterator first, OutputIterator last, command_queue &queue) { const buffer &buffer = first.get_buffer(); const size_t size = detail::iterator_range_size(first, last); kernel fill_kernel(m_program, "fill"); fill_kernel.set_arg(0, m_state_buffer); fill_kernel.set_arg(1, buffer); size_t p = 0; for(;;){ size_t count = 0; if(size - p >= n) count = n; else count = size - p; fill_kernel.set_arg(2, static_cast<uint_>(p)); queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0); p += n; if(p >= size) break; generate_state(queue); } }
void generate(OutputIterator first, OutputIterator last, command_queue &queue) { const size_t size = detail::iterator_range_size(first, last); kernel fill_kernel(m_program, "fill"); fill_kernel.set_arg(0, m_state_buffer); fill_kernel.set_arg(2, first.get_buffer()); size_t offset = 0; size_t &p = m_state_index; for(;;){ size_t count = 0; if(size > n){ count = n; } else { count = size; } fill_kernel.set_arg(1, static_cast<const uint_>(p)); fill_kernel.set_arg(3, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0); p += count; offset += count; if(offset >= size){ break; } generate_state(queue); p = 0; } }
void generate(OutputIterator first, OutputIterator last, command_queue &queue) { size_t size = detail::iterator_range_size(first, last); kernel fill_kernel(m_program, "fill"); fill_kernel.set_arg(1, m_multiplicands); fill_kernel.set_arg(2, first.get_buffer()); size_t offset = 0; for(;;){ size_t count = 0; if(size > threads){ count = threads; } else { count = size; } fill_kernel.set_arg(0, static_cast<const uint_>(m_seed)); fill_kernel.set_arg(3, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0); offset += count; if(offset >= size){ break; } update_seed(queue); } }
void generate(OutputIterator first_ctr, OutputIterator last_ctr, command_queue &queue) { const size_t size_ctr = detail::iterator_range_size(first_ctr, last_ctr); if(!size_ctr) { return; } boost::compute::vector<uint_> vector_key(size_ctr, m_context); vector_key.assign(size_ctr, 0, queue); kernel rng_kernel = m_program.create_kernel("generate_rng"); rng_kernel.set_arg(0, first_ctr.get_buffer()); rng_kernel.set_arg(1, vector_key); size_t offset = 0; for(;;){ size_t count = 0; size_t size = size_ctr/2; if(size > threads){ count = threads; } else { count = size; } rng_kernel.set_arg(2, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(rng_kernel, 0, count, 0); offset += count; if(offset >= size){ break; } } }
inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if_c< boost::is_same< InputIterator, buffer_iterator<typename InputIterator::value_type> >::value && boost::is_same< OutputIterator, buffer_iterator<typename OutputIterator::value_type> >::value && boost::is_same< typename InputIterator::value_type, typename OutputIterator::value_type >::value >::type* = 0) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; difference_type n = std::distance(first, last); if(n < 1){ // nothing to copy return first; } queue.enqueue_copy_buffer(first.get_buffer(), result.get_buffer(), first.get_index() * sizeof(value_type), result.get_index() * sizeof(value_type), static_cast<size_t>(n) * sizeof(value_type)); return result + n; }
inline command_queue default_queue_for_copy(InputIterator first, OutputIterator result, typename boost::enable_if< typename boost::mpl::and_< boost::mpl::not_<is_buffer_iterator<InputIterator> >, is_buffer_iterator<OutputIterator> > >::type* = 0) { (void) first; const buffer &buffer = result.get_buffer(); const context &context = buffer.get_context(); const device &device = context.get_device(); return command_queue(context, device); }
inline static bool left_align_generate(OutputIterator& sink, Context& ctx, Delimiter const& d, Parameter const& param, Embedded const& e, unsigned int const width, Padding const& p) { // make sure all generator parameters are valid BOOST_MPL_ASSERT_MSG( (spirit::traits::is_component<karma::domain, Embedded>::value), embedded_is_not_convertible_to_a_generator, (Context, Embedded)); BOOST_MPL_ASSERT_MSG( (spirit::traits::is_component<karma::domain, Padding>::value), padding_is_not_convertible_to_a_generator, (Context, Padding)); typedef typename result_of::as_component<karma::domain, Embedded>::type embedded; typedef typename result_of::as_component<karma::domain, Padding>::type padding; // wrap the given output iterator to allow counting detail::enable_counting<OutputIterator> counting(sink); // first generate the underlying output embedded ec = spirit::as_component(karma::domain(), e); typedef typename embedded::director director; bool r = director::generate(ec, sink, ctx, d, param); // pad the output until the max width is reached padding pc = spirit::as_component(karma::domain(), p); while(r && sink.count() < width) { typedef typename padding::director padding_director; r = padding_director::generate(pc, sink, ctx, unused, unused); } return r; }
size_t reduce(InputIterator first, size_t count, OutputIterator result, size_t block_size, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename boost::compute::result_of<BinaryFunction(input_type, input_type)>::type result_type; const context &context = queue.get_context(); size_t block_count = count / 2 / block_size; size_t total_block_count = static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); if(block_count != 0){ meta_kernel k("block_reduce"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); k << "const uint gid = get_global_id(0);\n" << "const uint lid = get_local_id(0);\n" << // copy values to local memory "block[lid] = " << function(first[k.make_var<uint_>("gid*2+0")], first[k.make_var<uint_>("gid*2+1")]) << ";\n" << // perform reduction "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << " barrier(CLK_LOCAL_MEM_FENCE);\n" << " uint mask = (i << 1) - 1;\n" << " if((lid & mask) == 0){\n" << " block[lid] = " << function(k.expr<input_type>("block[lid]"), k.expr<input_type>("block[lid+i]")) << ";\n" << " }\n" << "}\n" << // write block result to global output "if(lid == 0)\n" << " output[get_group_id(0)] = block[0];\n"; kernel kernel = k.compile(context); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, block_count * block_size, block_size); } // serially reduce any leftovers if(block_count * block_size * 2 < count){ size_t last_block_start = block_count * block_size * 2; meta_kernel k("extra_serial_reduce"); size_t count_arg = k.add_arg<uint_>("count"); size_t offset_arg = k.add_arg<uint_>("offset"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t output_offset_arg = k.add_arg<uint_>("output_offset"); k << k.decl<result_type>("result") << " = \n" << first[k.expr<uint_>("offset")] << ";\n" << "for(uint i = offset + 1; i < count; i++)\n" << " result = " << function(k.var<result_type>("result"), first[k.var<uint_>("i")]) << ";\n" << "output[output_offset] = result;\n"; kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); queue.enqueue_task(kernel); } return total_block_count; }