void generate(OutputIterator first_ctr, OutputIterator last_ctr, OutputIterator first_key, OutputIterator last_key, command_queue &queue) { const size_t size_ctr = detail::iterator_range_size(first_ctr, last_ctr); const size_t size_key = detail::iterator_range_size(first_key, last_key); if(!size_ctr || !size_key || (size_ctr != size_key)) { return; } kernel rng_kernel = m_program.create_kernel("generate_rng"); rng_kernel.set_arg(0, first_ctr.get_buffer()); rng_kernel.set_arg(1, first_key.get_buffer()); size_t offset = 0; for(;;){ size_t count = 0; size_t size = size_ctr/2; if(size > threads){ count = threads; } else { count = size; } rng_kernel.set_arg(2, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(rng_kernel, 0, count, 0); offset += count; if(offset >= size){ break; } } }
void fill(OutputIterator first, OutputIterator last, command_queue &queue) { const buffer &buffer = first.get_buffer(); const size_t size = detail::iterator_range_size(first, last); kernel fill_kernel(m_program, "fill"); fill_kernel.set_arg(0, m_state_buffer); fill_kernel.set_arg(1, buffer); size_t p = 0; for(;;){ size_t count = 0; if(size - p >= n) count = n; else count = size - p; fill_kernel.set_arg(2, static_cast<uint_>(p)); queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0); p += n; if(p >= size) break; generate_state(queue); } }
void generate(OutputIterator first, OutputIterator last, command_queue &queue) { const size_t size = detail::iterator_range_size(first, last); kernel fill_kernel(m_program, "fill"); fill_kernel.set_arg(0, m_state_buffer); fill_kernel.set_arg(2, first.get_buffer()); size_t offset = 0; size_t &p = m_state_index; for(;;){ size_t count = 0; if(size > n){ count = n; } else { count = size; } fill_kernel.set_arg(1, static_cast<const uint_>(p)); fill_kernel.set_arg(3, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0); p += count; offset += count; if(offset >= size){ break; } generate_state(queue); p = 0; } }
void generate(OutputIterator first, OutputIterator last, command_queue &queue) { size_t size = detail::iterator_range_size(first, last); kernel fill_kernel(m_program, "fill"); fill_kernel.set_arg(1, m_multiplicands); fill_kernel.set_arg(2, first.get_buffer()); size_t offset = 0; for(;;){ size_t count = 0; if(size > threads){ count = threads; } else { count = size; } fill_kernel.set_arg(0, static_cast<const uint_>(m_seed)); fill_kernel.set_arg(3, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0); offset += count; if(offset >= size){ break; } update_seed(queue); } }
void generate(OutputIterator first_ctr, OutputIterator last_ctr, command_queue &queue) { const size_t size_ctr = detail::iterator_range_size(first_ctr, last_ctr); if(!size_ctr) { return; } boost::compute::vector<uint_> vector_key(size_ctr, m_context); vector_key.assign(size_ctr, 0, queue); kernel rng_kernel = m_program.create_kernel("generate_rng"); rng_kernel.set_arg(0, first_ctr.get_buffer()); rng_kernel.set_arg(1, vector_key); size_t offset = 0; for(;;){ size_t count = 0; size_t size = size_ctr/2; if(size > threads){ count = threads; } else { count = size; } rng_kernel.set_arg(2, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(rng_kernel, 0, count, 0); offset += count; if(offset >= size){ break; } } }
inline future<OutputIterator> dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if< mpl::and_< is_device_iterator<InputIterator>, is_device_iterator<OutputIterator>, can_copy_with_copy_buffer< InputIterator, OutputIterator > > >::type* = 0) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; difference_type n = std::distance(first, last); if(n < 1){ // nothing to copy return make_future(result, event()); } event event_ = queue.enqueue_copy_buffer( first.get_buffer(), result.get_buffer(), first.get_index() * sizeof(value_type), result.get_index() * sizeof(value_type), static_cast<size_t>(n) * sizeof(value_type) ); return make_future(result + n, event_); }
inline command_queue default_queue_for_copy(InputIterator first, OutputIterator result, typename boost::enable_if< typename boost::mpl::and_< boost::mpl::not_<is_buffer_iterator<InputIterator> >, is_buffer_iterator<OutputIterator> > >::type* = 0) { (void) first; const buffer &buffer = result.get_buffer(); const context &context = buffer.get_context(); const device &device = context.get_device(); return command_queue(context, device); }
inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if_c< boost::is_same< InputIterator, buffer_iterator<typename InputIterator::value_type> >::value && boost::is_same< OutputIterator, buffer_iterator<typename OutputIterator::value_type> >::value && boost::is_same< typename InputIterator::value_type, typename OutputIterator::value_type >::value >::type* = 0) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; difference_type n = std::distance(first, last); if(n < 1){ // nothing to copy return first; } queue.enqueue_copy_buffer(first.get_buffer(), result.get_buffer(), first.get_index() * sizeof(value_type), result.get_index() * sizeof(value_type), static_cast<size_t>(n) * sizeof(value_type)); return result + n; }
size_t reduce(InputIterator first, size_t count, OutputIterator result, size_t block_size, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename boost::compute::result_of<BinaryFunction(input_type, input_type)>::type result_type; const context &context = queue.get_context(); size_t block_count = count / 2 / block_size; size_t total_block_count = static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); if(block_count != 0){ meta_kernel k("block_reduce"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); k << "const uint gid = get_global_id(0);\n" << "const uint lid = get_local_id(0);\n" << // copy values to local memory "block[lid] = " << function(first[k.make_var<uint_>("gid*2+0")], first[k.make_var<uint_>("gid*2+1")]) << ";\n" << // perform reduction "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << " barrier(CLK_LOCAL_MEM_FENCE);\n" << " uint mask = (i << 1) - 1;\n" << " if((lid & mask) == 0){\n" << " block[lid] = " << function(k.expr<input_type>("block[lid]"), k.expr<input_type>("block[lid+i]")) << ";\n" << " }\n" << "}\n" << // write block result to global output "if(lid == 0)\n" << " output[get_group_id(0)] = block[0];\n"; kernel kernel = k.compile(context); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, block_count * block_size, block_size); } // serially reduce any leftovers if(block_count * block_size * 2 < count){ size_t last_block_start = block_count * block_size * 2; meta_kernel k("extra_serial_reduce"); size_t count_arg = k.add_arg<uint_>("count"); size_t offset_arg = k.add_arg<uint_>("offset"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t output_offset_arg = k.add_arg<uint_>("output_offset"); k << k.decl<result_type>("result") << " = \n" << first[k.expr<uint_>("offset")] << ";\n" << "for(uint i = offset + 1; i < count; i++)\n" << " result = " << function(k.var<result_type>("result"), first[k.var<uint_>("i")]) << ";\n" << "output[output_offset] = result;\n"; kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); queue.enqueue_task(kernel); } return total_block_count; }