void generate_state(command_queue &queue) { kernel generate_state_kernel = m_program.create_kernel("generate_state"); generate_state_kernel.set_arg(0, m_state_buffer); queue.enqueue_task(generate_state_kernel); }
inline void sort2(const buffer &buffer, command_queue &queue) { const context &context = queue.get_context(); boost::shared_ptr<detail::program_cache> cache = detail::get_program_cache(context); std::string cache_key = std::string("fixed_sort2_") + type_name<T>(); program sort2_program = cache->get(cache_key); if(!sort2_program.get()){ const char source[] = "__kernel void sort2(__global T *input)\n" "{\n" " const T x = input[0];\n" " const T y = input[1];\n" " if(y < x){\n" " input[0] = y;\n" " input[1] = x;\n" " }\n" "}\n"; sort2_program = program::build_with_source( source, context, std::string("-DT=") + type_name<T>() ); cache->insert(cache_key, sort2_program); } kernel sort2_kernel = sort2_program.create_kernel("sort2"); sort2_kernel.set_arg(0, buffer); queue.enqueue_task(sort2_kernel); }
inline void serial_reduce(InputIterator first, InputIterator last, OutputIterator result, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type T; typedef typename ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type; const context &context = queue.get_context(); size_t count = detail::iterator_range_size(first, last); if(count == 0){ return; } meta_kernel k("serial_reduce"); size_t count_arg = k.add_arg<cl_uint>("count"); k << k.decl<result_type>("result") << " = " << first[0] << ";\n" << "for(uint i = 1; i < count; i++)\n" << " result = " << function(k.var<T>("result"), first[k.var<uint_>("i")]) << ";\n" << result[0] << " = result;\n"; kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<uint_>(count)); queue.enqueue_task(kernel); }
/// \internal_ /// Generates the multiplicands for each thread void generate_multiplicands(command_queue &queue) { kernel multiplicand_kernel = m_program.create_kernel("multiplicand"); multiplicand_kernel.set_arg(0, m_multiplicands); queue.enqueue_task(multiplicand_kernel); }
inline void serial_insertion_sort_by_key(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; size_t count = iterator_range_size(keys_first, keys_last); if(count < 2){ return; } meta_kernel k("serial_insertion_sort_by_key"); size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "keys"); size_t local_data_arg = k.add_arg<value_type *>(memory_object::local_memory, "data"); size_t count_arg = k.add_arg<uint_>("n"); k << // copy data to local memory "for(uint i = 0; i < n; i++){\n" << " keys[i] = " << keys_first[k.var<uint_>("i")] << ";\n" " data[i] = " << values_first[k.var<uint_>("i")] << ";\n" "}\n" // sort data in local memory "for(uint i = 1; i < n; i++){\n" << " " << k.decl<const key_type>("key") << " = keys[i];\n" << " " << k.decl<const value_type>("value") << " = data[i];\n" << " uint pos = i;\n" << " while(pos > 0 && " << compare(k.var<const key_type>("key"), k.var<const key_type>("keys[pos-1]")) << "){\n" << " keys[pos] = keys[pos-1];\n" << " data[pos] = data[pos-1];\n" << " pos--;\n" << " }\n" << " keys[pos] = key;\n" << " data[pos] = value;\n" << "}\n" << // copy sorted data to output "for(uint i = 0; i < n; i++){\n" << " " << keys_first[k.var<uint_>("i")] << " = keys[i];\n" " " << values_first[k.var<uint_>("i")] << " = data[i];\n" "}\n"; const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(local_keys_arg, static_cast<uint_>(count * sizeof(key_type)), 0); kernel.set_arg(local_data_arg, static_cast<uint_>(count * sizeof(value_type)), 0); kernel.set_arg(count_arg, static_cast<uint_>(count)); queue.enqueue_task(kernel); }
/// Seeds the random number generator with \p value. void seed(command_queue &queue, result_type value = default_seed) { kernel seed_kernel = m_program.create_kernel("seed"); seed_kernel.set_arg(0, value); seed_kernel.set_arg(1, m_state_buffer); queue.enqueue_task(seed_kernel); m_state_index = 0; }
inline void serial_insertion_sort(Iterator first, Iterator last, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type T; size_t count = iterator_range_size(first, last); if(count < 2){ return; } meta_kernel k("serial_insertion_sort"); size_t local_data_arg = k.add_arg<T *>(memory_object::local_memory, "data"); size_t count_arg = k.add_arg<uint_>("n"); k << // copy data to local memory "for(uint i = 0; i < n; i++){\n" << " data[i] = " << first[k.var<uint_>("i")] << ";\n" "}\n" // sort data in local memory "for(uint i = 1; i < n; i++){\n" << " " << k.decl<const T>("value") << " = data[i];\n" << " uint pos = i;\n" << " while(pos > 0 && " << compare(k.var<const T>("value"), k.var<const T>("data[pos-1]")) << "){\n" << " data[pos] = data[pos-1];\n" << " pos--;\n" << " }\n" << " data[pos] = value;\n" << "}\n" << // copy sorted data to output "for(uint i = 0; i < n; i++){\n" << " " << first[k.var<uint_>("i")] << " = data[i];\n" "}\n"; const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(local_data_arg, local_buffer<T>(count)); kernel.set_arg(count_arg, static_cast<uint_>(count)); queue.enqueue_task(kernel); }
inline size_t serial_count_if(InputIterator first, InputIterator last, Predicate predicate, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; const context &context = queue.get_context(); size_t size = iterator_range_size(first, last); meta_kernel k("serial_count_if"); k.add_arg<const uint_>("size", size); size_t result_arg = k.add_arg<uint_ *>("__global", "result"); k << "uint count = 0;\n" << "for(uint i = 0; i < size; i++){\n" << k.decl<const value_type>("value") << "=" << first[k.var<uint_>("i")] << ";\n" << "if(" << predicate(k.var<const value_type>("value")) << "){\n" << "count++;\n" << "}\n" "}\n" "*result = count;\n"; kernel kernel = k.compile(context); // setup result buffer buffer result_buffer(context, sizeof(uint_)); kernel.set_arg(result_arg, result_buffer); // run kernel queue.enqueue_task(kernel); // read index return detail::read_single_value<uint_>(result_buffer, queue); }
inline T serial_reduce(InputIterator first, InputIterator last, T init, BinaryFunction function, command_queue &queue) { size_t count = detail::iterator_range_size(first, last); if(count == 0){ return init; } const context &context = queue.get_context(); meta_kernel k("serial_reduce"); size_t init_arg = k.add_arg<T>("init"); size_t count_arg = k.add_arg<cl_uint>("count"); size_t output_arg = k.add_arg<T *>("__global", "output"); k << k.decl<T>("result") << " = init;\n" << "for(uint i = 0; i < count; i++)\n" << " result = " << function(k.var<T>("result"), first[k.var<cl_uint>("i")]) << ";\n" << "*output = result;\n"; kernel kernel = k.compile(context); scalar<T> output(context); kernel.set_arg(init_arg, init); kernel.set_arg(count_arg, static_cast<cl_uint>(count)); kernel.set_arg(output_arg, output.get_buffer()); queue.enqueue_task(kernel); return output.read(queue); }
inline size_t serial_count_if(InputIterator first, InputIterator last, Predicate predicate, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; const context &context = queue.get_context(); size_t size = iterator_range_size(first, last); meta_kernel k("serial_count_if"); k.add_set_arg("size", static_cast<uint_>(size)); size_t result_arg = k.add_arg<uint_ *>(memory_object::global_memory, "result"); k << "uint count = 0;\n" << "for(uint i = 0; i < size; i++){\n" << k.decl<const value_type>("value") << "=" << first[k.var<uint_>("i")] << ";\n" << "if(" << predicate(k.var<const value_type>("value")) << "){\n" << "count++;\n" << "}\n" "}\n" "*result = count;\n"; kernel kernel = k.compile(context); // setup result buffer scalar<uint_> result(context); kernel.set_arg(result_arg, result.get_buffer()); // run kernel queue.enqueue_task(kernel); // read index return result.read(queue); }
size_t reduce(InputIterator first, size_t count, OutputIterator result, size_t block_size, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename boost::compute::result_of<BinaryFunction(input_type, input_type)>::type result_type; const context &context = queue.get_context(); size_t block_count = count / 2 / block_size; size_t total_block_count = static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); if(block_count != 0){ meta_kernel k("block_reduce"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); k << "const uint gid = get_global_id(0);\n" << "const uint lid = get_local_id(0);\n" << // copy values to local memory "block[lid] = " << function(first[k.make_var<uint_>("gid*2+0")], first[k.make_var<uint_>("gid*2+1")]) << ";\n" << // perform reduction "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << " barrier(CLK_LOCAL_MEM_FENCE);\n" << " uint mask = (i << 1) - 1;\n" << " if((lid & mask) == 0){\n" << " block[lid] = " << function(k.expr<input_type>("block[lid]"), k.expr<input_type>("block[lid+i]")) << ";\n" << " }\n" << "}\n" << // write block result to global output "if(lid == 0)\n" << " output[get_group_id(0)] = block[0];\n"; kernel kernel = k.compile(context); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, block_count * block_size, block_size); } // serially reduce any leftovers if(block_count * block_size * 2 < count){ size_t last_block_start = block_count * block_size * 2; meta_kernel k("extra_serial_reduce"); size_t count_arg = k.add_arg<uint_>("count"); size_t offset_arg = k.add_arg<uint_>("offset"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t output_offset_arg = k.add_arg<uint_>("output_offset"); k << k.decl<result_type>("result") << " = \n" << first[k.expr<uint_>("offset")] << ";\n" << "for(uint i = offset + 1; i < count; i++)\n" << " result = " << function(k.var<result_type>("result"), first[k.var<uint_>("i")]) << ";\n" << "output[output_offset] = result;\n"; kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); queue.enqueue_task(kernel); } return total_block_count; }
inline size_t serial_reduce_by_key(InputKeyIterator keys_first, InputKeyIterator keys_last, InputValueIterator values_first, OutputKeyIterator keys_result, OutputValueIterator values_result, BinaryFunction function, BinaryPredicate predicate, command_queue &queue) { typedef typename std::iterator_traits<InputValueIterator>::value_type value_type; typedef typename std::iterator_traits<InputKeyIterator>::value_type key_type; typedef typename ::boost::compute::result_of<BinaryFunction(value_type, value_type)>::type result_type; const context &context = queue.get_context(); size_t count = detail::iterator_range_size(keys_first, keys_last); if(count < 1){ return count; } meta_kernel k("serial_reduce_by_key"); size_t count_arg = k.add_arg<uint_>("count"); size_t result_size_arg = k.add_arg<uint_ *>(memory_object::global_memory, "result_size"); k << k.decl<result_type>("result") << " = " << values_first[0] << ";\n" << k.decl<key_type>("previous_key") << " = " << keys_first[0] << ";\n" << k.decl<result_type>("value") << ";\n" << k.decl<key_type>("key") << ";\n" << k.decl<uint_>("size") << " = 1;\n" << keys_result[0] << " = previous_key;\n" << values_result[0] << " = result;\n" << "for(ulong i = 1; i < count; i++) {\n" << " value = " << values_first[k.var<uint_>("i")] << ";\n" << " key = " << keys_first[k.var<uint_>("i")] << ";\n" << " if (" << predicate(k.var<key_type>("previous_key"), k.var<key_type>("key")) << ") {\n" << " result = " << function(k.var<result_type>("result"), k.var<result_type>("value")) << ";\n" << " }\n " << " else { \n" << keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" << values_result[k.var<uint_>("size - 1")] << " = result;\n" << " result = value;\n" << " size++;\n" << " } \n" << " previous_key = key;\n" << "}\n" << keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" << values_result[k.var<uint_>("size - 1")] << " = result;\n" << "*result_size = size;"; kernel kernel = k.compile(context); scalar<uint_> result_size(context); kernel.set_arg(result_size_arg, result_size.get_buffer()); kernel.set_arg(count_arg, static_cast<uint_>(count)); queue.enqueue_task(kernel); return static_cast<size_t>(result_size.read(queue)); }
inline void radix_sort_impl(const buffer_iterator<T> first, const buffer_iterator<T> last, const buffer_iterator<T2> values_first, const bool ascending, command_queue &queue) { typedef T value_type; typedef typename radix_sort_value_type<sizeof(T)>::type sort_type; const device &device = queue.get_device(); const context &context = queue.get_context(); // if we have a valid values iterator then we are doing a // sort by key and have to set up the values buffer bool sort_by_key = (values_first.get_buffer().get() != 0); // load (or create) radix sort program std::string cache_key = std::string("__boost_radix_sort_") + type_name<value_type>(); if(sort_by_key){ cache_key += std::string("_with_") + type_name<T2>(); } boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // sort parameters const uint_ k = parameters->get(cache_key, "k", 4); const uint_ k2 = 1 << k; const uint_ block_size = parameters->get(cache_key, "tpb", 128); // sort program compiler options std::stringstream options; options << "-DK_BITS=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } if(sort_by_key){ options << " -DSORT_BY_KEY"; options << " -DT2=" << type_name<T2>(); options << enable_double<T2>(); } if(ascending){ options << " -DASC"; } // load radix sort program program radix_sort_program = cache->get_or_build( cache_key, options.str(), radix_sort_source, context ); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); size_t count = detail::iterator_range_size(first, last); uint_ block_count = static_cast<uint_>(count / block_size); if(block_count * block_size != count){ block_count++; } // setup temporary buffers vector<value_type> output(count, context); vector<T2> values_output(sort_by_key ? count : 0, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); uint_ input_offset = static_cast<uint_>(first.get_index()); const buffer *output_buffer = &output.get_buffer(); uint_ output_offset = 0; const buffer *values_input_buffer = &values_first.get_buffer(); uint_ values_input_offset = static_cast<uint_>(values_first.get_index()); const buffer *values_output_buffer = &values_output.get_buffer(); uint_ values_output_offset = 0; for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, input_offset); count_kernel.set_arg(2, static_cast<uint_>(count)); count_kernel.set_arg(3, counts); count_kernel.set_arg(4, offsets); count_kernel.set_arg(5, block_size * sizeof(uint_), 0); count_kernel.set_arg(6, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts); scan_kernel.set_arg(1, offsets); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, input_offset); scatter_kernel.set_arg(2, static_cast<uint_>(count)); scatter_kernel.set_arg(3, i * k); scatter_kernel.set_arg(4, counts); scatter_kernel.set_arg(5, offsets); scatter_kernel.set_arg(6, *output_buffer); scatter_kernel.set_arg(7, output_offset); if(sort_by_key){ scatter_kernel.set_arg(8, *values_input_buffer); scatter_kernel.set_arg(9, values_input_offset); scatter_kernel.set_arg(10, *values_output_buffer); scatter_kernel.set_arg(11, values_output_offset); } queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); std::swap(values_input_buffer, values_output_buffer); std::swap(input_offset, output_offset); std::swap(values_input_offset, values_output_offset); } }
inline void radix_sort(Iterator first, Iterator last, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type value_type; typedef typename radix_sort_value_type<sizeof(value_type)>::type sort_type; const context &context = queue.get_context(); size_t count = detail::iterator_range_size(first, last); // sort parameters const uint_ k = 4; const uint_ k2 = 1 << k; const uint_ block_size = 128; uint_ block_count = count / block_size; if(block_count * block_size != count){ block_count++; } // setup kernels program radix_sort_program = program::create_with_source(radix_sort_source, context); std::stringstream options; options << "-DK=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } radix_sort_program.build(options.str()); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); // setup temporary buffers vector<value_type> output(count, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); const buffer *output_buffer = &output.get_buffer(); for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, static_cast<uint_>(count)); count_kernel.set_arg(2, counts.get_buffer()); count_kernel.set_arg(3, offsets.get_buffer()); count_kernel.set_arg(4, block_size * sizeof(uint_), 0); count_kernel.set_arg(5, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts.get_buffer()); scan_kernel.set_arg(1, offsets.get_buffer()); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, static_cast<uint_>(count)); scatter_kernel.set_arg(2, i * k); scatter_kernel.set_arg(3, counts.get_buffer()); scatter_kernel.set_arg(4, offsets.get_buffer()); scatter_kernel.set_arg(5, *output_buffer); queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); } }