inline size_t bitonic_block_sort(KeyIterator keys_first, ValueIterator values_first, Compare compare, const size_t count, const bool sort_by_key, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; meta_kernel k("bitonic_block_sort"); size_t count_arg = k.add_arg<const uint_>("count"); size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys"); size_t local_vals_arg = 0; if(sort_by_key) { local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx"); } k << // Work item global and local ids k.decl<const uint_>("gid") << " = get_global_id(0);\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n"; // declare my_key and my_value k << k.decl<key_type>("my_key") << ";\n"; // Instead of copying values (my_value) in local memory with keys // we save local index (uchar) and copy my_value at the end at // final index. This saves local memory. if(sort_by_key) { k << k.decl<uchar_>("my_index") << " = (uchar)(lid);\n"; } // load key k << "if(gid < count) {\n" << k.var<key_type>("my_key") << " = " << keys_first[k.var<const uint_>("gid")] << ";\n" << "}\n"; // load key and index to local memory k << "lkeys[lid] = my_key;\n"; if(sort_by_key) { k << "lidx[lid] = my_index;\n"; } k << k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" << k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n"; // When work group size is a power of 2 bitonic sorter can be used; // otherwise, slower odd-even sort is used. k << // check if n is power of 2 "if(((n != 0) && ((n & (~n + 1)) == n))) {\n"; // bitonic sort, not stable k << // wait for keys and vals to be stored in local memory "barrier(CLK_LOCAL_MEM_FENCE);\n" << "#pragma unroll\n" << "for(" << k.decl<uint_>("length") << " = 1; " << "length < n; " << "length <<= 1" << ") {\n" << // direction of sort: false -> asc, true -> desc k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" << "for(" << k.decl<uint_>("k") << " = length; " << "k > 0; " << "k >>= 1" << ") {\n" << // sibling to compare with my key k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" << k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" << k.decl<bool>("compare") << " = " << compare(k.var<key_type>("sibling_key"), k.var<key_type>("my_key")) << ";\n" << k.decl<bool>("equal") << " = !(compare || " << compare(k.var<key_type>("my_key"), k.var<key_type>("sibling_key")) << ");\n" << k.decl<bool>("swap") << " = compare ^ (sibling_idx < lid) ^ direction;\n" << "swap = equal ? false : swap;\n" << "my_key = swap ? sibling_key : my_key;\n"; if(sort_by_key) { k << "my_index = swap ? lidx[sibling_idx] : my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << "lkeys[lid] = my_key;\n"; if(sort_by_key) { k << "lidx[lid] = my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << "}\n" << "}\n"; // end of bitonic sort // odd-even sort, not stable k << "}\n" << "else { \n"; k << k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" << k.decl<uint_>("oddsibling_idx") << " = " << "(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" << k.decl<uint_>("evensibling_idx") << " = " << "(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" << // wait for keys and vals to be stored in local memory "barrier(CLK_LOCAL_MEM_FENCE);\n" << "#pragma unroll\n" << "for(" << k.decl<uint_>("i") << " = 0; " << "i < n; " << "i++" << ") {\n" << k.decl<uint_>("sibling_idx") << " = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" << k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" << k.decl<bool>("compare") << " = " << compare(k.var<key_type>("sibling_key"), k.var<key_type>("my_key")) << ";\n" << k.decl<bool>("equal") << " = !(compare || " << compare(k.var<key_type>("my_key"), k.var<key_type>("sibling_key")) << ");\n" << k.decl<bool>("swap") << " = compare ^ (sibling_idx < lid);\n" << "swap = equal ? false : swap;\n" << "my_key = swap ? sibling_key : my_key;\n"; if(sort_by_key) { k << "my_index = swap ? lidx[sibling_idx] : my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << "lkeys[lid] = my_key;\n"; if(sort_by_key) { k << "lidx[lid] = my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" << // for "}\n"; // else // end of odd-even sort // save key and value k << "if(gid < count) {\n" << keys_first[k.var<const uint_>("gid")] << " = " << k.var<key_type>("my_key") << ";\n"; if(sort_by_key) { k << k.decl<value_type>("my_value") << " = " << values_first[k.var<const uint_>("offset + my_index")] << ";\n" << "barrier(CLK_GLOBAL_MEM_FENCE);\n" << values_first[k.var<const uint_>("gid")] << " = my_value;\n"; } k << // end if "}\n"; const context &context = queue.get_context(); const device &device = queue.get_device(); ::boost::compute::kernel kernel = k.compile(context); const size_t work_group_size = pick_bitonic_block_sort_block_size<key_type, uchar_>( kernel.get_work_group_info<size_t>( device, CL_KERNEL_WORK_GROUP_SIZE ), device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE), sort_by_key ); const size_t global_size = work_group_size * static_cast<size_t>( std::ceil(float(count) / work_group_size) ); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size)); if(sort_by_key) { kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size)); } queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size); // return size of the block return work_group_size; }
InputIterator find_extrema_with_reduce(InputIterator first, InputIterator last, ::boost::compute::less< typename std::iterator_traits< InputIterator >::value_type > compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); const device &device = queue.get_device(); // Getting information about used queue and device const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); const size_t count = detail::iterator_range_size(first, last); std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + type_name<input_type>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // get preferred work group size and preferred number // of work groups per compute unit size_t work_group_size = parameters->get(cache_key, "wgsize", 256); size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64); // calculate work group size and number of work groups work_group_size = (std::min)(max_work_group_size, work_group_size); size_t work_groups_no = compute_units_no * work_groups_per_cu; work_groups_no = (std::min)( work_groups_no, static_cast<size_t>(std::ceil(float(count) / work_group_size)) ); // phase I: finding candidates for extremum // device buffors for extremum candidates and their indices // each work-group computes its candidate // zero-copy buffers are used to eliminate copying data back to host vector<input_type, ::boost::compute::pinned_allocator<input_type> > candidates(work_groups_no, context); vector<uint_, ::boost::compute::pinned_allocator <uint_> > candidates_idx(work_groups_no, context); // finding candidates for first extremum and their indices find_extrema_with_reduce( first, count, candidates.begin(), candidates_idx.begin(), work_groups_no, work_group_size, compare, find_minimum, queue ); // phase II: finding extremum from among the candidates // mapping candidates and their indices to host input_type* candidates_host_ptr = static_cast<input_type*>( queue.enqueue_map_buffer( candidates.get_buffer(), command_queue::map_read, 0, work_groups_no * sizeof(input_type) ) ); uint_* candidates_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( candidates_idx.get_buffer(), command_queue::map_read, 0, work_groups_no * sizeof(uint_) ) ); input_type* i = candidates_host_ptr; uint_* idx = candidates_idx_host_ptr; uint_* extremum_idx = idx; input_type extremum = *candidates_host_ptr; i++; idx++; // find extremum (serial) from among the candidates on host if(!find_minimum) { while(idx != (candidates_idx_host_ptr + work_groups_no)) { input_type next = *i; bool compare_result = next > extremum; bool equal = next == extremum; extremum = compare_result ? next : extremum; extremum_idx = compare_result ? idx : extremum_idx; extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; idx++, i++; } } else { while(idx != (candidates_idx_host_ptr + work_groups_no)) { input_type next = *i; bool compare_result = next < extremum; bool equal = next == extremum; extremum = compare_result ? next : extremum; extremum_idx = compare_result ? idx : extremum_idx; extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; idx++, i++; } } return first + static_cast<difference_type>(*extremum_idx); }
size_t reduce(InputIterator first, size_t count, OutputIterator result, size_t block_size, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename boost::compute::result_of<BinaryFunction(input_type, input_type)>::type result_type; const context &context = queue.get_context(); size_t block_count = count / 2 / block_size; size_t total_block_count = static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); if(block_count != 0){ meta_kernel k("block_reduce"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); k << "const uint gid = get_global_id(0);\n" << "const uint lid = get_local_id(0);\n" << // copy values to local memory "block[lid] = " << function(first[k.make_var<uint_>("gid*2+0")], first[k.make_var<uint_>("gid*2+1")]) << ";\n" << // perform reduction "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << " barrier(CLK_LOCAL_MEM_FENCE);\n" << " uint mask = (i << 1) - 1;\n" << " if((lid & mask) == 0){\n" << " block[lid] = " << function(k.expr<input_type>("block[lid]"), k.expr<input_type>("block[lid+i]")) << ";\n" << " }\n" << "}\n" << // write block result to global output "if(lid == 0)\n" << " output[get_group_id(0)] = block[0];\n"; kernel kernel = k.compile(context); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, block_count * block_size, block_size); } // serially reduce any leftovers if(block_count * block_size * 2 < count){ size_t last_block_start = block_count * block_size * 2; meta_kernel k("extra_serial_reduce"); size_t count_arg = k.add_arg<uint_>("count"); size_t offset_arg = k.add_arg<uint_>("offset"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t output_offset_arg = k.add_arg<uint_>("output_offset"); k << k.decl<result_type>("result") << " = \n" << first[k.expr<uint_>("offset")] << ";\n" << "for(uint i = offset + 1; i < count; i++)\n" << " result = " << function(k.var<result_type>("result"), first[k.var<uint_>("i")]) << ";\n" << "output[output_offset] = result;\n"; kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); queue.enqueue_task(kernel); } return total_block_count; }
/// Returns raw context id for the given queue. inline context_id get_context_id(const command_queue &q) { return q.get_context().get(); }
inline void inplace_reduce(Iterator first, Iterator last, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type value_type; size_t input_size = iterator_range_size(first, last); if(input_size < 2){ return; } const context &context = queue.get_context(); size_t block_size = 64; size_t values_per_thread = 8; size_t block_count = input_size / (block_size * values_per_thread); if(block_count * block_size * values_per_thread != input_size) block_count++; vector<value_type> output(block_count, context); meta_kernel k("inplace_reduce"); size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input"); size_t input_size_arg = k.add_arg<const uint_>("input_size"); size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output"); size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch"); k << "const uint gid = get_global_id(0);\n" << "const uint lid = get_local_id(0);\n" << "const uint values_per_thread =\n" << uint_(values_per_thread) << ";\n" << // thread reduce "const uint index = gid * values_per_thread;\n" << "if(index < input_size){\n" << k.decl<value_type>("sum") << " = input[index];\n" << "for(uint i = 1;\n" << "i < values_per_thread && (index + i) < input_size;\n" << "i++){\n" << " sum = " << function(k.var<value_type>("sum"), k.var<value_type>("input[index+i]")) << ";\n" << "}\n" << "scratch[lid] = sum;\n" << "}\n" << // local reduce "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" << " barrier(CLK_LOCAL_MEM_FENCE);\n" << " uint mask = (i << 1) - 1;\n" << " uint next_index = (gid + i) * values_per_thread;\n" " if((lid & mask) == 0 && next_index < input_size){\n" << " scratch[lid] = " << function(k.var<value_type>("scratch[lid]"), k.var<value_type>("scratch[lid+i]")) << ";\n" << " }\n" << "}\n" << // write output for block "if(lid == 0){\n" << " output[get_group_id(0)] = scratch[0];\n" << "}\n" ; const buffer *input_buffer = &first.get_buffer(); const buffer *output_buffer = &output.get_buffer(); kernel kernel = k.compile(context); while(input_size > 1){ kernel.set_arg(input_arg, *input_buffer); kernel.set_arg(input_size_arg, static_cast<uint_>(input_size)); kernel.set_arg(output_arg, *output_buffer); kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, block_count * block_size, block_size); input_size = static_cast<size_t>( std::ceil(float(input_size) / (block_size * values_per_thread) ) ); block_count = input_size / (block_size * values_per_thread); if(block_count * block_size * values_per_thread != input_size) block_count++; std::swap(input_buffer, output_buffer); } if(input_buffer != &first.get_buffer()){ ::boost::compute::copy(output.begin(), output.begin() + 1, first, queue); } }
/// Create command queue on the same context and device as the given one. inline command_queue duplicate_queue(const command_queue &q) { return command_queue(q.get_context(), q.get_device(), q.get_properties()); }
/// Returns device associated with the given queue. inline device get_device(const command_queue &q) { return q.get_device(); }
inline OutputIterator scan_on_cpu(InputIterator first, InputIterator last, OutputIterator result, bool exclusive, T init, BinaryOperator op, command_queue &queue) { if(first == last){ return result; } typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); // create scan kernel meta_kernel k("scan_on_cpu"); // Arguments size_t n_arg = k.add_arg<ulong_>("n"); size_t init_arg = k.add_arg<output_type>("initial_value"); if(!exclusive){ k << k.decl<const ulong_>("start_idx") << " = 1;\n" << k.decl<output_type>("sum") << " = " << first[0] << ";\n" << result[0] << " = sum;\n"; } else { k << k.decl<const ulong_>("start_idx") << " = 0;\n" << k.decl<output_type>("sum") << " = initial_value;\n"; } k << "for(ulong i = start_idx; i < n; i++){\n" << k.decl<const input_type>("x") << " = " << first[k.var<ulong_>("i")] << ";\n"; if(exclusive){ k << result[k.var<ulong_>("i")] << " = sum;\n"; } k << " sum = " << op(k.var<output_type>("sum"), k.var<output_type>("x")) << ";\n"; if(!exclusive){ k << result[k.var<ulong_>("i")] << " = sum;\n"; } k << "}\n"; // compile scan kernel kernel scan_kernel = k.compile(context); // setup kernel arguments size_t n = detail::iterator_range_size(first, last); scan_kernel.set_arg<ulong_>(n_arg, n); scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init)); // execute the kernel queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1); // return iterator pointing to the end of the result range return result + n; }
inline void radix_sort_impl(const buffer_iterator<T> first, const buffer_iterator<T> last, const buffer_iterator<T2> values_first, const bool ascending, command_queue &queue) { typedef T value_type; typedef typename radix_sort_value_type<sizeof(T)>::type sort_type; const device &device = queue.get_device(); const context &context = queue.get_context(); // if we have a valid values iterator then we are doing a // sort by key and have to set up the values buffer bool sort_by_key = (values_first.get_buffer().get() != 0); // load (or create) radix sort program std::string cache_key = std::string("__boost_radix_sort_") + type_name<value_type>(); if(sort_by_key){ cache_key += std::string("_with_") + type_name<T2>(); } boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // sort parameters const uint_ k = parameters->get(cache_key, "k", 4); const uint_ k2 = 1 << k; const uint_ block_size = parameters->get(cache_key, "tpb", 128); // sort program compiler options std::stringstream options; options << "-DK_BITS=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } if(sort_by_key){ options << " -DSORT_BY_KEY"; options << " -DT2=" << type_name<T2>(); options << enable_double<T2>(); } if(ascending){ options << " -DASC"; } // load radix sort program program radix_sort_program = cache->get_or_build( cache_key, options.str(), radix_sort_source, context ); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); size_t count = detail::iterator_range_size(first, last); uint_ block_count = static_cast<uint_>(count / block_size); if(block_count * block_size != count){ block_count++; } // setup temporary buffers vector<value_type> output(count, context); vector<T2> values_output(sort_by_key ? count : 0, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); uint_ input_offset = static_cast<uint_>(first.get_index()); const buffer *output_buffer = &output.get_buffer(); uint_ output_offset = 0; const buffer *values_input_buffer = &values_first.get_buffer(); uint_ values_input_offset = static_cast<uint_>(values_first.get_index()); const buffer *values_output_buffer = &values_output.get_buffer(); uint_ values_output_offset = 0; for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, input_offset); count_kernel.set_arg(2, static_cast<uint_>(count)); count_kernel.set_arg(3, counts); count_kernel.set_arg(4, offsets); count_kernel.set_arg(5, block_size * sizeof(uint_), 0); count_kernel.set_arg(6, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts); scan_kernel.set_arg(1, offsets); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, input_offset); scatter_kernel.set_arg(2, static_cast<uint_>(count)); scatter_kernel.set_arg(3, i * k); scatter_kernel.set_arg(4, counts); scatter_kernel.set_arg(5, offsets); scatter_kernel.set_arg(6, *output_buffer); scatter_kernel.set_arg(7, output_offset); if(sort_by_key){ scatter_kernel.set_arg(8, *values_input_buffer); scatter_kernel.set_arg(9, values_input_offset); scatter_kernel.set_arg(10, *values_output_buffer); scatter_kernel.set_arg(11, values_output_offset); } queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); std::swap(values_input_buffer, values_output_buffer); std::swap(input_offset, output_offset); std::swap(values_input_offset, values_output_offset); } }
void start() { _actorStarted = true; _actorQueue.start(); _actorThread = std::thread( &actor<COMMAND,RETURN_VALUE>::_entry_point, this ); }
void stop() { _actorStarted = false; _actorQueue.stop(); _actorThread.join(); }
inline OutputIterator scan_on_cpu(InputIterator first, InputIterator last, OutputIterator result, bool exclusive, T init, BinaryOperator op, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); const device &device = queue.get_device(); const size_t compute_units = queue.get_device().compute_units(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T)); // for inputs smaller than serial_scan_threshold // serial_scan algorithm is used uint_ serial_scan_threshold = parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T)); serial_scan_threshold = (std::max)(serial_scan_threshold, uint_(compute_units)); size_t count = detail::iterator_range_size(first, last); if(count == 0){ return result; } else if(count < serial_scan_threshold) { return serial_scan(first, last, result, exclusive, init, op, queue); } buffer block_partial_sums(context, sizeof(output_type) * compute_units ); // create scan kernel meta_kernel k("scan_on_cpu_block_scan"); // Arguments size_t count_arg = k.add_arg<uint_>("count"); size_t init_arg = k.add_arg<output_type>("initial_value"); size_t block_partial_sums_arg = k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); k << "uint block = " << "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n"; if(!exclusive){ k << k.decl<output_type>("sum") << " = " << first[k.var<uint_>("index")] << ";\n" << result[k.var<uint_>("index")] << " = sum;\n" << "index++;\n"; } else { k << k.decl<output_type>("sum") << ";\n" << "if(index == 0){\n" << "sum = initial_value;\n" << "}\n" << "else {\n" << "sum = " << first[k.var<uint_>("index")] << ";\n" << "index++;\n" << "}\n"; } k << "while(index < end){\n" << // load next value k.decl<const input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n"; if(exclusive){ k << "if(get_global_id(0) == 0){\n" << result[k.var<uint_>("index")] << " = sum;\n" << "}\n"; } k << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("value")) << ";\n"; if(!exclusive){ k << "if(get_global_id(0) == 0){\n" << result[k.var<uint_>("index")] << " = sum;\n" << "}\n"; } k << "index++;\n" << "}\n" << // end while "block_partial_sums[get_global_id(0)] = sum;\n"; // compile scan kernel kernel block_scan_kernel = k.compile(context); // setup kernel arguments block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init)); block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); // execute the kernel size_t global_work_size = compute_units; queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0); // scan is done if(compute_units < 2) { return result + count; } // final scan kernel meta_kernel l("scan_on_cpu_final_scan"); // Arguments count_arg = l.add_arg<uint_>("count"); block_partial_sums_arg = l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); l << "uint block = " << "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << "uint index = block + get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << k.decl<output_type>("sum") << " = block_partial_sums[0];\n" << "for(uint i = 0; i < get_global_id(0); i++) {\n" << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" << "}\n" << "while(index < end){\n"; if(exclusive){ l << l.decl<output_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << result[k.var<uint_>("index")] << " = sum;\n" << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("value")) << ";\n"; } else { l << "sum = " << op(k.var<output_type>("sum"), first[k.var<uint_>("index")]) << ";\n" << result[k.var<uint_>("index")] << " = sum;\n"; } l << "index++;\n" << "}\n"; // compile scan kernel kernel final_scan_kernel = l.compile(context); // setup kernel arguments final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); // execute the kernel global_work_size = compute_units; queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0); // return iterator pointing to the end of the result range return result + count; }
/// Create command queue on the same context and device as the given one. inline command_queue duplicate_queue(const command_queue &q) { return command_queue(q.context(), q.device(), q.flags()); }
/// Binds the specified CUDA context to the calling CPU thread. inline void select_context(const command_queue &q) { q.context().set_current(); }
static inline std::vector<uint> pad_vector(command_queue &q, const V &v, uint x) { std::vector<uint> w { v.begin(), v.end() }; w.resize(q.device().max_block_size().size(), x); return w; }
/// Create and build a program from source string. inline vex::backend::program build_sources( const command_queue &queue, const std::string &source, const std::string &options = "" ) { #ifdef VEXCL_SHOW_KERNELS std::cout << source << std::endl; #else if (getenv("VEXCL_SHOW_KERNELS")) std::cout << source << std::endl; #endif std::string compile_options = options + " " + get_compile_options(queue); queue.context().set_current(); auto cc = queue.device().compute_capability(); std::ostringstream ccstr; ccstr << std::get<0>(cc) << std::get<1>(cc); sha1_hasher sha1; sha1.process(source) .process(queue.device().name()) .process(compile_options) .process(ccstr.str()) ; std::string hash = static_cast<std::string>(sha1); // Write source to a .cu file std::string basename = program_binaries_path(hash, true) + "kernel"; std::string ptxfile = basename + ".ptx"; if ( !boost::filesystem::exists(ptxfile) ) { std::string cufile = basename + ".cu"; { std::ofstream f(cufile); f << source; } // Compile the source to ptx. std::ostringstream cmdline; cmdline << "nvcc -ptx -O3" << " -arch=sm_" << std::get<0>(cc) << std::get<1>(cc) << " " << compile_options << " -o " << ptxfile << " " << cufile; if (0 != system(cmdline.str().c_str()) ) { #ifndef VEXCL_SHOW_KERNELS std::cerr << source << std::endl; #endif vex::detail::print_backtrace(); throw std::runtime_error("nvcc invocation failed"); } } // Load the compiled ptx. CUmodule prg; cuda_check( cuModuleLoad(&prg, ptxfile.c_str()) ); return program(queue.context(), prg); }
bool operator()(const command_queue &a, const command_queue &b) const { return a.get() < b.get(); }
/// Standard number of workgroups to launch on a device. static inline size_t num_workgroups(const command_queue &q) { return 8 * q.device().multiprocessor_count(); }
/// Checks if the compute device is CPU. inline bool is_cpu(const command_queue &q) { return q.get_device().get_info<cl_device_type>(CL_DEVICE_TYPE) & CL_DEVICE_TYPE_CPU; }
/// The size in bytes of shared memory per block available for this kernel. size_t max_shared_memory_per_block(const command_queue &q) const { return q.device().max_shared_memory_per_block() - shared_size_bytes(); }
/// Returns id of the device associated with the given queue. inline device_id get_device_id(const command_queue &q) { return q.get_device().get(); }
inline size_t serial_reduce_by_key(InputKeyIterator keys_first, InputKeyIterator keys_last, InputValueIterator values_first, OutputKeyIterator keys_result, OutputValueIterator values_result, BinaryFunction function, BinaryPredicate predicate, command_queue &queue) { typedef typename std::iterator_traits<InputValueIterator>::value_type value_type; typedef typename std::iterator_traits<InputKeyIterator>::value_type key_type; typedef typename ::boost::compute::result_of<BinaryFunction(value_type, value_type)>::type result_type; const context &context = queue.get_context(); size_t count = detail::iterator_range_size(keys_first, keys_last); if(count < 1){ return count; } meta_kernel k("serial_reduce_by_key"); size_t count_arg = k.add_arg<uint_>("count"); size_t result_size_arg = k.add_arg<uint_ *>(memory_object::global_memory, "result_size"); k << k.decl<result_type>("result") << " = " << values_first[0] << ";\n" << k.decl<key_type>("previous_key") << " = " << keys_first[0] << ";\n" << k.decl<result_type>("value") << ";\n" << k.decl<key_type>("key") << ";\n" << k.decl<uint_>("size") << " = 1;\n" << keys_result[0] << " = previous_key;\n" << values_result[0] << " = result;\n" << "for(ulong i = 1; i < count; i++) {\n" << " value = " << values_first[k.var<uint_>("i")] << ";\n" << " key = " << keys_first[k.var<uint_>("i")] << ";\n" << " if (" << predicate(k.var<key_type>("previous_key"), k.var<key_type>("key")) << ") {\n" << " result = " << function(k.var<result_type>("result"), k.var<result_type>("value")) << ";\n" << " }\n " << " else { \n" << keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" << values_result[k.var<uint_>("size - 1")] << " = result;\n" << " result = value;\n" << " size++;\n" << " } \n" << " previous_key = key;\n" << "}\n" << keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" << values_result[k.var<uint_>("size - 1")] << " = result;\n" << "*result_size = size;"; kernel kernel = k.compile(context); scalar<uint_> result_size(context); kernel.set_arg(result_size_arg, result_size.get_buffer()); kernel.set_arg(count_arg, static_cast<uint_>(count)); queue.enqueue_task(kernel); return static_cast<size_t>(result_size.read(queue)); }
/// Returns context for the given queue. inline context get_context(const command_queue &q) { return q.get_context(); }
inline void reduce_on_gpu(InputIterator first, InputIterator last, buffer_iterator<T> result, Function function, command_queue &queue) { const device &device = queue.get_device(); const context &context = queue.get_context(); detail::meta_kernel k("reduce"); k.add_arg<const T*>(memory_object::global_memory, "input"); k.add_arg<const uint_>("offset"); k.add_arg<const uint_>("count"); k.add_arg<T*>(memory_object::global_memory, "output"); k.add_arg<const uint_>("output_offset"); k << k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" << "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "__local " << type_name<T>() << " scratch[TPB];\n" << // private reduction k.decl<T>("sum") << " = 0;\n" << "for(uint i = 0; i < VPT; i++){\n" << " if(block_offset + lid + i*TPB < count){\n" << " sum = sum + block[lid+i*TPB]; \n" << " }\n" << "}\n" << "scratch[lid] = sum;\n"; // discrimination on vendor name if(is_nvidia_device(device)) k << ReduceBody<T,true>::body(); else k << ReduceBody<T,false>::body(); k << // write sum to output "if(lid == 0){\n" << " output[output_offset + get_group_id(0)] = scratch[0];\n" << "}\n"; std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); uint_ vpt = parameters->get(cache_key, "vpt", 8); uint_ tpb = parameters->get(cache_key, "tpb", 128); // reduce program compiler flags std::stringstream options; options << "-DT=" << type_name<T>() << " -DVPT=" << vpt << " -DTPB=" << tpb; // load program boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); program reduce_program = cache->get_or_build( cache_key, options.str(), k.source(), context ); // create reduce kernel kernel reduce_kernel(reduce_program, "reduce"); size_t count = std::distance(first, last); // first pass, reduce from input to ping buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T)); initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue); // update count after initial reduce count = std::ceil(float(count) / vpt / tpb); // middle pass(es), reduce between ping and pong const buffer *input_buffer = &ping; buffer pong(context, count / vpt / tpb * sizeof(T)); const buffer *output_buffer = &pong; if(count > vpt * tpb){ while(count > vpt * tpb){ reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, *output_buffer); reduce_kernel.set_arg(4, uint_(0)); size_t work_size = std::ceil(float(count) / vpt); if(work_size % tpb != 0){ work_size += tpb - work_size % tpb; } queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); std::swap(input_buffer, output_buffer); count = std::ceil(float(count) / vpt / tpb); } } // final pass, reduce from ping/pong to result reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, result.get_buffer()); reduce_kernel.set_arg(4, uint_(result.get_index())); queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb); }
InputIterator find_extrema_with_reduce(InputIterator first, InputIterator last, Compare compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); const device &device = queue.get_device(); // Getting information about used queue and device const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); const size_t count = detail::iterator_range_size(first, last); std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + type_name<input_type>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // get preferred work group size and preferred number // of work groups per compute unit size_t work_group_size = parameters->get(cache_key, "wgsize", 256); size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100); // calculate work group size and number of work groups work_group_size = (std::min)(max_work_group_size, work_group_size); size_t work_groups_no = compute_units_no * work_groups_per_cu; work_groups_no = (std::min)( work_groups_no, static_cast<size_t>(std::ceil(float(count) / work_group_size)) ); // phase I: finding candidates for extremum // device buffors for extremum candidates and their indices // each work-group computes its candidate vector<input_type> candidates(work_groups_no, context); vector<uint_> candidates_idx(work_groups_no, context); // finding candidates for first extremum and their indices find_extrema_with_reduce( first, count, candidates.begin(), candidates_idx.begin(), work_groups_no, work_group_size, compare, find_minimum, queue ); // phase II: finding extremum from among the candidates // zero-copy buffers for final result (value and index) vector<input_type, ::boost::compute::pinned_allocator<input_type> > result(1, context); vector<uint_, ::boost::compute::pinned_allocator<uint_> > result_idx(1, context); // get extremum from among the candidates find_extrema_with_reduce( candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(), result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue ); // mapping extremum index to host uint_* result_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( result_idx.get_buffer(), command_queue::map_read, 0, sizeof(uint_) ) ); return first + static_cast<difference_type>(*result_idx_host_ptr); }
inline InputIterator find_extrema_on_cpu(InputIterator first, InputIterator last, Compare compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; size_t count = iterator_range_size(first, last); const device &device = queue.get_device(); const uint_ compute_units = queue.get_device().compute_units(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_find_extrema_cpu_" + boost::lexical_cast<std::string>(sizeof(input_type)); // for inputs smaller than serial_find_extrema_threshold // serial_find_extrema algorithm is used uint_ serial_find_extrema_threshold = parameters->get( cache_key, "serial_find_extrema_threshold", 16384 * sizeof(input_type) ); serial_find_extrema_threshold = (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units)); const context &context = queue.get_context(); if(count < serial_find_extrema_threshold) { return serial_find_extrema(first, last, compare, find_minimum, queue); } meta_kernel k("find_extrema_on_cpu"); buffer output(context, sizeof(input_type) * compute_units); buffer output_idx( context, sizeof(uint_) * compute_units, buffer::read_write | buffer::alloc_host_ptr ); size_t count_arg = k.add_arg<uint_>("count"); size_t output_arg = k.add_arg<input_type *>(memory_object::global_memory, "output"); size_t output_idx_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output_idx"); k << "uint block = " << "(uint)ceil(((float)count)/get_global_size(0));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << "uint value_index = index;\n" << k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << "index++;\n" << "while(index < end){\n" << k.decl<input_type>("candidate") << " = " << first[k.var<uint_>("index")] << ";\n" << "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << "bool compare = " << compare(k.var<input_type>("candidate"), k.var<input_type>("value")) << ";\n" << "#else\n" << "bool compare = " << compare(k.var<input_type>("value"), k.var<input_type>("candidate")) << ";\n" << "#endif\n" << "value = compare ? candidate : value;\n" << "value_index = compare ? index : value_index;\n" << "index++;\n" << "}\n" << "output[get_global_id(0)] = value;\n" << "output_idx[get_global_id(0)] = value_index;\n"; size_t global_work_size = compute_units; std::string options; if(!find_minimum){ options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; } kernel kernel = k.compile(context, options); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(output_arg, output); kernel.set_arg(output_idx_arg, output_idx); queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0); buffer_iterator<input_type> result = serial_find_extrema( make_buffer_iterator<input_type>(output), make_buffer_iterator<input_type>(output, global_work_size), compare, find_minimum, queue ); uint_* output_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( output_idx, command_queue::map_read, 0, global_work_size * sizeof(uint_) ) ); difference_type extremum_idx = static_cast<difference_type>(*(output_idx_host_ptr + result.get_index())); return first + extremum_idx; }
inline void find_extrema_with_reduce(InputIterator input, vector<uint_>::iterator input_idx, size_t count, ResultIterator result, vector<uint_>::iterator result_idx, size_t work_groups_no, size_t work_group_size, Compare compare, const bool find_minimum, const bool use_input_idx, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); meta_kernel k("find_extrema_reduce"); size_t count_arg = k.add_arg<uint_>("count"); size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx"); k << // Work item global id k.decl<const uint_>("gid") << " = get_global_id(0);\n" << // Index of element that will be read from input buffer k.decl<uint_>("idx") << " = gid;\n" << k.decl<input_type>("acc") << ";\n" << k.decl<uint_>("acc_idx") << ";\n" << "if(gid < count) {\n" << // Real index of currently best element "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << "#else\n" << k.var<uint_>("acc_idx") << " = idx;\n" << "#endif\n" << // Init accumulator with first[get_global_id(0)] "acc = " << input[k.var<uint_>("idx")] << ";\n" << "idx += get_global_size(0);\n" << "}\n" << k.decl<bool>("compare_result") << ";\n" << k.decl<bool>("equal") << ";\n\n" << "while( idx < count ){\n" << // Next element k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" << "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << "#endif\n" << // Comparison between currently best element (acc) and next element "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << "compare_result = " << compare(k.var<input_type>("next"), k.var<input_type>("acc")) << ";\n" << "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << "equal = !compare_result && !" << compare(k.var<input_type>("acc"), k.var<input_type>("next")) << ";\n" << "# endif\n" << "#else\n" << "compare_result = " << compare(k.var<input_type>("acc"), k.var<input_type>("next")) << ";\n" << "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << "equal = !compare_result && !" << compare(k.var<input_type>("next"), k.var<input_type>("acc")) << ";\n" << "# endif\n" << "#endif\n" << // save the winner "acc = compare_result ? acc : next;\n" << "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << "acc_idx = compare_result ? " << "acc_idx : " << "(equal ? min(acc_idx, next_idx) : next_idx);\n" << "#else\n" << "acc_idx = compare_result ? acc_idx : idx;\n" << "#endif\n" << "idx += get_global_size(0);\n" << "}\n\n" << // Work item local id k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "block[lid] = acc;\n" << "block_idx[lid] = acc_idx;\n" << "barrier(CLK_LOCAL_MEM_FENCE);\n" << k.decl<uint_>("group_offset") << " = count - (get_local_size(0) * get_group_id(0));\n\n"; k << "#pragma unroll\n" "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " << "offset = offset / 2) {\n" << "if((lid < offset) && ((lid + offset) < group_offset)) { \n" << k.decl<input_type>("mine") << " = block[lid];\n" << k.decl<input_type>("other") << " = block[lid+offset];\n" << "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << "compare_result = " << compare(k.var<input_type>("other"), k.var<input_type>("mine")) << ";\n" << "equal = !compare_result && !" << compare(k.var<input_type>("mine"), k.var<input_type>("other")) << ";\n" << "#else\n" << "compare_result = " << compare(k.var<input_type>("mine"), k.var<input_type>("other")) << ";\n" << "equal = !compare_result && !" << compare(k.var<input_type>("other"), k.var<input_type>("mine")) << ";\n" << "#endif\n" << "block[lid] = compare_result ? mine : other;\n" << k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" << k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" << "block_idx[lid] = compare_result ? " << "mine_idx : " << "(equal ? min(mine_idx, other_idx) : other_idx);\n" << "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" << "}\n\n" << // write block result to global output "if(lid == 0){\n" << result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" << result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" << "}"; std::string options; if(!find_minimum){ options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; } if(use_input_idx){ options += " -DBOOST_COMPUTE_USE_INPUT_IDX"; } kernel kernel = k.compile(context, options); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size)); kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size)); queue.enqueue_1d_range_kernel(kernel, 0, work_groups_no * work_group_size, work_group_size); }
/// Creates a new threefry_engine and seeds it with \p value. explicit threefry_engine(command_queue &queue) : m_context(queue.get_context()) { // setup program load_program(); }
void generate(OutputIterator first, OutputIterator last, Function op, command_queue &queue) { vector<T> tmp(std::distance(first, last), queue.get_context()); generate(tmp.begin(), tmp.end(), queue); transform(tmp.begin(), tmp.end(), first, op, queue); }
inline void merge_blocks_on_gpu(KeyIterator keys_first, ValueIterator values_first, KeyIterator out_keys_first, ValueIterator out_values_first, Compare compare, const size_t count, const size_t block_size, const bool sort_by_key, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; meta_kernel k("merge_blocks"); size_t count_arg = k.add_arg<const uint_>("count"); size_t block_size_arg = k.add_arg<const uint_>("block_size"); k << // get global id k.decl<const uint_>("gid") << " = get_global_id(0);\n" << "if(gid >= count) {\n" << "return;\n" << "}\n" << k.decl<const key_type>("my_key") << " = " << keys_first[k.var<const uint_>("gid")] << ";\n"; if(sort_by_key) { k << k.decl<const value_type>("my_value") << " = " << values_first[k.var<const uint_>("gid")] << ";\n"; } k << // get my block idx k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" << k.decl<const bool>("my_block_idx_is_odd") << " = " << "my_block_idx & 0x1;\n" << k.decl<const uint_>("other_block_idx") << " = " << // if(my_block_idx is odd) {} else {} "my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" << // get ranges of my block and the other block // [my_block_start; my_block_end) // [other_block_start; other_block_end) k.decl<const uint_>("my_block_start") << " = " << "min(my_block_idx * block_size, count);\n" << // including k.decl<const uint_>("my_block_end") << " = " << "min((my_block_idx + 1) * block_size, count);\n" << // excluding k.decl<const uint_>("other_block_start") << " = " << "min(other_block_idx * block_size, count);\n" << // including k.decl<const uint_>("other_block_end") << " = " << "min((other_block_idx + 1) * block_size, count);\n" << // excluding // other block is empty, nothing to merge here "if(other_block_start == count){\n" << out_keys_first[k.var<uint_>("gid")] << " = my_key;\n"; if(sort_by_key) { k << out_values_first[k.var<uint_>("gid")] << " = my_value;\n"; } k << "return;\n" << "}\n" << // lower bound // left_idx - lower bound k.decl<uint_>("left_idx") << " = other_block_start;\n" << k.decl<uint_>("right_idx") << " = other_block_end;\n" << "while(left_idx < right_idx) {\n" << k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" << k.decl<key_type>("mid_key") << " = " << keys_first[k.var<const uint_>("mid_idx")] << ";\n" << k.decl<bool>("smaller") << " = " << compare(k.var<key_type>("mid_key"), k.var<key_type>("my_key")) << ";\n" << "left_idx = smaller ? mid_idx + 1 : left_idx;\n" << "right_idx = smaller ? right_idx : mid_idx;\n" << "}\n" << // left_idx is found position in other block // if my_block is odd we need to get the upper bound "right_idx = other_block_end;\n" << "if(my_block_idx_is_odd && left_idx != right_idx) {\n" << k.decl<key_type>("upper_key") << " = " << keys_first[k.var<const uint_>("left_idx")] << ";\n" << "while(" << "!(" << compare(k.var<key_type>("upper_key"), k.var<key_type>("my_key")) << ") && " << "!(" << compare(k.var<key_type>("my_key"), k.var<key_type>("upper_key")) << ") && " << "left_idx < right_idx" << ")" << "{\n" << k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" << k.decl<key_type>("mid_key") << " = " << keys_first[k.var<const uint_>("mid_idx")] << ";\n" << k.decl<bool>("equal") << " = " << "!(" << compare(k.var<key_type>("mid_key"), k.var<key_type>("my_key")) << ") && " << "!(" << compare(k.var<key_type>("my_key"), k.var<key_type>("mid_key")) << ");\n" << "left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" << "right_idx = equal ? right_idx : mid_idx;\n" << "upper_key = " << keys_first[k.var<const uint_>("left_idx")] << ";\n" << "}\n" << "}\n" << k.decl<uint_>("offset") << " = 0;\n" << "offset += gid - my_block_start;\n" << "offset += left_idx - other_block_start;\n" << "offset += min(my_block_start, other_block_start);\n" << out_keys_first[k.var<uint_>("offset")] << " = my_key;\n"; if(sort_by_key) { k << out_values_first[k.var<uint_>("offset")] << " = my_value;\n"; } const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); const size_t work_group_size = (std::min)( size_t(256), kernel.get_work_group_info<size_t>( queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE ) ); const size_t global_size = work_group_size * static_cast<size_t>( std::ceil(float(count) / work_group_size) ); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size); }