inline size_t count_if(InputIterator first, InputIterator last, Predicate predicate, command_queue &queue = system::default_queue()) { const device &device = queue.get_device(); size_t input_size = detail::iterator_range_size(first, last); if(input_size == 0){ return 0; } if(device.type() & device::cpu){ if(input_size < 1024){ return detail::serial_count_if(first, last, predicate, queue); } else { return detail::count_if_with_threads(first, last, predicate, queue); } } else { if(input_size < 32){ return detail::serial_count_if(first, last, predicate, queue); } else { return detail::count_if_with_reduce(first, last, predicate, queue); } } }
inline InputIterator binary_find(InputIterator first, InputIterator last, UnaryPredicate predicate, command_queue &queue = system::default_queue()) { const device &device = queue.get_device(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); const std::string cache_key = "__boost_binary_find"; size_t find_if_limit = 128; size_t threads = parameters->get(cache_key, "tpb", 128); size_t count = iterator_range_size(first, last); InputIterator search_first = first; InputIterator search_last = last; scalar<uint_> index(queue.get_context()); // construct and compile binary_find kernel binary_find_kernel<InputIterator, UnaryPredicate> binary_find_kernel(search_first, search_last, predicate); ::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context()); // set buffer for index kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer()); while(count > find_if_limit) { index.write(static_cast<uint_>(count), queue); // set block and run binary_find kernel uint_ block = static_cast<uint_>((count - 1)/(threads - 1)); kernel.set_arg(binary_find_kernel.m_block_arg, block); queue.enqueue_1d_range_kernel(kernel, 0, threads, 0); size_t i = index.read(queue); if(i == count) { search_first = search_last - ((count - 1)%(threads - 1)); break; } else { search_last = search_first + i; search_first = search_last - ((count - 1)/(threads - 1)); } // Make sure that first and last stay within the input range search_last = (std::min)(search_last, last); search_last = (std::max)(search_last, first); search_first = (std::max)(search_first, first); search_first = (std::min)(search_first, last); count = iterator_range_size(search_first, search_last); } return find_if(search_first, search_last, predicate, queue); }
inline void merge_sort_by_key_on_cpu(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; size_t count = iterator_range_size(keys_first, keys_last); if(count < 2){ return; } // for small input size only insertion sort is performed else if(count <= 512){ block_insertion_sort(keys_first, values_first, compare, count, count, true, queue); return; } const context &context = queue.get_context(); const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>() + "_with_" + type_name<key_type>(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); const size_t block_size = parameters->get(cache_key, "insertion_sort_by_key_block_size", 64); block_insertion_sort(keys_first, values_first, compare, count, block_size, true, queue); // temporary buffer for merge results vector<value_type> values_temp(count, context); vector<key_type> keys_temp(count, context); bool result_in_temporary_buffer = false; for(size_t i = block_size; i < count; i *= 2){ result_in_temporary_buffer = !result_in_temporary_buffer; if(result_in_temporary_buffer) { merge_blocks(keys_first, values_first, keys_temp.begin(), values_temp.begin(), compare, count, i, true, queue); } else { merge_blocks(keys_temp.begin(), values_temp.begin(), keys_first, values_first, compare, count, i, true, queue); } } if(result_in_temporary_buffer) { copy(keys_temp.begin(), keys_temp.end(), keys_first, queue); copy(values_temp.begin(), values_temp.end(), values_first, queue); } }
inline void dispatch_sort(Iterator first, Iterator last, Compare compare, command_queue &queue, typename boost::enable_if< is_device_iterator<Iterator> >::type* = 0) { if(queue.get_device().type() & device::gpu) { dispatch_gpu_sort(first, last, compare, queue); return; } ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue); }
inline void dispatch_sort_by_key(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, command_queue &queue) { if(queue.get_device().type() & device::gpu) { dispatch_gpu_sort_by_key(keys_first, keys_last, values_first, compare, queue); return; } ::boost::compute::detail::merge_sort_by_key_on_cpu( keys_first, keys_last, values_first, compare, queue ); }
inline OutputIterator copy_on_device(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue) { const device &device = queue.get_device(); copy_kernel<InputIterator, OutputIterator> kernel(device); kernel.set_range(first, last, result); kernel.exec(queue); return result + std::distance(first, last); }
inline future<OutputIterator> copy_on_device_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue) { const device &device = queue.get_device(); copy_kernel<InputIterator, OutputIterator> kernel(device); kernel.set_range(first, last, result); event event_ = kernel.exec(queue); return make_future(result + std::distance(first, last), event_); }
inline OutputIterator scan(InputIterator first, InputIterator last, OutputIterator result, bool exclusive, command_queue &queue) { const device &device = queue.get_device(); if(device.type() & device::cpu){ return scan_on_cpu(first, last, result, exclusive, queue); } else { return scan_on_gpu(first, last, result, exclusive, queue); } }
inline OutputIterator merge(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp, command_queue &queue = system::default_queue()) { BOOST_STATIC_ASSERT(is_device_iterator<InputIterator1>::value); BOOST_STATIC_ASSERT(is_device_iterator<InputIterator2>::value); BOOST_STATIC_ASSERT(is_device_iterator<OutputIterator>::value); typedef typename std::iterator_traits<InputIterator1>::value_type input1_type; typedef typename std::iterator_traits<InputIterator2>::value_type input2_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const device &device = queue.get_device(); std::string cache_key = std::string("__boost_merge_") + type_name<input1_type>() + "_" + type_name<input2_type>() + "_" + type_name<output_type>(); boost::shared_ptr<detail::parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // default serial merge threshold depends on device type size_t default_serial_merge_threshold = 32768; if(device.type() & device::gpu) { default_serial_merge_threshold = 2048; } // loading serial merge threshold parameter const size_t serial_merge_threshold = parameters->get(cache_key, "serial_merge_threshold", static_cast<uint_>(default_serial_merge_threshold)); // choosing merge algorithm const size_t total_count = detail::iterator_range_size(first1, last1) + detail::iterator_range_size(first2, last2); // for small inputs serial merge turns out to outperform // merge with merge path algorithm if(total_count <= serial_merge_threshold){ return detail::serial_merge(first1, last1, first2, last2, result, comp, queue); } return detail::merge_with_merge_path(first1, last1, first2, last2, result, comp, queue); }
inline void generic_reduce(InputIterator first, InputIterator last, OutputIterator result, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename boost::compute::result_of<BinaryFunction(input_type, input_type)>::type result_type; const device &device = queue.get_device(); const context &context = queue.get_context(); size_t count = detail::iterator_range_size(first, last); if(device.type() & device::cpu){ boost::compute::vector<result_type> value(1, context); detail::serial_reduce(first, last, value.begin(), function, queue); boost::compute::copy_n(value.begin(), 1, result, queue); } else { size_t block_size = 256; // first pass vector<result_type> results = detail::block_reduce(first, count, block_size, function, queue); if(results.size() > 1){ detail::inplace_reduce(results.begin(), results.end(), function, queue); } boost::compute::copy_n(results.begin(), 1, result, queue); } }
bool find_extrema_with_reduce_requirements_met(InputIterator first, InputIterator last, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; const device &device = queue.get_device(); // device must have dedicated local memory storage // otherwise reduction would be highly inefficient if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL) { return false; } const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); // local memory size in bytes (per compute unit) const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>(); std::string cache_key = std::string("__boost_find_extrema_reduce_") + type_name<input_type>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // Get preferred work group size size_t work_group_size = parameters->get(cache_key, "wgsize", 256); work_group_size = (std::min)(max_work_group_size, work_group_size); // local memory size needed to perform parallel reduction size_t required_local_mem_size = 0; // indices size required_local_mem_size += sizeof(uint_) * work_group_size; // values size required_local_mem_size += sizeof(input_type) * work_group_size; // at least 4 work groups per compute unit otherwise reduction // would be highly inefficient return ((required_local_mem_size * 4) <= local_mem_size); }
inline void dispatch_reduce(InputIterator first, InputIterator last, OutputIterator result, const plus<T> &function, command_queue &queue) { const context &context = queue.get_context(); const device &device = queue.get_device(); // reduce to temporary buffer on device array<T, 1> value(context); if(device.type() & device::cpu){ detail::reduce_on_cpu(first, last, value.begin(), function, queue); } else { reduce_on_gpu(first, last, value.begin(), function, queue); } // copy to result iterator copy_n(value.begin(), 1, result, queue); }
inline InputIterator binary_find(InputIterator first, InputIterator last, UnaryPredicate predicate, command_queue &queue = system::default_queue()) { const device &device = queue.get_device(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); const std::string cache_key = "__boost_binary_find"; size_t find_if_limit = 128; size_t threads = parameters->get(cache_key, "tpb", 128); size_t count = iterator_range_size(first, last); while(count > find_if_limit) { scalar<uint_> index(queue.get_context()); index.write(static_cast<uint_>(count), queue); binary_find_kernel kernel(threads); kernel.set_range(first, last, predicate); kernel.exec(queue, index); size_t i = index.read(queue); if(i == count) { first = last - count%threads; break; } else { last = first + i; first = last - count/threads; } count = iterator_range_size(first, last); } return find_if(first, last, predicate, queue); }
/// Checks if the compute device is CPU. inline bool is_cpu(const command_queue &q) { return q.get_device().get_info<cl_device_type>(CL_DEVICE_TYPE) & CL_DEVICE_TYPE_CPU; }
inline void merge_sort_on_cpu(Iterator first, Iterator last, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type value_type; size_t count = iterator_range_size(first, last); if(count < 2){ return; } // for small input size only insertion sort is performed else if(count <= 512){ block_insertion_sort(first, compare, count, count, queue); return; } const context &context = queue.get_context(); const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // When there is merge_with_path_blocks_no_threshold or less blocks left to // merge AND input size is merge_with_merge_path_input_size_threshold or more // merge_with_merge_path() algorithm is used to merge sorted blocks; // otherwise merge_blocks() is used. const size_t merge_with_path_blocks_no_threshold = parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8); const size_t merge_with_path_input_size_threshold = parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152); const size_t block_size = parameters->get(cache_key, "insertion_sort_block_size", 64); block_insertion_sort(first, compare, count, block_size, queue); // temporary buffer for merge result vector<value_type> temp(count, context); bool result_in_temporary_buffer = false; for(size_t i = block_size; i < count; i *= 2){ result_in_temporary_buffer = !result_in_temporary_buffer; if(result_in_temporary_buffer) { dispatch_merge_blocks(first, temp.begin(), compare, count, i, merge_with_path_input_size_threshold, merge_with_path_blocks_no_threshold, queue); } else { dispatch_merge_blocks(temp.begin(), first, compare, count, i, merge_with_path_input_size_threshold, merge_with_path_blocks_no_threshold, queue); } } if(result_in_temporary_buffer) { copy(temp.begin(), temp.end(), first, queue); } }
inline void merge_blocks_on_gpu(KeyIterator keys_first, ValueIterator values_first, KeyIterator out_keys_first, ValueIterator out_values_first, Compare compare, const size_t count, const size_t block_size, const bool sort_by_key, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; meta_kernel k("merge_blocks"); size_t count_arg = k.add_arg<const uint_>("count"); size_t block_size_arg = k.add_arg<const uint_>("block_size"); k << // get global id k.decl<const uint_>("gid") << " = get_global_id(0);\n" << "if(gid >= count) {\n" << "return;\n" << "}\n" << k.decl<const key_type>("my_key") << " = " << keys_first[k.var<const uint_>("gid")] << ";\n"; if(sort_by_key) { k << k.decl<const value_type>("my_value") << " = " << values_first[k.var<const uint_>("gid")] << ";\n"; } k << // get my block idx k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" << k.decl<const bool>("my_block_idx_is_odd") << " = " << "my_block_idx & 0x1;\n" << k.decl<const uint_>("other_block_idx") << " = " << // if(my_block_idx is odd) {} else {} "my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" << // get ranges of my block and the other block // [my_block_start; my_block_end) // [other_block_start; other_block_end) k.decl<const uint_>("my_block_start") << " = " << "min(my_block_idx * block_size, count);\n" << // including k.decl<const uint_>("my_block_end") << " = " << "min((my_block_idx + 1) * block_size, count);\n" << // excluding k.decl<const uint_>("other_block_start") << " = " << "min(other_block_idx * block_size, count);\n" << // including k.decl<const uint_>("other_block_end") << " = " << "min((other_block_idx + 1) * block_size, count);\n" << // excluding // other block is empty, nothing to merge here "if(other_block_start == count){\n" << out_keys_first[k.var<uint_>("gid")] << " = my_key;\n"; if(sort_by_key) { k << out_values_first[k.var<uint_>("gid")] << " = my_value;\n"; } k << "return;\n" << "}\n" << // lower bound // left_idx - lower bound k.decl<uint_>("left_idx") << " = other_block_start;\n" << k.decl<uint_>("right_idx") << " = other_block_end;\n" << "while(left_idx < right_idx) {\n" << k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" << k.decl<key_type>("mid_key") << " = " << keys_first[k.var<const uint_>("mid_idx")] << ";\n" << k.decl<bool>("smaller") << " = " << compare(k.var<key_type>("mid_key"), k.var<key_type>("my_key")) << ";\n" << "left_idx = smaller ? mid_idx + 1 : left_idx;\n" << "right_idx = smaller ? right_idx : mid_idx;\n" << "}\n" << // left_idx is found position in other block // if my_block is odd we need to get the upper bound "right_idx = other_block_end;\n" << "if(my_block_idx_is_odd && left_idx != right_idx) {\n" << k.decl<key_type>("upper_key") << " = " << keys_first[k.var<const uint_>("left_idx")] << ";\n" << "while(" << "!(" << compare(k.var<key_type>("upper_key"), k.var<key_type>("my_key")) << ") && " << "!(" << compare(k.var<key_type>("my_key"), k.var<key_type>("upper_key")) << ") && " << "left_idx < right_idx" << ")" << "{\n" << k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" << k.decl<key_type>("mid_key") << " = " << keys_first[k.var<const uint_>("mid_idx")] << ";\n" << k.decl<bool>("equal") << " = " << "!(" << compare(k.var<key_type>("mid_key"), k.var<key_type>("my_key")) << ") && " << "!(" << compare(k.var<key_type>("my_key"), k.var<key_type>("mid_key")) << ");\n" << "left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" << "right_idx = equal ? right_idx : mid_idx;\n" << "upper_key = " << keys_first[k.var<const uint_>("left_idx")] << ";\n" << "}\n" << "}\n" << k.decl<uint_>("offset") << " = 0;\n" << "offset += gid - my_block_start;\n" << "offset += left_idx - other_block_start;\n" << "offset += min(my_block_start, other_block_start);\n" << out_keys_first[k.var<uint_>("offset")] << " = my_key;\n"; if(sort_by_key) { k << out_values_first[k.var<uint_>("offset")] << " = my_value;\n"; } const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); const size_t work_group_size = (std::min)( size_t(256), kernel.get_work_group_info<size_t>( queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE ) ); const size_t global_size = work_group_size * static_cast<size_t>( std::ceil(float(count) / work_group_size) ); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size); }
inline size_t bitonic_block_sort(KeyIterator keys_first, ValueIterator values_first, Compare compare, const size_t count, const bool sort_by_key, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; meta_kernel k("bitonic_block_sort"); size_t count_arg = k.add_arg<const uint_>("count"); size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys"); size_t local_vals_arg = 0; if(sort_by_key) { local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx"); } k << // Work item global and local ids k.decl<const uint_>("gid") << " = get_global_id(0);\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n"; // declare my_key and my_value k << k.decl<key_type>("my_key") << ";\n"; // Instead of copying values (my_value) in local memory with keys // we save local index (uchar) and copy my_value at the end at // final index. This saves local memory. if(sort_by_key) { k << k.decl<uchar_>("my_index") << " = (uchar)(lid);\n"; } // load key k << "if(gid < count) {\n" << k.var<key_type>("my_key") << " = " << keys_first[k.var<const uint_>("gid")] << ";\n" << "}\n"; // load key and index to local memory k << "lkeys[lid] = my_key;\n"; if(sort_by_key) { k << "lidx[lid] = my_index;\n"; } k << k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" << k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n"; // When work group size is a power of 2 bitonic sorter can be used; // otherwise, slower odd-even sort is used. k << // check if n is power of 2 "if(((n != 0) && ((n & (~n + 1)) == n))) {\n"; // bitonic sort, not stable k << // wait for keys and vals to be stored in local memory "barrier(CLK_LOCAL_MEM_FENCE);\n" << "#pragma unroll\n" << "for(" << k.decl<uint_>("length") << " = 1; " << "length < n; " << "length <<= 1" << ") {\n" << // direction of sort: false -> asc, true -> desc k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" << "for(" << k.decl<uint_>("k") << " = length; " << "k > 0; " << "k >>= 1" << ") {\n" << // sibling to compare with my key k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" << k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" << k.decl<bool>("compare") << " = " << compare(k.var<key_type>("sibling_key"), k.var<key_type>("my_key")) << ";\n" << k.decl<bool>("equal") << " = !(compare || " << compare(k.var<key_type>("my_key"), k.var<key_type>("sibling_key")) << ");\n" << k.decl<bool>("swap") << " = compare ^ (sibling_idx < lid) ^ direction;\n" << "swap = equal ? false : swap;\n" << "my_key = swap ? sibling_key : my_key;\n"; if(sort_by_key) { k << "my_index = swap ? lidx[sibling_idx] : my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << "lkeys[lid] = my_key;\n"; if(sort_by_key) { k << "lidx[lid] = my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << "}\n" << "}\n"; // end of bitonic sort // odd-even sort, not stable k << "}\n" << "else { \n"; k << k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" << k.decl<uint_>("oddsibling_idx") << " = " << "(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" << k.decl<uint_>("evensibling_idx") << " = " << "(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" << // wait for keys and vals to be stored in local memory "barrier(CLK_LOCAL_MEM_FENCE);\n" << "#pragma unroll\n" << "for(" << k.decl<uint_>("i") << " = 0; " << "i < n; " << "i++" << ") {\n" << k.decl<uint_>("sibling_idx") << " = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" << k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" << k.decl<bool>("compare") << " = " << compare(k.var<key_type>("sibling_key"), k.var<key_type>("my_key")) << ";\n" << k.decl<bool>("equal") << " = !(compare || " << compare(k.var<key_type>("my_key"), k.var<key_type>("sibling_key")) << ");\n" << k.decl<bool>("swap") << " = compare ^ (sibling_idx < lid);\n" << "swap = equal ? false : swap;\n" << "my_key = swap ? sibling_key : my_key;\n"; if(sort_by_key) { k << "my_index = swap ? lidx[sibling_idx] : my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << "lkeys[lid] = my_key;\n"; if(sort_by_key) { k << "lidx[lid] = my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" << // for "}\n"; // else // end of odd-even sort // save key and value k << "if(gid < count) {\n" << keys_first[k.var<const uint_>("gid")] << " = " << k.var<key_type>("my_key") << ";\n"; if(sort_by_key) { k << k.decl<value_type>("my_value") << " = " << values_first[k.var<const uint_>("offset + my_index")] << ";\n" << "barrier(CLK_GLOBAL_MEM_FENCE);\n" << values_first[k.var<const uint_>("gid")] << " = my_value;\n"; } k << // end if "}\n"; const context &context = queue.get_context(); const device &device = queue.get_device(); ::boost::compute::kernel kernel = k.compile(context); const size_t work_group_size = pick_bitonic_block_sort_block_size<key_type, uchar_>( kernel.get_work_group_info<size_t>( device, CL_KERNEL_WORK_GROUP_SIZE ), device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE), sort_by_key ); const size_t global_size = work_group_size * static_cast<size_t>( std::ceil(float(count) / work_group_size) ); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size)); if(sort_by_key) { kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size)); } queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size); // return size of the block return work_group_size; }
inline OutputIterator scan_on_cpu(InputIterator first, InputIterator last, OutputIterator result, bool exclusive, T init, BinaryOperator op, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); const device &device = queue.get_device(); const size_t compute_units = queue.get_device().compute_units(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T)); // for inputs smaller than serial_scan_threshold // serial_scan algorithm is used uint_ serial_scan_threshold = parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T)); serial_scan_threshold = (std::max)(serial_scan_threshold, uint_(compute_units)); size_t count = detail::iterator_range_size(first, last); if(count == 0){ return result; } else if(count < serial_scan_threshold) { return serial_scan(first, last, result, exclusive, init, op, queue); } buffer block_partial_sums(context, sizeof(output_type) * compute_units ); // create scan kernel meta_kernel k("scan_on_cpu_block_scan"); // Arguments size_t count_arg = k.add_arg<uint_>("count"); size_t init_arg = k.add_arg<output_type>("initial_value"); size_t block_partial_sums_arg = k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); k << "uint block = " << "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n"; if(!exclusive){ k << k.decl<output_type>("sum") << " = " << first[k.var<uint_>("index")] << ";\n" << result[k.var<uint_>("index")] << " = sum;\n" << "index++;\n"; } else { k << k.decl<output_type>("sum") << ";\n" << "if(index == 0){\n" << "sum = initial_value;\n" << "}\n" << "else {\n" << "sum = " << first[k.var<uint_>("index")] << ";\n" << "index++;\n" << "}\n"; } k << "while(index < end){\n" << // load next value k.decl<const input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n"; if(exclusive){ k << "if(get_global_id(0) == 0){\n" << result[k.var<uint_>("index")] << " = sum;\n" << "}\n"; } k << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("value")) << ";\n"; if(!exclusive){ k << "if(get_global_id(0) == 0){\n" << result[k.var<uint_>("index")] << " = sum;\n" << "}\n"; } k << "index++;\n" << "}\n" << // end while "block_partial_sums[get_global_id(0)] = sum;\n"; // compile scan kernel kernel block_scan_kernel = k.compile(context); // setup kernel arguments block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init)); block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); // execute the kernel size_t global_work_size = compute_units; queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0); // scan is done if(compute_units < 2) { return result + count; } // final scan kernel meta_kernel l("scan_on_cpu_final_scan"); // Arguments count_arg = l.add_arg<uint_>("count"); block_partial_sums_arg = l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); l << "uint block = " << "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << "uint index = block + get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << k.decl<output_type>("sum") << " = block_partial_sums[0];\n" << "for(uint i = 0; i < get_global_id(0); i++) {\n" << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" << "}\n" << "while(index < end){\n"; if(exclusive){ l << l.decl<output_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << result[k.var<uint_>("index")] << " = sum;\n" << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("value")) << ";\n"; } else { l << "sum = " << op(k.var<output_type>("sum"), first[k.var<uint_>("index")]) << ";\n" << result[k.var<uint_>("index")] << " = sum;\n"; } l << "index++;\n" << "}\n"; // compile scan kernel kernel final_scan_kernel = l.compile(context); // setup kernel arguments final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); // execute the kernel global_work_size = compute_units; queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0); // return iterator pointing to the end of the result range return result + count; }
inline void radix_sort_impl(const buffer_iterator<T> first, const buffer_iterator<T> last, const buffer_iterator<T2> values_first, const bool ascending, command_queue &queue) { typedef T value_type; typedef typename radix_sort_value_type<sizeof(T)>::type sort_type; const device &device = queue.get_device(); const context &context = queue.get_context(); // if we have a valid values iterator then we are doing a // sort by key and have to set up the values buffer bool sort_by_key = (values_first.get_buffer().get() != 0); // load (or create) radix sort program std::string cache_key = std::string("__boost_radix_sort_") + type_name<value_type>(); if(sort_by_key){ cache_key += std::string("_with_") + type_name<T2>(); } boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // sort parameters const uint_ k = parameters->get(cache_key, "k", 4); const uint_ k2 = 1 << k; const uint_ block_size = parameters->get(cache_key, "tpb", 128); // sort program compiler options std::stringstream options; options << "-DK_BITS=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } if(sort_by_key){ options << " -DSORT_BY_KEY"; options << " -DT2=" << type_name<T2>(); options << enable_double<T2>(); } if(ascending){ options << " -DASC"; } // load radix sort program program radix_sort_program = cache->get_or_build( cache_key, options.str(), radix_sort_source, context ); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); size_t count = detail::iterator_range_size(first, last); uint_ block_count = static_cast<uint_>(count / block_size); if(block_count * block_size != count){ block_count++; } // setup temporary buffers vector<value_type> output(count, context); vector<T2> values_output(sort_by_key ? count : 0, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); uint_ input_offset = static_cast<uint_>(first.get_index()); const buffer *output_buffer = &output.get_buffer(); uint_ output_offset = 0; const buffer *values_input_buffer = &values_first.get_buffer(); uint_ values_input_offset = static_cast<uint_>(values_first.get_index()); const buffer *values_output_buffer = &values_output.get_buffer(); uint_ values_output_offset = 0; for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, input_offset); count_kernel.set_arg(2, static_cast<uint_>(count)); count_kernel.set_arg(3, counts); count_kernel.set_arg(4, offsets); count_kernel.set_arg(5, block_size * sizeof(uint_), 0); count_kernel.set_arg(6, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts); scan_kernel.set_arg(1, offsets); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, input_offset); scatter_kernel.set_arg(2, static_cast<uint_>(count)); scatter_kernel.set_arg(3, i * k); scatter_kernel.set_arg(4, counts); scatter_kernel.set_arg(5, offsets); scatter_kernel.set_arg(6, *output_buffer); scatter_kernel.set_arg(7, output_offset); if(sort_by_key){ scatter_kernel.set_arg(8, *values_input_buffer); scatter_kernel.set_arg(9, values_input_offset); scatter_kernel.set_arg(10, *values_output_buffer); scatter_kernel.set_arg(11, values_output_offset); } queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); std::swap(values_input_buffer, values_output_buffer); std::swap(input_offset, output_offset); std::swap(values_input_offset, values_output_offset); } }
InputIterator find_extrema_with_reduce(InputIterator first, InputIterator last, ::boost::compute::less< typename std::iterator_traits< InputIterator >::value_type > compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); const device &device = queue.get_device(); // Getting information about used queue and device const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); const size_t count = detail::iterator_range_size(first, last); std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + type_name<input_type>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // get preferred work group size and preferred number // of work groups per compute unit size_t work_group_size = parameters->get(cache_key, "wgsize", 256); size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64); // calculate work group size and number of work groups work_group_size = (std::min)(max_work_group_size, work_group_size); size_t work_groups_no = compute_units_no * work_groups_per_cu; work_groups_no = (std::min)( work_groups_no, static_cast<size_t>(std::ceil(float(count) / work_group_size)) ); // phase I: finding candidates for extremum // device buffors for extremum candidates and their indices // each work-group computes its candidate // zero-copy buffers are used to eliminate copying data back to host vector<input_type, ::boost::compute::pinned_allocator<input_type> > candidates(work_groups_no, context); vector<uint_, ::boost::compute::pinned_allocator <uint_> > candidates_idx(work_groups_no, context); // finding candidates for first extremum and their indices find_extrema_with_reduce( first, count, candidates.begin(), candidates_idx.begin(), work_groups_no, work_group_size, compare, find_minimum, queue ); // phase II: finding extremum from among the candidates // mapping candidates and their indices to host input_type* candidates_host_ptr = static_cast<input_type*>( queue.enqueue_map_buffer( candidates.get_buffer(), command_queue::map_read, 0, work_groups_no * sizeof(input_type) ) ); uint_* candidates_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( candidates_idx.get_buffer(), command_queue::map_read, 0, work_groups_no * sizeof(uint_) ) ); input_type* i = candidates_host_ptr; uint_* idx = candidates_idx_host_ptr; uint_* extremum_idx = idx; input_type extremum = *candidates_host_ptr; i++; idx++; // find extremum (serial) from among the candidates on host if(!find_minimum) { while(idx != (candidates_idx_host_ptr + work_groups_no)) { input_type next = *i; bool compare_result = next > extremum; bool equal = next == extremum; extremum = compare_result ? next : extremum; extremum_idx = compare_result ? idx : extremum_idx; extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; idx++, i++; } } else { while(idx != (candidates_idx_host_ptr + work_groups_no)) { input_type next = *i; bool compare_result = next < extremum; bool equal = next == extremum; extremum = compare_result ? next : extremum; extremum_idx = compare_result ? idx : extremum_idx; extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; idx++, i++; } } return first + static_cast<difference_type>(*extremum_idx); }
InputIterator find_extrema_with_reduce(InputIterator first, InputIterator last, Compare compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); const device &device = queue.get_device(); // Getting information about used queue and device const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); const size_t count = detail::iterator_range_size(first, last); std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + type_name<input_type>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // get preferred work group size and preferred number // of work groups per compute unit size_t work_group_size = parameters->get(cache_key, "wgsize", 256); size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100); // calculate work group size and number of work groups work_group_size = (std::min)(max_work_group_size, work_group_size); size_t work_groups_no = compute_units_no * work_groups_per_cu; work_groups_no = (std::min)( work_groups_no, static_cast<size_t>(std::ceil(float(count) / work_group_size)) ); // phase I: finding candidates for extremum // device buffors for extremum candidates and their indices // each work-group computes its candidate vector<input_type> candidates(work_groups_no, context); vector<uint_> candidates_idx(work_groups_no, context); // finding candidates for first extremum and their indices find_extrema_with_reduce( first, count, candidates.begin(), candidates_idx.begin(), work_groups_no, work_group_size, compare, find_minimum, queue ); // phase II: finding extremum from among the candidates // zero-copy buffers for final result (value and index) vector<input_type, ::boost::compute::pinned_allocator<input_type> > result(1, context); vector<uint_, ::boost::compute::pinned_allocator<uint_> > result_idx(1, context); // get extremum from among the candidates find_extrema_with_reduce( candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(), result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue ); // mapping extremum index to host uint_* result_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( result_idx.get_buffer(), command_queue::map_read, 0, sizeof(uint_) ) ); return first + static_cast<difference_type>(*result_idx_host_ptr); }
inline void reduce_on_gpu(InputIterator first, InputIterator last, buffer_iterator<T> result, Function function, command_queue &queue) { const device &device = queue.get_device(); const context &context = queue.get_context(); detail::meta_kernel k("reduce"); k.add_arg<const T*>(memory_object::global_memory, "input"); k.add_arg<const uint_>("offset"); k.add_arg<const uint_>("count"); k.add_arg<T*>(memory_object::global_memory, "output"); k.add_arg<const uint_>("output_offset"); k << k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" << "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "__local " << type_name<T>() << " scratch[TPB];\n" << // private reduction k.decl<T>("sum") << " = 0;\n" << "for(uint i = 0; i < VPT; i++){\n" << " if(block_offset + lid + i*TPB < count){\n" << " sum = sum + block[lid+i*TPB]; \n" << " }\n" << "}\n" << "scratch[lid] = sum;\n"; // discrimination on vendor name if(is_nvidia_device(device)) k << ReduceBody<T,true>::body(); else k << ReduceBody<T,false>::body(); k << // write sum to output "if(lid == 0){\n" << " output[output_offset + get_group_id(0)] = scratch[0];\n" << "}\n"; std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); uint_ vpt = parameters->get(cache_key, "vpt", 8); uint_ tpb = parameters->get(cache_key, "tpb", 128); // reduce program compiler flags std::stringstream options; options << "-DT=" << type_name<T>() << " -DVPT=" << vpt << " -DTPB=" << tpb; // load program boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); program reduce_program = cache->get_or_build( cache_key, options.str(), k.source(), context ); // create reduce kernel kernel reduce_kernel(reduce_program, "reduce"); size_t count = std::distance(first, last); // first pass, reduce from input to ping buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T)); initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue); // update count after initial reduce count = std::ceil(float(count) / vpt / tpb); // middle pass(es), reduce between ping and pong const buffer *input_buffer = &ping; buffer pong(context, count / vpt / tpb * sizeof(T)); const buffer *output_buffer = &pong; if(count > vpt * tpb){ while(count > vpt * tpb){ reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, *output_buffer); reduce_kernel.set_arg(4, uint_(0)); size_t work_size = std::ceil(float(count) / vpt); if(work_size % tpb != 0){ work_size += tpb - work_size % tpb; } queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); std::swap(input_buffer, output_buffer); count = std::ceil(float(count) / vpt / tpb); } } // final pass, reduce from ping/pong to result reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, result.get_buffer()); reduce_kernel.set_arg(4, uint_(result.get_index())); queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb); }
/// Returns id of the device associated with the given queue. inline device_id get_device_id(const command_queue &q) { return q.get_device().get(); }
/// Returns device associated with the given queue. inline device get_device(const command_queue &q) { return q.get_device(); }
inline InputIterator find_extrema_on_cpu(InputIterator first, InputIterator last, Compare compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; size_t count = iterator_range_size(first, last); const device &device = queue.get_device(); const uint_ compute_units = queue.get_device().compute_units(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_find_extrema_cpu_" + boost::lexical_cast<std::string>(sizeof(input_type)); // for inputs smaller than serial_find_extrema_threshold // serial_find_extrema algorithm is used uint_ serial_find_extrema_threshold = parameters->get( cache_key, "serial_find_extrema_threshold", 16384 * sizeof(input_type) ); serial_find_extrema_threshold = (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units)); const context &context = queue.get_context(); if(count < serial_find_extrema_threshold) { return serial_find_extrema(first, last, compare, find_minimum, queue); } meta_kernel k("find_extrema_on_cpu"); buffer output(context, sizeof(input_type) * compute_units); buffer output_idx( context, sizeof(uint_) * compute_units, buffer::read_write | buffer::alloc_host_ptr ); size_t count_arg = k.add_arg<uint_>("count"); size_t output_arg = k.add_arg<input_type *>(memory_object::global_memory, "output"); size_t output_idx_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output_idx"); k << "uint block = " << "(uint)ceil(((float)count)/get_global_size(0));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << "uint value_index = index;\n" << k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << "index++;\n" << "while(index < end){\n" << k.decl<input_type>("candidate") << " = " << first[k.var<uint_>("index")] << ";\n" << "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << "bool compare = " << compare(k.var<input_type>("candidate"), k.var<input_type>("value")) << ";\n" << "#else\n" << "bool compare = " << compare(k.var<input_type>("value"), k.var<input_type>("candidate")) << ";\n" << "#endif\n" << "value = compare ? candidate : value;\n" << "value_index = compare ? index : value_index;\n" << "index++;\n" << "}\n" << "output[get_global_id(0)] = value;\n" << "output_idx[get_global_id(0)] = value_index;\n"; size_t global_work_size = compute_units; std::string options; if(!find_minimum){ options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; } kernel kernel = k.compile(context, options); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(output_arg, output); kernel.set_arg(output_idx_arg, output_idx); queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0); buffer_iterator<input_type> result = serial_find_extrema( make_buffer_iterator<input_type>(output), make_buffer_iterator<input_type>(output, global_work_size), compare, find_minimum, queue ); uint_* output_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( output_idx, command_queue::map_read, 0, global_work_size * sizeof(uint_) ) ); difference_type extremum_idx = static_cast<difference_type>(*(output_idx_host_ptr + result.get_index())); return first + extremum_idx; }
/// Create command queue on the same context and device as the given one. inline command_queue duplicate_queue(const command_queue &q) { return command_queue(q.get_context(), q.get_device(), q.get_properties()); }