void generate(OutputIterator first_ctr, OutputIterator last_ctr, command_queue &queue) { const size_t size_ctr = detail::iterator_range_size(first_ctr, last_ctr); if(!size_ctr) { return; } boost::compute::vector<uint_> vector_key(size_ctr, m_context); vector_key.assign(size_ctr, 0, queue); kernel rng_kernel = m_program.create_kernel("generate_rng"); rng_kernel.set_arg(0, first_ctr.get_buffer()); rng_kernel.set_arg(1, vector_key); size_t offset = 0; for(;;){ size_t count = 0; size_t size = size_ctr/2; if(size > threads){ count = threads; } else { count = size; } rng_kernel.set_arg(2, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(rng_kernel, 0, count, 0); offset += count; if(offset >= size){ break; } } }
void fill(OutputIterator first, OutputIterator last, command_queue &queue) { const buffer &buffer = first.get_buffer(); const size_t size = detail::iterator_range_size(first, last); kernel fill_kernel(m_program, "fill"); fill_kernel.set_arg(0, m_state_buffer); fill_kernel.set_arg(1, buffer); size_t p = 0; for(;;){ size_t count = 0; if(size - p >= n) count = n; else count = size - p; fill_kernel.set_arg(2, static_cast<uint_>(p)); queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0); p += n; if(p >= size) break; generate_state(queue); } }
void generate(OutputIterator first, OutputIterator last, command_queue &queue) { const size_t size = detail::iterator_range_size(first, last); kernel fill_kernel(m_program, "fill"); fill_kernel.set_arg(0, m_state_buffer); fill_kernel.set_arg(2, first.get_buffer()); size_t offset = 0; size_t &p = m_state_index; for(;;){ size_t count = 0; if(size > n){ count = n; } else { count = size; } fill_kernel.set_arg(1, static_cast<const uint_>(p)); fill_kernel.set_arg(3, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0); p += count; offset += count; if(offset >= size){ break; } generate_state(queue); p = 0; } }
void generate(OutputIterator first_ctr, OutputIterator last_ctr, OutputIterator first_key, OutputIterator last_key, command_queue &queue) { const size_t size_ctr = detail::iterator_range_size(first_ctr, last_ctr); const size_t size_key = detail::iterator_range_size(first_key, last_key); if(!size_ctr || !size_key || (size_ctr != size_key)) { return; } kernel rng_kernel = m_program.create_kernel("generate_rng"); rng_kernel.set_arg(0, first_ctr.get_buffer()); rng_kernel.set_arg(1, first_key.get_buffer()); size_t offset = 0; for(;;){ size_t count = 0; size_t size = size_ctr/2; if(size > threads){ count = threads; } else { count = size; } rng_kernel.set_arg(2, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(rng_kernel, 0, count, 0); offset += count; if(offset >= size){ break; } } }
void generate(OutputIterator first, OutputIterator last, command_queue &queue) { size_t size = detail::iterator_range_size(first, last); kernel fill_kernel(m_program, "fill"); fill_kernel.set_arg(1, m_multiplicands); fill_kernel.set_arg(2, first.get_buffer()); size_t offset = 0; for(;;){ size_t count = 0; if(size > threads){ count = threads; } else { count = size; } fill_kernel.set_arg(0, static_cast<const uint_>(m_seed)); fill_kernel.set_arg(3, static_cast<const uint_>(offset)); queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0); offset += count; if(offset >= size){ break; } update_seed(queue); } }
inline InputIterator binary_find(InputIterator first, InputIterator last, UnaryPredicate predicate, command_queue &queue = system::default_queue()) { const device &device = queue.get_device(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); const std::string cache_key = "__boost_binary_find"; size_t find_if_limit = 128; size_t threads = parameters->get(cache_key, "tpb", 128); size_t count = iterator_range_size(first, last); InputIterator search_first = first; InputIterator search_last = last; scalar<uint_> index(queue.get_context()); // construct and compile binary_find kernel binary_find_kernel<InputIterator, UnaryPredicate> binary_find_kernel(search_first, search_last, predicate); ::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context()); // set buffer for index kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer()); while(count > find_if_limit) { index.write(static_cast<uint_>(count), queue); // set block and run binary_find kernel uint_ block = static_cast<uint_>((count - 1)/(threads - 1)); kernel.set_arg(binary_find_kernel.m_block_arg, block); queue.enqueue_1d_range_kernel(kernel, 0, threads, 0); size_t i = index.read(queue); if(i == count) { search_first = search_last - ((count - 1)%(threads - 1)); break; } else { search_last = search_first + i; search_first = search_last - ((count - 1)/(threads - 1)); } // Make sure that first and last stay within the input range search_last = (std::min)(search_last, last); search_last = (std::max)(search_last, first); search_first = (std::max)(search_first, first); search_first = (std::min)(search_first, last); count = iterator_range_size(search_first, search_last); } return find_if(search_first, search_last, predicate, queue); }
inline void initial_reduce(InputIterator first, InputIterator last, buffer result, const Function &function, kernel &reduce_kernel, const uint_ vpt, const uint_ tpb, command_queue &queue) { (void) function; (void) reduce_kernel; typedef typename std::iterator_traits<InputIterator>::value_type Arg; typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T; size_t count = std::distance(first, last); detail::meta_kernel k("initial_reduce"); k.add_set_arg<const uint_>("count", uint_(count)); size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output"); k << k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "__local " << type_name<T>() << " scratch[TPB];\n" << // private reduction k.decl<T>("sum") << " = 0;\n" << "for(uint i = 0; i < VPT; i++){\n" << " if(offset + lid + i*TPB < count){\n" << " sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" << " }\n" << "}\n" << "scratch[lid] = sum;\n" << // local reduction ReduceBody<T,false>::body() << // write sum to output "if(lid == 0){\n" << " output[get_group_id(0)] = scratch[0];\n" << "}\n"; const context &context = queue.get_context(); std::stringstream options; options << "-DVPT=" << vpt << " -DTPB=" << tpb; kernel generic_reduce_kernel = k.compile(context, options.str()); generic_reduce_kernel.set_arg(output_arg, result); size_t work_size = calculate_work_size(count, vpt, tpb); queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb); }
void exec_1d(command_queue &queue, size_t global_work_offset, size_t global_work_size) { const context &context = queue.get_context(); ::boost::compute::kernel kernel = compile(context); queue.enqueue_1d_range_kernel(kernel, global_work_offset, global_work_size); }
inline OutputIterator scan_on_cpu(InputIterator first, InputIterator last, OutputIterator result, bool exclusive, command_queue &queue) { if(first == last){ return result; } typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); // create scan kernel meta_kernel k("scan_on_cpu"); k.add_arg<ulong_>("n"); k << k.decl<input_type>("sum") << " = 0;\n" << "for(ulong i = 0; i < n; i++){\n" << k.decl<const input_type>("x") << " = " << first[k.var<ulong_>("i")] << ";\n"; if(exclusive){ k << result[k.var<ulong_>("i")] << " = sum;\n"; } k << " sum = sum + x;\n"; if(!exclusive){ k << result[k.var<ulong_>("i")] << " = sum;\n"; } k << "}\n"; // compile scan kernel kernel scan_kernel = k.compile(context); // setup kernel arguments size_t n = detail::iterator_range_size(first, last); scan_kernel.set_arg<ulong_>(0, n); // execute the kernel queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1); // return iterator pointing to the end of the result range return result + n; }
inline InputIterator pp_floor(InputIterator first, InputIterator last, ValueType value, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; size_t count = detail::iterator_range_size(first, last); if(count == 0){ return last; } const context &context = queue.get_context(); detail::meta_kernel k("pp_floor"); size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index"); size_t value_arg = k.add_arg<value_type>(memory_object::private_memory, "value"); atomic_max<int_> atomic_max_int; k << k.decl<const int_>("i") << " = get_global_id(0);\n" << k.decl<const value_type>("cur_value") << "=" << first[k.var<const int_>("i")] << ";\n" << "if(cur_value >= " << first[k.expr<int_>("*index")] << " && cur_value < value){\n" << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n" << "}\n"; kernel kernel = k.compile(context); scalar<int_> index(context); kernel.set_arg(index_arg, index.get_buffer()); index.write(static_cast<int_>(0), queue); kernel.set_arg(value_arg, value); queue.enqueue_1d_range_kernel(kernel, 0, count, 0); int result = static_cast<int>(index.read(queue)); return first + result; }
inline InputIterator prev_permutation_helper(InputIterator first, InputIterator last, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; size_t count = detail::iterator_range_size(first, last); if(count == 0 || count == 1){ return last; } count = count - 1; const context &context = queue.get_context(); detail::meta_kernel k("prev_permutation"); size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index"); atomic_max<int_> atomic_max_int; k << k.decl<const int_>("i") << " = get_global_id(0);\n" << k.decl<const value_type>("cur_value") << "=" << first[k.var<const int_>("i")] << ";\n" << k.decl<const value_type>("next_value") << "=" << first[k.expr<const int_>("i+1")] << ";\n" << "if(cur_value > next_value){\n" << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n" << "}\n"; kernel kernel = k.compile(context); scalar<int_> index(context); kernel.set_arg(index_arg, index.get_buffer()); index.write(static_cast<int_>(-1), queue); queue.enqueue_1d_range_kernel(kernel, 0, count, 0); int result = static_cast<int>(index.read(queue)); if(result == -1) return last; else return first + result; }
inline InputIterator find_end_helper(InputIterator first, InputIterator last, UnaryPredicate predicate, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; size_t count = detail::iterator_range_size(first, last); if(count == 0){ return last; } const context &context = queue.get_context(); detail::meta_kernel k("find_end"); size_t index_arg = k.add_arg<int *>("__global", "index"); atomic_max<int_> atomic_max_int; k << k.decl<const int_>("i") << " = get_global_id(0);\n" << k.decl<const value_type>("value") << "=" << first[k.var<const int_>("i")] << ";\n" << "if(" << predicate(k.var<const value_type>("value")) << "){\n" << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n" << "}\n"; kernel kernel = k.compile(context); scalar<int_> index(context); kernel.set_arg(index_arg, index.get_buffer()); index.write(static_cast<int_>(-1), queue); queue.enqueue_1d_range_kernel(kernel, 0, count, 0); int result = static_cast<int>(index.read(queue)); if(result == -1) return last; else return first + result; }
inline void initial_reduce(const buffer_iterator<T> &first, const buffer_iterator<T> &last, const buffer &result, const plus<T> &function, kernel &reduce_kernel, const uint_ vpt, const uint_ tpb, command_queue &queue) { (void) function; size_t count = std::distance(first, last); reduce_kernel.set_arg(0, first.get_buffer()); reduce_kernel.set_arg(1, uint_(first.get_index())); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, result); reduce_kernel.set_arg(4, uint_(0)); size_t work_size = calculate_work_size(count, vpt, tpb); queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); }
inline void block_insertion_sort(KeyIterator keys_first, ValueIterator values_first, Compare compare, const size_t count, const size_t block_size, const bool sort_by_key, command_queue &queue) { (void) values_first; typedef typename std::iterator_traits<KeyIterator>::value_type K; typedef typename std::iterator_traits<ValueIterator>::value_type T; meta_kernel k("merge_sort_on_cpu_block_insertion_sort"); size_t count_arg = k.add_arg<uint_>("count"); size_t block_size_arg = k.add_arg<uint_>("block_size"); k << k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" << k.decl<uint_>("end") << " = min(count, start + block_size);\n" << // block insertion sort (stable) "for(uint i = start+1; i < end; i++){\n" << " " << k.decl<const K>("key") << " = " << keys_first[k.var<uint_>("i")] << ";\n"; if(sort_by_key){ k << " " << k.decl<const T>("value") << " = " << values_first[k.var<uint_>("i")] << ";\n"; } k << " uint pos = i;\n" << " while(pos > start && " << compare(k.var<const K>("key"), keys_first[k.var<uint_>("pos-1")]) << "){\n" << " " << keys_first[k.var<uint_>("pos")] << " = " << keys_first[k.var<uint_>("pos-1")] << ";\n"; if(sort_by_key){ k << " " << values_first[k.var<uint_>("pos")] << " = " << values_first[k.var<uint_>("pos-1")] << ";\n"; } k << " pos--;\n" << " }\n" << " " << keys_first[k.var<uint_>("pos")] << " = key;\n"; if(sort_by_key) { k << " " << values_first[k.var<uint_>("pos")] << " = value;\n"; } k << "}\n"; // block insertion sort const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size)); queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); }
inline OutputIterator scan_on_cpu(InputIterator first, InputIterator last, OutputIterator result, bool exclusive, T init, BinaryOperator op, command_queue &queue) { if(first == last){ return result; } typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); // create scan kernel meta_kernel k("scan_on_cpu"); // Arguments size_t n_arg = k.add_arg<ulong_>("n"); size_t init_arg = k.add_arg<output_type>("initial_value"); if(!exclusive){ k << k.decl<const ulong_>("start_idx") << " = 1;\n" << k.decl<output_type>("sum") << " = " << first[0] << ";\n" << result[0] << " = sum;\n"; } else { k << k.decl<const ulong_>("start_idx") << " = 0;\n" << k.decl<output_type>("sum") << " = initial_value;\n"; } k << "for(ulong i = start_idx; i < n; i++){\n" << k.decl<const input_type>("x") << " = " << first[k.var<ulong_>("i")] << ";\n"; if(exclusive){ k << result[k.var<ulong_>("i")] << " = sum;\n"; } k << " sum = " << op(k.var<output_type>("sum"), k.var<output_type>("x")) << ";\n"; if(!exclusive){ k << result[k.var<ulong_>("i")] << " = sum;\n"; } k << "}\n"; // compile scan kernel kernel scan_kernel = k.compile(context); // setup kernel arguments size_t n = detail::iterator_range_size(first, last); scan_kernel.set_arg<ulong_>(n_arg, n); scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init)); // execute the kernel queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1); // return iterator pointing to the end of the result range return result + n; }
size_t reduce(InputIterator first, size_t count, OutputIterator result, size_t block_size, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename boost::compute::result_of<BinaryFunction(input_type, input_type)>::type result_type; const context &context = queue.get_context(); size_t block_count = count / 2 / block_size; size_t total_block_count = static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); if(block_count != 0){ meta_kernel k("block_reduce"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); k << "const uint gid = get_global_id(0);\n" << "const uint lid = get_local_id(0);\n" << // copy values to local memory "block[lid] = " << function(first[k.make_var<uint_>("gid*2+0")], first[k.make_var<uint_>("gid*2+1")]) << ";\n" << // perform reduction "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << " barrier(CLK_LOCAL_MEM_FENCE);\n" << " uint mask = (i << 1) - 1;\n" << " if((lid & mask) == 0){\n" << " block[lid] = " << function(k.expr<input_type>("block[lid]"), k.expr<input_type>("block[lid+i]")) << ";\n" << " }\n" << "}\n" << // write block result to global output "if(lid == 0)\n" << " output[get_group_id(0)] = block[0];\n"; kernel kernel = k.compile(context); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, block_count * block_size, block_size); } // serially reduce any leftovers if(block_count * block_size * 2 < count){ size_t last_block_start = block_count * block_size * 2; meta_kernel k("extra_serial_reduce"); size_t count_arg = k.add_arg<uint_>("count"); size_t offset_arg = k.add_arg<uint_>("offset"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t output_offset_arg = k.add_arg<uint_>("output_offset"); k << k.decl<result_type>("result") << " = \n" << first[k.expr<uint_>("offset")] << ";\n" << "for(uint i = offset + 1; i < count; i++)\n" << " result = " << function(k.var<result_type>("result"), first[k.var<uint_>("i")]) << ";\n" << "output[output_offset] = result;\n"; kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); queue.enqueue_task(kernel); } return total_block_count; }
inline void find_extrema_with_reduce(InputIterator input, vector<uint_>::iterator input_idx, size_t count, ResultIterator result, vector<uint_>::iterator result_idx, size_t work_groups_no, size_t work_group_size, Compare compare, const bool find_minimum, const bool use_input_idx, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); meta_kernel k("find_extrema_reduce"); size_t count_arg = k.add_arg<uint_>("count"); size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx"); k << // Work item global id k.decl<const uint_>("gid") << " = get_global_id(0);\n" << // Index of element that will be read from input buffer k.decl<uint_>("idx") << " = gid;\n" << k.decl<input_type>("acc") << ";\n" << k.decl<uint_>("acc_idx") << ";\n" << "if(gid < count) {\n" << // Real index of currently best element "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << "#else\n" << k.var<uint_>("acc_idx") << " = idx;\n" << "#endif\n" << // Init accumulator with first[get_global_id(0)] "acc = " << input[k.var<uint_>("idx")] << ";\n" << "idx += get_global_size(0);\n" << "}\n" << k.decl<bool>("compare_result") << ";\n" << k.decl<bool>("equal") << ";\n\n" << "while( idx < count ){\n" << // Next element k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" << "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << "#endif\n" << // Comparison between currently best element (acc) and next element "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << "compare_result = " << compare(k.var<input_type>("next"), k.var<input_type>("acc")) << ";\n" << "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << "equal = !compare_result && !" << compare(k.var<input_type>("acc"), k.var<input_type>("next")) << ";\n" << "# endif\n" << "#else\n" << "compare_result = " << compare(k.var<input_type>("acc"), k.var<input_type>("next")) << ";\n" << "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << "equal = !compare_result && !" << compare(k.var<input_type>("next"), k.var<input_type>("acc")) << ";\n" << "# endif\n" << "#endif\n" << // save the winner "acc = compare_result ? acc : next;\n" << "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << "acc_idx = compare_result ? " << "acc_idx : " << "(equal ? min(acc_idx, next_idx) : next_idx);\n" << "#else\n" << "acc_idx = compare_result ? acc_idx : idx;\n" << "#endif\n" << "idx += get_global_size(0);\n" << "}\n\n" << // Work item local id k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "block[lid] = acc;\n" << "block_idx[lid] = acc_idx;\n" << "barrier(CLK_LOCAL_MEM_FENCE);\n" << k.decl<uint_>("group_offset") << " = count - (get_local_size(0) * get_group_id(0));\n\n"; k << "#pragma unroll\n" "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " << "offset = offset / 2) {\n" << "if((lid < offset) && ((lid + offset) < group_offset)) { \n" << k.decl<input_type>("mine") << " = block[lid];\n" << k.decl<input_type>("other") << " = block[lid+offset];\n" << "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << "compare_result = " << compare(k.var<input_type>("other"), k.var<input_type>("mine")) << ";\n" << "equal = !compare_result && !" << compare(k.var<input_type>("mine"), k.var<input_type>("other")) << ";\n" << "#else\n" << "compare_result = " << compare(k.var<input_type>("mine"), k.var<input_type>("other")) << ";\n" << "equal = !compare_result && !" << compare(k.var<input_type>("other"), k.var<input_type>("mine")) << ";\n" << "#endif\n" << "block[lid] = compare_result ? mine : other;\n" << k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" << k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" << "block_idx[lid] = compare_result ? " << "mine_idx : " << "(equal ? min(mine_idx, other_idx) : other_idx);\n" << "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" << "}\n\n" << // write block result to global output "if(lid == 0){\n" << result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" << result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" << "}"; std::string options; if(!find_minimum){ options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; } if(use_input_idx){ options += " -DBOOST_COMPUTE_USE_INPUT_IDX"; } kernel kernel = k.compile(context, options); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size)); kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size)); queue.enqueue_1d_range_kernel(kernel, 0, work_groups_no * work_group_size, work_group_size); }
inline void inplace_reduce(Iterator first, Iterator last, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type value_type; size_t input_size = iterator_range_size(first, last); if(input_size < 2){ return; } const context &context = queue.get_context(); size_t block_size = 64; size_t values_per_thread = 8; size_t block_count = input_size / (block_size * values_per_thread); if(block_count * block_size * values_per_thread != input_size) block_count++; vector<value_type> output(block_count, context); meta_kernel k("inplace_reduce"); size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input"); size_t input_size_arg = k.add_arg<const uint_>("input_size"); size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output"); size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch"); k << "const uint gid = get_global_id(0);\n" << "const uint lid = get_local_id(0);\n" << "const uint values_per_thread =\n" << uint_(values_per_thread) << ";\n" << // thread reduce "const uint index = gid * values_per_thread;\n" << "if(index < input_size){\n" << k.decl<value_type>("sum") << " = input[index];\n" << "for(uint i = 1;\n" << "i < values_per_thread && (index + i) < input_size;\n" << "i++){\n" << " sum = " << function(k.var<value_type>("sum"), k.var<value_type>("input[index+i]")) << ";\n" << "}\n" << "scratch[lid] = sum;\n" << "}\n" << // local reduce "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" << " barrier(CLK_LOCAL_MEM_FENCE);\n" << " uint mask = (i << 1) - 1;\n" << " uint next_index = (gid + i) * values_per_thread;\n" " if((lid & mask) == 0 && next_index < input_size){\n" << " scratch[lid] = " << function(k.var<value_type>("scratch[lid]"), k.var<value_type>("scratch[lid+i]")) << ";\n" << " }\n" << "}\n" << // write output for block "if(lid == 0){\n" << " output[get_group_id(0)] = scratch[0];\n" << "}\n" ; const buffer *input_buffer = &first.get_buffer(); const buffer *output_buffer = &output.get_buffer(); kernel kernel = k.compile(context); while(input_size > 1){ kernel.set_arg(input_arg, *input_buffer); kernel.set_arg(input_size_arg, static_cast<uint_>(input_size)); kernel.set_arg(output_arg, *output_buffer); kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, block_count * block_size, block_size); input_size = static_cast<size_t>( std::ceil(float(input_size) / (block_size * values_per_thread) ) ); block_count = input_size / (block_size * values_per_thread); if(block_count * block_size * values_per_thread != input_size) block_count++; std::swap(input_buffer, output_buffer); } if(input_buffer != &first.get_buffer()){ ::boost::compute::copy(output.begin(), output.begin() + 1, first, queue); } }
inline void radix_sort_impl(const buffer_iterator<T> first, const buffer_iterator<T> last, const buffer_iterator<T2> values_first, const bool ascending, command_queue &queue) { typedef T value_type; typedef typename radix_sort_value_type<sizeof(T)>::type sort_type; const device &device = queue.get_device(); const context &context = queue.get_context(); // if we have a valid values iterator then we are doing a // sort by key and have to set up the values buffer bool sort_by_key = (values_first.get_buffer().get() != 0); // load (or create) radix sort program std::string cache_key = std::string("__boost_radix_sort_") + type_name<value_type>(); if(sort_by_key){ cache_key += std::string("_with_") + type_name<T2>(); } boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // sort parameters const uint_ k = parameters->get(cache_key, "k", 4); const uint_ k2 = 1 << k; const uint_ block_size = parameters->get(cache_key, "tpb", 128); // sort program compiler options std::stringstream options; options << "-DK_BITS=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } if(sort_by_key){ options << " -DSORT_BY_KEY"; options << " -DT2=" << type_name<T2>(); options << enable_double<T2>(); } if(ascending){ options << " -DASC"; } // load radix sort program program radix_sort_program = cache->get_or_build( cache_key, options.str(), radix_sort_source, context ); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); size_t count = detail::iterator_range_size(first, last); uint_ block_count = static_cast<uint_>(count / block_size); if(block_count * block_size != count){ block_count++; } // setup temporary buffers vector<value_type> output(count, context); vector<T2> values_output(sort_by_key ? count : 0, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); uint_ input_offset = static_cast<uint_>(first.get_index()); const buffer *output_buffer = &output.get_buffer(); uint_ output_offset = 0; const buffer *values_input_buffer = &values_first.get_buffer(); uint_ values_input_offset = static_cast<uint_>(values_first.get_index()); const buffer *values_output_buffer = &values_output.get_buffer(); uint_ values_output_offset = 0; for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, input_offset); count_kernel.set_arg(2, static_cast<uint_>(count)); count_kernel.set_arg(3, counts); count_kernel.set_arg(4, offsets); count_kernel.set_arg(5, block_size * sizeof(uint_), 0); count_kernel.set_arg(6, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts); scan_kernel.set_arg(1, offsets); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, input_offset); scatter_kernel.set_arg(2, static_cast<uint_>(count)); scatter_kernel.set_arg(3, i * k); scatter_kernel.set_arg(4, counts); scatter_kernel.set_arg(5, offsets); scatter_kernel.set_arg(6, *output_buffer); scatter_kernel.set_arg(7, output_offset); if(sort_by_key){ scatter_kernel.set_arg(8, *values_input_buffer); scatter_kernel.set_arg(9, values_input_offset); scatter_kernel.set_arg(10, *values_output_buffer); scatter_kernel.set_arg(11, values_output_offset); } queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); std::swap(values_input_buffer, values_output_buffer); std::swap(input_offset, output_offset); std::swap(values_input_offset, values_output_offset); } }
inline OutputIterator scan_on_cpu(InputIterator first, InputIterator last, OutputIterator result, bool exclusive, T init, BinaryOperator op, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); const device &device = queue.get_device(); const size_t compute_units = queue.get_device().compute_units(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T)); // for inputs smaller than serial_scan_threshold // serial_scan algorithm is used uint_ serial_scan_threshold = parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T)); serial_scan_threshold = (std::max)(serial_scan_threshold, uint_(compute_units)); size_t count = detail::iterator_range_size(first, last); if(count == 0){ return result; } else if(count < serial_scan_threshold) { return serial_scan(first, last, result, exclusive, init, op, queue); } buffer block_partial_sums(context, sizeof(output_type) * compute_units ); // create scan kernel meta_kernel k("scan_on_cpu_block_scan"); // Arguments size_t count_arg = k.add_arg<uint_>("count"); size_t init_arg = k.add_arg<output_type>("initial_value"); size_t block_partial_sums_arg = k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); k << "uint block = " << "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n"; if(!exclusive){ k << k.decl<output_type>("sum") << " = " << first[k.var<uint_>("index")] << ";\n" << result[k.var<uint_>("index")] << " = sum;\n" << "index++;\n"; } else { k << k.decl<output_type>("sum") << ";\n" << "if(index == 0){\n" << "sum = initial_value;\n" << "}\n" << "else {\n" << "sum = " << first[k.var<uint_>("index")] << ";\n" << "index++;\n" << "}\n"; } k << "while(index < end){\n" << // load next value k.decl<const input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n"; if(exclusive){ k << "if(get_global_id(0) == 0){\n" << result[k.var<uint_>("index")] << " = sum;\n" << "}\n"; } k << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("value")) << ";\n"; if(!exclusive){ k << "if(get_global_id(0) == 0){\n" << result[k.var<uint_>("index")] << " = sum;\n" << "}\n"; } k << "index++;\n" << "}\n" << // end while "block_partial_sums[get_global_id(0)] = sum;\n"; // compile scan kernel kernel block_scan_kernel = k.compile(context); // setup kernel arguments block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init)); block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); // execute the kernel size_t global_work_size = compute_units; queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0); // scan is done if(compute_units < 2) { return result + count; } // final scan kernel meta_kernel l("scan_on_cpu_final_scan"); // Arguments count_arg = l.add_arg<uint_>("count"); block_partial_sums_arg = l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); l << "uint block = " << "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << "uint index = block + get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << k.decl<output_type>("sum") << " = block_partial_sums[0];\n" << "for(uint i = 0; i < get_global_id(0); i++) {\n" << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" << "}\n" << "while(index < end){\n"; if(exclusive){ l << l.decl<output_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << result[k.var<uint_>("index")] << " = sum;\n" << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("value")) << ";\n"; } else { l << "sum = " << op(k.var<output_type>("sum"), first[k.var<uint_>("index")]) << ";\n" << result[k.var<uint_>("index")] << " = sum;\n"; } l << "index++;\n" << "}\n"; // compile scan kernel kernel final_scan_kernel = l.compile(context); // setup kernel arguments final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); // execute the kernel global_work_size = compute_units; queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0); // return iterator pointing to the end of the result range return result + count; }
inline void reduce_on_gpu(InputIterator first, InputIterator last, buffer_iterator<T> result, Function function, command_queue &queue) { const device &device = queue.get_device(); const context &context = queue.get_context(); detail::meta_kernel k("reduce"); k.add_arg<const T*>(memory_object::global_memory, "input"); k.add_arg<const uint_>("offset"); k.add_arg<const uint_>("count"); k.add_arg<T*>(memory_object::global_memory, "output"); k.add_arg<const uint_>("output_offset"); k << k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" << "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "__local " << type_name<T>() << " scratch[TPB];\n" << // private reduction k.decl<T>("sum") << " = 0;\n" << "for(uint i = 0; i < VPT; i++){\n" << " if(block_offset + lid + i*TPB < count){\n" << " sum = sum + block[lid+i*TPB]; \n" << " }\n" << "}\n" << "scratch[lid] = sum;\n"; // discrimination on vendor name if(is_nvidia_device(device)) k << ReduceBody<T,true>::body(); else k << ReduceBody<T,false>::body(); k << // write sum to output "if(lid == 0){\n" << " output[output_offset + get_group_id(0)] = scratch[0];\n" << "}\n"; std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); uint_ vpt = parameters->get(cache_key, "vpt", 8); uint_ tpb = parameters->get(cache_key, "tpb", 128); // reduce program compiler flags std::stringstream options; options << "-DT=" << type_name<T>() << " -DVPT=" << vpt << " -DTPB=" << tpb; // load program boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); program reduce_program = cache->get_or_build( cache_key, options.str(), k.source(), context ); // create reduce kernel kernel reduce_kernel(reduce_program, "reduce"); size_t count = std::distance(first, last); // first pass, reduce from input to ping buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T)); initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue); // update count after initial reduce count = std::ceil(float(count) / vpt / tpb); // middle pass(es), reduce between ping and pong const buffer *input_buffer = &ping; buffer pong(context, count / vpt / tpb * sizeof(T)); const buffer *output_buffer = &pong; if(count > vpt * tpb){ while(count > vpt * tpb){ reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, *output_buffer); reduce_kernel.set_arg(4, uint_(0)); size_t work_size = std::ceil(float(count) / vpt); if(work_size % tpb != 0){ work_size += tpb - work_size % tpb; } queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); std::swap(input_buffer, output_buffer); count = std::ceil(float(count) / vpt / tpb); } } // final pass, reduce from ping/pong to result reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, result.get_buffer()); reduce_kernel.set_arg(4, uint_(result.get_index())); queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb); }
inline InputIterator find_extrema_on_cpu(InputIterator first, InputIterator last, Compare compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; size_t count = iterator_range_size(first, last); const device &device = queue.get_device(); const uint_ compute_units = queue.get_device().compute_units(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_find_extrema_cpu_" + boost::lexical_cast<std::string>(sizeof(input_type)); // for inputs smaller than serial_find_extrema_threshold // serial_find_extrema algorithm is used uint_ serial_find_extrema_threshold = parameters->get( cache_key, "serial_find_extrema_threshold", 16384 * sizeof(input_type) ); serial_find_extrema_threshold = (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units)); const context &context = queue.get_context(); if(count < serial_find_extrema_threshold) { return serial_find_extrema(first, last, compare, find_minimum, queue); } meta_kernel k("find_extrema_on_cpu"); buffer output(context, sizeof(input_type) * compute_units); buffer output_idx( context, sizeof(uint_) * compute_units, buffer::read_write | buffer::alloc_host_ptr ); size_t count_arg = k.add_arg<uint_>("count"); size_t output_arg = k.add_arg<input_type *>(memory_object::global_memory, "output"); size_t output_idx_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output_idx"); k << "uint block = " << "(uint)ceil(((float)count)/get_global_size(0));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << "uint value_index = index;\n" << k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << "index++;\n" << "while(index < end){\n" << k.decl<input_type>("candidate") << " = " << first[k.var<uint_>("index")] << ";\n" << "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << "bool compare = " << compare(k.var<input_type>("candidate"), k.var<input_type>("value")) << ";\n" << "#else\n" << "bool compare = " << compare(k.var<input_type>("value"), k.var<input_type>("candidate")) << ";\n" << "#endif\n" << "value = compare ? candidate : value;\n" << "value_index = compare ? index : value_index;\n" << "index++;\n" << "}\n" << "output[get_global_id(0)] = value;\n" << "output_idx[get_global_id(0)] = value_index;\n"; size_t global_work_size = compute_units; std::string options; if(!find_minimum){ options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; } kernel kernel = k.compile(context, options); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(output_arg, output); kernel.set_arg(output_idx_arg, output_idx); queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0); buffer_iterator<input_type> result = serial_find_extrema( make_buffer_iterator<input_type>(output), make_buffer_iterator<input_type>(output, global_work_size), compare, find_minimum, queue ); uint_* output_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( output_idx, command_queue::map_read, 0, global_work_size * sizeof(uint_) ) ); difference_type extremum_idx = static_cast<difference_type>(*(output_idx_host_ptr + result.get_index())); return first + extremum_idx; }
inline void merge_blocks_on_gpu(KeyIterator keys_first, ValueIterator values_first, KeyIterator out_keys_first, ValueIterator out_values_first, Compare compare, const size_t count, const size_t block_size, const bool sort_by_key, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; meta_kernel k("merge_blocks"); size_t count_arg = k.add_arg<const uint_>("count"); size_t block_size_arg = k.add_arg<const uint_>("block_size"); k << // get global id k.decl<const uint_>("gid") << " = get_global_id(0);\n" << "if(gid >= count) {\n" << "return;\n" << "}\n" << k.decl<const key_type>("my_key") << " = " << keys_first[k.var<const uint_>("gid")] << ";\n"; if(sort_by_key) { k << k.decl<const value_type>("my_value") << " = " << values_first[k.var<const uint_>("gid")] << ";\n"; } k << // get my block idx k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" << k.decl<const bool>("my_block_idx_is_odd") << " = " << "my_block_idx & 0x1;\n" << k.decl<const uint_>("other_block_idx") << " = " << // if(my_block_idx is odd) {} else {} "my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" << // get ranges of my block and the other block // [my_block_start; my_block_end) // [other_block_start; other_block_end) k.decl<const uint_>("my_block_start") << " = " << "min(my_block_idx * block_size, count);\n" << // including k.decl<const uint_>("my_block_end") << " = " << "min((my_block_idx + 1) * block_size, count);\n" << // excluding k.decl<const uint_>("other_block_start") << " = " << "min(other_block_idx * block_size, count);\n" << // including k.decl<const uint_>("other_block_end") << " = " << "min((other_block_idx + 1) * block_size, count);\n" << // excluding // other block is empty, nothing to merge here "if(other_block_start == count){\n" << out_keys_first[k.var<uint_>("gid")] << " = my_key;\n"; if(sort_by_key) { k << out_values_first[k.var<uint_>("gid")] << " = my_value;\n"; } k << "return;\n" << "}\n" << // lower bound // left_idx - lower bound k.decl<uint_>("left_idx") << " = other_block_start;\n" << k.decl<uint_>("right_idx") << " = other_block_end;\n" << "while(left_idx < right_idx) {\n" << k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" << k.decl<key_type>("mid_key") << " = " << keys_first[k.var<const uint_>("mid_idx")] << ";\n" << k.decl<bool>("smaller") << " = " << compare(k.var<key_type>("mid_key"), k.var<key_type>("my_key")) << ";\n" << "left_idx = smaller ? mid_idx + 1 : left_idx;\n" << "right_idx = smaller ? right_idx : mid_idx;\n" << "}\n" << // left_idx is found position in other block // if my_block is odd we need to get the upper bound "right_idx = other_block_end;\n" << "if(my_block_idx_is_odd && left_idx != right_idx) {\n" << k.decl<key_type>("upper_key") << " = " << keys_first[k.var<const uint_>("left_idx")] << ";\n" << "while(" << "!(" << compare(k.var<key_type>("upper_key"), k.var<key_type>("my_key")) << ") && " << "!(" << compare(k.var<key_type>("my_key"), k.var<key_type>("upper_key")) << ") && " << "left_idx < right_idx" << ")" << "{\n" << k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" << k.decl<key_type>("mid_key") << " = " << keys_first[k.var<const uint_>("mid_idx")] << ";\n" << k.decl<bool>("equal") << " = " << "!(" << compare(k.var<key_type>("mid_key"), k.var<key_type>("my_key")) << ") && " << "!(" << compare(k.var<key_type>("my_key"), k.var<key_type>("mid_key")) << ");\n" << "left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" << "right_idx = equal ? right_idx : mid_idx;\n" << "upper_key = " << keys_first[k.var<const uint_>("left_idx")] << ";\n" << "}\n" << "}\n" << k.decl<uint_>("offset") << " = 0;\n" << "offset += gid - my_block_start;\n" << "offset += left_idx - other_block_start;\n" << "offset += min(my_block_start, other_block_start);\n" << out_keys_first[k.var<uint_>("offset")] << " = my_key;\n"; if(sort_by_key) { k << out_values_first[k.var<uint_>("offset")] << " = my_value;\n"; } const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); const size_t work_group_size = (std::min)( size_t(256), kernel.get_work_group_info<size_t>( queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE ) ); const size_t global_size = work_group_size * static_cast<size_t>( std::ceil(float(count) / work_group_size) ); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size); }
inline size_t bitonic_block_sort(KeyIterator keys_first, ValueIterator values_first, Compare compare, const size_t count, const bool sort_by_key, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; meta_kernel k("bitonic_block_sort"); size_t count_arg = k.add_arg<const uint_>("count"); size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys"); size_t local_vals_arg = 0; if(sort_by_key) { local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx"); } k << // Work item global and local ids k.decl<const uint_>("gid") << " = get_global_id(0);\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n"; // declare my_key and my_value k << k.decl<key_type>("my_key") << ";\n"; // Instead of copying values (my_value) in local memory with keys // we save local index (uchar) and copy my_value at the end at // final index. This saves local memory. if(sort_by_key) { k << k.decl<uchar_>("my_index") << " = (uchar)(lid);\n"; } // load key k << "if(gid < count) {\n" << k.var<key_type>("my_key") << " = " << keys_first[k.var<const uint_>("gid")] << ";\n" << "}\n"; // load key and index to local memory k << "lkeys[lid] = my_key;\n"; if(sort_by_key) { k << "lidx[lid] = my_index;\n"; } k << k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" << k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n"; // When work group size is a power of 2 bitonic sorter can be used; // otherwise, slower odd-even sort is used. k << // check if n is power of 2 "if(((n != 0) && ((n & (~n + 1)) == n))) {\n"; // bitonic sort, not stable k << // wait for keys and vals to be stored in local memory "barrier(CLK_LOCAL_MEM_FENCE);\n" << "#pragma unroll\n" << "for(" << k.decl<uint_>("length") << " = 1; " << "length < n; " << "length <<= 1" << ") {\n" << // direction of sort: false -> asc, true -> desc k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" << "for(" << k.decl<uint_>("k") << " = length; " << "k > 0; " << "k >>= 1" << ") {\n" << // sibling to compare with my key k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" << k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" << k.decl<bool>("compare") << " = " << compare(k.var<key_type>("sibling_key"), k.var<key_type>("my_key")) << ";\n" << k.decl<bool>("equal") << " = !(compare || " << compare(k.var<key_type>("my_key"), k.var<key_type>("sibling_key")) << ");\n" << k.decl<bool>("swap") << " = compare ^ (sibling_idx < lid) ^ direction;\n" << "swap = equal ? false : swap;\n" << "my_key = swap ? sibling_key : my_key;\n"; if(sort_by_key) { k << "my_index = swap ? lidx[sibling_idx] : my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << "lkeys[lid] = my_key;\n"; if(sort_by_key) { k << "lidx[lid] = my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << "}\n" << "}\n"; // end of bitonic sort // odd-even sort, not stable k << "}\n" << "else { \n"; k << k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" << k.decl<uint_>("oddsibling_idx") << " = " << "(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" << k.decl<uint_>("evensibling_idx") << " = " << "(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" << // wait for keys and vals to be stored in local memory "barrier(CLK_LOCAL_MEM_FENCE);\n" << "#pragma unroll\n" << "for(" << k.decl<uint_>("i") << " = 0; " << "i < n; " << "i++" << ") {\n" << k.decl<uint_>("sibling_idx") << " = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" << k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" << k.decl<bool>("compare") << " = " << compare(k.var<key_type>("sibling_key"), k.var<key_type>("my_key")) << ";\n" << k.decl<bool>("equal") << " = !(compare || " << compare(k.var<key_type>("my_key"), k.var<key_type>("sibling_key")) << ");\n" << k.decl<bool>("swap") << " = compare ^ (sibling_idx < lid);\n" << "swap = equal ? false : swap;\n" << "my_key = swap ? sibling_key : my_key;\n"; if(sort_by_key) { k << "my_index = swap ? lidx[sibling_idx] : my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << "lkeys[lid] = my_key;\n"; if(sort_by_key) { k << "lidx[lid] = my_index;\n"; } k << "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" << // for "}\n"; // else // end of odd-even sort // save key and value k << "if(gid < count) {\n" << keys_first[k.var<const uint_>("gid")] << " = " << k.var<key_type>("my_key") << ";\n"; if(sort_by_key) { k << k.decl<value_type>("my_value") << " = " << values_first[k.var<const uint_>("offset + my_index")] << ";\n" << "barrier(CLK_GLOBAL_MEM_FENCE);\n" << values_first[k.var<const uint_>("gid")] << " = my_value;\n"; } k << // end if "}\n"; const context &context = queue.get_context(); const device &device = queue.get_device(); ::boost::compute::kernel kernel = k.compile(context); const size_t work_group_size = pick_bitonic_block_sort_block_size<key_type, uchar_>( kernel.get_work_group_info<size_t>( device, CL_KERNEL_WORK_GROUP_SIZE ), device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE), sort_by_key ); const size_t global_size = work_group_size * static_cast<size_t>( std::ceil(float(count) / work_group_size) ); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size)); if(sort_by_key) { kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size)); } queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size); // return size of the block return work_group_size; }
inline void merge_blocks(KeyIterator keys_first, ValueIterator values_first, KeyIterator keys_result, ValueIterator values_result, Compare compare, size_t count, const size_t block_size, const bool sort_by_key, command_queue &queue) { (void) values_result; (void) values_first; meta_kernel k("merge_sort_on_cpu_merge_blocks"); size_t count_arg = k.add_arg<const uint_>("count"); size_t block_size_arg = k.add_arg<uint_>("block_size"); k << k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" << k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" << k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" << k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" << k.decl<uint_>("result_idx") << " = b1_start;\n" << // merging block 1 and block 2 (stable) "while(b1_start < b1_end && b2_start < b2_end){\n" << " if( " << compare(keys_first[k.var<uint_>("b2_start")], keys_first[k.var<uint_>("b1_start")]) << "){\n" << " " << keys_result[k.var<uint_>("result_idx")] << " = " << keys_first[k.var<uint_>("b2_start")] << ";\n"; if(sort_by_key){ k << " " << values_result[k.var<uint_>("result_idx")] << " = " << values_first[k.var<uint_>("b2_start")] << ";\n"; } k << " b2_start++;\n" << " }\n" << " else {\n" << " " << keys_result[k.var<uint_>("result_idx")] << " = " << keys_first[k.var<uint_>("b1_start")] << ";\n"; if(sort_by_key){ k << " " << values_result[k.var<uint_>("result_idx")] << " = " << values_first[k.var<uint_>("b1_start")] << ";\n"; } k << " b1_start++;\n" << " }\n" << " result_idx++;\n" << "}\n" << "while(b1_start < b1_end){\n" << " " << keys_result[k.var<uint_>("result_idx")] << " = " << keys_first[k.var<uint_>("b1_start")] << ";\n"; if(sort_by_key){ k << " " << values_result[k.var<uint_>("result_idx")] << " = " << values_first[k.var<uint_>("b1_start")] << ";\n"; } k << " b1_start++;\n" << " result_idx++;\n" << "}\n" << "while(b2_start < b2_end){\n" << " " << keys_result[k.var<uint_>("result_idx")] << " = " << keys_first[k.var<uint_>("b2_start")] << ";\n"; if(sort_by_key){ k << " " << values_result[k.var<uint_>("result_idx")] << " = " << values_first[k.var<uint_>("b2_start")] << ";\n"; } k << " b2_start++;\n" << " result_idx++;\n" << "}\n"; const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<const uint_>(count)); kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); const size_t global_size = static_cast<size_t>( std::ceil(float(count) / (2 * block_size)) ); queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); }
inline void radix_sort(Iterator first, Iterator last, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type value_type; typedef typename radix_sort_value_type<sizeof(value_type)>::type sort_type; const context &context = queue.get_context(); size_t count = detail::iterator_range_size(first, last); // sort parameters const uint_ k = 4; const uint_ k2 = 1 << k; const uint_ block_size = 128; uint_ block_count = count / block_size; if(block_count * block_size != count){ block_count++; } // setup kernels program radix_sort_program = program::create_with_source(radix_sort_source, context); std::stringstream options; options << "-DK=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } radix_sort_program.build(options.str()); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); // setup temporary buffers vector<value_type> output(count, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); const buffer *output_buffer = &output.get_buffer(); for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, static_cast<uint_>(count)); count_kernel.set_arg(2, counts.get_buffer()); count_kernel.set_arg(3, offsets.get_buffer()); count_kernel.set_arg(4, block_size * sizeof(uint_), 0); count_kernel.set_arg(5, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts.get_buffer()); scan_kernel.set_arg(1, offsets.get_buffer()); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, static_cast<uint_>(count)); scatter_kernel.set_arg(2, i * k); scatter_kernel.set_arg(3, counts.get_buffer()); scatter_kernel.set_arg(4, offsets.get_buffer()); scatter_kernel.set_arg(5, *output_buffer); queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); } }