inline void radix_sort_impl(const buffer_iterator<T> first, const buffer_iterator<T> last, const buffer_iterator<T2> values_first, const bool ascending, command_queue &queue) { typedef T value_type; typedef typename radix_sort_value_type<sizeof(T)>::type sort_type; const device &device = queue.get_device(); const context &context = queue.get_context(); // if we have a valid values iterator then we are doing a // sort by key and have to set up the values buffer bool sort_by_key = (values_first.get_buffer().get() != 0); // load (or create) radix sort program std::string cache_key = std::string("__boost_radix_sort_") + type_name<value_type>(); if(sort_by_key){ cache_key += std::string("_with_") + type_name<T2>(); } boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // sort parameters const uint_ k = parameters->get(cache_key, "k", 4); const uint_ k2 = 1 << k; const uint_ block_size = parameters->get(cache_key, "tpb", 128); // sort program compiler options std::stringstream options; options << "-DK_BITS=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } if(sort_by_key){ options << " -DSORT_BY_KEY"; options << " -DT2=" << type_name<T2>(); options << enable_double<T2>(); } if(ascending){ options << " -DASC"; } // load radix sort program program radix_sort_program = cache->get_or_build( cache_key, options.str(), radix_sort_source, context ); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); size_t count = detail::iterator_range_size(first, last); uint_ block_count = static_cast<uint_>(count / block_size); if(block_count * block_size != count){ block_count++; } // setup temporary buffers vector<value_type> output(count, context); vector<T2> values_output(sort_by_key ? count : 0, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); uint_ input_offset = static_cast<uint_>(first.get_index()); const buffer *output_buffer = &output.get_buffer(); uint_ output_offset = 0; const buffer *values_input_buffer = &values_first.get_buffer(); uint_ values_input_offset = static_cast<uint_>(values_first.get_index()); const buffer *values_output_buffer = &values_output.get_buffer(); uint_ values_output_offset = 0; for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, input_offset); count_kernel.set_arg(2, static_cast<uint_>(count)); count_kernel.set_arg(3, counts); count_kernel.set_arg(4, offsets); count_kernel.set_arg(5, block_size * sizeof(uint_), 0); count_kernel.set_arg(6, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts); scan_kernel.set_arg(1, offsets); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, input_offset); scatter_kernel.set_arg(2, static_cast<uint_>(count)); scatter_kernel.set_arg(3, i * k); scatter_kernel.set_arg(4, counts); scatter_kernel.set_arg(5, offsets); scatter_kernel.set_arg(6, *output_buffer); scatter_kernel.set_arg(7, output_offset); if(sort_by_key){ scatter_kernel.set_arg(8, *values_input_buffer); scatter_kernel.set_arg(9, values_input_offset); scatter_kernel.set_arg(10, *values_output_buffer); scatter_kernel.set_arg(11, values_output_offset); } queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); std::swap(values_input_buffer, values_output_buffer); std::swap(input_offset, output_offset); std::swap(values_input_offset, values_output_offset); } }
viennacl::vector<unsigned int> bucket_select(int N, const viennacl::vector<basic_type>& in) { viennacl::vector<unsigned int> src(in.size(), viennacl::traits::context(in)); viennacl::vector<unsigned int> dst(in.size() , viennacl::traits::context(in)); // load kernels static bool init = false; static int num_groups = 1; static int wg_size = 128; if (!init) { FILE * tmp = fopen("bucket_select.cl", "rb"); fseek(tmp, 0, SEEK_END); std::vector<char> binary; binary.resize(ftell(tmp)); rewind(tmp); fread(&binary[0], binary.size(), 1, tmp); fclose(tmp); binary.push_back(0); static viennacl::context g_context = viennacl::ocl::current_context(); static bool init = false; viennacl::ocl::context* ctx = g_context.opencl_pcontext(); std::cout << "Device " << ctx->current_device().name() << std::endl; ctx->build_options("-cl-std=CL2.0 -D CL_VERSION_2_0"); std::string program_text(&binary[0]); ctx->add_program(program_text, std::string("test")); init = true; } viennacl::ocl::kernel scan_kernel = viennacl::ocl::current_context().get_kernel("test", "scan_buckets"); viennacl::ocl::kernel scatter_kernel = viennacl::ocl::current_context().get_kernel("test", "scatter_buckets"); viennacl::ocl::kernel init_offsets_kernel = viennacl::ocl::current_context().get_kernel("test", "init_offsets"); scan_kernel.local_work_size(0, wg_size); scan_kernel.global_work_size(0, wg_size * num_groups); scatter_kernel.local_work_size(0, wg_size); scatter_kernel.global_work_size(0, wg_size * num_groups); init_offsets_kernel.local_work_size(0, wg_size); init_offsets_kernel.global_work_size(0, wg_size* num_groups); cl_uint size = src.size(); viennacl::ocl::enqueue(init_offsets_kernel(size, src)); int position = 0; viennacl::vector<unsigned int> result(N, viennacl::traits::context(in)); int num_buckets = 10; viennacl::vector<unsigned int> global_histogram((num_buckets + 1) * num_groups, viennacl::traits::context(in)); // -wg_size viennacl::vector<unsigned int> global_histogram_prefix((num_buckets + 1) * num_groups + 1, viennacl::traits::context(in)); std::vector< unsigned int > global_histogram_cpu((num_buckets + 1) * num_groups + 1); int scan_start = 0; int scan_end = in.size(); basic_type pivot; basic_type base_value; int split_bucket = 0; base_value = 0; pivot = std::numeric_limits<basic_type>::max() / num_buckets; assert(pivot > 0); while (position < N) { int main = (scan_end / wg_size) * wg_size; // floor to multiple wg size int loop_end = main == scan_end ? main : main + wg_size; // add wg size if needed viennacl::ocl::enqueue(scan_kernel(in, src, scan_end, loop_end, viennacl::ocl::local_mem(sizeof(cl_uint) *wg_size), base_value, pivot, num_buckets, global_histogram)); viennacl::linalg::exclusive_scan(global_histogram, global_histogram_prefix); viennacl::copy(global_histogram_prefix, global_histogram_cpu); global_histogram_cpu[global_histogram_cpu.size() - 1] = global_histogram_cpu[global_histogram_cpu.size() - 2]; // fix last element for (split_bucket = 1; split_bucket < num_buckets; ++split_bucket) { int offset = global_histogram_cpu[num_groups * split_bucket]; if (offset >= N) break; } viennacl::ocl::enqueue(scatter_kernel( in, src, scan_end, loop_end, viennacl::ocl::local_mem(sizeof(cl_uint) *wg_size), (basic_type)base_value, (basic_type)pivot, num_buckets, split_bucket, global_histogram_prefix, dst )); int hist_max = global_histogram_cpu[num_groups * split_bucket]; int hist_min = global_histogram_cpu[num_groups * (split_bucket - 1)]; //#ifdef DEBUG_RADIX_SELECT std::vector<unsigned int> dst_cpu(in.size()); std::vector<unsigned int> src_cpu(in.size()); viennacl::copy(dst, dst_cpu); viennacl::copy(src, src_cpu); //#endif if (hist_max == N) break; if (hist_max> N && hist_min < N) { scan_start = global_histogram_cpu[num_groups * (split_bucket - 1)]; scan_end = global_histogram_cpu[num_groups * split_bucket]; if (scan_start > 0) { viennacl::copy(dst.begin(), dst.begin() + scan_start, result.begin() + position); position += scan_start; } //#ifdef DEBUG_RADIX_SELECT std::vector<unsigned int> result_cpu(in.size()); viennacl::copy(result, result_cpu); //#endif if (position >= N) break; if (scan_end == dst.size() && scan_start == 0) dst.fast_swap(src); else viennacl::copy(dst.begin() + scan_start, dst.begin() + scan_end, src.begin()); scan_end -= scan_start; } base_value += pivot * (split_bucket-1); // update pivot pivot = pivot / num_buckets; if (pivot == 0) break; } if (position <N) viennacl::copy(dst.begin(), dst.begin() + (N - position), result.begin() + position); return result; }
inline void radix_sort(Iterator first, Iterator last, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type value_type; typedef typename radix_sort_value_type<sizeof(value_type)>::type sort_type; const context &context = queue.get_context(); size_t count = detail::iterator_range_size(first, last); // sort parameters const uint_ k = 4; const uint_ k2 = 1 << k; const uint_ block_size = 128; uint_ block_count = count / block_size; if(block_count * block_size != count){ block_count++; } // setup kernels program radix_sort_program = program::create_with_source(radix_sort_source, context); std::stringstream options; options << "-DK=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } radix_sort_program.build(options.str()); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); // setup temporary buffers vector<value_type> output(count, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); const buffer *output_buffer = &output.get_buffer(); for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, static_cast<uint_>(count)); count_kernel.set_arg(2, counts.get_buffer()); count_kernel.set_arg(3, offsets.get_buffer()); count_kernel.set_arg(4, block_size * sizeof(uint_), 0); count_kernel.set_arg(5, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts.get_buffer()); scan_kernel.set_arg(1, offsets.get_buffer()); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, static_cast<uint_>(count)); scatter_kernel.set_arg(2, i * k); scatter_kernel.set_arg(3, counts.get_buffer()); scatter_kernel.set_arg(4, offsets.get_buffer()); scatter_kernel.set_arg(5, *output_buffer); queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); } }