inline void radix_sort_impl(const buffer_iterator<T> first, const buffer_iterator<T> last, const buffer_iterator<T2> values_first, const bool ascending, command_queue &queue) { typedef T value_type; typedef typename radix_sort_value_type<sizeof(T)>::type sort_type; const device &device = queue.get_device(); const context &context = queue.get_context(); // if we have a valid values iterator then we are doing a // sort by key and have to set up the values buffer bool sort_by_key = (values_first.get_buffer().get() != 0); // load (or create) radix sort program std::string cache_key = std::string("__boost_radix_sort_") + type_name<value_type>(); if(sort_by_key){ cache_key += std::string("_with_") + type_name<T2>(); } boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // sort parameters const uint_ k = parameters->get(cache_key, "k", 4); const uint_ k2 = 1 << k; const uint_ block_size = parameters->get(cache_key, "tpb", 128); // sort program compiler options std::stringstream options; options << "-DK_BITS=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } if(sort_by_key){ options << " -DSORT_BY_KEY"; options << " -DT2=" << type_name<T2>(); options << enable_double<T2>(); } if(ascending){ options << " -DASC"; } // load radix sort program program radix_sort_program = cache->get_or_build( cache_key, options.str(), radix_sort_source, context ); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); size_t count = detail::iterator_range_size(first, last); uint_ block_count = static_cast<uint_>(count / block_size); if(block_count * block_size != count){ block_count++; } // setup temporary buffers vector<value_type> output(count, context); vector<T2> values_output(sort_by_key ? count : 0, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); uint_ input_offset = static_cast<uint_>(first.get_index()); const buffer *output_buffer = &output.get_buffer(); uint_ output_offset = 0; const buffer *values_input_buffer = &values_first.get_buffer(); uint_ values_input_offset = static_cast<uint_>(values_first.get_index()); const buffer *values_output_buffer = &values_output.get_buffer(); uint_ values_output_offset = 0; for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, input_offset); count_kernel.set_arg(2, static_cast<uint_>(count)); count_kernel.set_arg(3, counts); count_kernel.set_arg(4, offsets); count_kernel.set_arg(5, block_size * sizeof(uint_), 0); count_kernel.set_arg(6, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts); scan_kernel.set_arg(1, offsets); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, input_offset); scatter_kernel.set_arg(2, static_cast<uint_>(count)); scatter_kernel.set_arg(3, i * k); scatter_kernel.set_arg(4, counts); scatter_kernel.set_arg(5, offsets); scatter_kernel.set_arg(6, *output_buffer); scatter_kernel.set_arg(7, output_offset); if(sort_by_key){ scatter_kernel.set_arg(8, *values_input_buffer); scatter_kernel.set_arg(9, values_input_offset); scatter_kernel.set_arg(10, *values_output_buffer); scatter_kernel.set_arg(11, values_output_offset); } queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); std::swap(values_input_buffer, values_output_buffer); std::swap(input_offset, output_offset); std::swap(values_input_offset, values_output_offset); } }
inline void radix_sort(Iterator first, Iterator last, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type value_type; typedef typename radix_sort_value_type<sizeof(value_type)>::type sort_type; const context &context = queue.get_context(); size_t count = detail::iterator_range_size(first, last); // sort parameters const uint_ k = 4; const uint_ k2 = 1 << k; const uint_ block_size = 128; uint_ block_count = count / block_size; if(block_count * block_size != count){ block_count++; } // setup kernels program radix_sort_program = program::create_with_source(radix_sort_source, context); std::stringstream options; options << "-DK=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } radix_sort_program.build(options.str()); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); // setup temporary buffers vector<value_type> output(count, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); const buffer *output_buffer = &output.get_buffer(); for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, static_cast<uint_>(count)); count_kernel.set_arg(2, counts.get_buffer()); count_kernel.set_arg(3, offsets.get_buffer()); count_kernel.set_arg(4, block_size * sizeof(uint_), 0); count_kernel.set_arg(5, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts.get_buffer()); scan_kernel.set_arg(1, offsets.get_buffer()); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, static_cast<uint_>(count)); scatter_kernel.set_arg(2, i * k); scatter_kernel.set_arg(3, counts.get_buffer()); scatter_kernel.set_arg(4, offsets.get_buffer()); scatter_kernel.set_arg(5, *output_buffer); queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); } }