Пример #1
0
inline void radix_sort_impl(const buffer_iterator<T> first,
                            const buffer_iterator<T> last,
                            const buffer_iterator<T2> values_first,
                            const bool ascending,
                            command_queue &queue)
{

    typedef T value_type;
    typedef typename radix_sort_value_type<sizeof(T)>::type sort_type;

    const device &device = queue.get_device();
    const context &context = queue.get_context();


    // if we have a valid values iterator then we are doing a
    // sort by key and have to set up the values buffer
    bool sort_by_key = (values_first.get_buffer().get() != 0);

    // load (or create) radix sort program
    std::string cache_key =
        std::string("__boost_radix_sort_") + type_name<value_type>();

    if(sort_by_key){
        cache_key += std::string("_with_") + type_name<T2>();
    }

    boost::shared_ptr<program_cache> cache =
        program_cache::get_global_cache(context);
    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    // sort parameters
    const uint_ k = parameters->get(cache_key, "k", 4);
    const uint_ k2 = 1 << k;
    const uint_ block_size = parameters->get(cache_key, "tpb", 128);

    // sort program compiler options
    std::stringstream options;
    options << "-DK_BITS=" << k;
    options << " -DT=" << type_name<sort_type>();
    options << " -DBLOCK_SIZE=" << block_size;

    if(boost::is_floating_point<value_type>::value){
        options << " -DIS_FLOATING_POINT";
    }

    if(boost::is_signed<value_type>::value){
        options << " -DIS_SIGNED";
    }

    if(sort_by_key){
        options << " -DSORT_BY_KEY";
        options << " -DT2=" << type_name<T2>();
        options << enable_double<T2>();
    }

    if(ascending){
        options << " -DASC";
    }

    // load radix sort program
    program radix_sort_program = cache->get_or_build(
        cache_key, options.str(), radix_sort_source, context
    );

    kernel count_kernel(radix_sort_program, "count");
    kernel scan_kernel(radix_sort_program, "scan");
    kernel scatter_kernel(radix_sort_program, "scatter");

    size_t count = detail::iterator_range_size(first, last);

    uint_ block_count = static_cast<uint_>(count / block_size);
    if(block_count * block_size != count){
        block_count++;
    }

    // setup temporary buffers
    vector<value_type> output(count, context);
    vector<T2> values_output(sort_by_key ? count : 0, context);
    vector<uint_> offsets(k2, context);
    vector<uint_> counts(block_count * k2, context);

    const buffer *input_buffer = &first.get_buffer();
    uint_ input_offset = static_cast<uint_>(first.get_index());
    const buffer *output_buffer = &output.get_buffer();
    uint_ output_offset = 0;
    const buffer *values_input_buffer = &values_first.get_buffer();
    uint_ values_input_offset = static_cast<uint_>(values_first.get_index());
    const buffer *values_output_buffer = &values_output.get_buffer();
    uint_ values_output_offset = 0;

    for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){
        // write counts
        count_kernel.set_arg(0, *input_buffer);
        count_kernel.set_arg(1, input_offset);
        count_kernel.set_arg(2, static_cast<uint_>(count));
        count_kernel.set_arg(3, counts);
        count_kernel.set_arg(4, offsets);
        count_kernel.set_arg(5, block_size * sizeof(uint_), 0);
        count_kernel.set_arg(6, i * k);
        queue.enqueue_1d_range_kernel(count_kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);

        // scan counts
        if(k == 1){
            typedef uint2_ counter_type;
            ::boost::compute::exclusive_scan(
                make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
                make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2),
                make_buffer_iterator<counter_type>(counts.get_buffer()),
                queue
            );
        }
        else if(k == 2){
            typedef uint4_ counter_type;
            ::boost::compute::exclusive_scan(
                make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
                make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4),
                make_buffer_iterator<counter_type>(counts.get_buffer()),
                queue
            );
        }
        else if(k == 4){
            typedef uint16_ counter_type;
            ::boost::compute::exclusive_scan(
                make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
                make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16),
                make_buffer_iterator<counter_type>(counts.get_buffer()),
                queue
            );
        }
        else {
            BOOST_ASSERT(false && "unknown k");
            break;
        }

        // scan global offsets
        scan_kernel.set_arg(0, counts);
        scan_kernel.set_arg(1, offsets);
        scan_kernel.set_arg(2, block_count);
        queue.enqueue_task(scan_kernel);

        // scatter values
        scatter_kernel.set_arg(0, *input_buffer);
        scatter_kernel.set_arg(1, input_offset);
        scatter_kernel.set_arg(2, static_cast<uint_>(count));
        scatter_kernel.set_arg(3, i * k);
        scatter_kernel.set_arg(4, counts);
        scatter_kernel.set_arg(5, offsets);
        scatter_kernel.set_arg(6, *output_buffer);
        scatter_kernel.set_arg(7, output_offset);
        if(sort_by_key){
            scatter_kernel.set_arg(8, *values_input_buffer);
            scatter_kernel.set_arg(9, values_input_offset);
            scatter_kernel.set_arg(10, *values_output_buffer);
            scatter_kernel.set_arg(11, values_output_offset);
        }
        queue.enqueue_1d_range_kernel(scatter_kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);

        // swap buffers
        std::swap(input_buffer, output_buffer);
        std::swap(values_input_buffer, values_output_buffer);
        std::swap(input_offset, output_offset);
        std::swap(values_input_offset, values_output_offset);
    }
}
Пример #2
0
viennacl::vector<unsigned int> bucket_select(int N, const viennacl::vector<basic_type>& in)
{
	viennacl::vector<unsigned int> src(in.size(), viennacl::traits::context(in));
	viennacl::vector<unsigned int> dst(in.size() , viennacl::traits::context(in));

	// load kernels
	static bool init = false;
	static int num_groups = 1;
	static int wg_size = 128;
	

	if (!init)
	{
		FILE * tmp = fopen("bucket_select.cl", "rb");
		fseek(tmp, 0, SEEK_END);
		std::vector<char> binary;
		binary.resize(ftell(tmp));
		rewind(tmp);
		fread(&binary[0], binary.size(), 1, tmp);
		fclose(tmp);
		binary.push_back(0);
		static viennacl::context g_context = viennacl::ocl::current_context();
		static bool init = false;
		viennacl::ocl::context* ctx = g_context.opencl_pcontext();
		std::cout << "Device " << ctx->current_device().name() << std::endl;
		ctx->build_options("-cl-std=CL2.0 -D CL_VERSION_2_0");
		std::string program_text(&binary[0]);
		ctx->add_program(program_text, std::string("test"));
		init = true;
	}


	viennacl::ocl::kernel scan_kernel = viennacl::ocl::current_context().get_kernel("test", "scan_buckets");
	viennacl::ocl::kernel scatter_kernel = viennacl::ocl::current_context().get_kernel("test", "scatter_buckets");
	viennacl::ocl::kernel init_offsets_kernel = viennacl::ocl::current_context().get_kernel("test", "init_offsets");

	scan_kernel.local_work_size(0, wg_size);
	scan_kernel.global_work_size(0, wg_size * num_groups);

	scatter_kernel.local_work_size(0, wg_size);
	scatter_kernel.global_work_size(0, wg_size * num_groups);

	init_offsets_kernel.local_work_size(0, wg_size);
	init_offsets_kernel.global_work_size(0, wg_size* num_groups);
	cl_uint size = src.size();
	viennacl::ocl::enqueue(init_offsets_kernel(size, src));

	int position = 0;
	viennacl::vector<unsigned int> result(N, viennacl::traits::context(in));

	int num_buckets = 10;
	viennacl::vector<unsigned int> global_histogram((num_buckets + 1) * num_groups, viennacl::traits::context(in)); // -wg_size
	viennacl::vector<unsigned int> global_histogram_prefix((num_buckets + 1) * num_groups  + 1, viennacl::traits::context(in));
	std::vector< unsigned int > global_histogram_cpu((num_buckets + 1) * num_groups + 1);
	int scan_start = 0;
	int scan_end = in.size();
	basic_type pivot;
	basic_type base_value;
	int split_bucket = 0;
	base_value = 0;
	pivot = std::numeric_limits<basic_type>::max() / num_buckets;
	assert(pivot > 0);
	while (position < N)
	{
		int main = (scan_end / wg_size) * wg_size; // floor to multiple wg size
		int loop_end = main == scan_end ? main : main + wg_size; // add wg size if needed

		viennacl::ocl::enqueue(scan_kernel(in,
			src,
			scan_end,
			loop_end,
			viennacl::ocl::local_mem(sizeof(cl_uint) *wg_size),
			base_value,
			pivot,
			num_buckets,
			global_histogram));

		viennacl::linalg::exclusive_scan(global_histogram, global_histogram_prefix);
		viennacl::copy(global_histogram_prefix, global_histogram_cpu);
		global_histogram_cpu[global_histogram_cpu.size() - 1] = global_histogram_cpu[global_histogram_cpu.size() - 2]; // fix last element

		for (split_bucket = 1; split_bucket < num_buckets; ++split_bucket)
		{
			int offset = global_histogram_cpu[num_groups * split_bucket];
			if (offset >= N)
				break;
		}
		viennacl::ocl::enqueue(scatter_kernel(
			in,
			src,
			scan_end,
			loop_end,
			viennacl::ocl::local_mem(sizeof(cl_uint) *wg_size),
			(basic_type)base_value,
			(basic_type)pivot,
			num_buckets,
			split_bucket,
			global_histogram_prefix,
			dst
			));

		int hist_max = global_histogram_cpu[num_groups * split_bucket];
		int hist_min = global_histogram_cpu[num_groups * (split_bucket - 1)];
		//#ifdef DEBUG_RADIX_SELECT
		std::vector<unsigned int> dst_cpu(in.size());
		std::vector<unsigned int> src_cpu(in.size());
		viennacl::copy(dst, dst_cpu);
		viennacl::copy(src, src_cpu);
		//#endif

		if (hist_max == N)
			break;
		if (hist_max> N && hist_min < N)
		{
			scan_start = global_histogram_cpu[num_groups * (split_bucket - 1)];
			scan_end = global_histogram_cpu[num_groups * split_bucket];
			if (scan_start > 0)
			{
				viennacl::copy(dst.begin(), dst.begin() + scan_start, result.begin() + position);
				position += scan_start;
			}
			//#ifdef DEBUG_RADIX_SELECT
			std::vector<unsigned int> result_cpu(in.size());
			viennacl::copy(result, result_cpu);
			//#endif
			if (position >= N)
				break;
			if (scan_end == dst.size() && scan_start == 0)
				dst.fast_swap(src);
			else
				viennacl::copy(dst.begin() + scan_start, dst.begin() + scan_end, src.begin());
			scan_end -= scan_start;
		}

		base_value += pivot * (split_bucket-1);
		// update pivot
		
		pivot = pivot / num_buckets;
		if (pivot == 0)
			break;


	}
	if (position <N)
		viennacl::copy(dst.begin(), dst.begin() + (N - position), result.begin() + position);

	return result;
}
Пример #3
0
inline void radix_sort(Iterator first,
                       Iterator last,
                       command_queue &queue)
{
    typedef typename
        std::iterator_traits<Iterator>::value_type
        value_type;
    typedef typename
        radix_sort_value_type<sizeof(value_type)>::type
        sort_type;

    const context &context = queue.get_context();
    size_t count = detail::iterator_range_size(first, last);

    // sort parameters
    const uint_ k = 4;
    const uint_ k2 = 1 << k;
    const uint_ block_size = 128;

    uint_ block_count = count / block_size;
    if(block_count * block_size != count){
        block_count++;
    }

    // setup kernels
    program radix_sort_program =
        program::create_with_source(radix_sort_source, context);
    std::stringstream options;
    options << "-DK=" << k;
    options << " -DT=" << type_name<sort_type>();
    options << " -DBLOCK_SIZE=" << block_size;

    if(boost::is_floating_point<value_type>::value){
        options << " -DIS_FLOATING_POINT";
    }

    if(boost::is_signed<value_type>::value){
        options << " -DIS_SIGNED";
    }

    radix_sort_program.build(options.str());

    kernel count_kernel(radix_sort_program, "count");
    kernel scan_kernel(radix_sort_program, "scan");
    kernel scatter_kernel(radix_sort_program, "scatter");

    // setup temporary buffers
    vector<value_type> output(count, context);
    vector<uint_> offsets(k2, context);
    vector<uint_> counts(block_count * k2, context);

    const buffer *input_buffer = &first.get_buffer();
    const buffer *output_buffer = &output.get_buffer();

    for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){
        // write counts
        count_kernel.set_arg(0, *input_buffer);
        count_kernel.set_arg(1, static_cast<uint_>(count));
        count_kernel.set_arg(2, counts.get_buffer());
        count_kernel.set_arg(3, offsets.get_buffer());
        count_kernel.set_arg(4, block_size * sizeof(uint_), 0);
        count_kernel.set_arg(5, i * k);
        queue.enqueue_1d_range_kernel(count_kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);

        // scan counts
        if(k == 1){
            typedef uint2_ counter_type;
            ::boost::compute::exclusive_scan(
                make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
                make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2),
                make_buffer_iterator<counter_type>(counts.get_buffer()),
                queue
            );
        }
        else if(k == 2){
            typedef uint4_ counter_type;
            ::boost::compute::exclusive_scan(
                make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
                make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4),
                make_buffer_iterator<counter_type>(counts.get_buffer()),
                queue
            );
        }
        else if(k == 4){
            typedef uint16_ counter_type;
            ::boost::compute::exclusive_scan(
                make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
                make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16),
                make_buffer_iterator<counter_type>(counts.get_buffer()),
                queue
            );
        }
        else {
            BOOST_ASSERT(false && "unknown k");
            break;
        }

        // scan global offsets
        scan_kernel.set_arg(0, counts.get_buffer());
        scan_kernel.set_arg(1, offsets.get_buffer());
        scan_kernel.set_arg(2, block_count);
        queue.enqueue_task(scan_kernel);

        // scatter values
        scatter_kernel.set_arg(0, *input_buffer);
        scatter_kernel.set_arg(1, static_cast<uint_>(count));
        scatter_kernel.set_arg(2, i * k);
        scatter_kernel.set_arg(3, counts.get_buffer());
        scatter_kernel.set_arg(4, offsets.get_buffer());
        scatter_kernel.set_arg(5, *output_buffer);
        queue.enqueue_1d_range_kernel(scatter_kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);

        // swap buffers
        std::swap(input_buffer, output_buffer);
    }
}