clsparseStatus indices_to_offsets(clsparse::vector<T>& offsets, const clsparse::vector<T>& indices, const clsparseControl control) { typedef typename clsparse::vector<T> IndicesArray; typedef typename clsparse::vector<T>::size_type SizeType; //if (std::is_integral<T>) if (!clsparseInitialized) { return clsparseNotInitialized; } //check opencl elements if (control == nullptr) { return clsparseInvalidControlObject; } SizeType size = indices.size() > offsets.size() ? indices.size() : offsets.size(); IndicesArray values (control, indices.size(), 1, CL_MEM_READ_WRITE, true); IndicesArray keys_output (control, indices.size(), 0, CL_MEM_READ_WRITE, false); IndicesArray values_output (control, size, 0, CL_MEM_READ_WRITE, false); clsparseStatus status = internal::reduce_by_key(keys_output, values_output, indices, values, control); CLSPARSE_V(status, "Error: reduce_by_key"); if (status != clsparseSuccess) return status; assert(values_output.size() >= offsets.size()); cl_event clEvent; cl_int cl_status = clEnqueueCopyBuffer(control->queue(), values_output.data()(), offsets.data()(), 0, 0, offsets.size() * sizeof(T), 0, nullptr, &clEvent); CLSPARSE_V(cl_status, "Error: Enqueue copy buffer values to offsets"); cl_status = clWaitForEvents(1, &clEvent); CLSPARSE_V(cl_status, "Error: clWaitForEvents"); cl_status = clReleaseEvent(clEvent); CLSPARSE_V(cl_status, "Error: clReleaseEvent"); // Dunno why but this throws CL_INVALID_CONTEXT erro; // cl::Event event; // cl::enqueueCopyBuffer(values_output.data(), offsets.data(), // 0, 0, offsets.size(), // nullptr, &event); // CLSPARSE_V(cl_status, "Error: enqueueCopyBuffer"); // CLSPARSE_V(event.wait(), "Error: event wait"); status = exclusive_scan<EW_PLUS>(offsets, offsets, control); return status; }
inline void radix_sort_impl(const buffer_iterator<T> first, const buffer_iterator<T> last, const buffer_iterator<T2> values_first, const bool ascending, command_queue &queue) { typedef T value_type; typedef typename radix_sort_value_type<sizeof(T)>::type sort_type; const device &device = queue.get_device(); const context &context = queue.get_context(); // if we have a valid values iterator then we are doing a // sort by key and have to set up the values buffer bool sort_by_key = (values_first.get_buffer().get() != 0); // load (or create) radix sort program std::string cache_key = std::string("__boost_radix_sort_") + type_name<value_type>(); if(sort_by_key){ cache_key += std::string("_with_") + type_name<T2>(); } boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // sort parameters const uint_ k = parameters->get(cache_key, "k", 4); const uint_ k2 = 1 << k; const uint_ block_size = parameters->get(cache_key, "tpb", 128); // sort program compiler options std::stringstream options; options << "-DK_BITS=" << k; options << " -DT=" << type_name<sort_type>(); options << " -DBLOCK_SIZE=" << block_size; if(boost::is_floating_point<value_type>::value){ options << " -DIS_FLOATING_POINT"; } if(boost::is_signed<value_type>::value){ options << " -DIS_SIGNED"; } if(sort_by_key){ options << " -DSORT_BY_KEY"; options << " -DT2=" << type_name<T2>(); options << enable_double<T2>(); } if(ascending){ options << " -DASC"; } // load radix sort program program radix_sort_program = cache->get_or_build( cache_key, options.str(), radix_sort_source, context ); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); size_t count = detail::iterator_range_size(first, last); uint_ block_count = static_cast<uint_>(count / block_size); if(block_count * block_size != count){ block_count++; } // setup temporary buffers vector<value_type> output(count, context); vector<T2> values_output(sort_by_key ? count : 0, context); vector<uint_> offsets(k2, context); vector<uint_> counts(block_count * k2, context); const buffer *input_buffer = &first.get_buffer(); uint_ input_offset = static_cast<uint_>(first.get_index()); const buffer *output_buffer = &output.get_buffer(); uint_ output_offset = 0; const buffer *values_input_buffer = &values_first.get_buffer(); uint_ values_input_offset = static_cast<uint_>(values_first.get_index()); const buffer *values_output_buffer = &values_output.get_buffer(); uint_ values_output_offset = 0; for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ // write counts count_kernel.set_arg(0, *input_buffer); count_kernel.set_arg(1, input_offset); count_kernel.set_arg(2, static_cast<uint_>(count)); count_kernel.set_arg(3, counts); count_kernel.set_arg(4, offsets); count_kernel.set_arg(5, block_size * sizeof(uint_), 0); count_kernel.set_arg(6, i * k); queue.enqueue_1d_range_kernel(count_kernel, 0, block_count * block_size, block_size); // scan counts if(k == 1){ typedef uint2_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 2){ typedef uint4_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else if(k == 4){ typedef uint16_ counter_type; ::boost::compute::exclusive_scan( make_buffer_iterator<counter_type>(counts.get_buffer(), 0), make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), make_buffer_iterator<counter_type>(counts.get_buffer()), queue ); } else { BOOST_ASSERT(false && "unknown k"); break; } // scan global offsets scan_kernel.set_arg(0, counts); scan_kernel.set_arg(1, offsets); scan_kernel.set_arg(2, block_count); queue.enqueue_task(scan_kernel); // scatter values scatter_kernel.set_arg(0, *input_buffer); scatter_kernel.set_arg(1, input_offset); scatter_kernel.set_arg(2, static_cast<uint_>(count)); scatter_kernel.set_arg(3, i * k); scatter_kernel.set_arg(4, counts); scatter_kernel.set_arg(5, offsets); scatter_kernel.set_arg(6, *output_buffer); scatter_kernel.set_arg(7, output_offset); if(sort_by_key){ scatter_kernel.set_arg(8, *values_input_buffer); scatter_kernel.set_arg(9, values_input_offset); scatter_kernel.set_arg(10, *values_output_buffer); scatter_kernel.set_arg(11, values_output_offset); } queue.enqueue_1d_range_kernel(scatter_kernel, 0, block_count * block_size, block_size); // swap buffers std::swap(input_buffer, output_buffer); std::swap(values_input_buffer, values_output_buffer); std::swap(input_offset, output_offset); std::swap(values_input_offset, values_output_offset); } }