inline InputIterator binary_find(InputIterator first, InputIterator last, UnaryPredicate predicate, command_queue &queue = system::default_queue()) { size_t find_if_limit = 128; size_t threads = 128; size_t count = iterator_range_size(first, last); while(count > find_if_limit) { scalar<uint_> index(queue.get_context()); index.write(static_cast<uint_>(count), queue); binary_find_kernel kernel; kernel.set_range(first, last, predicate); kernel.exec(queue, index); size_t i = index.read(queue); if(i == count) { first = last - count%threads; break; } else { last = first + i; first = last - count/threads; } count = iterator_range_size(first, last); } return find_if(first, last, predicate, queue); }
void set_range(PatternIterator p_first, PatternIterator p_last, TextIterator t_first, TextIterator t_last, OutputIterator result) { m_p_count = iterator_range_size(p_first, p_last); m_p_count_arg = add_arg<uint_>("p_count"); m_count = iterator_range_size(t_first, t_last); m_count = m_count + 1 - m_p_count; *this << "uint i = get_global_id(0);\n" << "const uint i1 = i;\n" << "uint j;\n" << "for(j = 0; j<p_count; j++,i++)\n" << "{\n" << " if(" << p_first[expr<uint_>("j")] << " != " << t_first[expr<uint_>("i")] << ")\n" << " j = p_count + 1;\n" << "}\n" << "if(j == p_count)\n" << result[expr<uint_>("i1")] << " = 1;\n" << "else\n" << result[expr<uint_>("i1")] << " = 0;\n"; }
inline InputIterator binary_find(InputIterator first, InputIterator last, UnaryPredicate predicate, command_queue &queue = system::default_queue()) { const device &device = queue.get_device(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); const std::string cache_key = "__boost_binary_find"; size_t find_if_limit = 128; size_t threads = parameters->get(cache_key, "tpb", 128); size_t count = iterator_range_size(first, last); InputIterator search_first = first; InputIterator search_last = last; scalar<uint_> index(queue.get_context()); // construct and compile binary_find kernel binary_find_kernel<InputIterator, UnaryPredicate> binary_find_kernel(search_first, search_last, predicate); ::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context()); // set buffer for index kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer()); while(count > find_if_limit) { index.write(static_cast<uint_>(count), queue); // set block and run binary_find kernel uint_ block = static_cast<uint_>((count - 1)/(threads - 1)); kernel.set_arg(binary_find_kernel.m_block_arg, block); queue.enqueue_1d_range_kernel(kernel, 0, threads, 0); size_t i = index.read(queue); if(i == count) { search_first = search_last - ((count - 1)%(threads - 1)); break; } else { search_last = search_first + i; search_first = search_last - ((count - 1)/(threads - 1)); } // Make sure that first and last stay within the input range search_last = (std::min)(search_last, last); search_last = (std::max)(search_last, first); search_first = (std::max)(search_first, first); search_first = (std::min)(search_first, last); count = iterator_range_size(search_first, search_last); } return find_if(search_first, search_last, predicate, queue); }
inline future<HostIterator> copy_to_host_async(DeviceIterator first, DeviceIterator last, HostIterator result, command_queue &queue) { typedef typename std::iterator_traits<DeviceIterator>::value_type value_type; size_t count = iterator_range_size(first, last); if(count == 0){ return future<HostIterator>(); } const buffer &buffer = first.get_buffer(); size_t offset = first.get_index(); event event_ = queue.enqueue_read_buffer_async(buffer, offset * sizeof(value_type), count * sizeof(value_type), ::boost::addressof(*result)); return make_future(iterator_plus_distance(result, count), event_); }
void set_range(InputIterator1 first1, InputIterator2 first2, InputIterator3 tile_first1, InputIterator3 tile_last1, InputIterator4 tile_first2, OutputIterator1 result, OutputIterator2 counts) { m_count = iterator_range_size(tile_first1, tile_last1) - 1; *this << "uint i = get_global_id(0);\n" << "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" << "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" << "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" << "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" << "uint index = i*" << tile_size << ";\n" << "uint count = 0;\n" << "while(start1<end1 && start2<end2)\n" << "{\n" << " if(" << first1[expr<uint_>("start1")] << " == " << first2[expr<uint_>("start2")] << ")\n" << " {\n" << result[expr<uint_>("index")] << " = " << first1[expr<uint_>("start1")] << ";\n" << " index++; count++;\n" << " start1++; start2++;\n" << " }\n" << " else if(" << first1[expr<uint_>("start1")] << " < " << first2[expr<uint_>("start2")] << ")\n" << " start1++;\n" << " else start2++;\n" << "}\n" << counts[expr<uint_>("i")] << " = count;\n"; }
inline DeviceIterator copy_to_device(HostIterator first, HostIterator last, DeviceIterator result, command_queue &queue, const wait_list &events) { typedef typename std::iterator_traits<DeviceIterator>::value_type value_type; typedef typename std::iterator_traits<DeviceIterator>::difference_type difference_type; size_t count = iterator_range_size(first, last); if(count == 0){ return result; } size_t offset = result.get_index(); queue.enqueue_write_buffer(result.get_buffer(), offset * sizeof(value_type), count * sizeof(value_type), ::boost::addressof(*first), events); return result + static_cast<difference_type>(count); }
void set_range(TextIterator t_first, TextIterator t_last, value_type value, size_t n, OutputIterator result) { m_n = n; m_n_arg = add_arg<uint_>("n"); m_value = value; m_value_arg = add_arg<value_type>("value"); m_count = iterator_range_size(t_first, t_last); m_count = m_count + 1 - m_n; *this << "uint i = get_global_id(0);\n" << "uint i1 = i;\n" << "uint j;\n" << "for(j = 0; j<n; j++,i++)\n" << "{\n" << " if(value != " << t_first[expr<uint_>("i")] << ")\n" << " j = n + 1;\n" << "}\n" << "if(j == n)\n" << result[expr<uint_>("i1")] << " = 1;\n" << "else\n" << result[expr<uint_>("i1")] << " = 0;\n"; }
inline void merge_sort_by_key_on_cpu(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; size_t count = iterator_range_size(keys_first, keys_last); if(count < 2){ return; } // for small input size only insertion sort is performed else if(count <= 512){ block_insertion_sort(keys_first, values_first, compare, count, count, true, queue); return; } const context &context = queue.get_context(); const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>() + "_with_" + type_name<key_type>(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); const size_t block_size = parameters->get(cache_key, "insertion_sort_by_key_block_size", 64); block_insertion_sort(keys_first, values_first, compare, count, block_size, true, queue); // temporary buffer for merge results vector<value_type> values_temp(count, context); vector<key_type> keys_temp(count, context); bool result_in_temporary_buffer = false; for(size_t i = block_size; i < count; i *= 2){ result_in_temporary_buffer = !result_in_temporary_buffer; if(result_in_temporary_buffer) { merge_blocks(keys_first, values_first, keys_temp.begin(), values_temp.begin(), compare, count, i, true, queue); } else { merge_blocks(keys_temp.begin(), values_temp.begin(), keys_first, values_first, compare, count, i, true, queue); } } if(result_in_temporary_buffer) { copy(keys_temp.begin(), keys_temp.end(), keys_first, queue); copy(values_temp.begin(), values_temp.end(), values_first, queue); } }
inline void serial_insertion_sort_by_key(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; size_t count = iterator_range_size(keys_first, keys_last); if(count < 2){ return; } meta_kernel k("serial_insertion_sort_by_key"); size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "keys"); size_t local_data_arg = k.add_arg<value_type *>(memory_object::local_memory, "data"); size_t count_arg = k.add_arg<uint_>("n"); k << // copy data to local memory "for(uint i = 0; i < n; i++){\n" << " keys[i] = " << keys_first[k.var<uint_>("i")] << ";\n" " data[i] = " << values_first[k.var<uint_>("i")] << ";\n" "}\n" // sort data in local memory "for(uint i = 1; i < n; i++){\n" << " " << k.decl<const key_type>("key") << " = keys[i];\n" << " " << k.decl<const value_type>("value") << " = data[i];\n" << " uint pos = i;\n" << " while(pos > 0 && " << compare(k.var<const key_type>("key"), k.var<const key_type>("keys[pos-1]")) << "){\n" << " keys[pos] = keys[pos-1];\n" << " data[pos] = data[pos-1];\n" << " pos--;\n" << " }\n" << " keys[pos] = key;\n" << " data[pos] = value;\n" << "}\n" << // copy sorted data to output "for(uint i = 0; i < n; i++){\n" << " " << keys_first[k.var<uint_>("i")] << " = keys[i];\n" " " << values_first[k.var<uint_>("i")] << " = data[i];\n" "}\n"; const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(local_keys_arg, static_cast<uint_>(count * sizeof(key_type)), 0); kernel.set_arg(local_data_arg, static_cast<uint_>(count * sizeof(value_type)), 0); kernel.set_arg(count_arg, static_cast<uint_>(count)); queue.enqueue_task(kernel); }
inline OutputIterator merge_with_merge_path(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp, command_queue &queue = system::default_queue()) { typedef typename std::iterator_traits<OutputIterator>::difference_type result_difference_type; int tile_size = 1024; int count1 = iterator_range_size(first1, last1); int count2 = iterator_range_size(first2, last2); vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); // Tile the sets merge_path_kernel tiling_kernel; tiling_kernel.tile_size = 1024; tiling_kernel.set_range(first1, last1, first2, last2, tile_a.begin()+1, tile_b.begin()+1, comp); fill_n(tile_a.begin(), 1, 0, queue); fill_n(tile_b.begin(), 1, 0, queue); tiling_kernel.exec(queue); fill_n(tile_a.end()-1, 1, count1, queue); fill_n(tile_b.end()-1, 1, count2, queue); // Merge serial_merge_kernel merge_kernel; merge_kernel.tile_size = 1024; merge_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(), tile_b.begin(), result, comp); merge_kernel.exec(queue); return result + static_cast<result_difference_type>(count1 + count2); }
inline void merge_sort_by_key_on_gpu(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, bool stable, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; size_t count = iterator_range_size(keys_first, keys_last); if(count < 2){ return; } size_t block_size = block_sort( keys_first, values_first, compare, count, true /* sort_by_key */, stable /* stable */, queue ); // for small input size only block sort is performed if(count <= block_size) { return; } const context &context = queue.get_context(); bool result_in_temporary_buffer = false; ::boost::compute::vector<key_type> temp_keys(count, context); ::boost::compute::vector<value_type> temp_values(count, context); for(; block_size < count; block_size *= 2) { result_in_temporary_buffer = !result_in_temporary_buffer; if(result_in_temporary_buffer) { merge_blocks_on_gpu(keys_first, values_first, temp_keys.begin(), temp_values.begin(), compare, count, block_size, true /* sort_by_key */, queue); } else { merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(), keys_first, values_first, compare, count, block_size, true /* sort_by_key */, queue); } } if(result_in_temporary_buffer) { copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue); copy_async(temp_values.begin(), temp_values.end(), values_first, queue); } }
void set_range(MapIterator first, MapIterator last, InputIterator input, OutputIterator result) { m_count = iterator_range_size(first, last); *this << "const uint i = get_global_id(0);\n" << result[expr<uint_>("i")] << "=" << input[first[expr<uint_>("i")]] << ";\n"; }
inline InputIterator binary_find(InputIterator first, InputIterator last, UnaryPredicate predicate, command_queue &queue = system::default_queue()) { const device &device = queue.get_device(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); const std::string cache_key = "__boost_binary_find"; size_t find_if_limit = 128; size_t threads = parameters->get(cache_key, "tpb", 128); size_t count = iterator_range_size(first, last); while(count > find_if_limit) { scalar<uint_> index(queue.get_context()); index.write(static_cast<uint_>(count), queue); binary_find_kernel kernel(threads); kernel.set_range(first, last, predicate); kernel.exec(queue, index); size_t i = index.read(queue); if(i == count) { first = last - count%threads; break; } else { last = first + i; first = last - count/threads; } count = iterator_range_size(first, last); } return find_if(first, last, predicate, queue); }
void set_range(InputIterator1 first1, InputIterator2 first2, InputIterator3 tile_first1, InputIterator3 tile_last1, InputIterator4 tile_first2, OutputIterator result, Compare comp) { m_count = iterator_range_size(tile_first1, tile_last1) - 1; *this << "uint i = get_global_id(0);\n" << "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" << "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" << "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" << "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" << "uint index = i*" << tile_size << ";\n" << "while(start1<end1 && start2<end2)\n" << "{\n" << " if(!(" << comp(first2[expr<uint_>("start2")], first1[expr<uint_>("start1")]) << "))\n" << " {\n" << result[expr<uint_>("index")] << " = " << first1[expr<uint_>("start1")] << ";\n" << " index++;\n" << " start1++;\n" << " }\n" << " else\n" << " {\n" << result[expr<uint_>("index")] << " = " << first2[expr<uint_>("start2")] << ";\n" << " index++;\n" << " start2++;\n" << " }\n" << "}\n" << "while(start1<end1)\n" << "{\n" << result[expr<uint_>("index")] << " = " << first1[expr<uint_>("start1")] << ";\n" << " index++;\n" << " start1++;\n" << "}\n" << "while(start2<end2)\n" << "{\n" << result[expr<uint_>("index")] << " = " << first2[expr<uint_>("start2")] << ";\n" << " index++;\n" << " start2++;\n" << "}\n"; }
inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first, svm_ptr<T> last, svm_ptr<T> result, command_queue &queue) { size_t count = iterator_range_size(first, last); if(count == 0){ return result; } event event_ = queue.enqueue_svm_memcpy_async( result.get(), first.get(), count * sizeof(T) ); return make_future(result + count, event_); }
inline HostIterator copy_to_host(svm_ptr<T> first, svm_ptr<T> last, HostIterator result, command_queue &queue) { size_t count = iterator_range_size(first, last); if(count == 0){ return result; } queue.enqueue_svm_memcpy( ::boost::addressof(*result), first.get(), count * sizeof(T) ); return result + count; }
inline svm_ptr<T> copy_on_device(svm_ptr<T> first, svm_ptr<T> last, svm_ptr<T> result, command_queue &queue) { size_t count = iterator_range_size(first, last); if(count == 0){ return result; } queue.enqueue_svm_memcpy( result.get(), first.get(), count * sizeof(T) ); return result + count; }
inline future<HostIterator> copy_to_host_async(svm_ptr<T> first, svm_ptr<T> last, HostIterator result, command_queue &queue) { size_t count = iterator_range_size(first, last); if(count == 0){ return result; } event event_ = queue.enqueue_svm_memcpy_async( ::boost::addressof(*result), first.get(), count * sizeof(T) ); return make_future(iterator_plus_distance(result, count), event_); }
inline void serial_insertion_sort(Iterator first, Iterator last, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type T; size_t count = iterator_range_size(first, last); if(count < 2){ return; } meta_kernel k("serial_insertion_sort"); size_t local_data_arg = k.add_arg<T *>(memory_object::local_memory, "data"); size_t count_arg = k.add_arg<uint_>("n"); k << // copy data to local memory "for(uint i = 0; i < n; i++){\n" << " data[i] = " << first[k.var<uint_>("i")] << ";\n" "}\n" // sort data in local memory "for(uint i = 1; i < n; i++){\n" << " " << k.decl<const T>("value") << " = data[i];\n" << " uint pos = i;\n" << " while(pos > 0 && " << compare(k.var<const T>("value"), k.var<const T>("data[pos-1]")) << "){\n" << " data[pos] = data[pos-1];\n" << " pos--;\n" << " }\n" << " data[pos] = value;\n" << "}\n" << // copy sorted data to output "for(uint i = 0; i < n; i++){\n" << " " << first[k.var<uint_>("i")] << " = data[i];\n" "}\n"; const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(local_data_arg, local_buffer<T>(count)); kernel.set_arg(count_arg, static_cast<uint_>(count)); queue.enqueue_task(kernel); }
inline svm_ptr<T> copy_to_device(HostIterator first, HostIterator last, svm_ptr<T> result, command_queue &queue, const wait_list &events) { size_t count = iterator_range_size(first, last); if(count == 0){ return result; } queue.enqueue_svm_memcpy( result.get(), ::boost::addressof(*first), count * sizeof(T), events ); return result + count; }
inline future<svm_ptr<T> > copy_to_device_async(HostIterator first, HostIterator last, svm_ptr<T> result, command_queue &queue, const wait_list &events) { size_t count = iterator_range_size(first, last); if(count == 0){ return future<svm_ptr<T> >(); } event event_ = queue.enqueue_svm_memcpy_async( result.get(), ::boost::addressof(*first), count * sizeof(T), events ); return make_future(result + count, event_); }
inline size_t count_if_with_ballot(InputIterator first, InputIterator last, Predicate predicate, command_queue &queue) { size_t count = iterator_range_size(first, last); size_t block_size = 32; size_t block_count = count / block_size; if(block_count * block_size != count){ block_count++; } const ::boost::compute::context &context = queue.get_context(); ::boost::compute::vector<uint_> counts(block_count, context); ::boost::compute::detail::nvidia_popcount<uint_> popc; ::boost::compute::detail::nvidia_ballot<uint_> ballot; meta_kernel k("count_if_with_ballot"); k << "const uint gid = get_global_id(0);\n" << "bool value = false;\n" << "if(gid < count)\n" << " value = " << predicate(first[k.var<const uint_>("gid")]) << ";\n" << "uint bits = " << ballot(k.var<const uint_>("value")) << ";\n" << "if(get_local_id(0) == 0)\n" << counts.begin()[k.var<uint_>("get_group_id(0)") ] << " = " << popc(k.var<uint_>("bits")) << ";\n"; k.add_set_arg<const uint_>("count", count); k.exec_1d(queue, 0, block_size * block_count, block_size); uint_ result; ::boost::compute::reduce( counts.begin(), counts.end(), &result, queue ); return result; }
void set_range(InputIterator first, InputIterator last, UnaryPredicate predicate) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; int block = (iterator_range_size(first, last)-1)/(m_threads-1); m_index_arg = add_arg<uint_ *>(memory_object::global_memory, "index"); atomic_min<uint_> atomic_min_uint; *this << "uint i = get_global_id(0) * " << block << ";\n" << decl<value_type>("value") << "=" << first[var<uint_>("i")] << ";\n" << "if(" << predicate(var<value_type>("value")) << ") {\n" << atomic_min_uint(var<uint_ *>("index"), var<uint_>("i")) << ";\n" << "}\n"; }
inline DeviceIterator copy_to_device_map(HostIterator first, HostIterator last, DeviceIterator result, command_queue &queue, const wait_list &events) { typedef typename std::iterator_traits<DeviceIterator>::value_type value_type; typedef typename std::iterator_traits<DeviceIterator>::difference_type difference_type; size_t count = iterator_range_size(first, last); if(count == 0){ return result; } size_t offset = result.get_index(); // map result buffer to host value_type *pointer = static_cast<value_type*>( queue.enqueue_map_buffer( result.get_buffer(), CL_MAP_WRITE, offset * sizeof(value_type), count * sizeof(value_type), events ) ); // copy [first; last) to result buffer std::copy(first, last, pointer); // unmap result buffer boost::compute::event unmap_event = queue.enqueue_unmap_buffer( result.get_buffer(), static_cast<void*>(pointer) ); unmap_event.wait(); return result + static_cast<difference_type>(count); }
inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, typename boost::enable_if_c< is_device_iterator<InputIterator>::value && !is_device_iterator<OutputIterator>::value >::type* = 0) { if(is_contiguous_iterator<OutputIterator>::value){ return copy_to_host(first, last, result, queue); } else { // for non-contiguous input we first copy the values to // a temporary std::vector and then copy from there typedef typename std::iterator_traits<InputIterator>::value_type T; std::vector<T> vector(iterator_range_size(first, last)); copy_to_host(first, last, vector.begin(), queue); return std::copy(vector.begin(), vector.end(), result); } }
inline size_t serial_count_if(InputIterator first, InputIterator last, Predicate predicate, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; const context &context = queue.get_context(); size_t size = iterator_range_size(first, last); meta_kernel k("serial_count_if"); k.add_arg<const uint_>("size", size); size_t result_arg = k.add_arg<uint_ *>("__global", "result"); k << "uint count = 0;\n" << "for(uint i = 0; i < size; i++){\n" << k.decl<const value_type>("value") << "=" << first[k.var<uint_>("i")] << ";\n" << "if(" << predicate(k.var<const value_type>("value")) << "){\n" << "count++;\n" << "}\n" "}\n" "*result = count;\n"; kernel kernel = k.compile(context); // setup result buffer buffer result_buffer(context, sizeof(uint_)); kernel.set_arg(result_arg, result_buffer); // run kernel queue.enqueue_task(kernel); // read index return detail::read_single_value<uint_>(result_buffer, queue); }
inline size_t serial_count_if(InputIterator first, InputIterator last, Predicate predicate, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; const context &context = queue.get_context(); size_t size = iterator_range_size(first, last); meta_kernel k("serial_count_if"); k.add_set_arg("size", static_cast<uint_>(size)); size_t result_arg = k.add_arg<uint_ *>(memory_object::global_memory, "result"); k << "uint count = 0;\n" << "for(uint i = 0; i < size; i++){\n" << k.decl<const value_type>("value") << "=" << first[k.var<uint_>("i")] << ";\n" << "if(" << predicate(k.var<const value_type>("value")) << "){\n" << "count++;\n" << "}\n" "}\n" "*result = count;\n"; kernel kernel = k.compile(context); // setup result buffer scalar<uint_> result(context); kernel.set_arg(result_arg, result.get_buffer()); // run kernel queue.enqueue_task(kernel); // read index return result.read(queue); }
inline svm_ptr<T> copy_to_device_map(HostIterator first, HostIterator last, svm_ptr<T> result, command_queue &queue, const wait_list &events) { size_t count = iterator_range_size(first, last); if(count == 0){ return result; } // map queue.enqueue_svm_map( result.get(), count * sizeof(T), CL_MAP_WRITE, events ); // copy [first; last) to result buffer std::copy(first, last, static_cast<T*>(result.get())); // unmap result queue.enqueue_svm_unmap(result.get()).wait(); return result + count; }
void set_range(InputIterator1 first1, InputIterator2 first2, InputIterator3 tile_first1, InputIterator3 tile_last1, InputIterator4 tile_first2, OutputIterator result) { m_count = iterator_range_size(tile_first1, tile_last1) - 1; *this << "uint i = get_global_id(0);\n" << "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" << "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" << "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" << "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" << "uint includes = 1;\n" << "while(start1<end1 && start2<end2)\n" << "{\n" << " if(" << first1[expr<uint_>("start1")] << " == " << first2[expr<uint_>("start2")] << ")\n" << " {\n" << " start1++; start2++;\n" << " }\n" << " else if(" << first1[expr<uint_>("start1")] << " < " << first2[expr<uint_>("start2")] << ")\n" << " start1++;\n" << " else\n" << " {\n" << " includes = 0;\n" << " break;\n" << " }\n" << "}\n" << "if(start2<end2)\n" << " includes = 0;\n" << result[expr<uint_>("i")] << " = includes;\n"; }
inline void merge_sort_on_cpu(Iterator first, Iterator last, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type value_type; size_t count = iterator_range_size(first, last); if(count < 2){ return; } // for small input size only insertion sort is performed else if(count <= 512){ block_insertion_sort(first, compare, count, count, queue); return; } const context &context = queue.get_context(); const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // When there is merge_with_path_blocks_no_threshold or less blocks left to // merge AND input size is merge_with_merge_path_input_size_threshold or more // merge_with_merge_path() algorithm is used to merge sorted blocks; // otherwise merge_blocks() is used. const size_t merge_with_path_blocks_no_threshold = parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8); const size_t merge_with_path_input_size_threshold = parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152); const size_t block_size = parameters->get(cache_key, "insertion_sort_block_size", 64); block_insertion_sort(first, compare, count, block_size, queue); // temporary buffer for merge result vector<value_type> temp(count, context); bool result_in_temporary_buffer = false; for(size_t i = block_size; i < count; i *= 2){ result_in_temporary_buffer = !result_in_temporary_buffer; if(result_in_temporary_buffer) { dispatch_merge_blocks(first, temp.begin(), compare, count, i, merge_with_path_input_size_threshold, merge_with_path_blocks_no_threshold, queue); } else { dispatch_merge_blocks(temp.begin(), first, compare, count, i, merge_with_path_input_size_threshold, merge_with_path_blocks_no_threshold, queue); } } if(result_in_temporary_buffer) { copy(temp.begin(), temp.end(), first, queue); } }