void generate(OutputIterator first, OutputIterator last, Generator &generator, command_queue &queue) { size_t size = std::distance(first, last); typedef typename Generator::result_type g_result_type; vector<g_result_type> tmp(size, queue.get_context()); vector<g_result_type> tmp2(size, queue.get_context()); uint_ bound = ((uint_(-1))/(m_b-m_a+1))*(m_b-m_a+1); buffer_iterator<g_result_type> tmp2_iter; while(size>0) { generator.generate(tmp.begin(), tmp.begin() + size, queue); tmp2_iter = copy_if(tmp.begin(), tmp.begin() + size, tmp2.begin(), _1 <= bound, queue); size = std::distance(tmp2_iter, tmp2.end()); } BOOST_COMPUTE_FUNCTION(IntType, scale_random, (const g_result_type x), { return LO + (x % (HI-LO+1)); });
inline InputIterator binary_find(InputIterator first, InputIterator last, UnaryPredicate predicate, command_queue &queue = system::default_queue()) { const device &device = queue.get_device(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); const std::string cache_key = "__boost_binary_find"; size_t find_if_limit = 128; size_t threads = parameters->get(cache_key, "tpb", 128); size_t count = iterator_range_size(first, last); InputIterator search_first = first; InputIterator search_last = last; scalar<uint_> index(queue.get_context()); // construct and compile binary_find kernel binary_find_kernel<InputIterator, UnaryPredicate> binary_find_kernel(search_first, search_last, predicate); ::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context()); // set buffer for index kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer()); while(count > find_if_limit) { index.write(static_cast<uint_>(count), queue); // set block and run binary_find kernel uint_ block = static_cast<uint_>((count - 1)/(threads - 1)); kernel.set_arg(binary_find_kernel.m_block_arg, block); queue.enqueue_1d_range_kernel(kernel, 0, threads, 0); size_t i = index.read(queue); if(i == count) { search_first = search_last - ((count - 1)%(threads - 1)); break; } else { search_last = search_first + i; search_first = search_last - ((count - 1)/(threads - 1)); } // Make sure that first and last stay within the input range search_last = (std::min)(search_last, last); search_last = (std::max)(search_last, first); search_first = (std::max)(search_first, first); search_first = (std::min)(search_first, last); count = iterator_range_size(search_first, search_last); } return find_if(search_first, search_last, predicate, queue); }
inline OutputIterator set_symmetric_difference(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, command_queue &queue = system::default_queue()) { typedef typename std::iterator_traits<InputIterator1>::value_type value_type; int tile_size = 1024; int count1 = detail::iterator_range_size(first1, last1); int count2 = detail::iterator_range_size(first2, last2); vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); // Tile the sets detail::balanced_path_kernel tiling_kernel; tiling_kernel.tile_size = tile_size; tiling_kernel.set_range(first1, last1, first2, last2, tile_a.begin()+1, tile_b.begin()+1); fill_n(tile_a.begin(), 1, 0, queue); fill_n(tile_b.begin(), 1, 0, queue); tiling_kernel.exec(queue); fill_n(tile_a.end()-1, 1, count1, queue); fill_n(tile_b.end()-1, 1, count2, queue); vector<value_type> temp_result(count1+count2, queue.get_context()); vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context()); fill_n(counts.end()-1, 1, 0, queue); // Find individual symmetric differences detail::serial_set_symmetric_difference_kernel symmetric_difference_kernel; symmetric_difference_kernel.tile_size = tile_size; symmetric_difference_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(), tile_b.begin(), temp_result.begin(), counts.begin()); symmetric_difference_kernel.exec(queue); exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue); // Compact the results detail::compact_kernel compact_kernel; compact_kernel.tile_size = tile_size; compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result); compact_kernel.exec(queue); return result + (counts.end() - 1).read(queue); }
inline InputIterator binary_find(InputIterator first, InputIterator last, UnaryPredicate predicate, command_queue &queue = system::default_queue()) { size_t find_if_limit = 128; size_t threads = 128; size_t count = iterator_range_size(first, last); while(count > find_if_limit) { scalar<uint_> index(queue.get_context()); index.write(static_cast<uint_>(count), queue); binary_find_kernel kernel; kernel.set_range(first, last, predicate); kernel.exec(queue, index); size_t i = index.read(queue); if(i == count) { first = last - count%threads; break; } else { last = first + i; first = last - count/threads; } count = iterator_range_size(first, last); } return find_if(first, last, predicate, queue); }
inline void sort2(const buffer &buffer, command_queue &queue) { const context &context = queue.get_context(); boost::shared_ptr<detail::program_cache> cache = detail::get_program_cache(context); std::string cache_key = std::string("fixed_sort2_") + type_name<T>(); program sort2_program = cache->get(cache_key); if(!sort2_program.get()){ const char source[] = "__kernel void sort2(__global T *input)\n" "{\n" " const T x = input[0];\n" " const T y = input[1];\n" " if(y < x){\n" " input[0] = y;\n" " input[1] = x;\n" " }\n" "}\n"; sort2_program = program::build_with_source( source, context, std::string("-DT=") + type_name<T>() ); cache->insert(cache_key, sort2_program); } kernel sort2_kernel = sort2_program.create_kernel("sort2"); sort2_kernel.set_arg(0, buffer); queue.enqueue_task(sort2_kernel); }
inline void opengl_enqueue_release_buffer(const opengl_buffer &buffer, command_queue &queue) { BOOST_ASSERT(buffer.get_context() == queue.get_context()); opengl_enqueue_release_gl_objects(1, &buffer.get(), queue); }
inline vector< typename boost::compute::result_of< BinaryFunction( typename std::iterator_traits<InputIterator>::value_type, typename std::iterator_traits<InputIterator>::value_type ) >::type > block_reduce(InputIterator first, size_t count, size_t block_size, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename boost::compute::result_of<BinaryFunction(input_type, input_type)>::type result_type; const context &context = queue.get_context(); size_t total_block_count = static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); vector<result_type> result_vector(total_block_count, context); reduce(first, count, result_vector.begin(), block_size, function, queue); return result_vector; }
inline void gemm(const matrix_order order, const matrix_transpose trans_a, const matrix_transpose trans_b, const int M, const int N, const int K, const Scalar alpha, device_ptr<Scalar> A, const int lda, device_ptr<Scalar> B, const int ldb, const Scalar beta, device_ptr<Scalar> C, const int ldc, command_queue &queue) { (void) order; (void) trans_a; (void) trans_b; ::boost::compute::detail::meta_kernel k("gemm"); k.add_set_arg<Scalar>("alpha", alpha); k.add_set_arg<Scalar>("beta", beta); k.add_set_arg<const cl_uint>("M", static_cast<const cl_uint>(M)); k.add_set_arg<const cl_uint>("N", static_cast<const cl_uint>(N)); k.add_set_arg<const cl_uint>("K", static_cast<const cl_uint>(K)); k.add_set_arg<const cl_uint>("lda", static_cast<const cl_uint>(lda)); k.add_set_arg<const cl_uint>("ldb", static_cast<const cl_uint>(ldb)); k.add_set_arg<const cl_uint>("ldc", static_cast<const cl_uint>(ldc)); size_t a_index = k.add_arg<const Scalar *>("__global", "A"); size_t b_index = k.add_arg<const Scalar *>("__global", "B"); size_t c_index = k.add_arg<Scalar *>("__global", "C"); k << k.decl<cl_uint>("i") << " = get_global_id(0);\n" << k.decl<cl_uint>("j") << " = get_global_id(1);\n" << k.decl<Scalar>("sum") << " = 0;\n" << "for(uint k = 0; k < K; k++){\n" << " sum += " << A[k.expr<cl_uint>("i*lda+k")] << " * " << B[k.expr<cl_uint>("k*ldb+j")] << ";\n" << "};\n" << C[k.expr<cl_uint>("i*ldc+j")] << "=" << "alpha * sum + beta *" << C[k.expr<cl_uint>("i*ldc+j")] << ";\n"; const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(a_index, A.get_buffer()); kernel.set_arg(b_index, B.get_buffer()); kernel.set_arg(c_index, C.get_buffer()); size_t work_group_offsets[] = { 0, 0 }; size_t work_group_sizes[] = { static_cast<size_t>(N), static_cast<size_t>(M) }; queue.enqueue_nd_range_kernel(kernel, 2, work_group_offsets, work_group_sizes, 0); }
inline TextIterator search_n(TextIterator t_first, TextIterator t_last, size_t n, ValueType value, command_queue &queue = system::default_queue()) { // there is no need to check if pattern starts at last n - 1 indices vector<uint_> matching_indices( detail::iterator_range_size(t_first, t_last) + 1 - n, queue.get_context() ); // search_n_kernel puts value 1 at every index in vector where pattern // of n values starts at detail::search_n_kernel<TextIterator, vector<uint_>::iterator> kernel; kernel.set_range(t_first, t_last, value, n, matching_indices.begin()); kernel.exec(queue); vector<uint_>::iterator index = ::boost::compute::find( matching_indices.begin(), matching_indices.end(), uint_(1), queue ); // pattern was not found if(index == matching_indices.end()) return t_last; return t_first + detail::iterator_range_size(matching_indices.begin(), index); }
inline OutputIterator adjacent_difference(InputIterator first, InputIterator last, OutputIterator result, BinaryFunction op, command_queue &queue = system::default_queue()) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; if(first == last) { return result; } if (first == result) { vector<value_type> temp(detail::iterator_range_size(first, last), queue.get_context()); copy(first, last, temp.begin(), queue); return ::boost::compute::detail::dispatch_adjacent_difference( temp.begin(), temp.end(), result, op, queue ); } else { return ::boost::compute::detail::dispatch_adjacent_difference( first, last, result, op, queue ); } }
inline T inner_product(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, T init, BinaryAccumulateFunction accumulate_function, BinaryTransformFunction transform_function, command_queue &queue = system::default_queue()) { typedef typename std::iterator_traits<InputIterator1>::value_type value_type; size_t count = detail::iterator_range_size(first1, last1); vector<value_type> result(count, queue.get_context()); transform(first1, last1, first2, result.begin(), transform_function, queue); return ::boost::compute::accumulate(result.begin(), result.end(), init, accumulate_function, queue); }
inline void serial_reduce(InputIterator first, InputIterator last, OutputIterator result, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type T; typedef typename ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type; const context &context = queue.get_context(); size_t count = detail::iterator_range_size(first, last); if(count == 0){ return; } meta_kernel k("serial_reduce"); size_t count_arg = k.add_arg<cl_uint>("count"); k << k.decl<result_type>("result") << " = " << first[0] << ";\n" << "for(uint i = 1; i < count; i++)\n" << " result = " << function(k.var<T>("result"), first[k.var<uint_>("i")]) << ";\n" << result[0] << " = result;\n"; kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<uint_>(count)); queue.enqueue_task(kernel); }
inline void sort_by_transform(Iterator first, Iterator last, Transform transform, Compare compare, command_queue &queue = system::default_queue()) { typedef typename std::iterator_traits<Iterator>::value_type value_type; typedef typename boost::compute::result_of<Transform(value_type)>::type key_type; size_t n = detail::iterator_range_size(first, last); if(n < 2){ return; } const context &context = queue.get_context(); ::boost::compute::vector<key_type> keys(n, context); ::boost::compute::transform( first, last, keys.begin(), transform, queue ); ::boost::compute::sort_by_key( keys.begin(), keys.end(), first, compare, queue ); }
inline InputIterator adjacent_find_with_atomics(InputIterator first, InputIterator last, Compare compare, command_queue &queue) { if(first == last){ return last; } const context &context = queue.get_context(); size_t count = detail::iterator_range_size(first, last); // initialize output to the last index detail::scalar<uint_> output(context); output.write(static_cast<uint_>(count), queue); detail::meta_kernel k("adjacent_find_with_atomics"); size_t output_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output"); k << "const uint i = get_global_id(0);\n" << "if(" << compare(first[k.expr<uint_>("i")], first[k.expr<uint_>("i+1")]) << "){\n" << " atomic_min(output, i);\n" << "}\n"; k.set_arg(output_arg, output.get_buffer()); k.exec_1d(queue, 0, count - 1, 1); return first + output.read(queue); }
inline TextIterator find_end(PatternIterator p_first, PatternIterator p_last, TextIterator t_first, TextIterator t_last, command_queue &queue = system::default_queue()) { const context &context = queue.get_context(); vector<uint_> matching_indices(detail::iterator_range_size(t_first, t_last), context); detail::search_kernel<PatternIterator, TextIterator, vector<uint_>::iterator> kernel; kernel.set_range(p_first, p_last, t_first, t_last, matching_indices.begin()); kernel.exec(queue); using boost::compute::_1; vector<uint_>::iterator index = detail::find_end_helper(matching_indices.begin(), matching_indices.end(), _1 == 1, queue); return t_first + detail::iterator_range_size(matching_indices.begin(), index); }
inline cv::Mat opencv_create_mat_with_image2d(const image2d &image, command_queue &queue = system::default_queue()) { BOOST_ASSERT(image.get_context() == queue.get_context()); cv::Mat mat; image_format format = image.get_format(); const cl_image_format *cl_image_format = format.get_format_ptr(); if(cl_image_format->image_channel_data_type == CL_UNORM_INT8 && cl_image_format->image_channel_order == CL_BGRA) { mat = cv::Mat(image.height(), image.width(), CV_8UC4); } else if(cl_image_format->image_channel_data_type == CL_UNORM_INT16 && cl_image_format->image_channel_order == CL_BGRA) { mat = cv::Mat(image.height(), image.width(), CV_16UC4); } else if(cl_image_format->image_channel_data_type == CL_FLOAT && cl_image_format->image_channel_order == CL_INTENSITY) { mat = cv::Mat(image.height(), image.width(), CV_32FC1); } else { mat = cv::Mat(image.height(), image.width(), CV_8UC1); } opencv_copy_image_to_mat(image, mat, queue); return mat; }
/// Enqueues a command to release the specified OpenGL buffer. /// /// \see_opencl_ref{clEnqueueReleaseGLObjects} inline event opengl_enqueue_release_buffer(const opengl_buffer &buffer, command_queue &queue, const wait_list &events = wait_list()) { BOOST_ASSERT(buffer.get_context() == queue.get_context()); return opengl_enqueue_release_gl_objects(1, &buffer.get(), queue, events); }
inline void merge_sort_by_key_on_cpu(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; size_t count = iterator_range_size(keys_first, keys_last); if(count < 2){ return; } // for small input size only insertion sort is performed else if(count <= 512){ block_insertion_sort(keys_first, values_first, compare, count, count, true, queue); return; } const context &context = queue.get_context(); const device &device = queue.get_device(); // loading parameters std::string cache_key = std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>() + "_with_" + type_name<key_type>(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); const size_t block_size = parameters->get(cache_key, "insertion_sort_by_key_block_size", 64); block_insertion_sort(keys_first, values_first, compare, count, block_size, true, queue); // temporary buffer for merge results vector<value_type> values_temp(count, context); vector<key_type> keys_temp(count, context); bool result_in_temporary_buffer = false; for(size_t i = block_size; i < count; i *= 2){ result_in_temporary_buffer = !result_in_temporary_buffer; if(result_in_temporary_buffer) { merge_blocks(keys_first, values_first, keys_temp.begin(), values_temp.begin(), compare, count, i, true, queue); } else { merge_blocks(keys_temp.begin(), values_temp.begin(), keys_first, values_first, compare, count, i, true, queue); } } if(result_in_temporary_buffer) { copy(keys_temp.begin(), keys_temp.end(), keys_first, queue); copy(values_temp.begin(), values_temp.end(), values_first, queue); } }
inline void opencv_copy_image_to_mat(const image2d &image, cv::Mat &mat, command_queue &queue = system::default_queue()) { BOOST_ASSERT(mat.isContinuous()); BOOST_ASSERT(image.get_context() == queue.get_context()); queue.enqueue_read_image(image, image.origin(), image.size(), mat.data); }
inline void serial_insertion_sort_by_key(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; size_t count = iterator_range_size(keys_first, keys_last); if(count < 2){ return; } meta_kernel k("serial_insertion_sort_by_key"); size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "keys"); size_t local_data_arg = k.add_arg<value_type *>(memory_object::local_memory, "data"); size_t count_arg = k.add_arg<uint_>("n"); k << // copy data to local memory "for(uint i = 0; i < n; i++){\n" << " keys[i] = " << keys_first[k.var<uint_>("i")] << ";\n" " data[i] = " << values_first[k.var<uint_>("i")] << ";\n" "}\n" // sort data in local memory "for(uint i = 1; i < n; i++){\n" << " " << k.decl<const key_type>("key") << " = keys[i];\n" << " " << k.decl<const value_type>("value") << " = data[i];\n" << " uint pos = i;\n" << " while(pos > 0 && " << compare(k.var<const key_type>("key"), k.var<const key_type>("keys[pos-1]")) << "){\n" << " keys[pos] = keys[pos-1];\n" << " data[pos] = data[pos-1];\n" << " pos--;\n" << " }\n" << " keys[pos] = key;\n" << " data[pos] = value;\n" << "}\n" << // copy sorted data to output "for(uint i = 0; i < n; i++){\n" << " " << keys_first[k.var<uint_>("i")] << " = keys[i];\n" " " << values_first[k.var<uint_>("i")] << " = data[i];\n" "}\n"; const context &context = queue.get_context(); ::boost::compute::kernel kernel = k.compile(context); kernel.set_arg(local_keys_arg, static_cast<uint_>(count * sizeof(key_type)), 0); kernel.set_arg(local_data_arg, static_cast<uint_>(count * sizeof(value_type)), 0); kernel.set_arg(count_arg, static_cast<uint_>(count)); queue.enqueue_task(kernel); }
/// Creates a new mersenne_twister_engine and seeds it with \p value. explicit mersenne_twister_engine(command_queue &queue, result_type value = default_seed) : m_context(queue.get_context()), m_state_buffer(m_context, n * sizeof(result_type)) { // setup program load_program(); // seed state seed(queue, value); }
inline void merge_sort_by_key_on_gpu(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, Compare compare, bool stable, command_queue &queue) { typedef typename std::iterator_traits<KeyIterator>::value_type key_type; typedef typename std::iterator_traits<ValueIterator>::value_type value_type; size_t count = iterator_range_size(keys_first, keys_last); if(count < 2){ return; } size_t block_size = block_sort( keys_first, values_first, compare, count, true /* sort_by_key */, stable /* stable */, queue ); // for small input size only block sort is performed if(count <= block_size) { return; } const context &context = queue.get_context(); bool result_in_temporary_buffer = false; ::boost::compute::vector<key_type> temp_keys(count, context); ::boost::compute::vector<value_type> temp_values(count, context); for(; block_size < count; block_size *= 2) { result_in_temporary_buffer = !result_in_temporary_buffer; if(result_in_temporary_buffer) { merge_blocks_on_gpu(keys_first, values_first, temp_keys.begin(), temp_values.begin(), compare, count, block_size, true /* sort_by_key */, queue); } else { merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(), keys_first, values_first, compare, count, block_size, true /* sort_by_key */, queue); } } if(result_in_temporary_buffer) { copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue); copy_async(temp_values.begin(), temp_values.end(), values_first, queue); } }
inline void initial_reduce(InputIterator first, InputIterator last, buffer result, const Function &function, kernel &reduce_kernel, const uint_ vpt, const uint_ tpb, command_queue &queue) { (void) function; (void) reduce_kernel; typedef typename std::iterator_traits<InputIterator>::value_type Arg; typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T; size_t count = std::distance(first, last); detail::meta_kernel k("initial_reduce"); k.add_set_arg<const uint_>("count", uint_(count)); size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output"); k << k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "__local " << type_name<T>() << " scratch[TPB];\n" << // private reduction k.decl<T>("sum") << " = 0;\n" << "for(uint i = 0; i < VPT; i++){\n" << " if(offset + lid + i*TPB < count){\n" << " sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" << " }\n" << "}\n" << "scratch[lid] = sum;\n" << // local reduction ReduceBody<T,false>::body() << // write sum to output "if(lid == 0){\n" << " output[get_group_id(0)] = scratch[0];\n" << "}\n"; const context &context = queue.get_context(); std::stringstream options; options << "-DVPT=" << vpt << " -DTPB=" << tpb; kernel generic_reduce_kernel = k.compile(context, options.str()); generic_reduce_kernel.set_arg(output_arg, result); size_t work_size = calculate_work_size(count, vpt, tpb); queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb); }
void exec_1d(command_queue &queue, size_t global_work_offset, size_t global_work_size) { const context &context = queue.get_context(); ::boost::compute::kernel kernel = compile(context); queue.enqueue_1d_range_kernel(kernel, global_work_offset, global_work_size); }
inline void opencv_copy_image_to_mat(const image2d &image, cv::Mat &mat, command_queue &queue = system::default_queue()) { BOOST_ASSERT(mat.isContinuous()); BOOST_ASSERT(image.get_context() == queue.get_context()); size_t origin[2] = { 0, 0 }; size_t region[2] = { image.width(), image.height() }; queue.enqueue_read_image(image, origin, region, 0, mat.data); }
inline bool includes(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, command_queue &queue = system::default_queue()) { typedef typename std::iterator_traits<InputIterator1>::value_type value_type; int tile_size = 1024; int count1 = detail::iterator_range_size(first1, last1); int count2 = detail::iterator_range_size(first2, last2); vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); // Tile the sets detail::tile_sets_kernel tiling_kernel; tiling_kernel.tile_size = tile_size; tiling_kernel.set_range(first1, last1, first2, last2, tile_a.begin()+1, tile_b.begin()+1); fill_n(tile_a.begin(), 1, 0, queue); fill_n(tile_b.begin(), 1, 0, queue); tiling_kernel.exec(queue); fill_n(tile_a.end()-1, 1, count1, queue); fill_n(tile_b.end()-1, 1, count2, queue); vector<uint_> result((count1+count2+tile_size-1)/tile_size, queue.get_context()); // Find individually detail::serial_includes_kernel includes_kernel; includes_kernel.tile_size = tile_size; includes_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(), tile_b.begin(), result.begin()); includes_kernel.exec(queue); return find(result.begin(), result.end(), 0, queue) == result.end(); }
inline event write_single_value(const T &value, const buffer &buffer, size_t index, command_queue &queue) { BOOST_ASSERT(index < buffer.size() / sizeof(T)); BOOST_ASSERT(buffer.get_context() == queue.get_context()); return queue.enqueue_write_buffer(buffer, index * sizeof(T), sizeof(T), &value); }
inline image2d opencv_create_image2d_with_mat(const cv::Mat &mat, cl_mem_flags flags, command_queue &queue = system::default_queue()) { const context &context = queue.get_context(); const image_format format = opencv_get_mat_image_format(mat); image2d image(context, mat.cols, mat.rows, format, flags); opencv_copy_mat_to_image(mat, image, queue); return image; }
inline OutputIterator scan_on_cpu(InputIterator first, InputIterator last, OutputIterator result, bool exclusive, command_queue &queue) { if(first == last){ return result; } typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); // create scan kernel meta_kernel k("scan_on_cpu"); k.add_arg<ulong_>("n"); k << k.decl<input_type>("sum") << " = 0;\n" << "for(ulong i = 0; i < n; i++){\n" << k.decl<const input_type>("x") << " = " << first[k.var<ulong_>("i")] << ";\n"; if(exclusive){ k << result[k.var<ulong_>("i")] << " = sum;\n"; } k << " sum = sum + x;\n"; if(!exclusive){ k << result[k.var<ulong_>("i")] << " = sum;\n"; } k << "}\n"; // compile scan kernel kernel scan_kernel = k.compile(context); // setup kernel arguments size_t n = detail::iterator_range_size(first, last); scan_kernel.set_arg<ulong_>(0, n); // execute the kernel queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1); // return iterator pointing to the end of the result range return result + n; }
inline OutputIterator copy_if_impl(InputIterator first, InputIterator last, OutputIterator result, Predicate predicate, bool copyIndex, command_queue &queue) { typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type; size_t count = detail::iterator_range_size(first, last); if(count == 0){ return result; } const context &context = queue.get_context(); // storage for destination indices ::boost::compute::vector<cl_uint> indices(count, context); // write counts ::boost::compute::detail::meta_kernel k1("copy_if_write_counts"); k1 << indices.begin()[k1.get_global_id(0)] << " = " << predicate(first[k1.get_global_id(0)]) << " ? 1 : 0;\n"; k1.exec_1d(queue, 0, count); // count number of elements to be copied size_t copied_element_count = ::boost::compute::count(indices.begin(), indices.end(), 1, queue); // scan indices ::boost::compute::exclusive_scan(indices.begin(), indices.end(), indices.begin(), queue); // copy values ::boost::compute::detail::meta_kernel k2("copy_if_do_copy"); k2 << "if(" << predicate(first[k2.get_global_id(0)]) << ")" << " " << result[indices.begin()[k2.get_global_id(0)]] << "="; if(copyIndex){ k2 << k2.get_global_id(0) << ";\n"; } else { k2 << first[k2.get_global_id(0)] << ";\n"; } k2.exec_1d(queue, 0, count); return result + static_cast<difference_type>(copied_element_count); }