void generate(OutputIterator first,
                  OutputIterator last,
                  Generator &generator,
                  command_queue &queue)
    {
        size_t size = std::distance(first, last);
        typedef typename Generator::result_type g_result_type;

        vector<g_result_type> tmp(size, queue.get_context());
        vector<g_result_type> tmp2(size, queue.get_context());

        uint_ bound = ((uint_(-1))/(m_b-m_a+1))*(m_b-m_a+1);

        buffer_iterator<g_result_type> tmp2_iter;

        while(size>0)
        {
            generator.generate(tmp.begin(), tmp.begin() + size, queue);
            tmp2_iter = copy_if(tmp.begin(), tmp.begin() + size, tmp2.begin(),
                                _1 <= bound, queue);
            size = std::distance(tmp2_iter, tmp2.end());
        }

        BOOST_COMPUTE_FUNCTION(IntType, scale_random, (const g_result_type x),
        {
            return LO + (x % (HI-LO+1));
        });
Beispiel #2
0
inline InputIterator binary_find(InputIterator first,
                                 InputIterator last,
                                 UnaryPredicate predicate,
                                 command_queue &queue = system::default_queue())
{
    const device &device = queue.get_device();

    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    const std::string cache_key = "__boost_binary_find";

    size_t find_if_limit = 128;
    size_t threads = parameters->get(cache_key, "tpb", 128);
    size_t count = iterator_range_size(first, last);

    InputIterator search_first = first;
    InputIterator search_last = last;

    scalar<uint_> index(queue.get_context());

    // construct and compile binary_find kernel
    binary_find_kernel<InputIterator, UnaryPredicate>
        binary_find_kernel(search_first, search_last, predicate);
    ::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context());

    // set buffer for index
    kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer());

    while(count > find_if_limit) {
        index.write(static_cast<uint_>(count), queue);

        // set block and run binary_find kernel
        uint_ block = static_cast<uint_>((count - 1)/(threads - 1));
        kernel.set_arg(binary_find_kernel.m_block_arg, block);
        queue.enqueue_1d_range_kernel(kernel, 0, threads, 0);

        size_t i = index.read(queue);

        if(i == count) {
            search_first = search_last - ((count - 1)%(threads - 1));
            break;
        } else {
            search_last = search_first + i;
            search_first = search_last - ((count - 1)/(threads - 1));
        }

        // Make sure that first and last stay within the input range
        search_last = (std::min)(search_last, last);
        search_last = (std::max)(search_last, first);

        search_first = (std::max)(search_first, first);
        search_first = (std::min)(search_first, last);

        count = iterator_range_size(search_first, search_last);
    }

    return find_if(search_first, search_last, predicate, queue);
}
inline OutputIterator
set_symmetric_difference(InputIterator1 first1,
                         InputIterator1 last1,
                         InputIterator2 first2,
                         InputIterator2 last2,
                         OutputIterator result,
                         command_queue &queue = system::default_queue())
{
    typedef typename std::iterator_traits<InputIterator1>::value_type value_type;

    int tile_size = 1024;

    int count1 = detail::iterator_range_size(first1, last1);
    int count2 = detail::iterator_range_size(first2, last2);

    vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
    vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());

    // Tile the sets
    detail::balanced_path_kernel tiling_kernel;
    tiling_kernel.tile_size = tile_size;
    tiling_kernel.set_range(first1, last1, first2, last2,
                            tile_a.begin()+1, tile_b.begin()+1);
    fill_n(tile_a.begin(), 1, 0, queue);
    fill_n(tile_b.begin(), 1, 0, queue);
    tiling_kernel.exec(queue);

    fill_n(tile_a.end()-1, 1, count1, queue);
    fill_n(tile_b.end()-1, 1, count2, queue);

    vector<value_type> temp_result(count1+count2, queue.get_context());
    vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context());
    fill_n(counts.end()-1, 1, 0, queue);

    // Find individual symmetric differences
    detail::serial_set_symmetric_difference_kernel symmetric_difference_kernel;
    symmetric_difference_kernel.tile_size = tile_size;
    symmetric_difference_kernel.set_range(first1, first2, tile_a.begin(),
                                            tile_a.end(), tile_b.begin(),
                                            temp_result.begin(), counts.begin());

    symmetric_difference_kernel.exec(queue);

    exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue);

    // Compact the results
    detail::compact_kernel compact_kernel;
    compact_kernel.tile_size = tile_size;
    compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result);

    compact_kernel.exec(queue);

    return result + (counts.end() - 1).read(queue);
}
Beispiel #4
0
inline InputIterator binary_find(InputIterator first,
                                 InputIterator last,
                                 UnaryPredicate predicate,
                                 command_queue &queue = system::default_queue())
{
    size_t find_if_limit = 128;
    size_t threads = 128;
    size_t count = iterator_range_size(first, last);

    while(count > find_if_limit) {

        scalar<uint_> index(queue.get_context());
        index.write(static_cast<uint_>(count), queue);

        binary_find_kernel kernel;
        kernel.set_range(first, last, predicate);
        kernel.exec(queue, index);

        size_t i = index.read(queue);

        if(i == count) {
            first = last - count%threads;
            break;
        } else {
            last = first + i;
            first = last - count/threads;
        }

        count = iterator_range_size(first, last);
    }

    return find_if(first, last, predicate, queue);
}
Beispiel #5
0
inline void sort2(const buffer &buffer, command_queue &queue)
{
    const context &context = queue.get_context();

    boost::shared_ptr<detail::program_cache> cache =
        detail::get_program_cache(context);
    std::string cache_key =
        std::string("fixed_sort2_") + type_name<T>();

    program sort2_program = cache->get(cache_key);
    if(!sort2_program.get()){
        const char source[] =
            "__kernel void sort2(__global T *input)\n"
            "{\n"
            "    const T x = input[0];\n"
            "    const T y = input[1];\n"
            "    if(y < x){\n"
            "        input[0] = y;\n"
            "        input[1] = x;\n"
            "    }\n"
            "}\n";

        sort2_program = program::build_with_source(
                source, context, std::string("-DT=") + type_name<T>()
                );

        cache->insert(cache_key, sort2_program);
    }

    kernel sort2_kernel = sort2_program.create_kernel("sort2");
    sort2_kernel.set_arg(0, buffer);
    queue.enqueue_task(sort2_kernel);
}
Beispiel #6
0
inline void opengl_enqueue_release_buffer(const opengl_buffer &buffer,
                                          command_queue &queue)
{
    BOOST_ASSERT(buffer.get_context() == queue.get_context());

    opengl_enqueue_release_gl_objects(1, &buffer.get(), queue);
}
Beispiel #7
0
inline vector<
    typename boost::compute::result_of<
        BinaryFunction(
            typename std::iterator_traits<InputIterator>::value_type,
            typename std::iterator_traits<InputIterator>::value_type
        )
    >::type
>
block_reduce(InputIterator first,
             size_t count,
             size_t block_size,
             BinaryFunction function,
             command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type
        input_type;
    typedef typename
        boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
        result_type;

    const context &context = queue.get_context();
    size_t total_block_count =
        static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
    vector<result_type> result_vector(total_block_count, context);

    reduce(first, count, result_vector.begin(), block_size, function, queue);

    return result_vector;
}
Beispiel #8
0
inline void gemm(const matrix_order order,
                 const matrix_transpose trans_a,
                 const matrix_transpose trans_b,
                 const int M,
                 const int N,
                 const int K,
                 const Scalar alpha,
                 device_ptr<Scalar> A,
                 const int lda,
                 device_ptr<Scalar> B,
                 const int ldb,
                 const Scalar beta,
                 device_ptr<Scalar> C,
                 const int ldc,
                 command_queue &queue)
{
    (void) order;
    (void) trans_a;
    (void) trans_b;

    ::boost::compute::detail::meta_kernel k("gemm");
    k.add_set_arg<Scalar>("alpha", alpha);
    k.add_set_arg<Scalar>("beta", beta);
    k.add_set_arg<const cl_uint>("M", static_cast<const cl_uint>(M));
    k.add_set_arg<const cl_uint>("N", static_cast<const cl_uint>(N));
    k.add_set_arg<const cl_uint>("K", static_cast<const cl_uint>(K));
    k.add_set_arg<const cl_uint>("lda", static_cast<const cl_uint>(lda));
    k.add_set_arg<const cl_uint>("ldb", static_cast<const cl_uint>(ldb));
    k.add_set_arg<const cl_uint>("ldc", static_cast<const cl_uint>(ldc));
    size_t a_index = k.add_arg<const Scalar *>("__global", "A");
    size_t b_index = k.add_arg<const Scalar *>("__global", "B");
    size_t c_index = k.add_arg<Scalar *>("__global", "C");

    k <<
        k.decl<cl_uint>("i") << " = get_global_id(0);\n" <<
        k.decl<cl_uint>("j") << " = get_global_id(1);\n" <<
        k.decl<Scalar>("sum") << " = 0;\n" <<
        "for(uint k = 0; k < K; k++){\n" <<
        "    sum += " << A[k.expr<cl_uint>("i*lda+k")] << " * "
                      << B[k.expr<cl_uint>("k*ldb+j")] << ";\n" <<
        "};\n" <<
        C[k.expr<cl_uint>("i*ldc+j")] << "=" <<
            "alpha * sum + beta *" << C[k.expr<cl_uint>("i*ldc+j")] << ";\n";

    const context &context = queue.get_context();
    ::boost::compute::kernel kernel = k.compile(context);

    kernel.set_arg(a_index, A.get_buffer());
    kernel.set_arg(b_index, B.get_buffer());
    kernel.set_arg(c_index, C.get_buffer());

    size_t work_group_offsets[] = { 0, 0 };
    size_t work_group_sizes[] = { static_cast<size_t>(N),
                                  static_cast<size_t>(M) };
    queue.enqueue_nd_range_kernel(kernel,
                                  2,
                                  work_group_offsets,
                                  work_group_sizes,
                                  0);
}
inline TextIterator search_n(TextIterator t_first,
                             TextIterator t_last,
                             size_t n,
                             ValueType value,
                             command_queue &queue = system::default_queue())
{
    // there is no need to check if pattern starts at last n - 1 indices
    vector<uint_> matching_indices(
        detail::iterator_range_size(t_first, t_last) + 1 - n,
        queue.get_context()
    );

    // search_n_kernel puts value 1 at every index in vector where pattern
    // of n values starts at
    detail::search_n_kernel<TextIterator,
                            vector<uint_>::iterator> kernel;

    kernel.set_range(t_first, t_last, value, n, matching_indices.begin());
    kernel.exec(queue);

    vector<uint_>::iterator index = ::boost::compute::find(
        matching_indices.begin(), matching_indices.end(), uint_(1), queue
    );

    // pattern was not found
    if(index == matching_indices.end())
        return t_last;

    return t_first + detail::iterator_range_size(matching_indices.begin(), index);
}
inline OutputIterator
adjacent_difference(InputIterator first,
                    InputIterator last,
                    OutputIterator result,
                    BinaryFunction op,
                    command_queue &queue = system::default_queue())
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;

    if(first == last) {
        return result;
    }

    if (first == result) {
        vector<value_type> temp(detail::iterator_range_size(first, last),
                                queue.get_context());
        copy(first, last, temp.begin(), queue);

        return ::boost::compute::detail::dispatch_adjacent_difference(
            temp.begin(), temp.end(), result, op, queue
        );
    }
    else {
        return ::boost::compute::detail::dispatch_adjacent_difference(
            first, last, result, op, queue
        );
    }
}
Beispiel #11
0
inline T inner_product(InputIterator1 first1,
                       InputIterator1 last1,
                       InputIterator2 first2,
                       T init,
                       BinaryAccumulateFunction accumulate_function,
                       BinaryTransformFunction transform_function,
                       command_queue &queue = system::default_queue())
{
    typedef typename std::iterator_traits<InputIterator1>::value_type value_type;

    size_t count = detail::iterator_range_size(first1, last1);
    vector<value_type> result(count, queue.get_context());
    transform(first1,
              last1,
              first2,
              result.begin(),
              transform_function,
              queue);

    return ::boost::compute::accumulate(result.begin(),
                                        result.end(),
                                        init,
                                        accumulate_function,
                                        queue);
}
Beispiel #12
0
inline void serial_reduce(InputIterator first,
                          InputIterator last,
                          OutputIterator result,
                          BinaryFunction function,
                          command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type T;
    typedef typename
        ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type;

    const context &context = queue.get_context();
    size_t count = detail::iterator_range_size(first, last);
    if(count == 0){
        return;
    }

    meta_kernel k("serial_reduce");
    size_t count_arg = k.add_arg<cl_uint>("count");

    k <<
        k.decl<result_type>("result") << " = " << first[0] << ";\n" <<
        "for(uint i = 1; i < count; i++)\n" <<
        "    result = " << function(k.var<T>("result"),
                                    first[k.var<uint_>("i")]) << ";\n" <<
        result[0] << " = result;\n";

    kernel kernel = k.compile(context);

    kernel.set_arg(count_arg, static_cast<uint_>(count));

    queue.enqueue_task(kernel);
}
Beispiel #13
0
inline void sort_by_transform(Iterator first,
                              Iterator last,
                              Transform transform,
                              Compare compare,
                              command_queue &queue = system::default_queue())
{
    typedef typename std::iterator_traits<Iterator>::value_type value_type;
    typedef typename boost::compute::result_of<Transform(value_type)>::type key_type;

    size_t n = detail::iterator_range_size(first, last);
    if(n < 2){
        return;
    }

    const context &context = queue.get_context();

    ::boost::compute::vector<key_type> keys(n, context);

    ::boost::compute::transform(
        first,
        last,
        keys.begin(),
        transform,
        queue
    );

    ::boost::compute::sort_by_key(
        keys.begin(),
        keys.end(),
        first,
        compare,
        queue
    );
}
inline InputIterator
adjacent_find_with_atomics(InputIterator first,
                           InputIterator last,
                           Compare compare,
                           command_queue &queue)
{
    if(first == last){
        return last;
    }

    const context &context = queue.get_context();
    size_t count = detail::iterator_range_size(first, last);

    // initialize output to the last index
    detail::scalar<uint_> output(context);
    output.write(static_cast<uint_>(count), queue);

    detail::meta_kernel k("adjacent_find_with_atomics");

    size_t output_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output");

    k << "const uint i = get_global_id(0);\n"
      << "if(" << compare(first[k.expr<uint_>("i")],
                          first[k.expr<uint_>("i+1")]) << "){\n"
      << "    atomic_min(output, i);\n"
      << "}\n";

    k.set_arg(output_arg, output.get_buffer());

    k.exec_1d(queue, 0, count - 1, 1);

    return first + output.read(queue);
}
Beispiel #15
0
inline TextIterator find_end(PatternIterator p_first,
                             PatternIterator p_last,
                             TextIterator t_first,
                             TextIterator t_last,
                             command_queue &queue = system::default_queue())
{
    const context &context = queue.get_context();
    vector<uint_> matching_indices(detail::iterator_range_size(t_first, t_last),
                                    context);

    detail::search_kernel<PatternIterator,
                          TextIterator,
                          vector<uint_>::iterator> kernel;

    kernel.set_range(p_first, p_last, t_first, t_last, matching_indices.begin());
    kernel.exec(queue);

    using boost::compute::_1;

    vector<uint_>::iterator index =
        detail::find_end_helper(matching_indices.begin(),
                                matching_indices.end(),
                                _1 == 1,
                                queue);

    return t_first + detail::iterator_range_size(matching_indices.begin(), index);
}
Beispiel #16
0
inline cv::Mat opencv_create_mat_with_image2d(const image2d &image,
                                              command_queue &queue = system::default_queue())
{
    BOOST_ASSERT(image.get_context() == queue.get_context());

    cv::Mat mat;
    image_format format = image.get_format();
    const cl_image_format *cl_image_format = format.get_format_ptr();

    if(cl_image_format->image_channel_data_type == CL_UNORM_INT8 &&
            cl_image_format->image_channel_order == CL_BGRA)
    {
        mat = cv::Mat(image.height(), image.width(), CV_8UC4);
    }
    else if(cl_image_format->image_channel_data_type == CL_UNORM_INT16 &&
            cl_image_format->image_channel_order == CL_BGRA)
    {
        mat = cv::Mat(image.height(), image.width(), CV_16UC4);
    }
    else if(cl_image_format->image_channel_data_type == CL_FLOAT &&
            cl_image_format->image_channel_order == CL_INTENSITY)
    {
        mat = cv::Mat(image.height(), image.width(), CV_32FC1);
    }
    else
    {
        mat = cv::Mat(image.height(), image.width(), CV_8UC1);
    }

    opencv_copy_image_to_mat(image, mat, queue);

    return mat;
}
Beispiel #17
0
/// Enqueues a command to release the specified OpenGL buffer.
///
/// \see_opencl_ref{clEnqueueReleaseGLObjects}
inline event opengl_enqueue_release_buffer(const opengl_buffer &buffer,
                                          command_queue &queue,
                                          const wait_list &events = wait_list())
{
    BOOST_ASSERT(buffer.get_context() == queue.get_context());

    return opengl_enqueue_release_gl_objects(1, &buffer.get(), queue, events);
}
Beispiel #18
0
inline void merge_sort_by_key_on_cpu(KeyIterator keys_first,
                                     KeyIterator keys_last,
                                     ValueIterator values_first,
                                     Compare compare,
                                     command_queue &queue)
{
    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;

    size_t count = iterator_range_size(keys_first, keys_last);
    if(count < 2){
        return;
    }
    // for small input size only insertion sort is performed
    else if(count <= 512){
        block_insertion_sort(keys_first, values_first, compare,
                             count, count, true, queue);
        return;
    }

    const context &context = queue.get_context();
    const device &device = queue.get_device();

    // loading parameters
    std::string cache_key =
        std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>()
        + "_with_" + type_name<key_type>();
    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    const size_t block_size =
        parameters->get(cache_key, "insertion_sort_by_key_block_size", 64);
    block_insertion_sort(keys_first, values_first, compare,
                         count, block_size, true, queue);

    // temporary buffer for merge results
    vector<value_type> values_temp(count, context);
    vector<key_type> keys_temp(count, context);
    bool result_in_temporary_buffer = false;

    for(size_t i = block_size; i < count; i *= 2){
        result_in_temporary_buffer = !result_in_temporary_buffer;
        if(result_in_temporary_buffer) {
            merge_blocks(keys_first, values_first,
                         keys_temp.begin(), values_temp.begin(),
                         compare, count, i, true, queue);
        } else {
            merge_blocks(keys_temp.begin(), values_temp.begin(),
                         keys_first, values_first,
                         compare, count, i, true, queue);
        }
    }

    if(result_in_temporary_buffer) {
        copy(keys_temp.begin(), keys_temp.end(), keys_first, queue);
        copy(values_temp.begin(), values_temp.end(), values_first, queue);
    }
}
Beispiel #19
0
inline void opencv_copy_image_to_mat(const image2d &image,
                                     cv::Mat &mat,
                                     command_queue &queue = system::default_queue())
{
    BOOST_ASSERT(mat.isContinuous());
    BOOST_ASSERT(image.get_context() == queue.get_context());

    queue.enqueue_read_image(image, image.origin(), image.size(), mat.data);
}
Beispiel #20
0
inline void serial_insertion_sort_by_key(KeyIterator keys_first,
                                         KeyIterator keys_last,
                                         ValueIterator values_first,
                                         Compare compare,
                                         command_queue &queue)
{
    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;

    size_t count = iterator_range_size(keys_first, keys_last);
    if(count < 2){
        return;
    }

    meta_kernel k("serial_insertion_sort_by_key");
    size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "keys");
    size_t local_data_arg = k.add_arg<value_type *>(memory_object::local_memory, "data");
    size_t count_arg = k.add_arg<uint_>("n");

    k <<
        // copy data to local memory
        "for(uint i = 0; i < n; i++){\n" <<
        "    keys[i] = " << keys_first[k.var<uint_>("i")] << ";\n"
        "    data[i] = " << values_first[k.var<uint_>("i")] << ";\n"
        "}\n"

        // sort data in local memory
        "for(uint i = 1; i < n; i++){\n" <<
        "    " << k.decl<const key_type>("key") << " = keys[i];\n" <<
        "    " << k.decl<const value_type>("value") << " = data[i];\n" <<
        "    uint pos = i;\n" <<
        "    while(pos > 0 && " <<
                   compare(k.var<const key_type>("key"),
                           k.var<const key_type>("keys[pos-1]")) << "){\n" <<
        "        keys[pos] = keys[pos-1];\n" <<
        "        data[pos] = data[pos-1];\n" <<
        "        pos--;\n" <<
        "    }\n" <<
        "    keys[pos] = key;\n" <<
        "    data[pos] = value;\n" <<
        "}\n" <<

        // copy sorted data to output
        "for(uint i = 0; i < n; i++){\n" <<
        "    " << keys_first[k.var<uint_>("i")] << " = keys[i];\n"
        "    " << values_first[k.var<uint_>("i")] << " = data[i];\n"
        "}\n";

    const context &context = queue.get_context();
    ::boost::compute::kernel kernel = k.compile(context);
    kernel.set_arg(local_keys_arg, static_cast<uint_>(count * sizeof(key_type)), 0);
    kernel.set_arg(local_data_arg, static_cast<uint_>(count * sizeof(value_type)), 0);
    kernel.set_arg(count_arg, static_cast<uint_>(count));

    queue.enqueue_task(kernel);
}
    /// Creates a new mersenne_twister_engine and seeds it with \p value.
    explicit mersenne_twister_engine(command_queue &queue,
                                     result_type value = default_seed)
        : m_context(queue.get_context()),
          m_state_buffer(m_context, n * sizeof(result_type))
    {
        // setup program
        load_program();

        // seed state
        seed(queue, value);
    }
Beispiel #22
0
inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
                                     KeyIterator keys_last,
                                     ValueIterator values_first,
                                     Compare compare,
                                     bool stable,
                                     command_queue &queue)
{
    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;

    size_t count = iterator_range_size(keys_first, keys_last);
    if(count < 2){
        return;
    }

    size_t block_size =
        block_sort(
            keys_first, values_first,
            compare, count,
            true /* sort_by_key */, stable /* stable */,
            queue
        );

    // for small input size only block sort is performed
    if(count <= block_size) {
        return;
    }

    const context &context = queue.get_context();

    bool result_in_temporary_buffer = false;
    ::boost::compute::vector<key_type> temp_keys(count, context);
    ::boost::compute::vector<value_type> temp_values(count, context);

    for(; block_size < count; block_size *= 2) {
        result_in_temporary_buffer = !result_in_temporary_buffer;
        if(result_in_temporary_buffer) {
            merge_blocks_on_gpu(keys_first, values_first,
                                temp_keys.begin(), temp_values.begin(),
                                compare, count, block_size,
                                true /* sort_by_key */, queue);
        } else {
            merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(),
                                keys_first, values_first,
                                compare, count, block_size,
                                true /* sort_by_key */, queue);
        }
    }

    if(result_in_temporary_buffer) {
        copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue);
        copy_async(temp_values.begin(), temp_values.end(), values_first, queue);
    }
}
Beispiel #23
0
inline void initial_reduce(InputIterator first,
                           InputIterator last,
                           buffer result,
                           const Function &function,
                           kernel &reduce_kernel,
                           const uint_ vpt,
                           const uint_ tpb,
                           command_queue &queue)
{
    (void) function;
    (void) reduce_kernel;

    typedef typename std::iterator_traits<InputIterator>::value_type Arg;
    typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;

    size_t count = std::distance(first, last);
    detail::meta_kernel k("initial_reduce");
    k.add_set_arg<const uint_>("count", uint_(count));
    size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");

    k <<
        k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<

        "__local " << type_name<T>() << " scratch[TPB];\n" <<

        // private reduction
        k.decl<T>("sum") << " = 0;\n" <<
        "for(uint i = 0; i < VPT; i++){\n" <<
        "    if(offset + lid + i*TPB < count){\n" <<
        "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
        "    }\n" <<
        "}\n" <<

        "scratch[lid] = sum;\n" <<

        // local reduction
        ReduceBody<T,false>::body() <<

        // write sum to output
        "if(lid == 0){\n" <<
        "    output[get_group_id(0)] = scratch[0];\n" <<
        "}\n";

    const context &context = queue.get_context();
    std::stringstream options;
    options << "-DVPT=" << vpt << " -DTPB=" << tpb;
    kernel generic_reduce_kernel = k.compile(context, options.str());
    generic_reduce_kernel.set_arg(output_arg, result);

    size_t work_size = calculate_work_size(count, vpt, tpb);

    queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
}
Beispiel #24
0
    void exec_1d(command_queue &queue,
                 size_t global_work_offset,
                 size_t global_work_size)
    {
        const context &context = queue.get_context();

        ::boost::compute::kernel kernel = compile(context);

        queue.enqueue_1d_range_kernel(kernel,
                                      global_work_offset,
                                      global_work_size);
    }
Beispiel #25
0
inline void opencv_copy_image_to_mat(const image2d &image,
                                     cv::Mat &mat,
                                     command_queue &queue = system::default_queue())
{
    BOOST_ASSERT(mat.isContinuous());
    BOOST_ASSERT(image.get_context() == queue.get_context());

    size_t origin[2] = { 0, 0 };
    size_t region[2] = { image.width(), image.height() };

    queue.enqueue_read_image(image, origin, region, 0, mat.data);
}
Beispiel #26
0
inline bool includes(InputIterator1 first1,
                    InputIterator1 last1,
                    InputIterator2 first2,
                    InputIterator2 last2,
                    command_queue &queue = system::default_queue())
{
    typedef typename std::iterator_traits<InputIterator1>::value_type value_type;

    int tile_size = 1024;

    int count1 = detail::iterator_range_size(first1, last1);
    int count2 = detail::iterator_range_size(first2, last2);

    vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
    vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());

    // Tile the sets
    detail::tile_sets_kernel tiling_kernel;
    tiling_kernel.tile_size = tile_size;
    tiling_kernel.set_range(first1, last1, first2, last2,
                            tile_a.begin()+1, tile_b.begin()+1);
    fill_n(tile_a.begin(), 1, 0, queue);
    fill_n(tile_b.begin(), 1, 0, queue);
    tiling_kernel.exec(queue);

    fill_n(tile_a.end()-1, 1, count1, queue);
    fill_n(tile_b.end()-1, 1, count2, queue);

    vector<uint_> result((count1+count2+tile_size-1)/tile_size, queue.get_context());

    // Find individually
    detail::serial_includes_kernel includes_kernel;
    includes_kernel.tile_size = tile_size;
    includes_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
                                tile_b.begin(), result.begin());

    includes_kernel.exec(queue);

    return find(result.begin(), result.end(), 0, queue) == result.end();
}
inline event write_single_value(const T &value,
                                const buffer &buffer,
                                size_t index,
                                command_queue &queue)
{
    BOOST_ASSERT(index < buffer.size() / sizeof(T));
    BOOST_ASSERT(buffer.get_context() == queue.get_context());

    return queue.enqueue_write_buffer(buffer,
                                      index * sizeof(T),
                                      sizeof(T),
                                      &value);
}
Beispiel #28
0
inline image2d opencv_create_image2d_with_mat(const cv::Mat &mat,
                                              cl_mem_flags flags,
                                              command_queue &queue = system::default_queue())
{
    const context &context = queue.get_context();
    const image_format format = opencv_get_mat_image_format(mat);

    image2d image(context, mat.cols, mat.rows, format, flags);

    opencv_copy_mat_to_image(mat, image, queue);

    return image;
}
Beispiel #29
0
inline OutputIterator scan_on_cpu(InputIterator first,
                                  InputIterator last,
                                  OutputIterator result,
                                  bool exclusive,
                                  command_queue &queue)
{
    if(first == last){
        return result;
    }

    typedef typename
        std::iterator_traits<InputIterator>::value_type input_type;
    typedef typename
        std::iterator_traits<OutputIterator>::value_type output_type;

    const context &context = queue.get_context();

    // create scan kernel
    meta_kernel k("scan_on_cpu");
    k.add_arg<ulong_>("n");
    k <<
        k.decl<input_type>("sum") << " = 0;\n" <<
        "for(ulong i = 0; i < n; i++){\n" <<
        k.decl<const input_type>("x") << " = "
            << first[k.var<ulong_>("i")] << ";\n";

    if(exclusive){
        k << result[k.var<ulong_>("i")] << " = sum;\n";
    }

    k << "    sum = sum + x;\n";

    if(!exclusive){
        k << result[k.var<ulong_>("i")] << " = sum;\n";
    }

    k << "}\n";

    // compile scan kernel
    kernel scan_kernel = k.compile(context);

    // setup kernel arguments
    size_t n = detail::iterator_range_size(first, last);
    scan_kernel.set_arg<ulong_>(0, n);

    // execute the kernel
    queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);

    // return iterator pointing to the end of the result range
    return result + n;
}
Beispiel #30
0
inline OutputIterator copy_if_impl(InputIterator first,
                                   InputIterator last,
                                   OutputIterator result,
                                   Predicate predicate,
                                   bool copyIndex,
                                   command_queue &queue)
{
    typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type;

    size_t count = detail::iterator_range_size(first, last);
    if(count == 0){
        return result;
    }

    const context &context = queue.get_context();

    // storage for destination indices
    ::boost::compute::vector<cl_uint> indices(count, context);

    // write counts
    ::boost::compute::detail::meta_kernel k1("copy_if_write_counts");
    k1 << indices.begin()[k1.get_global_id(0)] << " = "
           << predicate(first[k1.get_global_id(0)]) << " ? 1 : 0;\n";
    k1.exec_1d(queue, 0, count);

    // count number of elements to be copied
    size_t copied_element_count =
        ::boost::compute::count(indices.begin(), indices.end(), 1, queue);

    // scan indices
    ::boost::compute::exclusive_scan(indices.begin(),
                                     indices.end(),
                                     indices.begin(),
                                     queue);

    // copy values
    ::boost::compute::detail::meta_kernel k2("copy_if_do_copy");
    k2 << "if(" << predicate(first[k2.get_global_id(0)]) << ")" <<
          "    " << result[indices.begin()[k2.get_global_id(0)]] << "=";

    if(copyIndex){
        k2 << k2.get_global_id(0) << ";\n";
    }
    else {
        k2 << first[k2.get_global_id(0)] << ";\n";
    }

    k2.exec_1d(queue, 0, count);

    return result + static_cast<difference_type>(copied_element_count);
}