Ejemplo n.º 1
0
    void generate(OutputIterator first_ctr, OutputIterator last_ctr, OutputIterator first_key, OutputIterator last_key, command_queue &queue) {
        const size_t size_ctr = detail::iterator_range_size(first_ctr, last_ctr);
        const size_t size_key = detail::iterator_range_size(first_key, last_key);
        if(!size_ctr || !size_key || (size_ctr != size_key)) {
            return;
        }
        kernel rng_kernel = m_program.create_kernel("generate_rng");
       
        rng_kernel.set_arg(0, first_ctr.get_buffer());
        rng_kernel.set_arg(1, first_key.get_buffer());
        size_t offset = 0;

        for(;;){
            size_t count = 0;
            size_t size = size_ctr/2;
            if(size > threads){
                count = threads;
            }
            else {
                count = size;
            }
            rng_kernel.set_arg(2, static_cast<const uint_>(offset));
            queue.enqueue_1d_range_kernel(rng_kernel, 0, count, 0);

            offset += count;

            if(offset >= size){
                break;
            }

        }
    }
Ejemplo n.º 2
0
    void fill(OutputIterator first, OutputIterator last, command_queue &queue)
    {
        const buffer &buffer = first.get_buffer();
        const size_t size = detail::iterator_range_size(first, last);

        kernel fill_kernel(m_program, "fill");
        fill_kernel.set_arg(0, m_state_buffer);
        fill_kernel.set_arg(1, buffer);

        size_t p = 0;

        for(;;){
            size_t count = 0;
            if(size - p >= n)
                count = n;
            else
                count = size - p;

            fill_kernel.set_arg(2, static_cast<uint_>(p));
            queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0);

            p += n;

            if(p >= size)
                break;

            generate_state(queue);
        }
    }
Ejemplo n.º 3
0
    void generate(OutputIterator first, OutputIterator last, command_queue &queue)
    {
        const size_t size = detail::iterator_range_size(first, last);

        kernel fill_kernel(m_program, "fill");
        fill_kernel.set_arg(0, m_state_buffer);
        fill_kernel.set_arg(2, first.get_buffer());

        size_t offset = 0;
        size_t &p = m_state_index;

        for(;;){
            size_t count = 0;
            if(size > n){
                count = n;
            }
            else {
                count = size;
            }
            fill_kernel.set_arg(1, static_cast<const uint_>(p));
            fill_kernel.set_arg(3, static_cast<const uint_>(offset));
            queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0);

            p += count;
            offset += count;

            if(offset >= size){
                break;
            }

            generate_state(queue);
            p = 0;
        }
    }
    void generate(OutputIterator first, OutputIterator last, command_queue &queue)
    {
        size_t size = detail::iterator_range_size(first, last);

        kernel fill_kernel(m_program, "fill");
        fill_kernel.set_arg(1, m_multiplicands);
        fill_kernel.set_arg(2, first.get_buffer());

        size_t offset = 0;

        for(;;){
            size_t count = 0;
            if(size > threads){
                count = threads;
            }
            else {
                count = size;
            }
            fill_kernel.set_arg(0, static_cast<const uint_>(m_seed));
            fill_kernel.set_arg(3, static_cast<const uint_>(offset));
            queue.enqueue_1d_range_kernel(fill_kernel, 0, count, 0);

            offset += count;

            if(offset >= size){
                break;
            }

            update_seed(queue);
        }
    }
Ejemplo n.º 5
0
    void generate(OutputIterator first_ctr, OutputIterator last_ctr, command_queue &queue) {
        const size_t size_ctr = detail::iterator_range_size(first_ctr, last_ctr);
        if(!size_ctr) {
            return;
        }
        boost::compute::vector<uint_> vector_key(size_ctr, m_context);
        vector_key.assign(size_ctr, 0, queue);
        kernel rng_kernel = m_program.create_kernel("generate_rng");

        rng_kernel.set_arg(0, first_ctr.get_buffer());
        rng_kernel.set_arg(1, vector_key);
        size_t offset = 0;

        for(;;){
            size_t count = 0;
            size_t size = size_ctr/2;
            if(size > threads){
                count = threads;
            }
            else {
                count = size;
            }
            rng_kernel.set_arg(2, static_cast<const uint_>(offset));
            queue.enqueue_1d_range_kernel(rng_kernel, 0, count, 0);

            offset += count;

            if(offset >= size){
                break;
            }

        }
    }
Ejemplo n.º 6
0
inline future<OutputIterator>
dispatch_copy_async(InputIterator first,
                    InputIterator last,
                    OutputIterator result,
                    command_queue &queue,
                    typename boost::enable_if<
                        mpl::and_<
                            is_device_iterator<InputIterator>,
                            is_device_iterator<OutputIterator>,
                            can_copy_with_copy_buffer<
                                InputIterator, OutputIterator
                            >
                        >
                    >::type* = 0)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;

    difference_type n = std::distance(first, last);
    if(n < 1){
        // nothing to copy
        return make_future(result, event());
    }

    event event_ =
        queue.enqueue_copy_buffer(
            first.get_buffer(),
            result.get_buffer(),
            first.get_index() * sizeof(value_type),
            result.get_index() * sizeof(value_type),
            static_cast<size_t>(n) * sizeof(value_type)
        );

    return make_future(result + n, event_);
}
Ejemplo n.º 7
0
inline command_queue
default_queue_for_copy(InputIterator first,
                       OutputIterator result,
                       typename boost::enable_if<
                           typename boost::mpl::and_<
                               boost::mpl::not_<is_buffer_iterator<InputIterator> >,
                               is_buffer_iterator<OutputIterator>
                           >
                       >::type* = 0)
{
    (void) first;

    const buffer &buffer = result.get_buffer();
    const context &context = buffer.get_context();
    const device &device = context.get_device();

    return command_queue(context, device);
}
Ejemplo n.º 8
0
inline OutputIterator
dispatch_copy(InputIterator first,
              InputIterator last,
              OutputIterator result,
              command_queue &queue,
              typename boost::enable_if_c<
                  boost::is_same<
                      InputIterator,
                      buffer_iterator<typename InputIterator::value_type>
                  >::value &&
                  boost::is_same<
                      OutputIterator,
                      buffer_iterator<typename OutputIterator::value_type>
                  >::value &&
                  boost::is_same<
                      typename InputIterator::value_type,
                      typename OutputIterator::value_type
                  >::value
              >::type* = 0)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;

    difference_type n = std::distance(first, last);
    if(n < 1){
        // nothing to copy
        return first;
    }

    queue.enqueue_copy_buffer(first.get_buffer(),
                              result.get_buffer(),
                              first.get_index() * sizeof(value_type),
                              result.get_index() * sizeof(value_type),
                              static_cast<size_t>(n) * sizeof(value_type));
    return result + n;
}
Ejemplo n.º 9
0
size_t reduce(InputIterator first,
              size_t count,
              OutputIterator result,
              size_t block_size,
              BinaryFunction function,
              command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type
        input_type;
    typedef typename
        boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
        result_type;

    const context &context = queue.get_context();
    size_t block_count = count / 2 / block_size;
    size_t total_block_count =
        static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));

    if(block_count != 0){
        meta_kernel k("block_reduce");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");

        k <<
            "const uint gid = get_global_id(0);\n" <<
            "const uint lid = get_local_id(0);\n" <<

            // copy values to local memory
            "block[lid] = " <<
                function(first[k.make_var<uint_>("gid*2+0")],
                         first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<

            // perform reduction
            "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
            "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "    uint mask = (i << 1) - 1;\n" <<
            "    if((lid & mask) == 0){\n" <<
            "        block[lid] = " <<
                         function(k.expr<input_type>("block[lid]"),
                                  k.expr<input_type>("block[lid+i]")) << ";\n" <<
            "    }\n" <<
            "}\n" <<

            // write block result to global output
            "if(lid == 0)\n" <<
            "    output[get_group_id(0)] = block[0];\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(block_arg, local_buffer<input_type>(block_size));

        queue.enqueue_1d_range_kernel(kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);
    }

    // serially reduce any leftovers
    if(block_count * block_size * 2 < count){
        size_t last_block_start = block_count * block_size * 2;

        meta_kernel k("extra_serial_reduce");
        size_t count_arg = k.add_arg<uint_>("count");
        size_t offset_arg = k.add_arg<uint_>("offset");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t output_offset_arg = k.add_arg<uint_>("output_offset");

        k <<
            k.decl<result_type>("result") << " = \n" <<
                first[k.expr<uint_>("offset")] << ";\n" <<
            "for(uint i = offset + 1; i < count; i++)\n" <<
            "    result = " <<
                     function(k.var<result_type>("result"),
                              first[k.var<uint_>("i")]) << ";\n" <<
            "output[output_offset] = result;\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(count_arg, static_cast<uint_>(count));
        kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));

        queue.enqueue_task(kernel);
    }

    return total_block_count;
}