Пример #1
0
    kernel compile(const context &context, const std::string &options = std::string())
    {
        // generate the program source
        std::string source = this->source();

        // generate cache key
        std::string cache_key = detail::sha1(source);

        // try to look the program up in the cache
        boost::shared_ptr<program_cache> cache = get_program_cache(context);
        ::boost::compute::program program = cache->get(cache_key);

        // build the program if it was not in the cache
        if(!program.get()){
            program = ::boost::compute::program::build_with_source(
                source, context, options
            );

            cache->insert(cache_key, program);
        }

        // create kernel
        ::boost::compute::kernel kernel = program.create_kernel(name());

        // bind stored args
        for(size_t i = 0; i < m_stored_args.size(); i++){
            const detail::meta_kernel_stored_arg &arg = m_stored_args[i];

            if(arg.m_size != 0){
                kernel.set_arg(i, arg.m_size, arg.m_value);
            }
        }

        // bind buffer args
        for(size_t i = 0; i < m_stored_buffers.size(); i++){
            const detail::meta_kernel_buffer_info &bi = m_stored_buffers[i];

            kernel.set_arg(bi.index, bi.m_mem);
        }

        return kernel;
    }
Пример #2
0
inline void reduce_on_gpu(InputIterator first,
                          InputIterator last,
                          buffer_iterator<T> result,
                          Function function,
                          command_queue &queue)
{
    const device &device = queue.get_device();

    detail::meta_kernel k("reduce");
    k.add_arg<T*>("__global const","input");
    k.add_arg<const uint_>("offset");
    k.add_arg<const uint_>("count");
    k.add_arg<T*>("__global","output");
    k.add_arg<const uint_>("output_offset");

    k <<
        k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
        "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<

        "__local " << type_name<T>() << " scratch[TPB];\n" <<
        // private reduction
        k.decl<T>("sum") << " = 0;\n" <<
        "for(uint i = 0; i < VPT; i++){\n" <<
        "    if(block_offset + lid + i*TPB < count){\n" <<
        "        sum = sum + block[lid+i*TPB]; \n" <<
        "    }\n" <<
        "}\n" <<

        "scratch[lid] = sum;\n";

    // discrimination on vendor name
    if(is_nvidia_device(device))
        k << ReduceBody<T,true>::body();
    else
        k << ReduceBody<T,false>::body();

    k <<
        // write sum to output
         "if(lid == 0){\n" <<
         "    output[output_offset + get_group_id(0)] = scratch[0];\n" <<
         "}\n";

    uint_ vpt = 8;
    uint_ tpb = 128;

    size_t count = std::distance(first, last);

    const context &context = queue.get_context();
    boost::shared_ptr<program_cache> cache = get_program_cache(context);
    std::string cache_key = std::string("boost_reduce_on_gpu_") + type_name<T>();
    program reduce_program = cache->get(cache_key);
    if(!reduce_program.get()){
        // create reduce program
        std::stringstream options;
        options << "-DT=" << type_name<T>()
                << " -DVPT=" << vpt
                << " -DTPB=" << tpb;
        reduce_program = program::build_with_source(k.source(), context, options.str());

        cache->insert(cache_key, reduce_program);
    }

    // create reduce kernel
    kernel reduce_kernel(reduce_program, "reduce");

    // first pass, reduce from input to ping
    buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
    initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);

    // update count after initial reduce
    count = std::ceil(float(count) / vpt / tpb);

    // middle pass(es), reduce between ping and pong
    const buffer *input_buffer = &ping;
    buffer pong(context, count / vpt / tpb * sizeof(T));
    const buffer *output_buffer = &pong;
    if(count > vpt * tpb){
        while(count > vpt * tpb){
            reduce_kernel.set_arg(0, *input_buffer);
            reduce_kernel.set_arg(1, uint_(0));
            reduce_kernel.set_arg(2, uint_(count));
            reduce_kernel.set_arg(3, *output_buffer);
            reduce_kernel.set_arg(4, uint_(0));

            size_t work_size = std::ceil(float(count) / vpt);
            if(work_size % tpb != 0){
                work_size += tpb - work_size % tpb;
            }
            queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);

            std::swap(input_buffer, output_buffer);
            count = std::ceil(float(count) / vpt / tpb);
        }
    }

    // final pass, reduce from ping/pong to result
    reduce_kernel.set_arg(0, *input_buffer);
    reduce_kernel.set_arg(1, uint_(0));
    reduce_kernel.set_arg(2, uint_(count));
    reduce_kernel.set_arg(3, result.get_buffer());
    reduce_kernel.set_arg(4, uint_(result.get_index()));

    queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
}