Esempio n. 1
0
inline void kernel_common(std::ostringstream &o, const cl::Device& device) {
    if(std::is_same<T, cl_double>::value) {
        o << standard_kernel_header(device)
          << "typedef double real_t;\n"
          << "typedef double2 real2_t;\n";
    } else {
        o << "typedef float real_t;\n"
          << "typedef float2 real2_t;\n";
    }
}
Esempio n. 2
0
        typename std::enable_if<
            !boost::proto::matches<
                typename boost::proto::result_of::as_expr<ExprTuple>::type,
                multivector_full_grammar
            >::value
#if !defined(_MSC_VER) || _MSC_VER >= 1700
            && N == std::tuple_size<ExprTuple>::value
#endif
            , const multivector&
        >::type
        operator=(const ExprTuple &expr) {
#endif
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream source;

                    source << standard_kernel_header(device);

                    {
                        get_header f(source);
                        for_each<0>(expr, f);
                    }

                    source <<
                        "kernel void multi_expr_tuple(\n"
                        "\t" << type_name<size_t>() << " n";

                    for(uint i = 1; i <= N; i++)
                        source << ",\n\tglobal " << type_name<T>() << " *res_" << i;

                    {
                        get_params f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n)\n{\n";

                    if ( is_cpu(device) ) {
                        source <<
                            "\tsize_t chunk_size  = (n + get_global_size(0) - 1) / get_global_size(0);\n"
                            "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n"
                            "\tsize_t chunk_end   = min(n, chunk_start + chunk_size);\n"
                            "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n";
                    } else {
                        source <<
                            "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";
                    }

                    {
                        get_expressions f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n";

                    for(uint i = 1; i <= N; i++)
                        source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n";

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, "multi_expr_tuple");
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, (*vec[i])(d));

                    {
                        set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d));
                        for_each<0>(expr, f);
                    }

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }
Esempio n. 3
0
 source_generator(const command_queue &queue)
     : indent(0), first_prm(true), cpu( is_cpu(queue) )
 {
     src << standard_kernel_header(queue);
 }
Esempio n. 4
0
        typename std::enable_if<
            boost::proto::matches<
                typename boost::proto::result_of::as_expr<Expr>::type,
                multivector_expr_grammar
            >::value,
            const multivector&
        >::type
        operator=(const Expr& expr) {
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            // If any device in context is CPU, then do not fuse the kernel,
            // but assign components individually.
            if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) {
                assign_subexpressions<0, N>(boost::proto::as_child(expr));
                return *this;
            }

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream kernel_name;
                    kernel_name << "multi_";
                    vector_name_context name_ctx(kernel_name);
                    boost::proto::eval(boost::proto::as_child(expr), name_ctx);

                    std::ostringstream source;
                    source << standard_kernel_header(device);

                    extract_user_functions()(
                            boost::proto::as_child(expr),
                            declare_user_function(source)
                            );

                    source << "kernel void " << kernel_name.str()
                           << "(\n\t" << type_name<size_t>() << " n";

                    for(size_t i = 0; i < N; )
                        source << ",\n\tglobal " << type_name<T>()
                               << " *res_" << ++i;

                    build_param_list<N>(boost::proto::as_child(expr), source);

                    source <<
                        "\n)\n{\n"
                        "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";

                    build_expr_list(boost::proto::as_child(expr), source);

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, kernel_name.str().c_str());
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, vec[i]->operator()(d));

                    set_kernel_args<N>(
                            boost::proto::as_child(expr),
                            kernel->second.kernel, d, pos, vec[0]->part_start(d)
                            );

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }