Пример #1
0
        static backend::kernel& make_kernel(
                const backend::command_queue &q,
                const Expr &expr
                )
        {
            using namespace detail;
            static kernel_cache cache;

            auto kernel = cache.find(q);

            if (kernel == cache.end()) {
                backend::source_generator src(q);

                output_terminal_preamble gpre(src, q, "prm", empty_state());
                boost::proto::eval(boost::proto::as_child(expr), gpre);

                src.begin_kernel("vexcl_any_of_kernel");
                src.begin_kernel_parameters();
                src.parameter<size_t>("n");

                extract_terminals()(expr,
                        declare_expression_parameter(
                            src, q, "prm", empty_state()
                            )
                        );

                src.template parameter< global_ptr<char> >("result");

                src.end_kernel_parameters();
                src.new_line() << "for(ulong idx = 0; idx < n; ++idx)";
                src.open("{");

                output_local_preamble lpre(src, q, "prm", empty_state());
                boost::proto::eval(boost::proto::as_child(expr), lpre);

                src.new_line() << "if (";

                vector_expr_context expr_ctx(src, q, "prm", empty_state());
                boost::proto::eval(boost::proto::as_child(expr), expr_ctx);

                src << ")";
                src.open("{");
                src.new_line() << "result[0] = 1;";
                src.new_line() << "return;";
                src.close("}");

                src.close("}");
                src.new_line() << "result[0] = 0;";
                src.end_kernel();

                kernel = cache.insert(q, backend::kernel(
                            q, src.str(), "vexcl_any_of_kernel"
                            ));
                kernel->second.config(1, 1);
            }

            return kernel->second;
        }
Пример #2
0
        typename std::enable_if<
            !boost::proto::matches<
                typename boost::proto::result_of::as_expr<ExprTuple>::type,
                multivector_full_grammar
            >::value
#if !defined(_MSC_VER) || _MSC_VER >= 1700
            && N == std::tuple_size<ExprTuple>::value
#endif
            , const multivector&
        >::type
        operator=(const ExprTuple &expr) {
#endif
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream source;

                    source << standard_kernel_header(device);

                    {
                        get_header f(source);
                        for_each<0>(expr, f);
                    }

                    source <<
                        "kernel void multi_expr_tuple(\n"
                        "\t" << type_name<size_t>() << " n";

                    for(uint i = 1; i <= N; i++)
                        source << ",\n\tglobal " << type_name<T>() << " *res_" << i;

                    {
                        get_params f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n)\n{\n";

                    if ( is_cpu(device) ) {
                        source <<
                            "\tsize_t chunk_size  = (n + get_global_size(0) - 1) / get_global_size(0);\n"
                            "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n"
                            "\tsize_t chunk_end   = min(n, chunk_start + chunk_size);\n"
                            "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n";
                    } else {
                        source <<
                            "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";
                    }

                    {
                        get_expressions f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n";

                    for(uint i = 1; i <= N; i++)
                        source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n";

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, "multi_expr_tuple");
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, (*vec[i])(d));

                    {
                        set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d));
                        for_each<0>(expr, f);
                    }

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }
Пример #3
0
        typename std::enable_if<
            boost::proto::matches<
                typename boost::proto::result_of::as_expr<Expr>::type,
                multivector_expr_grammar
            >::value,
            const multivector&
        >::type
        operator=(const Expr& expr) {
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            // If any device in context is CPU, then do not fuse the kernel,
            // but assign components individually.
            if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) {
                assign_subexpressions<0, N>(boost::proto::as_child(expr));
                return *this;
            }

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream kernel_name;
                    kernel_name << "multi_";
                    vector_name_context name_ctx(kernel_name);
                    boost::proto::eval(boost::proto::as_child(expr), name_ctx);

                    std::ostringstream source;
                    source << standard_kernel_header(device);

                    extract_user_functions()(
                            boost::proto::as_child(expr),
                            declare_user_function(source)
                            );

                    source << "kernel void " << kernel_name.str()
                           << "(\n\t" << type_name<size_t>() << " n";

                    for(size_t i = 0; i < N; )
                        source << ",\n\tglobal " << type_name<T>()
                               << " *res_" << ++i;

                    build_param_list<N>(boost::proto::as_child(expr), source);

                    source <<
                        "\n)\n{\n"
                        "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";

                    build_expr_list(boost::proto::as_child(expr), source);

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, kernel_name.str().c_str());
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, vec[i]->operator()(d));

                    set_kernel_args<N>(
                            boost::proto::as_child(expr),
                            kernel->second.kernel, d, pos, vec[0]->part_start(d)
                            );

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }
Пример #4
0
void eval(const Expr &expr,
        const std::vector<backend::command_queue> &queue,
        const std::vector<size_t> &part
        )
{
    using namespace vex::detail;

#if (VEXCL_CHECK_SIZES > 0)
    {
        get_expression_properties prop;
        extract_terminals()(boost::proto::as_child(expr), prop);

        precondition(
                prop.queue.empty() || prop.queue.size() == queue.size(),
                "Incompatible queue lists"
                );

        precondition(
                prop.size == 0 || prop.size == part.back(),
                "Incompatible expression sizes"
                );
    }
#endif
    static kernel_cache cache;

    for(unsigned d = 0; d < queue.size(); d++) {
        auto kernel = cache.find(queue[d]);

        backend::select_context(queue[d]);

        if (kernel == cache.end()) {
            backend::source_generator source(queue[d]);

            output_terminal_preamble termpream(source, queue[d], "prm", empty_state());

            boost::proto::eval(boost::proto::as_child(expr), termpream);

            source.begin_kernel("vexcl_eval_kernel");
            source.begin_kernel_parameters();
            source.parameter<size_t>("n");

            declare_expression_parameter declare(source, queue[d], "prm", empty_state());
            extract_terminals()(boost::proto::as_child(expr), declare);

            source.end_kernel_parameters();
            source.grid_stride_loop().open("{");

            output_local_preamble loc_init(source, queue[d], "prm", empty_state());
            boost::proto::eval(boost::proto::as_child(expr), loc_init);

            source.new_line();
            vector_expr_context expr_ctx(source, queue[d], "prm", empty_state());
            boost::proto::eval(boost::proto::as_child(expr), expr_ctx);
            source << ";";
            source.close("}");
            source.end_kernel();

            kernel = cache.insert(queue[d], backend::kernel(
                        queue[d], source.str(), "vexcl_eval_kernel"));
        }

        if (size_t psize = part[d + 1] - part[d]) {
            auto &K = kernel->second;
            K.push_arg(psize);
            set_expression_argument setarg(K, d, part[d], empty_state());
            extract_terminals()( boost::proto::as_child(expr), setarg);
            K(queue[d]);
        }
    }
}