C++ (Cpp) num_workgroups Exemples

Exemple #1

0

Afficher le fichier

Fichier : kernel.hpp Projet : ds283/vexcl

        /// Select best launch configuration for the given shared memory requirements.
        void config(const command_queue &q, std::function<size_t(size_t)> smem) {
            // Select workgroup size that would fit into the device.
            size_t ws = q.device().max_threads_per_block() / 2;

            size_t max_ws   = max_threads_per_block(q);
            size_t max_smem = max_shared_memory_per_block(q);

            // Reduce workgroup size until it satisfies resource requirements:
            while( (ws > max_ws) || (smem(ws) > max_smem) )
                ws /= 2;

            config(num_workgroups(q), ws);
        }

Exemple #2

0

Afficher le fichier

Fichier : kernel.hpp Projet : illicium/csc344-wi14

        /// Select best launch configuration for the given shared memory requirements.
        void config(const cl::CommandQueue &queue, std::function<size_t(size_t)> smem) {
            cl::Device dev = queue.getInfo<CL_QUEUE_DEVICE>();

            if ( is_cpu(queue) ) {
                w_size = 1;
            } else {
                // Select workgroup size that would fit into the device.
                w_size = dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0] / 2;

                size_t max_ws   = max_threads_per_block(queue);
                size_t max_smem = max_shared_memory_per_block(queue);

                // Reduce workgroup size until it satisfies resource requirements:
                while( (w_size > max_ws) || (smem(w_size) > max_smem) )
                    w_size /= 2;
            }

            g_size = w_size * num_workgroups(queue);
        }

Exemple #3

0

Afficher le fichier

Fichier : kernel.hpp Projet : clouddon/vexcl

        /// Select best launch configuration for the given shared memory requirements.
        void config(const boost::compute::command_queue &queue, std::function<size_t(size_t)> smem) {
            boost::compute::device dev = queue.get_device();

            size_t ws;

            if ( is_cpu(queue) ) {
                ws = 1;
            } else {
                // Select workgroup size that would fit into the device.
                ws = dev.get_info<std::vector<size_t>>(CL_DEVICE_MAX_WORK_ITEM_SIZES)[0] / 2;

                size_t max_ws   = max_threads_per_block(queue);
                size_t max_smem = max_shared_memory_per_block(queue);

                // Reduce workgroup size until it satisfies resource requirements:
                while( (ws > max_ws) || (smem(ws) > max_smem) )
                    ws /= 2;
            }

            config(num_workgroups(queue), ws);
        }

Exemple #4

0

Afficher le fichier

Fichier : multivector.hpp Projet : Quanteek/vexcl

        typename std::enable_if<
            !boost::proto::matches<
                typename boost::proto::result_of::as_expr<ExprTuple>::type,
                multivector_full_grammar
            >::value
#if !defined(_MSC_VER) || _MSC_VER >= 1700
            && N == std::tuple_size<ExprTuple>::value
#endif
            , const multivector&
        >::type
        operator=(const ExprTuple &expr) {
#endif
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream source;

                    source << standard_kernel_header(device);

                    {
                        get_header f(source);
                        for_each<0>(expr, f);
                    }

                    source <<
                        "kernel void multi_expr_tuple(\n"
                        "\t" << type_name<size_t>() << " n";

                    for(uint i = 1; i <= N; i++)
                        source << ",\n\tglobal " << type_name<T>() << " *res_" << i;

                    {
                        get_params f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n)\n{\n";

                    if ( is_cpu(device) ) {
                        source <<
                            "\tsize_t chunk_size  = (n + get_global_size(0) - 1) / get_global_size(0);\n"
                            "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n"
                            "\tsize_t chunk_end   = min(n, chunk_start + chunk_size);\n"
                            "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n";
                    } else {
                        source <<
                            "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";
                    }

                    {
                        get_expressions f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n";

                    for(uint i = 1; i <= N; i++)
                        source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n";

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, "multi_expr_tuple");
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, (*vec[i])(d));

                    {
                        set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d));
                        for_each<0>(expr, f);
                    }

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }

Exemple #5

0

Afficher le fichier

Fichier : multivector.hpp Projet : Quanteek/vexcl

        typename std::enable_if<
            boost::proto::matches<
                typename boost::proto::result_of::as_expr<Expr>::type,
                multivector_expr_grammar
            >::value,
            const multivector&
        >::type
        operator=(const Expr& expr) {
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            // If any device in context is CPU, then do not fuse the kernel,
            // but assign components individually.
            if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) {
                assign_subexpressions<0, N>(boost::proto::as_child(expr));
                return *this;
            }

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream kernel_name;
                    kernel_name << "multi_";
                    vector_name_context name_ctx(kernel_name);
                    boost::proto::eval(boost::proto::as_child(expr), name_ctx);

                    std::ostringstream source;
                    source << standard_kernel_header(device);

                    extract_user_functions()(
                            boost::proto::as_child(expr),
                            declare_user_function(source)
                            );

                    source << "kernel void " << kernel_name.str()
                           << "(\n\t" << type_name<size_t>() << " n";

                    for(size_t i = 0; i < N; )
                        source << ",\n\tglobal " << type_name<T>()
                               << " *res_" << ++i;

                    build_param_list<N>(boost::proto::as_child(expr), source);

                    source <<
                        "\n)\n{\n"
                        "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";

                    build_expr_list(boost::proto::as_child(expr), source);

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, kernel_name.str().c_str());
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, vec[i]->operator()(d));

                    set_kernel_args<N>(
                            boost::proto::as_child(expr),
                            kernel->second.kernel, d, pos, vec[0]->part_start(d)
                            );

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }