示例#1
0
backend::kernel key_value_mapping(const backend::command_queue &queue) {
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        src.kernel("key_value_mapping")
            .open("(")
                .template parameter< size_t >("n");

        boost::mpl::for_each<K>(pointer_param<global_ptr, true>(src, "ikeys"));
        boost::mpl::for_each<K>(pointer_param<global_ptr      >(src, "okeys"));

        src.template parameter< global_ptr<V>       >("ovals");
        src.template parameter< global_ptr<int>     >("offset");
        src.template parameter< global_ptr<const V> >("ivals");
        src.close(")").open("{");

        src.new_line().grid_stride_loop().open("{");

        src.new_line() << "int num_sections = offset[n - 1] + 1;";

        src.new_line() << "int off = offset[idx];";
        src.new_line() << "if (idx < (n - 1) && off != offset[idx + 1])";
        src.open("{");
        for(int p = 0; p < boost::mpl::size<K>::value; ++p)
            src.new_line() << "okeys" << p << "[off] = ikeys" << p << "[idx];";
        src.new_line() << "ovals[off] = ivals[idx];";
        src.close("}");

        src.new_line() << "if (idx == (n - 1))";
        src.open("{");
        for(int p = 0; p < boost::mpl::size<K>::value; ++p)
            src.new_line() << "okeys" << p << "[num_sections - 1] = ikeys" << p << "[idx];";
        src.new_line() << "ovals[num_sections - 1] = ivals[idx];";
        src.close("}");

        src.close("}");

        src.close("}");

        backend::kernel krn(queue, src.str(), "key_value_mapping");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
示例#2
0
backend::kernel block_sum_by_key(const backend::command_queue &queue) {
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Oper::define(src, "oper");

        src.kernel("block_sum_by_key")
            .open("(")
                .template parameter< size_t                >("n")
                .template parameter< global_ptr<const int> >("key_sum")
                .template parameter< global_ptr<const T>   >("post_sum")
                .template parameter< global_ptr<const int> >("keys")
                .template parameter< global_ptr<T>         >("output")
            .close(")").open("{");

        src.new_line() << "size_t g_id  = " << src.global_id(0)  << ";";
        src.new_line() << "size_t block = " << src.group_id(0)   << ";";

        src.new_line() << "if (g_id >= n) return;";

        // accumulate prefix
        src.new_line() << "int key2 = keys[ g_id ];";
        src.new_line() << "int key1 = (block > 0    ) ? key_sum[ block - 1 ] : key2 - 1;";
        src.new_line() << "int key3 = (g_id  < n - 1) ? keys   [ g_id  + 1 ] : key2 - 1;";

        src.new_line() << "if (block > 0 && key1 == key2 && key2 != key3)";
        src.open("{");
        src.new_line() << type_name<T>() << " scan_result    = output  [ g_id      ];";
        src.new_line() << type_name<T>() << " post_block_sum = post_sum[ block - 1 ];";
        src.new_line() << "output[ g_id ] = oper( scan_result, post_block_sum );";
        src.close("}");

        src.close("}");

        backend::kernel krn(queue, src.str(), "block_sum_by_key");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
示例#3
0
backend::kernel offset_calculation(const backend::command_queue &queue) {
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Comp::define(src, "comp");

        src.kernel("offset_calculation")
            .open("(")
            .template parameter< size_t >("n");

        boost::mpl::for_each<T>(pointer_param<global_ptr, true>(src, "keys"));

        src.template parameter< global_ptr<int> >("offsets");
        src.close(")").open("{");

        src.new_line().grid_stride_loop().open("{");
        src.new_line()
            << "if (idx > 0)"
            << " offsets[idx] = !comp(";
        for(int p = 0; p < boost::mpl::size<T>::value; ++p)
            src << (p ? ", " : "") << "keys" << p << "[idx - 1]";
        for(int p = 0; p < boost::mpl::size<T>::value; ++p)
            src << ", keys" << p << "[idx]";
        src << ");";
        src.new_line() << "else offsets[idx] = 0;";
        src.close("}");
        src.close("}");

        backend::kernel krn(queue, src.str(), "offset_calculation");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
示例#4
0
backend::kernel block_inclusive_scan_by_key(const backend::command_queue &queue)
{
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Oper::define(src, "oper");

        src.kernel("block_inclusive_scan_by_key")
            .open("(")
                .template parameter< size_t                >("n")
                .template parameter< global_ptr<const int> >("key_sum")
                .template parameter< global_ptr<const T>   >("pre_sum")
                .template parameter< global_ptr<T>         >("post_sum")
                .template parameter< cl_uint               >("work_per_thread")
            .close(")").open("{");

        src.new_line() << "size_t l_id   = " << src.local_id(0)   << ";";
        src.new_line() << "size_t g_id   = " << src.global_id(0)  << ";";
        src.new_line() << "size_t wgsz   = " << src.local_size(0) << ";";
        src.new_line() << "size_t map_id = g_id * work_per_thread;";

        src.new_line() << "struct Shared";
        src.open("{");
            src.new_line() << "int keys[" << NT << "];";
            src.new_line() << type_name<T>() << " vals[" << NT << "];";
        src.close("};");

        src.smem_static_var("struct Shared", "shared");

        src.new_line() << "uint offset;";
        src.new_line() << "int  key;";
        src.new_line() << type_name<T>() << " work_sum;";

        src.new_line() << "if (map_id < n)";
        src.open("{");
        src.new_line() << "int prev_key;";

        // accumulate zeroth value manually
        src.new_line() << "offset   = 0;";
        src.new_line() << "key      = key_sum[map_id];";
        src.new_line() << "work_sum = pre_sum[map_id];";

        src.new_line() << "post_sum[map_id] = work_sum;";

        //  Serial accumulation
        src.new_line() << "for( offset = offset + 1; offset < work_per_thread; ++offset )";
        src.open("{");
        src.new_line() << "prev_key = key;";
        src.new_line() << "key      = key_sum[ map_id + offset ];";

        src.new_line() << "if ( map_id + offset < n )";
        src.open("{");
        src.new_line() << type_name<T>() << " y = pre_sum[ map_id + offset ];";

        src.new_line() << "if ( key == prev_key ) work_sum = oper( work_sum, y );";
        src.new_line() << "else work_sum = y;";

        src.new_line() << "post_sum[ map_id + offset ] = work_sum;";
        src.close("}");
        src.close("}");
        src.close("}");
        src.new_line().barrier();

        // load LDS with register sums
        src.new_line() << "shared.vals[ l_id ] = work_sum;";
        src.new_line() << "shared.keys[ l_id ] = key;";

        // scan in lds
        src.new_line() << type_name<T>() << " scan_sum = work_sum;";

        src.new_line() << "for( offset = 1; offset < wgsz; offset *= 2 )";
        src.open("{");
        src.new_line().barrier();

        src.new_line() << "if (map_id < n)";
        src.open("{");
        src.new_line() << "if (l_id >= offset)";
        src.open("{");
        src.new_line() << "int key1 = shared.keys[ l_id ];";
        src.new_line() << "int key2 = shared.keys[ l_id - offset ];";

        src.new_line() << "if ( key1 == key2 ) scan_sum = oper( scan_sum, shared.vals[ l_id - offset ] );";
        src.new_line() << "else scan_sum = shared.vals[ l_id ];";
        src.close("}");

        src.close("}");
        src.new_line().barrier();

        src.new_line() << "shared.vals[ l_id ] = scan_sum;";
        src.close("}");

        src.new_line().barrier();

        // write final scan from pre-scan and lds scan
        src.new_line() << "for( offset = 0; offset < work_per_thread; ++offset )";
        src.open("{");
        src.new_line().barrier(true);

        src.new_line() << "if (map_id < n && l_id > 0)";
        src.open("{");
        src.new_line() << type_name<T>() << " y = post_sum[ map_id + offset ];";
        src.new_line() << "int key1 = key_sum    [ map_id + offset ];";
        src.new_line() << "int key2 = shared.keys[ l_id - 1 ];";

        src.new_line() << "if ( key1 == key2 ) y = oper( y, shared.vals[l_id - 1] );";

        src.new_line() << "post_sum[ map_id + offset ] = y;";
        src.close("}");
        src.close("}");

        src.close("}");

        backend::kernel krn(queue, src.str(), "block_inclusive_scan_by_key");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
示例#5
0
backend::kernel block_scan_by_key(const backend::command_queue &queue) {
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Oper::define(src, "oper");

        src.kernel("block_scan_by_key")
            .open("(")
                .template parameter< size_t                >("n")
                .template parameter< global_ptr<const int> >("keys")
                .template parameter< global_ptr<const T>   >("vals")
                .template parameter< global_ptr<T>         >("output")
                .template parameter< global_ptr<int>       >("key_buf")
                .template parameter< global_ptr<T>         >("val_buf")
            .close(")").open("{");

        src.new_line() << "size_t l_id  = " << src.local_id(0)   << ";";
        src.new_line() << "size_t g_id  = " << src.global_id(0)  << ";";
        src.new_line() << "size_t block = " << src.group_id(0)   << ";";
        src.new_line() << "size_t wgsz  = " << src.local_size(0) << ";";

        src.new_line() << "struct Shared";
        src.open("{");
            src.new_line() << "int keys[" << NT << "];";
            src.new_line() << type_name<T>() << " vals[" << NT << "];";
        src.close("};");

        src.smem_static_var("struct Shared", "shared");

        src.new_line() << "int key;";
        src.new_line() << type_name<T>() << " val;";

        src.new_line() << "if (g_id < n)";
        src.open("{");
        src.new_line() << "key = keys[g_id];";
        src.new_line() << "val = vals[g_id];";
        src.new_line() << "shared.keys[l_id] = key;";
        src.new_line() << "shared.vals[l_id] = val;";
        src.close("}");

        // Computes a scan within a workgroup updates vals in lds but not keys
        src.new_line() << type_name<T>() << " sum = val;";
        src.new_line() << "for(size_t offset = 1; offset < wgsz; offset *= 2)";
        src.open("{");
        src.new_line().barrier();
        src.new_line() << "if (l_id >= offset && shared.keys[l_id - offset] == key)";
        src.open("{");
        src.new_line() << "sum = oper(sum, shared.vals[l_id - offset]);";
        src.close("}");
        src.new_line().barrier();
        src.new_line() << "shared.vals[l_id] = sum;";
        src.close("}");
        src.new_line().barrier();

        src.new_line() << "if (g_id >= n) return;";

        // Each work item writes out its calculated scan result, relative to the
        // beginning of each work group
        src.new_line() << "int key2 = -1;";
        src.new_line() << "if (g_id < n - 1) key2 = keys[g_id + 1];";
        src.new_line() << "if (key != key2) output[g_id] = sum;";

        src.new_line() << "if (l_id == 0)";
        src.open("{");
        src.new_line() << "key_buf[block] = shared.keys[wgsz - 1];";
        src.new_line() << "val_buf[block] = shared.vals[wgsz - 1];";
        src.close("}");

        src.close("}");

        backend::kernel krn(queue, src.str(), "block_scan_by_key");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
示例#6
0
        typename std::enable_if<
            !boost::proto::matches<
                typename boost::proto::result_of::as_expr<ExprTuple>::type,
                multivector_full_grammar
            >::value
#if !defined(_MSC_VER) || _MSC_VER >= 1700
            && N == std::tuple_size<ExprTuple>::value
#endif
            , const multivector&
        >::type
        operator=(const ExprTuple &expr) {
#endif
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream source;

                    source << standard_kernel_header(device);

                    {
                        get_header f(source);
                        for_each<0>(expr, f);
                    }

                    source <<
                        "kernel void multi_expr_tuple(\n"
                        "\t" << type_name<size_t>() << " n";

                    for(uint i = 1; i <= N; i++)
                        source << ",\n\tglobal " << type_name<T>() << " *res_" << i;

                    {
                        get_params f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n)\n{\n";

                    if ( is_cpu(device) ) {
                        source <<
                            "\tsize_t chunk_size  = (n + get_global_size(0) - 1) / get_global_size(0);\n"
                            "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n"
                            "\tsize_t chunk_end   = min(n, chunk_start + chunk_size);\n"
                            "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n";
                    } else {
                        source <<
                            "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";
                    }

                    {
                        get_expressions f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n";

                    for(uint i = 1; i <= N; i++)
                        source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n";

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, "multi_expr_tuple");
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, (*vec[i])(d));

                    {
                        set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d));
                        for_each<0>(expr, f);
                    }

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }
示例#7
0
        typename std::enable_if<
            boost::proto::matches<
                typename boost::proto::result_of::as_expr<Expr>::type,
                multivector_expr_grammar
            >::value,
            const multivector&
        >::type
        operator=(const Expr& expr) {
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            // If any device in context is CPU, then do not fuse the kernel,
            // but assign components individually.
            if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) {
                assign_subexpressions<0, N>(boost::proto::as_child(expr));
                return *this;
            }

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream kernel_name;
                    kernel_name << "multi_";
                    vector_name_context name_ctx(kernel_name);
                    boost::proto::eval(boost::proto::as_child(expr), name_ctx);

                    std::ostringstream source;
                    source << standard_kernel_header(device);

                    extract_user_functions()(
                            boost::proto::as_child(expr),
                            declare_user_function(source)
                            );

                    source << "kernel void " << kernel_name.str()
                           << "(\n\t" << type_name<size_t>() << " n";

                    for(size_t i = 0; i < N; )
                        source << ",\n\tglobal " << type_name<T>()
                               << " *res_" << ++i;

                    build_param_list<N>(boost::proto::as_child(expr), source);

                    source <<
                        "\n)\n{\n"
                        "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";

                    build_expr_list(boost::proto::as_child(expr), source);

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, kernel_name.str().c_str());
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, vec[i]->operator()(d));

                    set_kernel_args<N>(
                            boost::proto::as_child(expr),
                            kernel->second.kernel, d, pos, vec[0]->part_start(d)
                            );

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }
示例#8
0
 /// apply kernel to get similarity metric.
 double operator()(const FeatVec<K>& x) const {
   return krn(x, m, sigma);
 }