Exemple #1
0
int reduce_by_key_sink(
        IKTuple &&ikeys, vector<V> const &ivals,
        OKTuple &&okeys, vector<V>       &ovals,
        Comp, Oper
        )
{
    namespace fusion = boost::fusion;
    typedef typename extract_value_types<IKTuple>::type K;

    static_assert(
            std::is_same<K, typename extract_value_types<OKTuple>::type>::value,
            "Incompatible input and output key types");

    precondition(
            fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1,
            "reduce_by_key is only supported for single device contexts"
            );

    precondition(fusion::at_c<0>(ikeys).size() == ivals.size(),
            "keys and values should have same size"
            );

    const auto &queue = fusion::at_c<0>(ikeys).queue_list();
    backend::select_context(queue[0]);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu;

    size_t count         = fusion::at_c<0>(ikeys).size();
    size_t num_blocks    = (count + NT - 1) / NT;
    size_t scan_buf_size = alignup(num_blocks, NT);

    backend::device_vector<int> key_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   pre_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   post_sum  (queue[0], scan_buf_size);
    backend::device_vector<V>   offset_val(queue[0], count);
    backend::device_vector<int> offset    (queue[0], count);

    /***** Kernel 0 *****/
    auto krn0 = offset_calculation<K, Comp>(queue[0]);

    krn0.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn0));
    krn0.push_arg(offset);

    krn0(queue[0]);

    VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;);
int reduce_by_key_sink(
        IKTuple &&ikeys, vector<V> const &ivals,
        OKTuple &&okeys, vector<V>       &ovals,
        Comp, Oper
        )
{
    namespace fusion = boost::fusion;
    typedef typename extract_value_types<IKTuple>::type K;

    static_assert(
            std::is_same<K, typename extract_value_types<OKTuple>::type>::value,
            "Incompatible input and output key types");

    precondition(
            fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1,
            "Sorting is only supported for single device contexts"
            );

    precondition(fusion::at_c<0>(ikeys).size() == ivals.size(),
            "keys and values should have same size"
            );

    const auto &queue = fusion::at_c<0>(ikeys).queue_list();
    backend::select_context(queue[0]);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu;

    size_t count         = fusion::at_c<0>(ikeys).size();
    size_t num_blocks    = (count + NT - 1) / NT;
    size_t scan_buf_size = alignup(num_blocks, NT);

    backend::device_vector<int> key_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   pre_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   post_sum  (queue[0], scan_buf_size);
    backend::device_vector<V>   offset_val(queue[0], count);
    backend::device_vector<int> offset    (queue[0], count);

    /***** Kernel 0 *****/
    auto krn0 = detail::offset_calculation<K, Comp>(queue[0]);

    krn0.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn0));
    krn0.push_arg(offset);

    krn0(queue[0]);

    VEX_FUNCTION(plus, int(int, int), "return prm1 + prm2;");
    detail::scan(queue[0], offset, offset, 0, false, plus);

    /***** Kernel 1 *****/
    auto krn1 = is_cpu(queue[0]) ?
        detail::block_scan_by_key<NT_cpu, V, Oper>(queue[0]) :
        detail::block_scan_by_key<NT_gpu, V, Oper>(queue[0]);

    krn1.push_arg(count);
    krn1.push_arg(offset);
    krn1.push_arg(ivals(0));
    krn1.push_arg(offset_val);
    krn1.push_arg(key_sum);
    krn1.push_arg(pre_sum);

    krn1.config(num_blocks, NT);
    krn1(queue[0]);

    /***** Kernel 2 *****/
    uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT));

    auto krn2 = is_cpu(queue[0]) ?
        detail::block_inclusive_scan_by_key<NT_cpu, V, Oper>(queue[0]) :
        detail::block_inclusive_scan_by_key<NT_gpu, V, Oper>(queue[0]);

    krn2.push_arg(num_blocks);
    krn2.push_arg(key_sum);
    krn2.push_arg(pre_sum);
    krn2.push_arg(post_sum);
    krn2.push_arg(work_per_thread);

    krn2.config(1, NT);
    krn2(queue[0]);

    /***** Kernel 3 *****/
    auto krn3 = detail::block_sum_by_key<V, Oper>(queue[0]);

    krn3.push_arg(count);
    krn3.push_arg(key_sum);
    krn3.push_arg(post_sum);
    krn3.push_arg(offset);
    krn3.push_arg(offset_val);

    krn3.config(num_blocks, NT);
    krn3(queue[0]);

    /***** resize okeys and ovals *****/
    int out_elements;
    offset.read(queue[0], count - 1, 1, &out_elements, true);
    ++out_elements;

    boost::fusion::for_each(okeys, do_vex_resize(queue, out_elements));
    ovals.resize(ivals.queue_list(), out_elements);

    /***** Kernel 4 *****/
    auto krn4 = detail::key_value_mapping<K, V>(queue[0]);

    krn4.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn4));
    boost::fusion::for_each(okeys, do_push_arg(krn4));
    krn4.push_arg(ovals(0));
    krn4.push_arg(offset);
    krn4.push_arg(offset_val);

    krn4(queue[0]);

    return out_elements;
}