Exemple #1
0
int reduce_by_key_sink(
        IKTuple &&ikeys, vector<V> const &ivals,
        OKTuple &&okeys, vector<V>       &ovals,
        Comp, Oper
        )
{
    namespace fusion = boost::fusion;
    typedef typename extract_value_types<IKTuple>::type K;

    static_assert(
            std::is_same<K, typename extract_value_types<OKTuple>::type>::value,
            "Incompatible input and output key types");

    precondition(
            fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1,
            "reduce_by_key is only supported for single device contexts"
            );

    precondition(fusion::at_c<0>(ikeys).size() == ivals.size(),
            "keys and values should have same size"
            );

    const auto &queue = fusion::at_c<0>(ikeys).queue_list();
    backend::select_context(queue[0]);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu;

    size_t count         = fusion::at_c<0>(ikeys).size();
    size_t num_blocks    = (count + NT - 1) / NT;
    size_t scan_buf_size = alignup(num_blocks, NT);

    backend::device_vector<int> key_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   pre_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   post_sum  (queue[0], scan_buf_size);
    backend::device_vector<V>   offset_val(queue[0], count);
    backend::device_vector<int> offset    (queue[0], count);

    /***** Kernel 0 *****/
    auto krn0 = offset_calculation<K, Comp>(queue[0]);

    krn0.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn0));
    krn0.push_arg(offset);

    krn0(queue[0]);

    VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;);
int reduce_by_key_sink(
        IKTuple &&ikeys, vector<V> const &ivals,
        OKTuple &&okeys, vector<V>       &ovals,
        Comp, Oper
        )
{
    namespace fusion = boost::fusion;
    typedef typename extract_value_types<IKTuple>::type K;

    static_assert(
            std::is_same<K, typename extract_value_types<OKTuple>::type>::value,
            "Incompatible input and output key types");

    precondition(
            fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1,
            "Sorting is only supported for single device contexts"
            );

    precondition(fusion::at_c<0>(ikeys).size() == ivals.size(),
            "keys and values should have same size"
            );

    const auto &queue = fusion::at_c<0>(ikeys).queue_list();
    backend::select_context(queue[0]);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu;

    size_t count         = fusion::at_c<0>(ikeys).size();
    size_t num_blocks    = (count + NT - 1) / NT;
    size_t scan_buf_size = alignup(num_blocks, NT);

    backend::device_vector<int> key_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   pre_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   post_sum  (queue[0], scan_buf_size);
    backend::device_vector<V>   offset_val(queue[0], count);
    backend::device_vector<int> offset    (queue[0], count);

    /***** Kernel 0 *****/
    auto krn0 = detail::offset_calculation<K, Comp>(queue[0]);

    krn0.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn0));
    krn0.push_arg(offset);

    krn0(queue[0]);

    VEX_FUNCTION(plus, int(int, int), "return prm1 + prm2;");
    detail::scan(queue[0], offset, offset, 0, false, plus);

    /***** Kernel 1 *****/
    auto krn1 = is_cpu(queue[0]) ?
        detail::block_scan_by_key<NT_cpu, V, Oper>(queue[0]) :
        detail::block_scan_by_key<NT_gpu, V, Oper>(queue[0]);

    krn1.push_arg(count);
    krn1.push_arg(offset);
    krn1.push_arg(ivals(0));
    krn1.push_arg(offset_val);
    krn1.push_arg(key_sum);
    krn1.push_arg(pre_sum);

    krn1.config(num_blocks, NT);
    krn1(queue[0]);

    /***** Kernel 2 *****/
    uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT));

    auto krn2 = is_cpu(queue[0]) ?
        detail::block_inclusive_scan_by_key<NT_cpu, V, Oper>(queue[0]) :
        detail::block_inclusive_scan_by_key<NT_gpu, V, Oper>(queue[0]);

    krn2.push_arg(num_blocks);
    krn2.push_arg(key_sum);
    krn2.push_arg(pre_sum);
    krn2.push_arg(post_sum);
    krn2.push_arg(work_per_thread);

    krn2.config(1, NT);
    krn2(queue[0]);

    /***** Kernel 3 *****/
    auto krn3 = detail::block_sum_by_key<V, Oper>(queue[0]);

    krn3.push_arg(count);
    krn3.push_arg(key_sum);
    krn3.push_arg(post_sum);
    krn3.push_arg(offset);
    krn3.push_arg(offset_val);

    krn3.config(num_blocks, NT);
    krn3(queue[0]);

    /***** resize okeys and ovals *****/
    int out_elements;
    offset.read(queue[0], count - 1, 1, &out_elements, true);
    ++out_elements;

    boost::fusion::for_each(okeys, do_vex_resize(queue, out_elements));
    ovals.resize(ivals.queue_list(), out_elements);

    /***** Kernel 4 *****/
    auto krn4 = detail::key_value_mapping<K, V>(queue[0]);

    krn4.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn4));
    boost::fusion::for_each(okeys, do_push_arg(krn4));
    krn4.push_arg(ovals(0));
    krn4.push_arg(offset);
    krn4.push_arg(offset_val);

    krn4(queue[0]);

    return out_elements;
}
Exemple #3
0
void scan(
        backend::command_queue    const &queue,
        backend::device_vector<T> const &input,
        backend::device_vector<T>       &output,
        T init,
        bool exclusive,
        Oper
        )
{
    precondition(
            input.size() == output.size(),
            "Wrong output size in inclusive_scan"
            );

    backend::select_context(queue);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue) ? NT_cpu : NT_gpu;
    const int NT2 = 2 * NT;

    int do_exclusive = exclusive ? 1 : 0;

    const size_t count         = input.size();
    const size_t num_blocks    = (count + NT2 - 1) / NT2;
    const size_t scan_buf_size = alignup(num_blocks, NT2);

    backend::device_vector<T> pre_sum1(queue, scan_buf_size);
    backend::device_vector<T> pre_sum2(queue, scan_buf_size);
    backend::device_vector<T> post_sum(queue, scan_buf_size);

    // Kernel0
    auto krn0 = is_cpu(queue) ?
        block_inclusive_scan<NT_cpu, T, Oper>(queue) :
        block_inclusive_scan<NT_gpu, T, Oper>(queue);

    krn0.push_arg(count);
    krn0.push_arg(input);
    krn0.push_arg(init);
    krn0.push_arg(pre_sum1);
    krn0.push_arg(pre_sum2);
    krn0.push_arg(do_exclusive);

    krn0.config(num_blocks, NT);

    krn0(queue);

    // Kernel1
    auto krn1 = is_cpu(queue) ?
        intra_block_inclusive_scan<NT_cpu, T, Oper>(queue) :
        intra_block_inclusive_scan<NT_gpu, T, Oper>(queue);

    uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT));
    krn1.push_arg(num_blocks);
    krn1.push_arg(post_sum);
    krn1.push_arg(pre_sum1);
    krn1.push_arg(init);
    krn1.push_arg(work_per_thread);

    krn1.config(1, NT);

    krn1(queue);

    // Kernel2
    auto krn2 = is_cpu(queue) ?
        block_addition<NT_cpu, T, Oper>(queue) :
        block_addition<NT_gpu, T, Oper>(queue);

    krn2.push_arg(count);
    krn2.push_arg(input);
    krn2.push_arg(output);
    krn2.push_arg(post_sum);
    krn2.push_arg(pre_sum2);
    krn2.push_arg(init);
    krn2.push_arg(do_exclusive);

    krn2.config(num_blocks * 2, NT);

    krn2(queue);
}