Exemplos de uint_ em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: geometry_svg_generator_impl.hpp Projeto: cquest/mapnik

    svg_path_generator<OutputIterator,Path>::svg_path_generator()
        : svg_path_generator::base_type(svg)
    {
        boost::spirit::karma::uint_type uint_;
        boost::spirit::karma::_val_type _val;
        boost::spirit::karma::_1_type _1;
        boost::spirit::karma::lit_type lit;

        svg = point | linestring | polygon
            ;

        point = &uint_(mapnik::geometry::geometry_types::Point)[_1 = _type(_val)]
            << svg_point [_1 = _first(_val)]
            ;

        svg_point = &uint_
            << lit("cx=\"") << coordinate
            << lit("\" cy=\"") << coordinate
            << lit('\"')
            ;

        linestring = &uint_(mapnik::geometry::geometry_types::LineString)[_1 = _type(_val)]
            << lit("d=\"") << svg_path << lit("\"")
            ;

        polygon = &uint_(mapnik::geometry::geometry_types::Polygon)[_1 = _type(_val)]
            << lit("d=\"") << svg_path << lit("\"")
            ;

        svg_path %= ((&uint_(mapnik::SEG_MOVETO) << lit('M')
                      | &uint_(mapnik::SEG_LINETO) << lit('L'))
                     << coordinate << lit(' ') << coordinate) % lit(' ')
                     ;

    }

Exemplo n.º 2

0

Exibir arquivo

Arquivo: wkt_generator.cpp Projeto: Jiangyangyang/mapnik

wkt_generator<OutputIterator, Geometry>::wkt_generator(bool single)
    : wkt_generator::base_type(wkt)
{
    boost::spirit::karma::uint_type uint_;
    boost::spirit::karma::_val_type _val;
    boost::spirit::karma::_1_type _1;
    boost::spirit::karma::lit_type lit;
    boost::spirit::karma::_a_type _a;
    boost::spirit::karma::_b_type _b;
    boost::spirit::karma::_c_type _c;
    boost::spirit::karma::_r1_type _r1;
    boost::spirit::karma::eps_type eps;
    boost::spirit::karma::string_type kstring;

    wkt = point | linestring | polygon
        ;

    point = &uint_(mapnik::geometry_type::types::Point)[_1 = _type(_val)]
        << kstring[ phoenix::if_ (single) [_1 = "Point("]
                   .else_[_1 = "("]]
        << point_coord [_1 = _first(_val)] << lit(')')
        ;

    linestring = &uint_(mapnik::geometry_type::types::LineString)[_1 = _type(_val)]
        << kstring[ phoenix::if_ (single) [_1 = "LineString("]
                   .else_[_1 = "("]]
        << coords
        << lit(')')
        ;

    polygon = &uint_(mapnik::geometry_type::types::Polygon)[_1 = _type(_val)]
        << kstring[ phoenix::if_ (single) [_1 = "Polygon("]
                   .else_[_1 = "("]]
        << coords2
        << lit("))")
        ;

    point_coord = &uint_ << coordinate << lit(' ') << coordinate
        ;

    polygon_coord %= ( &uint_(mapnik::SEG_MOVETO)
                       << eps[_r1 += 1][_a  = _x(_val)][ _b = _y(_val)]
                       << kstring[ if_ (_r1 > 1) [_1 = "),("]
                                  .else_[_1 = "("]]
                       |
                       &uint_(mapnik::SEG_LINETO)
                       << lit(',') << eps[_a = _x(_val)][_b = _y(_val)]
        )
        << coordinate[_1 = _a]
        << lit(' ')
        << coordinate[_1 = _b]
        ;

    coords2 %= *polygon_coord(_a,_b,_c)
        ;

    coords = point_coord % lit(',')
        ;
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: geometry_generator_grammar.hpp Projeto: Jiangyangyang/mapnik

    geometry_generator_grammar()
        : geometry_generator_grammar::base_type(coordinates)
    {
        boost::spirit::karma::uint_type uint_;
        boost::spirit::bool_type bool_;
        boost::spirit::karma::_val_type _val;
        boost::spirit::karma::_1_type _1;
        boost::spirit::karma::lit_type lit;
        boost::spirit::karma::_a_type _a;
        boost::spirit::karma::_r1_type _r1;
        boost::spirit::karma::eps_type eps;
        boost::spirit::karma::string_type kstring;

        coordinates =  point | linestring | polygon
            ;

        point = &uint_(mapnik::geometry_type::types::Point)[_1 = _type(_val)]
            << point_coord [_1 = _first(_val)]
            ;

        linestring = &uint_(mapnik::geometry_type::types::LineString)[_1 = _type(_val)]
            << lit('[')
            << coords
            << lit(']')
            ;

        polygon = &uint_(mapnik::geometry_type::types::Polygon)[_1 = _type(_val)]
            << lit('[')
            << coords2
            << lit("]]")
            ;

        point_coord = &uint_
            << lit('[')
            << coord_type << lit(',') << coord_type
            << lit(']')
            ;

        polygon_coord %= ( &uint_(mapnik::SEG_MOVETO) << eps[_r1 += 1]
                           << kstring[ if_ (_r1 > 1) [_1 = "],["]
                                             .else_[_1 = '[' ]]
                           |
                           &uint_(mapnik::SEG_LINETO)
                           << lit(',')) << lit('[') << coord_type << lit(',') << coord_type << lit(']')
            ;

        coords2 %= *polygon_coord(_a)
            ;

        coords = point_coord % lit(',')
            ;

    }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: search_n.hpp Projeto: alanwoolley/innoextract-android

inline TextIterator search_n(TextIterator t_first,
                             TextIterator t_last,
                             size_t n,
                             ValueType value,
                             command_queue &queue = system::default_queue())
{
    // there is no need to check if pattern starts at last n - 1 indices
    vector<uint_> matching_indices(
        detail::iterator_range_size(t_first, t_last) + 1 - n,
        queue.get_context()
    );

    // search_n_kernel puts value 1 at every index in vector where pattern
    // of n values starts at
    detail::search_n_kernel<TextIterator,
                            vector<uint_>::iterator> kernel;

    kernel.set_range(t_first, t_last, value, n, matching_indices.begin());
    kernel.exec(queue);

    vector<uint_>::iterator index = ::boost::compute::find(
        matching_indices.begin(), matching_indices.end(), uint_(1), queue
    );

    // pattern was not found
    if(index == matching_indices.end())
        return t_last;

    return t_first + detail::iterator_range_size(matching_indices.begin(), index);
}

Exemplo n.º 5

0

Exibir arquivo

Arquivo: uniform_int_distribution.hpp Projeto: Soledad89/compute

    void generate(OutputIterator first,
                  OutputIterator last,
                  Generator &generator,
                  command_queue &queue)
    {
        size_t size = std::distance(first, last);
        typedef typename Generator::result_type g_result_type;

        vector<g_result_type> tmp(size, queue.get_context());
        vector<g_result_type> tmp2(size, queue.get_context());

        uint_ bound = ((uint_(-1))/(m_b-m_a+1))*(m_b-m_a+1);

        buffer_iterator<g_result_type> tmp2_iter;

        while(size>0)
        {
            generator.generate(tmp.begin(), tmp.begin() + size, queue);
            tmp2_iter = copy_if(tmp.begin(), tmp.begin() + size, tmp2.begin(),
                                _1 <= bound, queue);
            size = std::distance(tmp2_iter, tmp2.end());
        }

        BOOST_COMPUTE_FUNCTION(IntType, scale_random, (const g_result_type x),
        {
            return LO + (x % (HI-LO+1));
        });

Exemplo n.º 6

0

Exibir arquivo

Arquivo: program.hpp Projeto: Quanteek/compute

    /**
     * In case BOOST_COMPUTE_USE_OFFLINE_CACHE macro is defined,
     * the compiled binary is stored for reuse in the offline cache located in
     * $HOME/.boost_compute on UNIX-like systems and in %APPDATA%/boost_compute
     * on Windows.
     */
    static program build_with_source(
            const std::string &source,
            const context     &context,
            const std::string &options = std::string()
            )
    {
#ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE
        // Get hash string for the kernel.
        std::string hash;
        {
            device   d(context.get_device());
            platform p(d.get_info<cl_platform_id>(CL_DEVICE_PLATFORM));

            std::ostringstream src;
            src << "// " << p.name() << " v" << p.version() << "\n"
                << "// " << context.get_device().name() << "\n"
                << "// " << options << "\n\n"
                << source;

            hash = detail::sha1(src.str());
        }

        // Try to get cached program binaries:
        try {
            boost::optional<program> prog = load_program_binary(hash, context);

            if (prog) {
                prog->build(options);
                return *prog;
            }
        } catch (...) {
            // Something bad happened. Fallback to normal compilation.
        }

        // Cache is apparently not available. Just compile the sources.
#endif
        const char *source_string = source.c_str();

        cl_int error = 0;
        cl_program program_ = clCreateProgramWithSource(context,
                                                        uint_(1),
                                                        &source_string,
                                                        0,
                                                        &error);
        if(!program_){
            BOOST_THROW_EXCEPTION(runtime_exception(error));
        }

        program prog(program_, false);
        prog.build(options);

#ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE
        // Save program binaries for future reuse.
        save_program_binary(hash, prog);
#endif

        return prog;
    }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: device_ptr.hpp Projeto: ASMlover/study

    detail::device_ptr_index_expr<T, Expr>
    operator[](const Expr &expr) const
    {
        BOOST_ASSERT(m_buffer.get());

        return detail::device_ptr_index_expr<T, Expr>(m_buffer,
                                                      uint_(m_index),
                                                      expr);
    }

Exemplo n.º 8

0

Exibir arquivo

Arquivo: program.hpp Projeto: ASMlover/study

    /**
     * In case BOOST_COMPUTE_USE_OFFLINE_CACHE macro is defined,
     * the compiled binary is stored for reuse in the offline cache located in
     * $HOME/.boost_compute on UNIX-like systems and in %APPDATA%/boost_compute
     * on Windows.
     */
    static program build_with_source(
            const std::string &source,
            const context     &context,
            const std::string &options = std::string()
            )
    {
#ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE
        // Get hash string for the kernel.
        device   d = context.get_device();
        platform p = d.platform();

        detail::sha1 hash;
        hash.process( p.name()    )
            .process( p.version() )
            .process( d.name()    )
            .process( options     )
            .process( source      )
            ;
        std::string hash_string = hash;

        // Try to get cached program binaries:
        try {
            boost::optional<program> prog = load_program_binary(hash_string, context);

            if (prog) {
                prog->build(options);
                return *prog;
            }
        } catch (...) {
            // Something bad happened. Fallback to normal compilation.
        }

        // Cache is apparently not available. Just compile the sources.
#endif
        const char *source_string = source.c_str();

        cl_int error = 0;
        cl_program program_ = clCreateProgramWithSource(context,
                                                        uint_(1),
                                                        &source_string,
                                                        0,
                                                        &error);
        if(!program_){
            BOOST_THROW_EXCEPTION(opencl_error(error));
        }

        program prog(program_, false);
        prog.build(options);

#ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE
        // Save program binaries for future reuse.
        save_program_binary(hash_string, prog);
#endif

        return prog;
    }

Exemplo n.º 9

0

Exibir arquivo

Arquivo: search_all.hpp Projeto: 2bbb/compute

    event exec(command_queue &queue)
    {
        if(m_count == 0) {
            return event();
        }

        set_arg(m_p_count_arg, uint_(m_p_count));

        return exec_1d(queue, 0, m_count);
    }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: merge_with_merge_path.hpp Projeto: ASMlover/study

inline OutputIterator
merge_with_merge_path(InputIterator1 first1,
                      InputIterator1 last1,
                      InputIterator2 first2,
                      InputIterator2 last2,
                      OutputIterator result,
                      Compare comp,
                      command_queue &queue = system::default_queue())
{
    typedef typename
        std::iterator_traits<OutputIterator>::difference_type result_difference_type;

    size_t tile_size = 1024;

    size_t count1 = iterator_range_size(first1, last1);
    size_t count2 = iterator_range_size(first2, last2);

    vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
    vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());

    // Tile the sets
    merge_path_kernel tiling_kernel;
    tiling_kernel.tile_size = static_cast<unsigned int>(tile_size);
    tiling_kernel.set_range(first1, last1, first2, last2,
                            tile_a.begin()+1, tile_b.begin()+1, comp);
    fill_n(tile_a.begin(), 1, uint_(0), queue);
    fill_n(tile_b.begin(), 1, uint_(0), queue);
    tiling_kernel.exec(queue);

    fill_n(tile_a.end()-1, 1, static_cast<uint_>(count1), queue);
    fill_n(tile_b.end()-1, 1, static_cast<uint_>(count2), queue);

    // Merge
    serial_merge_kernel merge_kernel;
    merge_kernel.tile_size = static_cast<unsigned int>(tile_size);
    merge_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
                           tile_b.begin(), result, comp);

    merge_kernel.exec(queue);

    return result + static_cast<result_difference_type>(count1 + count2);
}

Exemplo n.º 11

0

Exibir arquivo

Arquivo: reduce_on_gpu.hpp Projeto: Fantasticer/arangodb

inline void initial_reduce(InputIterator first,
                           InputIterator last,
                           buffer result,
                           const Function &function,
                           kernel &reduce_kernel,
                           const uint_ vpt,
                           const uint_ tpb,
                           command_queue &queue)
{
    (void) function;
    (void) reduce_kernel;

    typedef typename std::iterator_traits<InputIterator>::value_type Arg;
    typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;

    size_t count = std::distance(first, last);
    detail::meta_kernel k("initial_reduce");
    k.add_set_arg<const uint_>("count", uint_(count));
    size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");

    k <<
        k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<

        "__local " << type_name<T>() << " scratch[TPB];\n" <<

        // private reduction
        k.decl<T>("sum") << " = 0;\n" <<
        "for(uint i = 0; i < VPT; i++){\n" <<
        "    if(offset + lid + i*TPB < count){\n" <<
        "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
        "    }\n" <<
        "}\n" <<

        "scratch[lid] = sum;\n" <<

        // local reduction
        ReduceBody<T,false>::body() <<

        // write sum to output
        "if(lid == 0){\n" <<
        "    output[get_group_id(0)] = scratch[0];\n" <<
        "}\n";

    const context &context = queue.get_context();
    std::stringstream options;
    options << "-DVPT=" << vpt << " -DTPB=" << tpb;
    kernel generic_reduce_kernel = k.compile(context, options.str());
    generic_reduce_kernel.set_arg(output_arg, result);

    size_t work_size = calculate_work_size(count, vpt, tpb);

    queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: copy_on_device.hpp Projeto: junmuz/compute

    event exec(command_queue &queue)
    {
        if(m_count == 0){
            // nothing to do
            return event();
        }

        size_t global_work_size = calculate_work_size(m_count, m_vpt, m_tpb);

        set_arg(m_count_arg, uint_(m_count));

        return exec_1d(queue, 0, global_work_size, m_tpb);
    }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: reduce_on_gpu.hpp Projeto: Fantasticer/arangodb

inline void initial_reduce(const buffer_iterator<T> &first,
                           const buffer_iterator<T> &last,
                           const buffer &result,
                           const plus<T> &function,
                           kernel &reduce_kernel,
                           const uint_ vpt,
                           const uint_ tpb,
                           command_queue &queue)
{
    (void) function;

    size_t count = std::distance(first, last);

    reduce_kernel.set_arg(0, first.get_buffer());
    reduce_kernel.set_arg(1, uint_(first.get_index()));
    reduce_kernel.set_arg(2, uint_(count));
    reduce_kernel.set_arg(3, result);
    reduce_kernel.set_arg(4, uint_(0));

    size_t work_size = calculate_work_size(count, vpt, tpb);

    queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: buffer_iterator.hpp Projeto: Mageswaran1989/compute

inline meta_kernel& operator<<(meta_kernel &kernel,
                               const buffer_iterator_index_expr<T, IndexExpr> &expr)
{
    if(expr.m_index == 0){
        return kernel <<
                   kernel.get_buffer_identifier<T>(expr.m_buffer, expr.m_address_space) <<
                   '[' << expr.m_expr << ']';
    }
    else {
        return kernel <<
                   kernel.get_buffer_identifier<T>(expr.m_buffer, expr.m_address_space) <<
                   '[' << uint_(expr.m_index) << "+(" << expr.m_expr << ")]";
    }
}

Exemplo n.º 15

0

Exibir arquivo

Arquivo: get.hpp Projeto: 2bbb/compute

        static std::string value()
        {
            BOOST_STATIC_ASSERT(N < 16);

            std::stringstream stream;

            if(N < 10){
                stream << ".s" << uint_(N);
            }
            else if(N < 16){
                stream << ".s" << char('a' + (N - 10));
            }

            return stream.str();
        }

Exemplo n.º 16

0

Exibir arquivo

Arquivo: program.hpp Projeto: 2bbb/compute

    /// Creates a new program with \p source in \p context.
    ///
    /// \see_opencl_ref{clCreateProgramWithSource}
    static program create_with_source(const std::string &source,
                                      const context &context)
    {
        const char *source_string = source.c_str();

        cl_int error = 0;
        cl_program program_ = clCreateProgramWithSource(context,
                                                        uint_(1),
                                                        &source_string,
                                                        0,
                                                        &error);
        if(!program_){
            BOOST_THROW_EXCEPTION(opencl_error(error));
        }

        return program(program_, false);
    }

Exemplo n.º 17

0

Exibir arquivo

Arquivo: program.hpp Projeto: 2bbb/compute

    /// Creates a new program with \p sources in \p context.
    ///
    /// \see_opencl_ref{clCreateProgramWithSource}
    static program create_with_source(const std::vector<std::string> &sources,
                                      const context &context)
    {
        std::vector<const char*> source_strings(sources.size());
        for(size_t i = 0; i < sources.size(); i++){
            source_strings[i] = sources[i].c_str();
        }

        cl_int error = 0;
        cl_program program_ = clCreateProgramWithSource(context,
                                                        uint_(sources.size()),
                                                        &source_strings[0],
                                                        0,
                                                        &error);
        if(!program_){
            BOOST_THROW_EXCEPTION(opencl_error(error));
        }

        return program(program_, false);
    }

Exemplo n.º 18

0

Exibir arquivo

Arquivo: program.hpp Projeto: Quanteek/compute

    /// Creates a new program with \p binary of \p binary_size in
    /// \p context.
    ///
    /// \see_opencl_ref{clCreateProgramWithBinary}
    static program create_with_binary(const unsigned char *binary,
                                      size_t binary_size,
                                      const context &context)
    {
        const cl_device_id device = context.get_device().id();

        cl_int error = 0;
        cl_int binary_status = 0;
        cl_program program_ = clCreateProgramWithBinary(context,
                                                        uint_(1),
                                                        &device,
                                                        &binary_size,
                                                        &binary,
                                                        &binary_status,
                                                        &error);
        if(!program_){
            BOOST_THROW_EXCEPTION(runtime_exception(error));
        }

        return program(program_, false);
    }

Exemplo n.º 19

0

Exibir arquivo

Arquivo: search_n.hpp Projeto: Soledad89/compute

inline TextIterator search_n(TextIterator t_first,
                             TextIterator t_last,
                             size_t n,
                             ValueType value,
                             command_queue &queue = system::default_queue())
{
    vector<uint_> matching_indices(detail::iterator_range_size(t_first, t_last),
                                    queue.get_context());

    detail::search_n_kernel<TextIterator,
                            vector<uint_>::iterator> kernel;

    kernel.set_range(t_first, t_last, value, n, matching_indices.begin());
    kernel.exec(queue);

    vector<uint_>::iterator index = ::boost::compute::find(
        matching_indices.begin(), matching_indices.end(), uint_(1), queue
    );

    return t_first + detail::iterator_range_size(matching_indices.begin(), index);
}

Exemplo n.º 20

0

Exibir arquivo

Arquivo: find_extrema_on_cpu.hpp Projeto: AbhinavJain13/turicreate

inline InputIterator find_extrema_on_cpu(InputIterator first,
                                         InputIterator last,
                                         Compare compare,
                                         const bool find_minimum,
                                         command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
    size_t count = iterator_range_size(first, last);

    const device &device = queue.get_device();
    const uint_ compute_units = queue.get_device().compute_units();

    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);
    std::string cache_key =
        "__boost_find_extrema_cpu_"
            + boost::lexical_cast<std::string>(sizeof(input_type));

    // for inputs smaller than serial_find_extrema_threshold
    // serial_find_extrema algorithm is used
    uint_ serial_find_extrema_threshold = parameters->get(
        cache_key,
        "serial_find_extrema_threshold",
        16384 * sizeof(input_type)
    );
    serial_find_extrema_threshold =
        (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units));

    const context &context = queue.get_context();
    if(count < serial_find_extrema_threshold) {
        return serial_find_extrema(first, last, compare, find_minimum, queue);
    }

    meta_kernel k("find_extrema_on_cpu");
    buffer output(context, sizeof(input_type) * compute_units);
    buffer output_idx(
        context, sizeof(uint_) * compute_units,
        buffer::read_write | buffer::alloc_host_ptr
    );

    size_t count_arg = k.add_arg<uint_>("count");
    size_t output_arg =
        k.add_arg<input_type *>(memory_object::global_memory, "output");
    size_t output_idx_arg =
        k.add_arg<uint_ *>(memory_object::global_memory, "output_idx");

    k <<
        "uint block = " <<
            "(uint)ceil(((float)count)/get_global_size(0));\n" <<
        "uint index = get_global_id(0) * block;\n" <<
        "uint end = min(count, index + block);\n" <<

        "uint value_index = index;\n" <<
        k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" <<

        "index++;\n" <<
        "while(index < end){\n" <<
            k.decl<input_type>("candidate") <<
                " = " << first[k.var<uint_>("index")] << ";\n" <<
        "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
            "bool compare = " << compare(k.var<input_type>("candidate"),
                                         k.var<input_type>("value")) << ";\n" <<
        "#else\n" <<
            "bool compare = " << compare(k.var<input_type>("value"),
                                         k.var<input_type>("candidate")) << ";\n" <<
        "#endif\n" <<
            "value = compare ? candidate : value;\n" <<
            "value_index = compare ? index : value_index;\n" <<
            "index++;\n" <<
        "}\n" <<
        "output[get_global_id(0)] = value;\n" <<
        "output_idx[get_global_id(0)] = value_index;\n";

    size_t global_work_size = compute_units;
    std::string options;
    if(!find_minimum){
        options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
    }
    kernel kernel = k.compile(context, options);

    kernel.set_arg(count_arg, static_cast<uint_>(count));
    kernel.set_arg(output_arg, output);
    kernel.set_arg(output_idx_arg, output_idx);
    queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0);

    buffer_iterator<input_type> result = serial_find_extrema(
        make_buffer_iterator<input_type>(output),
        make_buffer_iterator<input_type>(output, global_work_size),
        compare,
        find_minimum,
        queue
    );

    uint_* output_idx_host_ptr =
        static_cast<uint_*>(
            queue.enqueue_map_buffer(
                output_idx, command_queue::map_read,
                0, global_work_size * sizeof(uint_)
            )
        );

    difference_type extremum_idx =
        static_cast<difference_type>(*(output_idx_host_ptr + result.get_index()));
    return first + extremum_idx;
}

Exemplo n.º 21

0

Exibir arquivo

Arquivo: meta_kernel.hpp Projeto: OHDSI/RcppBoostCompute

 meta_kernel& operator<<(const meta_kernel_literal<unsigned char> &literal)
 {
     return *this << uint_(literal.value());
 }

Exemplo n.º 22

0

Exibir arquivo

Arquivo: geometry_generator_grammar_impl.hpp Projeto: joycepg/mapnik

geometry_generator_grammar<OutputIterator, Geometry>::geometry_generator_grammar()
    : geometry_generator_grammar::base_type(geometry)
{
    boost::spirit::karma::_val_type _val;
    boost::spirit::karma::_1_type _1;
    boost::spirit::karma::_a_type _a;
    boost::spirit::karma::lit_type lit;
    boost::spirit::karma::uint_type uint_;
    boost::spirit::karma::eps_type eps;

    geometry = geometry_dispatch.alias()
        ;

    geometry_dispatch = eps[_a = geometry_type(_val)] <<
        (&uint_(geometry::geometry_types::Point)[_1 = _a]
         << (point | lit("null")))
        |
        (&uint_(geometry::geometry_types::LineString)[_1 = _a]
         << (linestring | lit("null")))
        |
        (&uint_(geometry::geometry_types::Polygon)[_1 = _a]
         << (polygon | lit("null")))
        |
        (&uint_(geometry::geometry_types::MultiPoint)[_1 = _a]
         << (multi_point | lit("null")))
        |
        (&uint_(geometry::geometry_types::MultiLineString)[_1 = _a]
         << (multi_linestring | lit("null")))
        |
        (&uint_(geometry::geometry_types::MultiPolygon)[_1 = _a]
         << (multi_polygon | lit("null")))
        |
        (&uint_(geometry::geometry_types::GeometryCollection)[_1 = _a]
         << (geometry_collection | lit("null")))
        |
        lit("null")
        ;

    point = lit("{\"type\":\"Point\",\"coordinates\":") << point_coord << lit("}")
        ;
    linestring = lit("{\"type\":\"LineString\",\"coordinates\":[") << linestring_coord << lit("]}")
        ;
    polygon = lit("{\"type\":\"Polygon\",\"coordinates\":[") << polygon_coord << lit("]}")
        ;
    multi_point = lit("{\"type\":\"MultiPoint\",\"coordinates\":[") << multi_point_coord << lit("]}")
        ;
    multi_linestring = lit("{\"type\":\"MultiLineString\",\"coordinates\":[") << multi_linestring_coord << lit("]}")
        ;
    multi_polygon = lit("{\"type\":\"MultiPolygon\",\"coordinates\":[") << multi_polygon_coord << lit("]}")
        ;
    geometry_collection = lit("{\"type\":\"GeometryCollection\",\"geometries\":[") << geometries  << lit("]}")
        ;
    point_coord = lit('[') << coordinate << lit(',') << coordinate  << lit(']')
        ;
    linestring_coord = point_coord % lit(',')
        ;
    polygon_coord = lit('[') << exterior_ring_coord << lit(']') << interior_ring_coord
        ;
    exterior_ring_coord = linestring_coord.alias()
        ;
    interior_ring_coord =  *(lit(",[") << exterior_ring_coord << lit(']'))
        ;
    multi_point_coord = linestring_coord.alias()
        ;
    multi_linestring_coord = (lit('[') << linestring_coord << lit(']')) % lit(',')
        ;
    multi_polygon_coord = (lit('[') << polygon_coord << lit(']')) % lit(',')
        ;
    geometries =  geometry % lit(',')
        ;
}

Exemplo n.º 23

0

Exibir arquivo

Arquivo: reduce_on_gpu.hpp Projeto: Fantasticer/arangodb

inline void reduce_on_gpu(InputIterator first,
                          InputIterator last,
                          buffer_iterator<T> result,
                          Function function,
                          command_queue &queue)
{
    const device &device = queue.get_device();
    const context &context = queue.get_context();

    detail::meta_kernel k("reduce");
    k.add_arg<const T*>(memory_object::global_memory, "input");
    k.add_arg<const uint_>("offset");
    k.add_arg<const uint_>("count");
    k.add_arg<T*>(memory_object::global_memory, "output");
    k.add_arg<const uint_>("output_offset");

    k <<
        k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
        "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<

        "__local " << type_name<T>() << " scratch[TPB];\n" <<
        // private reduction
        k.decl<T>("sum") << " = 0;\n" <<
        "for(uint i = 0; i < VPT; i++){\n" <<
        "    if(block_offset + lid + i*TPB < count){\n" <<
        "        sum = sum + block[lid+i*TPB]; \n" <<
        "    }\n" <<
        "}\n" <<

        "scratch[lid] = sum;\n";

    // discrimination on vendor name
    if(is_nvidia_device(device))
        k << ReduceBody<T,true>::body();
    else
        k << ReduceBody<T,false>::body();

    k <<
        // write sum to output
         "if(lid == 0){\n" <<
         "    output[output_offset + get_group_id(0)] = scratch[0];\n" <<
         "}\n";

    std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();

    // load parameters
    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    uint_ vpt = parameters->get(cache_key, "vpt", 8);
    uint_ tpb = parameters->get(cache_key, "tpb", 128);

    // reduce program compiler flags
    std::stringstream options;
    options << "-DT=" << type_name<T>()
            << " -DVPT=" << vpt
            << " -DTPB=" << tpb;

    // load program
    boost::shared_ptr<program_cache> cache =
        program_cache::get_global_cache(context);

    program reduce_program = cache->get_or_build(
        cache_key, options.str(), k.source(), context
    );

    // create reduce kernel
    kernel reduce_kernel(reduce_program, "reduce");

    size_t count = std::distance(first, last);

    // first pass, reduce from input to ping
    buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
    initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);

    // update count after initial reduce
    count = std::ceil(float(count) / vpt / tpb);

    // middle pass(es), reduce between ping and pong
    const buffer *input_buffer = &ping;
    buffer pong(context, count / vpt / tpb * sizeof(T));
    const buffer *output_buffer = &pong;
    if(count > vpt * tpb){
        while(count > vpt * tpb){
            reduce_kernel.set_arg(0, *input_buffer);
            reduce_kernel.set_arg(1, uint_(0));
            reduce_kernel.set_arg(2, uint_(count));
            reduce_kernel.set_arg(3, *output_buffer);
            reduce_kernel.set_arg(4, uint_(0));

            size_t work_size = std::ceil(float(count) / vpt);
            if(work_size % tpb != 0){
                work_size += tpb - work_size % tpb;
            }
            queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);

            std::swap(input_buffer, output_buffer);
            count = std::ceil(float(count) / vpt / tpb);
        }
    }

    // final pass, reduce from ping/pong to result
    reduce_kernel.set_arg(0, *input_buffer);
    reduce_kernel.set_arg(1, uint_(0));
    reduce_kernel.set_arg(2, uint_(count));
    reduce_kernel.set_arg(3, result.get_buffer());
    reduce_kernel.set_arg(4, uint_(result.get_index()));

    queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
}

Exemplo n.º 24

0

Exibir arquivo

Arquivo: scan_on_cpu.hpp Projeto: 3Jade/Sprawl

inline OutputIterator scan_on_cpu(InputIterator first,
                                  InputIterator last,
                                  OutputIterator result,
                                  bool exclusive,
                                  T init,
                                  BinaryOperator op,
                                  command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type input_type;
    typedef typename
        std::iterator_traits<OutputIterator>::value_type output_type;

    const context &context = queue.get_context();
    const device &device = queue.get_device();
    const size_t compute_units = queue.get_device().compute_units();

    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    std::string cache_key =
        "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));

    // for inputs smaller than serial_scan_threshold
    // serial_scan algorithm is used
    uint_ serial_scan_threshold =
        parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
    serial_scan_threshold =
        (std::max)(serial_scan_threshold, uint_(compute_units));

    size_t count = detail::iterator_range_size(first, last);
    if(count == 0){
        return result;
    }
    else if(count < serial_scan_threshold) {
        return serial_scan(first, last, result, exclusive, init, op, queue);
    }

    buffer block_partial_sums(context, sizeof(output_type) * compute_units );

    // create scan kernel
    meta_kernel k("scan_on_cpu_block_scan");

    // Arguments
    size_t count_arg = k.add_arg<uint_>("count");
    size_t init_arg = k.add_arg<output_type>("initial_value");
    size_t block_partial_sums_arg =
        k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");

    k <<
        "uint block = " <<
            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
        "uint index = get_global_id(0) * block;\n" <<
        "uint end = min(count, index + block);\n";

    if(!exclusive){
        k <<
            k.decl<output_type>("sum") << " = " <<
                first[k.var<uint_>("index")] << ";\n" <<
            result[k.var<uint_>("index")] << " = sum;\n" <<
            "index++;\n";
    }
    else {
        k <<
            k.decl<output_type>("sum") << ";\n" <<
            "if(index == 0){\n" <<
                "sum = initial_value;\n" <<
            "}\n" <<
            "else {\n" <<
                "sum = " << first[k.var<uint_>("index")] << ";\n" <<
                "index++;\n" <<
            "}\n";
    }

    k <<
        "while(index < end){\n" <<
            // load next value
            k.decl<const input_type>("value") << " = "
                << first[k.var<uint_>("index")] << ";\n";

    if(exclusive){
        k <<
            "if(get_global_id(0) == 0){\n" <<
                result[k.var<uint_>("index")] << " = sum;\n" <<
            "}\n";
    }
    k <<
            "sum = " << op(k.var<output_type>("sum"),
                           k.var<output_type>("value")) << ";\n";

    if(!exclusive){
        k <<
            "if(get_global_id(0) == 0){\n" <<
                result[k.var<uint_>("index")] << " = sum;\n" <<
            "}\n";
    }

    k <<
            "index++;\n" <<
        "}\n" << // end while
        "block_partial_sums[get_global_id(0)] = sum;\n";

    // compile scan kernel
    kernel block_scan_kernel = k.compile(context);

    // setup kernel arguments
    block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
    block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
    block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);

    // execute the kernel
    size_t global_work_size = compute_units;
    queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);

    // scan is done
    if(compute_units < 2) {
        return result + count;
    }

    // final scan kernel
    meta_kernel l("scan_on_cpu_final_scan");

    // Arguments
    count_arg = l.add_arg<uint_>("count");
    block_partial_sums_arg =
        l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");

    l <<
        "uint block = " <<
            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
        "uint index = block + get_global_id(0) * block;\n" <<
        "uint end = min(count, index + block);\n" <<

        k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
        "for(uint i = 0; i < get_global_id(0); i++) {\n" <<
            "sum = " << op(k.var<output_type>("sum"),
                           k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
        "}\n" <<

        "while(index < end){\n";
    if(exclusive){
        l <<
            l.decl<output_type>("value") << " = "
                << first[k.var<uint_>("index")] << ";\n" <<
            result[k.var<uint_>("index")] << " = sum;\n" <<
            "sum = " << op(k.var<output_type>("sum"),
                           k.var<output_type>("value")) << ";\n";
    }
    else {
        l <<
            "sum = " << op(k.var<output_type>("sum"),
                           first[k.var<uint_>("index")]) << ";\n" <<
            result[k.var<uint_>("index")] << " = sum;\n";
    }
    l <<
            "index++;\n" <<
        "}\n";


    // compile scan kernel
    kernel final_scan_kernel = l.compile(context);

    // setup kernel arguments
    final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
    final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);

    // execute the kernel
    global_work_size = compute_units;
    queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);

    // return iterator pointing to the end of the result range
    return result + count;
}

Exemplo n.º 25

0

Exibir arquivo

Arquivo: reduce.hpp Projeto: junmuz/compute

size_t reduce(InputIterator first,
              size_t count,
              OutputIterator result,
              size_t block_size,
              BinaryFunction function,
              command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type
        input_type;
    typedef typename
        boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
        result_type;

    const context &context = queue.get_context();
    size_t block_count = count / 2 / block_size;
    size_t total_block_count =
        static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));

    if(block_count != 0){
        meta_kernel k("block_reduce");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");

        k <<
            "const uint gid = get_global_id(0);\n" <<
            "const uint lid = get_local_id(0);\n" <<

            // copy values to local memory
            "block[lid] = " <<
                function(first[k.make_var<uint_>("gid*2+0")],
                         first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<

            // perform reduction
            "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
            "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "    uint mask = (i << 1) - 1;\n" <<
            "    if((lid & mask) == 0){\n" <<
            "        block[lid] = " <<
                         function(k.expr<input_type>("block[lid]"),
                                  k.expr<input_type>("block[lid+i]")) << ";\n" <<
            "    }\n" <<
            "}\n" <<

            // write block result to global output
            "if(lid == 0)\n" <<
            "    output[get_group_id(0)] = block[0];\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(block_arg, local_buffer<input_type>(block_size));

        queue.enqueue_1d_range_kernel(kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);
    }

    // serially reduce any leftovers
    if(block_count * block_size * 2 < count){
        size_t last_block_start = block_count * block_size * 2;

        meta_kernel k("extra_serial_reduce");
        size_t count_arg = k.add_arg<uint_>("count");
        size_t offset_arg = k.add_arg<uint_>("offset");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t output_offset_arg = k.add_arg<uint_>("output_offset");

        k <<
            k.decl<result_type>("result") << " = \n" <<
                first[k.expr<uint_>("offset")] << ";\n" <<
            "for(uint i = offset + 1; i < count; i++)\n" <<
            "    result = " <<
                     function(k.var<result_type>("result"),
                              first[k.var<uint_>("i")]) << ";\n" <<
            "output[output_offset] = result;\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(count_arg, static_cast<uint_>(count));
        kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));

        queue.enqueue_task(kernel);
    }

    return total_block_count;
}

Exemplo n.º 26

0

Exibir arquivo

Arquivo: find_extrema_with_reduce.hpp Projeto: 2bbb/compute

inline void find_extrema_with_reduce(InputIterator input,
                                     vector<uint_>::iterator input_idx,
                                     size_t count,
                                     ResultIterator result,
                                     vector<uint_>::iterator result_idx,
                                     size_t work_groups_no,
                                     size_t work_group_size,
                                     Compare compare,
                                     const bool find_minimum,
                                     const bool use_input_idx,
                                     command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type input_type;

    const context &context = queue.get_context();

    meta_kernel k("find_extrema_reduce");
    size_t count_arg = k.add_arg<uint_>("count");
    size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
    size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");

    k <<
        // Work item global id
        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<

        // Index of element that will be read from input buffer
        k.decl<uint_>("idx") << " = gid;\n" <<

        k.decl<input_type>("acc") << ";\n" <<
        k.decl<uint_>("acc_idx") << ";\n" <<
        "if(gid < count) {\n" <<
            // Real index of currently best element
            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
            k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
            "#else\n" <<
            k.var<uint_>("acc_idx") << " = idx;\n" <<
            "#endif\n" <<

            // Init accumulator with first[get_global_id(0)]
            "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
            "idx += get_global_size(0);\n" <<
        "}\n" <<

        k.decl<bool>("compare_result") << ";\n" <<
        k.decl<bool>("equal") << ";\n\n" <<
        "while( idx < count ){\n" <<
            // Next element
            k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
            k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
            "#endif\n" <<

            // Comparison between currently best element (acc) and next element
            "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
            "compare_result = " << compare(k.var<input_type>("next"),
                                           k.var<input_type>("acc")) << ";\n" <<
            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
            "equal = !compare_result && !" <<
                compare(k.var<input_type>("acc"),
                        k.var<input_type>("next")) << ";\n" <<
            "# endif\n" <<
            "#else\n" <<
            "compare_result = " << compare(k.var<input_type>("acc"),
                                           k.var<input_type>("next")) << ";\n" <<
            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
            "equal = !compare_result && !" <<
                compare(k.var<input_type>("next"),
                        k.var<input_type>("acc")) << ";\n" <<
            "# endif\n" <<
            "#endif\n" <<

            // save the winner
            "acc = compare_result ? acc : next;\n" <<
            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
            "acc_idx = compare_result ? " <<
                "acc_idx : " <<
                "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
            "#else\n" <<
            "acc_idx = compare_result ? acc_idx : idx;\n" <<
            "#endif\n" <<
            "idx += get_global_size(0);\n" <<
        "}\n\n" <<

        // Work item local id
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
        "block[lid] = acc;\n" <<
        "block_idx[lid] = acc_idx;\n" <<
        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<

        k.decl<uint_>("group_offset") <<
            " = count - (get_local_size(0) * get_group_id(0));\n\n";

    k <<
        "#pragma unroll\n"
        "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
             "offset = offset / 2) {\n" <<
             "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
                 k.decl<input_type>("mine") << " = block[lid];\n" <<
                 k.decl<input_type>("other") << " = block[lid+offset];\n" <<
                 "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
                 "compare_result = " << compare(k.var<input_type>("other"),
                                                k.var<input_type>("mine")) << ";\n" <<
                 "equal = !compare_result && !" <<
                     compare(k.var<input_type>("mine"),
                             k.var<input_type>("other")) << ";\n" <<
                 "#else\n" <<
                 "compare_result = " << compare(k.var<input_type>("mine"),
                                                k.var<input_type>("other")) << ";\n" <<
                 "equal = !compare_result && !" <<
                     compare(k.var<input_type>("other"),
                             k.var<input_type>("mine")) << ";\n" <<
                 "#endif\n" <<
                 "block[lid] = compare_result ? mine : other;\n" <<
                 k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
                 k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
                 "block_idx[lid] = compare_result ? " <<
                     "mine_idx : " <<
                     "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
             "}\n"
             "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
        "}\n\n" <<

         // write block result to global output
        "if(lid == 0){\n" <<
            result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
            result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
        "}";

    std::string options;
    if(!find_minimum){
        options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
    }
    if(use_input_idx){
        options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
    }

    kernel kernel = k.compile(context, options);

    kernel.set_arg(count_arg, static_cast<uint_>(count));
    kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
    kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));

    queue.enqueue_1d_range_kernel(kernel,
                                  0,
                                  work_groups_no * work_group_size,
                                  work_group_size);
}

Exemplo n.º 27

0

Exibir arquivo

Arquivo: inplace_reduce.hpp Projeto: junmuz/compute

inline void inplace_reduce(Iterator first,
                           Iterator last,
                           BinaryFunction function,
                           command_queue &queue)
{
    typedef typename
        std::iterator_traits<Iterator>::value_type
        value_type;

    size_t input_size = iterator_range_size(first, last);
    if(input_size < 2){
        return;
    }

    const context &context = queue.get_context();

    size_t block_size = 64;
    size_t values_per_thread = 8;
    size_t block_count = input_size / (block_size * values_per_thread);
    if(block_count * block_size * values_per_thread != input_size)
        block_count++;

    vector<value_type> output(block_count, context);

    meta_kernel k("inplace_reduce");
    size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
    size_t input_size_arg = k.add_arg<const uint_>("input_size");
    size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
    size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
    k <<
        "const uint gid = get_global_id(0);\n" <<
        "const uint lid = get_local_id(0);\n" <<
        "const uint values_per_thread =\n"
            << uint_(values_per_thread) << ";\n" <<

        // thread reduce
        "const uint index = gid * values_per_thread;\n" <<
        "if(index < input_size){\n" <<
            k.decl<value_type>("sum") << " = input[index];\n" <<
            "for(uint i = 1;\n" <<
                 "i < values_per_thread && (index + i) < input_size;\n" <<
                 "i++){\n" <<
            "    sum = " <<
                     function(k.var<value_type>("sum"),
                              k.var<value_type>("input[index+i]")) << ";\n" <<
            "}\n" <<
            "scratch[lid] = sum;\n" <<
        "}\n" <<

        // local reduce
        "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
        "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
        "    uint mask = (i << 1) - 1;\n" <<
        "    uint next_index = (gid + i) * values_per_thread;\n"
        "    if((lid & mask) == 0 && next_index < input_size){\n" <<
        "        scratch[lid] = " <<
                     function(k.var<value_type>("scratch[lid]"),
                              k.var<value_type>("scratch[lid+i]")) << ";\n" <<
        "    }\n" <<
        "}\n" <<

        // write output for block
        "if(lid == 0){\n" <<
        "    output[get_group_id(0)] = scratch[0];\n" <<
        "}\n"
        ;

    const buffer *input_buffer = &first.get_buffer();
    const buffer *output_buffer = &output.get_buffer();

    kernel kernel = k.compile(context);

    while(input_size > 1){
        kernel.set_arg(input_arg, *input_buffer);
        kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
        kernel.set_arg(output_arg, *output_buffer);
        kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));

        queue.enqueue_1d_range_kernel(kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);

        input_size =
            static_cast<size_t>(
                std::ceil(float(input_size) / (block_size * values_per_thread)
            )
        );

        block_count = input_size / (block_size * values_per_thread);
        if(block_count * block_size * values_per_thread != input_size)
            block_count++;

        std::swap(input_buffer, output_buffer);
    }

    if(input_buffer != &first.get_buffer()){
        ::boost::compute::copy(output.begin(),
                               output.begin() + 1,
                               first,
                               queue);
    }
}