svg_path_generator<OutputIterator,Path>::svg_path_generator() : svg_path_generator::base_type(svg) { boost::spirit::karma::uint_type uint_; boost::spirit::karma::_val_type _val; boost::spirit::karma::_1_type _1; boost::spirit::karma::lit_type lit; svg = point | linestring | polygon ; point = &uint_(mapnik::geometry::geometry_types::Point)[_1 = _type(_val)] << svg_point [_1 = _first(_val)] ; svg_point = &uint_ << lit("cx=\"") << coordinate << lit("\" cy=\"") << coordinate << lit('\"') ; linestring = &uint_(mapnik::geometry::geometry_types::LineString)[_1 = _type(_val)] << lit("d=\"") << svg_path << lit("\"") ; polygon = &uint_(mapnik::geometry::geometry_types::Polygon)[_1 = _type(_val)] << lit("d=\"") << svg_path << lit("\"") ; svg_path %= ((&uint_(mapnik::SEG_MOVETO) << lit('M') | &uint_(mapnik::SEG_LINETO) << lit('L')) << coordinate << lit(' ') << coordinate) % lit(' ') ; }
wkt_generator<OutputIterator, Geometry>::wkt_generator(bool single) : wkt_generator::base_type(wkt) { boost::spirit::karma::uint_type uint_; boost::spirit::karma::_val_type _val; boost::spirit::karma::_1_type _1; boost::spirit::karma::lit_type lit; boost::spirit::karma::_a_type _a; boost::spirit::karma::_b_type _b; boost::spirit::karma::_c_type _c; boost::spirit::karma::_r1_type _r1; boost::spirit::karma::eps_type eps; boost::spirit::karma::string_type kstring; wkt = point | linestring | polygon ; point = &uint_(mapnik::geometry_type::types::Point)[_1 = _type(_val)] << kstring[ phoenix::if_ (single) [_1 = "Point("] .else_[_1 = "("]] << point_coord [_1 = _first(_val)] << lit(')') ; linestring = &uint_(mapnik::geometry_type::types::LineString)[_1 = _type(_val)] << kstring[ phoenix::if_ (single) [_1 = "LineString("] .else_[_1 = "("]] << coords << lit(')') ; polygon = &uint_(mapnik::geometry_type::types::Polygon)[_1 = _type(_val)] << kstring[ phoenix::if_ (single) [_1 = "Polygon("] .else_[_1 = "("]] << coords2 << lit("))") ; point_coord = &uint_ << coordinate << lit(' ') << coordinate ; polygon_coord %= ( &uint_(mapnik::SEG_MOVETO) << eps[_r1 += 1][_a = _x(_val)][ _b = _y(_val)] << kstring[ if_ (_r1 > 1) [_1 = "),("] .else_[_1 = "("]] | &uint_(mapnik::SEG_LINETO) << lit(',') << eps[_a = _x(_val)][_b = _y(_val)] ) << coordinate[_1 = _a] << lit(' ') << coordinate[_1 = _b] ; coords2 %= *polygon_coord(_a,_b,_c) ; coords = point_coord % lit(',') ; }
geometry_generator_grammar() : geometry_generator_grammar::base_type(coordinates) { boost::spirit::karma::uint_type uint_; boost::spirit::bool_type bool_; boost::spirit::karma::_val_type _val; boost::spirit::karma::_1_type _1; boost::spirit::karma::lit_type lit; boost::spirit::karma::_a_type _a; boost::spirit::karma::_r1_type _r1; boost::spirit::karma::eps_type eps; boost::spirit::karma::string_type kstring; coordinates = point | linestring | polygon ; point = &uint_(mapnik::geometry_type::types::Point)[_1 = _type(_val)] << point_coord [_1 = _first(_val)] ; linestring = &uint_(mapnik::geometry_type::types::LineString)[_1 = _type(_val)] << lit('[') << coords << lit(']') ; polygon = &uint_(mapnik::geometry_type::types::Polygon)[_1 = _type(_val)] << lit('[') << coords2 << lit("]]") ; point_coord = &uint_ << lit('[') << coord_type << lit(',') << coord_type << lit(']') ; polygon_coord %= ( &uint_(mapnik::SEG_MOVETO) << eps[_r1 += 1] << kstring[ if_ (_r1 > 1) [_1 = "],["] .else_[_1 = '[' ]] | &uint_(mapnik::SEG_LINETO) << lit(',')) << lit('[') << coord_type << lit(',') << coord_type << lit(']') ; coords2 %= *polygon_coord(_a) ; coords = point_coord % lit(',') ; }
inline TextIterator search_n(TextIterator t_first, TextIterator t_last, size_t n, ValueType value, command_queue &queue = system::default_queue()) { // there is no need to check if pattern starts at last n - 1 indices vector<uint_> matching_indices( detail::iterator_range_size(t_first, t_last) + 1 - n, queue.get_context() ); // search_n_kernel puts value 1 at every index in vector where pattern // of n values starts at detail::search_n_kernel<TextIterator, vector<uint_>::iterator> kernel; kernel.set_range(t_first, t_last, value, n, matching_indices.begin()); kernel.exec(queue); vector<uint_>::iterator index = ::boost::compute::find( matching_indices.begin(), matching_indices.end(), uint_(1), queue ); // pattern was not found if(index == matching_indices.end()) return t_last; return t_first + detail::iterator_range_size(matching_indices.begin(), index); }
void generate(OutputIterator first, OutputIterator last, Generator &generator, command_queue &queue) { size_t size = std::distance(first, last); typedef typename Generator::result_type g_result_type; vector<g_result_type> tmp(size, queue.get_context()); vector<g_result_type> tmp2(size, queue.get_context()); uint_ bound = ((uint_(-1))/(m_b-m_a+1))*(m_b-m_a+1); buffer_iterator<g_result_type> tmp2_iter; while(size>0) { generator.generate(tmp.begin(), tmp.begin() + size, queue); tmp2_iter = copy_if(tmp.begin(), tmp.begin() + size, tmp2.begin(), _1 <= bound, queue); size = std::distance(tmp2_iter, tmp2.end()); } BOOST_COMPUTE_FUNCTION(IntType, scale_random, (const g_result_type x), { return LO + (x % (HI-LO+1)); });
/** * In case BOOST_COMPUTE_USE_OFFLINE_CACHE macro is defined, * the compiled binary is stored for reuse in the offline cache located in * $HOME/.boost_compute on UNIX-like systems and in %APPDATA%/boost_compute * on Windows. */ static program build_with_source( const std::string &source, const context &context, const std::string &options = std::string() ) { #ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE // Get hash string for the kernel. std::string hash; { device d(context.get_device()); platform p(d.get_info<cl_platform_id>(CL_DEVICE_PLATFORM)); std::ostringstream src; src << "// " << p.name() << " v" << p.version() << "\n" << "// " << context.get_device().name() << "\n" << "// " << options << "\n\n" << source; hash = detail::sha1(src.str()); } // Try to get cached program binaries: try { boost::optional<program> prog = load_program_binary(hash, context); if (prog) { prog->build(options); return *prog; } } catch (...) { // Something bad happened. Fallback to normal compilation. } // Cache is apparently not available. Just compile the sources. #endif const char *source_string = source.c_str(); cl_int error = 0; cl_program program_ = clCreateProgramWithSource(context, uint_(1), &source_string, 0, &error); if(!program_){ BOOST_THROW_EXCEPTION(runtime_exception(error)); } program prog(program_, false); prog.build(options); #ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE // Save program binaries for future reuse. save_program_binary(hash, prog); #endif return prog; }
detail::device_ptr_index_expr<T, Expr> operator[](const Expr &expr) const { BOOST_ASSERT(m_buffer.get()); return detail::device_ptr_index_expr<T, Expr>(m_buffer, uint_(m_index), expr); }
/** * In case BOOST_COMPUTE_USE_OFFLINE_CACHE macro is defined, * the compiled binary is stored for reuse in the offline cache located in * $HOME/.boost_compute on UNIX-like systems and in %APPDATA%/boost_compute * on Windows. */ static program build_with_source( const std::string &source, const context &context, const std::string &options = std::string() ) { #ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE // Get hash string for the kernel. device d = context.get_device(); platform p = d.platform(); detail::sha1 hash; hash.process( p.name() ) .process( p.version() ) .process( d.name() ) .process( options ) .process( source ) ; std::string hash_string = hash; // Try to get cached program binaries: try { boost::optional<program> prog = load_program_binary(hash_string, context); if (prog) { prog->build(options); return *prog; } } catch (...) { // Something bad happened. Fallback to normal compilation. } // Cache is apparently not available. Just compile the sources. #endif const char *source_string = source.c_str(); cl_int error = 0; cl_program program_ = clCreateProgramWithSource(context, uint_(1), &source_string, 0, &error); if(!program_){ BOOST_THROW_EXCEPTION(opencl_error(error)); } program prog(program_, false); prog.build(options); #ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE // Save program binaries for future reuse. save_program_binary(hash_string, prog); #endif return prog; }
event exec(command_queue &queue) { if(m_count == 0) { return event(); } set_arg(m_p_count_arg, uint_(m_p_count)); return exec_1d(queue, 0, m_count); }
inline OutputIterator merge_with_merge_path(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp, command_queue &queue = system::default_queue()) { typedef typename std::iterator_traits<OutputIterator>::difference_type result_difference_type; size_t tile_size = 1024; size_t count1 = iterator_range_size(first1, last1); size_t count2 = iterator_range_size(first2, last2); vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); // Tile the sets merge_path_kernel tiling_kernel; tiling_kernel.tile_size = static_cast<unsigned int>(tile_size); tiling_kernel.set_range(first1, last1, first2, last2, tile_a.begin()+1, tile_b.begin()+1, comp); fill_n(tile_a.begin(), 1, uint_(0), queue); fill_n(tile_b.begin(), 1, uint_(0), queue); tiling_kernel.exec(queue); fill_n(tile_a.end()-1, 1, static_cast<uint_>(count1), queue); fill_n(tile_b.end()-1, 1, static_cast<uint_>(count2), queue); // Merge serial_merge_kernel merge_kernel; merge_kernel.tile_size = static_cast<unsigned int>(tile_size); merge_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(), tile_b.begin(), result, comp); merge_kernel.exec(queue); return result + static_cast<result_difference_type>(count1 + count2); }
inline void initial_reduce(InputIterator first, InputIterator last, buffer result, const Function &function, kernel &reduce_kernel, const uint_ vpt, const uint_ tpb, command_queue &queue) { (void) function; (void) reduce_kernel; typedef typename std::iterator_traits<InputIterator>::value_type Arg; typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T; size_t count = std::distance(first, last); detail::meta_kernel k("initial_reduce"); k.add_set_arg<const uint_>("count", uint_(count)); size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output"); k << k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "__local " << type_name<T>() << " scratch[TPB];\n" << // private reduction k.decl<T>("sum") << " = 0;\n" << "for(uint i = 0; i < VPT; i++){\n" << " if(offset + lid + i*TPB < count){\n" << " sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" << " }\n" << "}\n" << "scratch[lid] = sum;\n" << // local reduction ReduceBody<T,false>::body() << // write sum to output "if(lid == 0){\n" << " output[get_group_id(0)] = scratch[0];\n" << "}\n"; const context &context = queue.get_context(); std::stringstream options; options << "-DVPT=" << vpt << " -DTPB=" << tpb; kernel generic_reduce_kernel = k.compile(context, options.str()); generic_reduce_kernel.set_arg(output_arg, result); size_t work_size = calculate_work_size(count, vpt, tpb); queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb); }
event exec(command_queue &queue) { if(m_count == 0){ // nothing to do return event(); } size_t global_work_size = calculate_work_size(m_count, m_vpt, m_tpb); set_arg(m_count_arg, uint_(m_count)); return exec_1d(queue, 0, global_work_size, m_tpb); }
inline void initial_reduce(const buffer_iterator<T> &first, const buffer_iterator<T> &last, const buffer &result, const plus<T> &function, kernel &reduce_kernel, const uint_ vpt, const uint_ tpb, command_queue &queue) { (void) function; size_t count = std::distance(first, last); reduce_kernel.set_arg(0, first.get_buffer()); reduce_kernel.set_arg(1, uint_(first.get_index())); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, result); reduce_kernel.set_arg(4, uint_(0)); size_t work_size = calculate_work_size(count, vpt, tpb); queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); }
inline meta_kernel& operator<<(meta_kernel &kernel, const buffer_iterator_index_expr<T, IndexExpr> &expr) { if(expr.m_index == 0){ return kernel << kernel.get_buffer_identifier<T>(expr.m_buffer, expr.m_address_space) << '[' << expr.m_expr << ']'; } else { return kernel << kernel.get_buffer_identifier<T>(expr.m_buffer, expr.m_address_space) << '[' << uint_(expr.m_index) << "+(" << expr.m_expr << ")]"; } }
static std::string value() { BOOST_STATIC_ASSERT(N < 16); std::stringstream stream; if(N < 10){ stream << ".s" << uint_(N); } else if(N < 16){ stream << ".s" << char('a' + (N - 10)); } return stream.str(); }
/// Creates a new program with \p source in \p context. /// /// \see_opencl_ref{clCreateProgramWithSource} static program create_with_source(const std::string &source, const context &context) { const char *source_string = source.c_str(); cl_int error = 0; cl_program program_ = clCreateProgramWithSource(context, uint_(1), &source_string, 0, &error); if(!program_){ BOOST_THROW_EXCEPTION(opencl_error(error)); } return program(program_, false); }
/// Creates a new program with \p sources in \p context. /// /// \see_opencl_ref{clCreateProgramWithSource} static program create_with_source(const std::vector<std::string> &sources, const context &context) { std::vector<const char*> source_strings(sources.size()); for(size_t i = 0; i < sources.size(); i++){ source_strings[i] = sources[i].c_str(); } cl_int error = 0; cl_program program_ = clCreateProgramWithSource(context, uint_(sources.size()), &source_strings[0], 0, &error); if(!program_){ BOOST_THROW_EXCEPTION(opencl_error(error)); } return program(program_, false); }
/// Creates a new program with \p binary of \p binary_size in /// \p context. /// /// \see_opencl_ref{clCreateProgramWithBinary} static program create_with_binary(const unsigned char *binary, size_t binary_size, const context &context) { const cl_device_id device = context.get_device().id(); cl_int error = 0; cl_int binary_status = 0; cl_program program_ = clCreateProgramWithBinary(context, uint_(1), &device, &binary_size, &binary, &binary_status, &error); if(!program_){ BOOST_THROW_EXCEPTION(runtime_exception(error)); } return program(program_, false); }
inline TextIterator search_n(TextIterator t_first, TextIterator t_last, size_t n, ValueType value, command_queue &queue = system::default_queue()) { vector<uint_> matching_indices(detail::iterator_range_size(t_first, t_last), queue.get_context()); detail::search_n_kernel<TextIterator, vector<uint_>::iterator> kernel; kernel.set_range(t_first, t_last, value, n, matching_indices.begin()); kernel.exec(queue); vector<uint_>::iterator index = ::boost::compute::find( matching_indices.begin(), matching_indices.end(), uint_(1), queue ); return t_first + detail::iterator_range_size(matching_indices.begin(), index); }
inline InputIterator find_extrema_on_cpu(InputIterator first, InputIterator last, Compare compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; size_t count = iterator_range_size(first, last); const device &device = queue.get_device(); const uint_ compute_units = queue.get_device().compute_units(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_find_extrema_cpu_" + boost::lexical_cast<std::string>(sizeof(input_type)); // for inputs smaller than serial_find_extrema_threshold // serial_find_extrema algorithm is used uint_ serial_find_extrema_threshold = parameters->get( cache_key, "serial_find_extrema_threshold", 16384 * sizeof(input_type) ); serial_find_extrema_threshold = (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units)); const context &context = queue.get_context(); if(count < serial_find_extrema_threshold) { return serial_find_extrema(first, last, compare, find_minimum, queue); } meta_kernel k("find_extrema_on_cpu"); buffer output(context, sizeof(input_type) * compute_units); buffer output_idx( context, sizeof(uint_) * compute_units, buffer::read_write | buffer::alloc_host_ptr ); size_t count_arg = k.add_arg<uint_>("count"); size_t output_arg = k.add_arg<input_type *>(memory_object::global_memory, "output"); size_t output_idx_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output_idx"); k << "uint block = " << "(uint)ceil(((float)count)/get_global_size(0));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << "uint value_index = index;\n" << k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << "index++;\n" << "while(index < end){\n" << k.decl<input_type>("candidate") << " = " << first[k.var<uint_>("index")] << ";\n" << "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << "bool compare = " << compare(k.var<input_type>("candidate"), k.var<input_type>("value")) << ";\n" << "#else\n" << "bool compare = " << compare(k.var<input_type>("value"), k.var<input_type>("candidate")) << ";\n" << "#endif\n" << "value = compare ? candidate : value;\n" << "value_index = compare ? index : value_index;\n" << "index++;\n" << "}\n" << "output[get_global_id(0)] = value;\n" << "output_idx[get_global_id(0)] = value_index;\n"; size_t global_work_size = compute_units; std::string options; if(!find_minimum){ options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; } kernel kernel = k.compile(context, options); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(output_arg, output); kernel.set_arg(output_idx_arg, output_idx); queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0); buffer_iterator<input_type> result = serial_find_extrema( make_buffer_iterator<input_type>(output), make_buffer_iterator<input_type>(output, global_work_size), compare, find_minimum, queue ); uint_* output_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( output_idx, command_queue::map_read, 0, global_work_size * sizeof(uint_) ) ); difference_type extremum_idx = static_cast<difference_type>(*(output_idx_host_ptr + result.get_index())); return first + extremum_idx; }
meta_kernel& operator<<(const meta_kernel_literal<unsigned char> &literal) { return *this << uint_(literal.value()); }
geometry_generator_grammar<OutputIterator, Geometry>::geometry_generator_grammar() : geometry_generator_grammar::base_type(geometry) { boost::spirit::karma::_val_type _val; boost::spirit::karma::_1_type _1; boost::spirit::karma::_a_type _a; boost::spirit::karma::lit_type lit; boost::spirit::karma::uint_type uint_; boost::spirit::karma::eps_type eps; geometry = geometry_dispatch.alias() ; geometry_dispatch = eps[_a = geometry_type(_val)] << (&uint_(geometry::geometry_types::Point)[_1 = _a] << (point | lit("null"))) | (&uint_(geometry::geometry_types::LineString)[_1 = _a] << (linestring | lit("null"))) | (&uint_(geometry::geometry_types::Polygon)[_1 = _a] << (polygon | lit("null"))) | (&uint_(geometry::geometry_types::MultiPoint)[_1 = _a] << (multi_point | lit("null"))) | (&uint_(geometry::geometry_types::MultiLineString)[_1 = _a] << (multi_linestring | lit("null"))) | (&uint_(geometry::geometry_types::MultiPolygon)[_1 = _a] << (multi_polygon | lit("null"))) | (&uint_(geometry::geometry_types::GeometryCollection)[_1 = _a] << (geometry_collection | lit("null"))) | lit("null") ; point = lit("{\"type\":\"Point\",\"coordinates\":") << point_coord << lit("}") ; linestring = lit("{\"type\":\"LineString\",\"coordinates\":[") << linestring_coord << lit("]}") ; polygon = lit("{\"type\":\"Polygon\",\"coordinates\":[") << polygon_coord << lit("]}") ; multi_point = lit("{\"type\":\"MultiPoint\",\"coordinates\":[") << multi_point_coord << lit("]}") ; multi_linestring = lit("{\"type\":\"MultiLineString\",\"coordinates\":[") << multi_linestring_coord << lit("]}") ; multi_polygon = lit("{\"type\":\"MultiPolygon\",\"coordinates\":[") << multi_polygon_coord << lit("]}") ; geometry_collection = lit("{\"type\":\"GeometryCollection\",\"geometries\":[") << geometries << lit("]}") ; point_coord = lit('[') << coordinate << lit(',') << coordinate << lit(']') ; linestring_coord = point_coord % lit(',') ; polygon_coord = lit('[') << exterior_ring_coord << lit(']') << interior_ring_coord ; exterior_ring_coord = linestring_coord.alias() ; interior_ring_coord = *(lit(",[") << exterior_ring_coord << lit(']')) ; multi_point_coord = linestring_coord.alias() ; multi_linestring_coord = (lit('[') << linestring_coord << lit(']')) % lit(',') ; multi_polygon_coord = (lit('[') << polygon_coord << lit(']')) % lit(',') ; geometries = geometry % lit(',') ; }
inline void reduce_on_gpu(InputIterator first, InputIterator last, buffer_iterator<T> result, Function function, command_queue &queue) { const device &device = queue.get_device(); const context &context = queue.get_context(); detail::meta_kernel k("reduce"); k.add_arg<const T*>(memory_object::global_memory, "input"); k.add_arg<const uint_>("offset"); k.add_arg<const uint_>("count"); k.add_arg<T*>(memory_object::global_memory, "output"); k.add_arg<const uint_>("output_offset"); k << k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" << "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "__local " << type_name<T>() << " scratch[TPB];\n" << // private reduction k.decl<T>("sum") << " = 0;\n" << "for(uint i = 0; i < VPT; i++){\n" << " if(block_offset + lid + i*TPB < count){\n" << " sum = sum + block[lid+i*TPB]; \n" << " }\n" << "}\n" << "scratch[lid] = sum;\n"; // discrimination on vendor name if(is_nvidia_device(device)) k << ReduceBody<T,true>::body(); else k << ReduceBody<T,false>::body(); k << // write sum to output "if(lid == 0){\n" << " output[output_offset + get_group_id(0)] = scratch[0];\n" << "}\n"; std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); uint_ vpt = parameters->get(cache_key, "vpt", 8); uint_ tpb = parameters->get(cache_key, "tpb", 128); // reduce program compiler flags std::stringstream options; options << "-DT=" << type_name<T>() << " -DVPT=" << vpt << " -DTPB=" << tpb; // load program boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); program reduce_program = cache->get_or_build( cache_key, options.str(), k.source(), context ); // create reduce kernel kernel reduce_kernel(reduce_program, "reduce"); size_t count = std::distance(first, last); // first pass, reduce from input to ping buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T)); initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue); // update count after initial reduce count = std::ceil(float(count) / vpt / tpb); // middle pass(es), reduce between ping and pong const buffer *input_buffer = &ping; buffer pong(context, count / vpt / tpb * sizeof(T)); const buffer *output_buffer = &pong; if(count > vpt * tpb){ while(count > vpt * tpb){ reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, *output_buffer); reduce_kernel.set_arg(4, uint_(0)); size_t work_size = std::ceil(float(count) / vpt); if(work_size % tpb != 0){ work_size += tpb - work_size % tpb; } queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); std::swap(input_buffer, output_buffer); count = std::ceil(float(count) / vpt / tpb); } } // final pass, reduce from ping/pong to result reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, result.get_buffer()); reduce_kernel.set_arg(4, uint_(result.get_index())); queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb); }
inline OutputIterator scan_on_cpu(InputIterator first, InputIterator last, OutputIterator result, bool exclusive, T init, BinaryOperator op, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); const device &device = queue.get_device(); const size_t compute_units = queue.get_device().compute_units(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T)); // for inputs smaller than serial_scan_threshold // serial_scan algorithm is used uint_ serial_scan_threshold = parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T)); serial_scan_threshold = (std::max)(serial_scan_threshold, uint_(compute_units)); size_t count = detail::iterator_range_size(first, last); if(count == 0){ return result; } else if(count < serial_scan_threshold) { return serial_scan(first, last, result, exclusive, init, op, queue); } buffer block_partial_sums(context, sizeof(output_type) * compute_units ); // create scan kernel meta_kernel k("scan_on_cpu_block_scan"); // Arguments size_t count_arg = k.add_arg<uint_>("count"); size_t init_arg = k.add_arg<output_type>("initial_value"); size_t block_partial_sums_arg = k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); k << "uint block = " << "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n"; if(!exclusive){ k << k.decl<output_type>("sum") << " = " << first[k.var<uint_>("index")] << ";\n" << result[k.var<uint_>("index")] << " = sum;\n" << "index++;\n"; } else { k << k.decl<output_type>("sum") << ";\n" << "if(index == 0){\n" << "sum = initial_value;\n" << "}\n" << "else {\n" << "sum = " << first[k.var<uint_>("index")] << ";\n" << "index++;\n" << "}\n"; } k << "while(index < end){\n" << // load next value k.decl<const input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n"; if(exclusive){ k << "if(get_global_id(0) == 0){\n" << result[k.var<uint_>("index")] << " = sum;\n" << "}\n"; } k << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("value")) << ";\n"; if(!exclusive){ k << "if(get_global_id(0) == 0){\n" << result[k.var<uint_>("index")] << " = sum;\n" << "}\n"; } k << "index++;\n" << "}\n" << // end while "block_partial_sums[get_global_id(0)] = sum;\n"; // compile scan kernel kernel block_scan_kernel = k.compile(context); // setup kernel arguments block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init)); block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); // execute the kernel size_t global_work_size = compute_units; queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0); // scan is done if(compute_units < 2) { return result + count; } // final scan kernel meta_kernel l("scan_on_cpu_final_scan"); // Arguments count_arg = l.add_arg<uint_>("count"); block_partial_sums_arg = l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); l << "uint block = " << "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << "uint index = block + get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << k.decl<output_type>("sum") << " = block_partial_sums[0];\n" << "for(uint i = 0; i < get_global_id(0); i++) {\n" << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" << "}\n" << "while(index < end){\n"; if(exclusive){ l << l.decl<output_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << result[k.var<uint_>("index")] << " = sum;\n" << "sum = " << op(k.var<output_type>("sum"), k.var<output_type>("value")) << ";\n"; } else { l << "sum = " << op(k.var<output_type>("sum"), first[k.var<uint_>("index")]) << ";\n" << result[k.var<uint_>("index")] << " = sum;\n"; } l << "index++;\n" << "}\n"; // compile scan kernel kernel final_scan_kernel = l.compile(context); // setup kernel arguments final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); // execute the kernel global_work_size = compute_units; queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0); // return iterator pointing to the end of the result range return result + count; }
size_t reduce(InputIterator first, size_t count, OutputIterator result, size_t block_size, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename boost::compute::result_of<BinaryFunction(input_type, input_type)>::type result_type; const context &context = queue.get_context(); size_t block_count = count / 2 / block_size; size_t total_block_count = static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); if(block_count != 0){ meta_kernel k("block_reduce"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); k << "const uint gid = get_global_id(0);\n" << "const uint lid = get_local_id(0);\n" << // copy values to local memory "block[lid] = " << function(first[k.make_var<uint_>("gid*2+0")], first[k.make_var<uint_>("gid*2+1")]) << ";\n" << // perform reduction "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << " barrier(CLK_LOCAL_MEM_FENCE);\n" << " uint mask = (i << 1) - 1;\n" << " if((lid & mask) == 0){\n" << " block[lid] = " << function(k.expr<input_type>("block[lid]"), k.expr<input_type>("block[lid+i]")) << ";\n" << " }\n" << "}\n" << // write block result to global output "if(lid == 0)\n" << " output[get_group_id(0)] = block[0];\n"; kernel kernel = k.compile(context); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, block_count * block_size, block_size); } // serially reduce any leftovers if(block_count * block_size * 2 < count){ size_t last_block_start = block_count * block_size * 2; meta_kernel k("extra_serial_reduce"); size_t count_arg = k.add_arg<uint_>("count"); size_t offset_arg = k.add_arg<uint_>("offset"); size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); size_t output_offset_arg = k.add_arg<uint_>("output_offset"); k << k.decl<result_type>("result") << " = \n" << first[k.expr<uint_>("offset")] << ";\n" << "for(uint i = offset + 1; i < count; i++)\n" << " result = " << function(k.var<result_type>("result"), first[k.var<uint_>("i")]) << ";\n" << "output[output_offset] = result;\n"; kernel kernel = k.compile(context); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); kernel.set_arg(output_arg, result.get_buffer()); kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); queue.enqueue_task(kernel); } return total_block_count; }
inline void find_extrema_with_reduce(InputIterator input, vector<uint_>::iterator input_idx, size_t count, ResultIterator result, vector<uint_>::iterator result_idx, size_t work_groups_no, size_t work_group_size, Compare compare, const bool find_minimum, const bool use_input_idx, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); meta_kernel k("find_extrema_reduce"); size_t count_arg = k.add_arg<uint_>("count"); size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx"); k << // Work item global id k.decl<const uint_>("gid") << " = get_global_id(0);\n" << // Index of element that will be read from input buffer k.decl<uint_>("idx") << " = gid;\n" << k.decl<input_type>("acc") << ";\n" << k.decl<uint_>("acc_idx") << ";\n" << "if(gid < count) {\n" << // Real index of currently best element "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << "#else\n" << k.var<uint_>("acc_idx") << " = idx;\n" << "#endif\n" << // Init accumulator with first[get_global_id(0)] "acc = " << input[k.var<uint_>("idx")] << ";\n" << "idx += get_global_size(0);\n" << "}\n" << k.decl<bool>("compare_result") << ";\n" << k.decl<bool>("equal") << ";\n\n" << "while( idx < count ){\n" << // Next element k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" << "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << "#endif\n" << // Comparison between currently best element (acc) and next element "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << "compare_result = " << compare(k.var<input_type>("next"), k.var<input_type>("acc")) << ";\n" << "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << "equal = !compare_result && !" << compare(k.var<input_type>("acc"), k.var<input_type>("next")) << ";\n" << "# endif\n" << "#else\n" << "compare_result = " << compare(k.var<input_type>("acc"), k.var<input_type>("next")) << ";\n" << "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << "equal = !compare_result && !" << compare(k.var<input_type>("next"), k.var<input_type>("acc")) << ";\n" << "# endif\n" << "#endif\n" << // save the winner "acc = compare_result ? acc : next;\n" << "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << "acc_idx = compare_result ? " << "acc_idx : " << "(equal ? min(acc_idx, next_idx) : next_idx);\n" << "#else\n" << "acc_idx = compare_result ? acc_idx : idx;\n" << "#endif\n" << "idx += get_global_size(0);\n" << "}\n\n" << // Work item local id k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "block[lid] = acc;\n" << "block_idx[lid] = acc_idx;\n" << "barrier(CLK_LOCAL_MEM_FENCE);\n" << k.decl<uint_>("group_offset") << " = count - (get_local_size(0) * get_group_id(0));\n\n"; k << "#pragma unroll\n" "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " << "offset = offset / 2) {\n" << "if((lid < offset) && ((lid + offset) < group_offset)) { \n" << k.decl<input_type>("mine") << " = block[lid];\n" << k.decl<input_type>("other") << " = block[lid+offset];\n" << "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << "compare_result = " << compare(k.var<input_type>("other"), k.var<input_type>("mine")) << ";\n" << "equal = !compare_result && !" << compare(k.var<input_type>("mine"), k.var<input_type>("other")) << ";\n" << "#else\n" << "compare_result = " << compare(k.var<input_type>("mine"), k.var<input_type>("other")) << ";\n" << "equal = !compare_result && !" << compare(k.var<input_type>("other"), k.var<input_type>("mine")) << ";\n" << "#endif\n" << "block[lid] = compare_result ? mine : other;\n" << k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" << k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" << "block_idx[lid] = compare_result ? " << "mine_idx : " << "(equal ? min(mine_idx, other_idx) : other_idx);\n" << "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" << "}\n\n" << // write block result to global output "if(lid == 0){\n" << result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" << result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" << "}"; std::string options; if(!find_minimum){ options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; } if(use_input_idx){ options += " -DBOOST_COMPUTE_USE_INPUT_IDX"; } kernel kernel = k.compile(context, options); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size)); kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size)); queue.enqueue_1d_range_kernel(kernel, 0, work_groups_no * work_group_size, work_group_size); }
inline void inplace_reduce(Iterator first, Iterator last, BinaryFunction function, command_queue &queue) { typedef typename std::iterator_traits<Iterator>::value_type value_type; size_t input_size = iterator_range_size(first, last); if(input_size < 2){ return; } const context &context = queue.get_context(); size_t block_size = 64; size_t values_per_thread = 8; size_t block_count = input_size / (block_size * values_per_thread); if(block_count * block_size * values_per_thread != input_size) block_count++; vector<value_type> output(block_count, context); meta_kernel k("inplace_reduce"); size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input"); size_t input_size_arg = k.add_arg<const uint_>("input_size"); size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output"); size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch"); k << "const uint gid = get_global_id(0);\n" << "const uint lid = get_local_id(0);\n" << "const uint values_per_thread =\n" << uint_(values_per_thread) << ";\n" << // thread reduce "const uint index = gid * values_per_thread;\n" << "if(index < input_size){\n" << k.decl<value_type>("sum") << " = input[index];\n" << "for(uint i = 1;\n" << "i < values_per_thread && (index + i) < input_size;\n" << "i++){\n" << " sum = " << function(k.var<value_type>("sum"), k.var<value_type>("input[index+i]")) << ";\n" << "}\n" << "scratch[lid] = sum;\n" << "}\n" << // local reduce "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" << " barrier(CLK_LOCAL_MEM_FENCE);\n" << " uint mask = (i << 1) - 1;\n" << " uint next_index = (gid + i) * values_per_thread;\n" " if((lid & mask) == 0 && next_index < input_size){\n" << " scratch[lid] = " << function(k.var<value_type>("scratch[lid]"), k.var<value_type>("scratch[lid+i]")) << ";\n" << " }\n" << "}\n" << // write output for block "if(lid == 0){\n" << " output[get_group_id(0)] = scratch[0];\n" << "}\n" ; const buffer *input_buffer = &first.get_buffer(); const buffer *output_buffer = &output.get_buffer(); kernel kernel = k.compile(context); while(input_size > 1){ kernel.set_arg(input_arg, *input_buffer); kernel.set_arg(input_size_arg, static_cast<uint_>(input_size)); kernel.set_arg(output_arg, *output_buffer); kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size)); queue.enqueue_1d_range_kernel(kernel, 0, block_count * block_size, block_size); input_size = static_cast<size_t>( std::ceil(float(input_size) / (block_size * values_per_thread) ) ); block_count = input_size / (block_size * values_per_thread); if(block_count * block_size * values_per_thread != input_size) block_count++; std::swap(input_buffer, output_buffer); } if(input_buffer != &first.get_buffer()){ ::boost::compute::copy(output.begin(), output.begin() + 1, first, queue); } }