static backend::kernel& make_kernel( const backend::command_queue &q, const Expr &expr ) { using namespace detail; static kernel_cache cache; auto kernel = cache.find(q); if (kernel == cache.end()) { backend::source_generator src(q); output_terminal_preamble gpre(src, q, "prm", empty_state()); boost::proto::eval(boost::proto::as_child(expr), gpre); src.begin_kernel("vexcl_any_of_kernel"); src.begin_kernel_parameters(); src.parameter<size_t>("n"); extract_terminals()(expr, declare_expression_parameter( src, q, "prm", empty_state() ) ); src.template parameter< global_ptr<char> >("result"); src.end_kernel_parameters(); src.new_line() << "for(ulong idx = 0; idx < n; ++idx)"; src.open("{"); output_local_preamble lpre(src, q, "prm", empty_state()); boost::proto::eval(boost::proto::as_child(expr), lpre); src.new_line() << "if ("; vector_expr_context expr_ctx(src, q, "prm", empty_state()); boost::proto::eval(boost::proto::as_child(expr), expr_ctx); src << ")"; src.open("{"); src.new_line() << "result[0] = 1;"; src.new_line() << "return;"; src.close("}"); src.close("}"); src.new_line() << "result[0] = 0;"; src.end_kernel(); kernel = cache.insert(q, backend::kernel( q, src.str(), "vexcl_any_of_kernel" )); kernel->second.config(1, 1); } return kernel->second; }
typename std::enable_if< !boost::proto::matches< typename boost::proto::result_of::as_expr<ExprTuple>::type, multivector_full_grammar >::value #if !defined(_MSC_VER) || _MSC_VER >= 1700 && N == std::tuple_size<ExprTuple>::value #endif , const multivector& >::type operator=(const ExprTuple &expr) { #endif static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream source; source << standard_kernel_header(device); { get_header f(source); for_each<0>(expr, f); } source << "kernel void multi_expr_tuple(\n" "\t" << type_name<size_t>() << " n"; for(uint i = 1; i <= N; i++) source << ",\n\tglobal " << type_name<T>() << " *res_" << i; { get_params f(source); for_each<0>(expr, f); } source << "\n)\n{\n"; if ( is_cpu(device) ) { source << "\tsize_t chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);\n" "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n" "\tsize_t chunk_end = min(n, chunk_start + chunk_size);\n" "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n"; } else { source << "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; } { get_expressions f(source); for_each<0>(expr, f); } source << "\n"; for(uint i = 1; i <= N; i++) source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n"; source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, "multi_expr_tuple"); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, (*vec[i])(d)); { set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d)); for_each<0>(expr, f); } queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }
typename std::enable_if< boost::proto::matches< typename boost::proto::result_of::as_expr<Expr>::type, multivector_expr_grammar >::value, const multivector& >::type operator=(const Expr& expr) { static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); // If any device in context is CPU, then do not fuse the kernel, // but assign components individually. if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) { assign_subexpressions<0, N>(boost::proto::as_child(expr)); return *this; } for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream kernel_name; kernel_name << "multi_"; vector_name_context name_ctx(kernel_name); boost::proto::eval(boost::proto::as_child(expr), name_ctx); std::ostringstream source; source << standard_kernel_header(device); extract_user_functions()( boost::proto::as_child(expr), declare_user_function(source) ); source << "kernel void " << kernel_name.str() << "(\n\t" << type_name<size_t>() << " n"; for(size_t i = 0; i < N; ) source << ",\n\tglobal " << type_name<T>() << " *res_" << ++i; build_param_list<N>(boost::proto::as_child(expr), source); source << "\n)\n{\n" "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; build_expr_list(boost::proto::as_child(expr), source); source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, kernel_name.str().c_str()); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, vec[i]->operator()(d)); set_kernel_args<N>( boost::proto::as_child(expr), kernel->second.kernel, d, pos, vec[0]->part_start(d) ); queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }
void eval(const Expr &expr, const std::vector<backend::command_queue> &queue, const std::vector<size_t> &part ) { using namespace vex::detail; #if (VEXCL_CHECK_SIZES > 0) { get_expression_properties prop; extract_terminals()(boost::proto::as_child(expr), prop); precondition( prop.queue.empty() || prop.queue.size() == queue.size(), "Incompatible queue lists" ); precondition( prop.size == 0 || prop.size == part.back(), "Incompatible expression sizes" ); } #endif static kernel_cache cache; for(unsigned d = 0; d < queue.size(); d++) { auto kernel = cache.find(queue[d]); backend::select_context(queue[d]); if (kernel == cache.end()) { backend::source_generator source(queue[d]); output_terminal_preamble termpream(source, queue[d], "prm", empty_state()); boost::proto::eval(boost::proto::as_child(expr), termpream); source.begin_kernel("vexcl_eval_kernel"); source.begin_kernel_parameters(); source.parameter<size_t>("n"); declare_expression_parameter declare(source, queue[d], "prm", empty_state()); extract_terminals()(boost::proto::as_child(expr), declare); source.end_kernel_parameters(); source.grid_stride_loop().open("{"); output_local_preamble loc_init(source, queue[d], "prm", empty_state()); boost::proto::eval(boost::proto::as_child(expr), loc_init); source.new_line(); vector_expr_context expr_ctx(source, queue[d], "prm", empty_state()); boost::proto::eval(boost::proto::as_child(expr), expr_ctx); source << ";"; source.close("}"); source.end_kernel(); kernel = cache.insert(queue[d], backend::kernel( queue[d], source.str(), "vexcl_eval_kernel")); } if (size_t psize = part[d + 1] - part[d]) { auto &K = kernel->second; K.push_arg(psize); set_expression_argument setarg(K, d, part[d], empty_state()); extract_terminals()( boost::proto::as_child(expr), setarg); K(queue[d]); } } }