inline void kernel_common(std::ostringstream &o, const cl::Device& device) { if(std::is_same<T, cl_double>::value) { o << standard_kernel_header(device) << "typedef double real_t;\n" << "typedef double2 real2_t;\n"; } else { o << "typedef float real_t;\n" << "typedef float2 real2_t;\n"; } }
typename std::enable_if< !boost::proto::matches< typename boost::proto::result_of::as_expr<ExprTuple>::type, multivector_full_grammar >::value #if !defined(_MSC_VER) || _MSC_VER >= 1700 && N == std::tuple_size<ExprTuple>::value #endif , const multivector& >::type operator=(const ExprTuple &expr) { #endif static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream source; source << standard_kernel_header(device); { get_header f(source); for_each<0>(expr, f); } source << "kernel void multi_expr_tuple(\n" "\t" << type_name<size_t>() << " n"; for(uint i = 1; i <= N; i++) source << ",\n\tglobal " << type_name<T>() << " *res_" << i; { get_params f(source); for_each<0>(expr, f); } source << "\n)\n{\n"; if ( is_cpu(device) ) { source << "\tsize_t chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);\n" "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n" "\tsize_t chunk_end = min(n, chunk_start + chunk_size);\n" "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n"; } else { source << "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; } { get_expressions f(source); for_each<0>(expr, f); } source << "\n"; for(uint i = 1; i <= N; i++) source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n"; source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, "multi_expr_tuple"); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, (*vec[i])(d)); { set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d)); for_each<0>(expr, f); } queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }
source_generator(const command_queue &queue) : indent(0), first_prm(true), cpu( is_cpu(queue) ) { src << standard_kernel_header(queue); }
typename std::enable_if< boost::proto::matches< typename boost::proto::result_of::as_expr<Expr>::type, multivector_expr_grammar >::value, const multivector& >::type operator=(const Expr& expr) { static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); // If any device in context is CPU, then do not fuse the kernel, // but assign components individually. if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) { assign_subexpressions<0, N>(boost::proto::as_child(expr)); return *this; } for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream kernel_name; kernel_name << "multi_"; vector_name_context name_ctx(kernel_name); boost::proto::eval(boost::proto::as_child(expr), name_ctx); std::ostringstream source; source << standard_kernel_header(device); extract_user_functions()( boost::proto::as_child(expr), declare_user_function(source) ); source << "kernel void " << kernel_name.str() << "(\n\t" << type_name<size_t>() << " n"; for(size_t i = 0; i < N; ) source << ",\n\tglobal " << type_name<T>() << " *res_" << ++i; build_param_list<N>(boost::proto::as_child(expr), source); source << "\n)\n{\n" "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; build_expr_list(boost::proto::as_child(expr), source); source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, kernel_name.str().c_str()); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, vec[i]->operator()(d)); set_kernel_args<N>( boost::proto::as_child(expr), kernel->second.kernel, d, pos, vec[0]->part_start(d) ); queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }