/// Select best launch configuration for the given shared memory requirements. void config(const command_queue &q, std::function<size_t(size_t)> smem) { // Select workgroup size that would fit into the device. size_t ws = q.device().max_threads_per_block() / 2; size_t max_ws = max_threads_per_block(q); size_t max_smem = max_shared_memory_per_block(q); // Reduce workgroup size until it satisfies resource requirements: while( (ws > max_ws) || (smem(ws) > max_smem) ) ws /= 2; config(num_workgroups(q), ws); }
/// Select best launch configuration for the given shared memory requirements. void config(const cl::CommandQueue &queue, std::function<size_t(size_t)> smem) { cl::Device dev = queue.getInfo<CL_QUEUE_DEVICE>(); if ( is_cpu(queue) ) { w_size = 1; } else { // Select workgroup size that would fit into the device. w_size = dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0] / 2; size_t max_ws = max_threads_per_block(queue); size_t max_smem = max_shared_memory_per_block(queue); // Reduce workgroup size until it satisfies resource requirements: while( (w_size > max_ws) || (smem(w_size) > max_smem) ) w_size /= 2; } g_size = w_size * num_workgroups(queue); }
/// Select best launch configuration for the given shared memory requirements. void config(const boost::compute::command_queue &queue, std::function<size_t(size_t)> smem) { boost::compute::device dev = queue.get_device(); size_t ws; if ( is_cpu(queue) ) { ws = 1; } else { // Select workgroup size that would fit into the device. ws = dev.get_info<std::vector<size_t>>(CL_DEVICE_MAX_WORK_ITEM_SIZES)[0] / 2; size_t max_ws = max_threads_per_block(queue); size_t max_smem = max_shared_memory_per_block(queue); // Reduce workgroup size until it satisfies resource requirements: while( (ws > max_ws) || (smem(ws) > max_smem) ) ws /= 2; } config(num_workgroups(queue), ws); }
typename std::enable_if< !boost::proto::matches< typename boost::proto::result_of::as_expr<ExprTuple>::type, multivector_full_grammar >::value #if !defined(_MSC_VER) || _MSC_VER >= 1700 && N == std::tuple_size<ExprTuple>::value #endif , const multivector& >::type operator=(const ExprTuple &expr) { #endif static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream source; source << standard_kernel_header(device); { get_header f(source); for_each<0>(expr, f); } source << "kernel void multi_expr_tuple(\n" "\t" << type_name<size_t>() << " n"; for(uint i = 1; i <= N; i++) source << ",\n\tglobal " << type_name<T>() << " *res_" << i; { get_params f(source); for_each<0>(expr, f); } source << "\n)\n{\n"; if ( is_cpu(device) ) { source << "\tsize_t chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);\n" "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n" "\tsize_t chunk_end = min(n, chunk_start + chunk_size);\n" "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n"; } else { source << "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; } { get_expressions f(source); for_each<0>(expr, f); } source << "\n"; for(uint i = 1; i <= N; i++) source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n"; source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, "multi_expr_tuple"); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, (*vec[i])(d)); { set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d)); for_each<0>(expr, f); } queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }
typename std::enable_if< boost::proto::matches< typename boost::proto::result_of::as_expr<Expr>::type, multivector_expr_grammar >::value, const multivector& >::type operator=(const Expr& expr) { static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); // If any device in context is CPU, then do not fuse the kernel, // but assign components individually. if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) { assign_subexpressions<0, N>(boost::proto::as_child(expr)); return *this; } for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream kernel_name; kernel_name << "multi_"; vector_name_context name_ctx(kernel_name); boost::proto::eval(boost::proto::as_child(expr), name_ctx); std::ostringstream source; source << standard_kernel_header(device); extract_user_functions()( boost::proto::as_child(expr), declare_user_function(source) ); source << "kernel void " << kernel_name.str() << "(\n\t" << type_name<size_t>() << " n"; for(size_t i = 0; i < N; ) source << ",\n\tglobal " << type_name<T>() << " *res_" << ++i; build_param_list<N>(boost::proto::as_child(expr), source); source << "\n)\n{\n" "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; build_expr_list(boost::proto::as_child(expr), source); source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, kernel_name.str().c_str()); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, vec[i]->operator()(d)); set_kernel_args<N>( boost::proto::as_child(expr), kernel->second.kernel, d, pos, vec[0]->part_start(d) ); queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }