backend::kernel key_value_mapping(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); src.kernel("key_value_mapping") .open("(") .template parameter< size_t >("n"); boost::mpl::for_each<K>(pointer_param<global_ptr, true>(src, "ikeys")); boost::mpl::for_each<K>(pointer_param<global_ptr >(src, "okeys")); src.template parameter< global_ptr<V> >("ovals"); src.template parameter< global_ptr<int> >("offset"); src.template parameter< global_ptr<const V> >("ivals"); src.close(")").open("{"); src.new_line().grid_stride_loop().open("{"); src.new_line() << "int num_sections = offset[n - 1] + 1;"; src.new_line() << "int off = offset[idx];"; src.new_line() << "if (idx < (n - 1) && off != offset[idx + 1])"; src.open("{"); for(int p = 0; p < boost::mpl::size<K>::value; ++p) src.new_line() << "okeys" << p << "[off] = ikeys" << p << "[idx];"; src.new_line() << "ovals[off] = ivals[idx];"; src.close("}"); src.new_line() << "if (idx == (n - 1))"; src.open("{"); for(int p = 0; p < boost::mpl::size<K>::value; ++p) src.new_line() << "okeys" << p << "[num_sections - 1] = ikeys" << p << "[idx];"; src.new_line() << "ovals[num_sections - 1] = ivals[idx];"; src.close("}"); src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "key_value_mapping"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
backend::kernel block_sum_by_key(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); Oper::define(src, "oper"); src.kernel("block_sum_by_key") .open("(") .template parameter< size_t >("n") .template parameter< global_ptr<const int> >("key_sum") .template parameter< global_ptr<const T> >("post_sum") .template parameter< global_ptr<const int> >("keys") .template parameter< global_ptr<T> >("output") .close(")").open("{"); src.new_line() << "size_t g_id = " << src.global_id(0) << ";"; src.new_line() << "size_t block = " << src.group_id(0) << ";"; src.new_line() << "if (g_id >= n) return;"; // accumulate prefix src.new_line() << "int key2 = keys[ g_id ];"; src.new_line() << "int key1 = (block > 0 ) ? key_sum[ block - 1 ] : key2 - 1;"; src.new_line() << "int key3 = (g_id < n - 1) ? keys [ g_id + 1 ] : key2 - 1;"; src.new_line() << "if (block > 0 && key1 == key2 && key2 != key3)"; src.open("{"); src.new_line() << type_name<T>() << " scan_result = output [ g_id ];"; src.new_line() << type_name<T>() << " post_block_sum = post_sum[ block - 1 ];"; src.new_line() << "output[ g_id ] = oper( scan_result, post_block_sum );"; src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "block_sum_by_key"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
backend::kernel offset_calculation(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); Comp::define(src, "comp"); src.kernel("offset_calculation") .open("(") .template parameter< size_t >("n"); boost::mpl::for_each<T>(pointer_param<global_ptr, true>(src, "keys")); src.template parameter< global_ptr<int> >("offsets"); src.close(")").open("{"); src.new_line().grid_stride_loop().open("{"); src.new_line() << "if (idx > 0)" << " offsets[idx] = !comp("; for(int p = 0; p < boost::mpl::size<T>::value; ++p) src << (p ? ", " : "") << "keys" << p << "[idx - 1]"; for(int p = 0; p < boost::mpl::size<T>::value; ++p) src << ", keys" << p << "[idx]"; src << ");"; src.new_line() << "else offsets[idx] = 0;"; src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "offset_calculation"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
backend::kernel block_inclusive_scan_by_key(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); Oper::define(src, "oper"); src.kernel("block_inclusive_scan_by_key") .open("(") .template parameter< size_t >("n") .template parameter< global_ptr<const int> >("key_sum") .template parameter< global_ptr<const T> >("pre_sum") .template parameter< global_ptr<T> >("post_sum") .template parameter< cl_uint >("work_per_thread") .close(")").open("{"); src.new_line() << "size_t l_id = " << src.local_id(0) << ";"; src.new_line() << "size_t g_id = " << src.global_id(0) << ";"; src.new_line() << "size_t wgsz = " << src.local_size(0) << ";"; src.new_line() << "size_t map_id = g_id * work_per_thread;"; src.new_line() << "struct Shared"; src.open("{"); src.new_line() << "int keys[" << NT << "];"; src.new_line() << type_name<T>() << " vals[" << NT << "];"; src.close("};"); src.smem_static_var("struct Shared", "shared"); src.new_line() << "uint offset;"; src.new_line() << "int key;"; src.new_line() << type_name<T>() << " work_sum;"; src.new_line() << "if (map_id < n)"; src.open("{"); src.new_line() << "int prev_key;"; // accumulate zeroth value manually src.new_line() << "offset = 0;"; src.new_line() << "key = key_sum[map_id];"; src.new_line() << "work_sum = pre_sum[map_id];"; src.new_line() << "post_sum[map_id] = work_sum;"; // Serial accumulation src.new_line() << "for( offset = offset + 1; offset < work_per_thread; ++offset )"; src.open("{"); src.new_line() << "prev_key = key;"; src.new_line() << "key = key_sum[ map_id + offset ];"; src.new_line() << "if ( map_id + offset < n )"; src.open("{"); src.new_line() << type_name<T>() << " y = pre_sum[ map_id + offset ];"; src.new_line() << "if ( key == prev_key ) work_sum = oper( work_sum, y );"; src.new_line() << "else work_sum = y;"; src.new_line() << "post_sum[ map_id + offset ] = work_sum;"; src.close("}"); src.close("}"); src.close("}"); src.new_line().barrier(); // load LDS with register sums src.new_line() << "shared.vals[ l_id ] = work_sum;"; src.new_line() << "shared.keys[ l_id ] = key;"; // scan in lds src.new_line() << type_name<T>() << " scan_sum = work_sum;"; src.new_line() << "for( offset = 1; offset < wgsz; offset *= 2 )"; src.open("{"); src.new_line().barrier(); src.new_line() << "if (map_id < n)"; src.open("{"); src.new_line() << "if (l_id >= offset)"; src.open("{"); src.new_line() << "int key1 = shared.keys[ l_id ];"; src.new_line() << "int key2 = shared.keys[ l_id - offset ];"; src.new_line() << "if ( key1 == key2 ) scan_sum = oper( scan_sum, shared.vals[ l_id - offset ] );"; src.new_line() << "else scan_sum = shared.vals[ l_id ];"; src.close("}"); src.close("}"); src.new_line().barrier(); src.new_line() << "shared.vals[ l_id ] = scan_sum;"; src.close("}"); src.new_line().barrier(); // write final scan from pre-scan and lds scan src.new_line() << "for( offset = 0; offset < work_per_thread; ++offset )"; src.open("{"); src.new_line().barrier(true); src.new_line() << "if (map_id < n && l_id > 0)"; src.open("{"); src.new_line() << type_name<T>() << " y = post_sum[ map_id + offset ];"; src.new_line() << "int key1 = key_sum [ map_id + offset ];"; src.new_line() << "int key2 = shared.keys[ l_id - 1 ];"; src.new_line() << "if ( key1 == key2 ) y = oper( y, shared.vals[l_id - 1] );"; src.new_line() << "post_sum[ map_id + offset ] = y;"; src.close("}"); src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "block_inclusive_scan_by_key"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
backend::kernel block_scan_by_key(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); Oper::define(src, "oper"); src.kernel("block_scan_by_key") .open("(") .template parameter< size_t >("n") .template parameter< global_ptr<const int> >("keys") .template parameter< global_ptr<const T> >("vals") .template parameter< global_ptr<T> >("output") .template parameter< global_ptr<int> >("key_buf") .template parameter< global_ptr<T> >("val_buf") .close(")").open("{"); src.new_line() << "size_t l_id = " << src.local_id(0) << ";"; src.new_line() << "size_t g_id = " << src.global_id(0) << ";"; src.new_line() << "size_t block = " << src.group_id(0) << ";"; src.new_line() << "size_t wgsz = " << src.local_size(0) << ";"; src.new_line() << "struct Shared"; src.open("{"); src.new_line() << "int keys[" << NT << "];"; src.new_line() << type_name<T>() << " vals[" << NT << "];"; src.close("};"); src.smem_static_var("struct Shared", "shared"); src.new_line() << "int key;"; src.new_line() << type_name<T>() << " val;"; src.new_line() << "if (g_id < n)"; src.open("{"); src.new_line() << "key = keys[g_id];"; src.new_line() << "val = vals[g_id];"; src.new_line() << "shared.keys[l_id] = key;"; src.new_line() << "shared.vals[l_id] = val;"; src.close("}"); // Computes a scan within a workgroup updates vals in lds but not keys src.new_line() << type_name<T>() << " sum = val;"; src.new_line() << "for(size_t offset = 1; offset < wgsz; offset *= 2)"; src.open("{"); src.new_line().barrier(); src.new_line() << "if (l_id >= offset && shared.keys[l_id - offset] == key)"; src.open("{"); src.new_line() << "sum = oper(sum, shared.vals[l_id - offset]);"; src.close("}"); src.new_line().barrier(); src.new_line() << "shared.vals[l_id] = sum;"; src.close("}"); src.new_line().barrier(); src.new_line() << "if (g_id >= n) return;"; // Each work item writes out its calculated scan result, relative to the // beginning of each work group src.new_line() << "int key2 = -1;"; src.new_line() << "if (g_id < n - 1) key2 = keys[g_id + 1];"; src.new_line() << "if (key != key2) output[g_id] = sum;"; src.new_line() << "if (l_id == 0)"; src.open("{"); src.new_line() << "key_buf[block] = shared.keys[wgsz - 1];"; src.new_line() << "val_buf[block] = shared.vals[wgsz - 1];"; src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "block_scan_by_key"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
typename std::enable_if< !boost::proto::matches< typename boost::proto::result_of::as_expr<ExprTuple>::type, multivector_full_grammar >::value #if !defined(_MSC_VER) || _MSC_VER >= 1700 && N == std::tuple_size<ExprTuple>::value #endif , const multivector& >::type operator=(const ExprTuple &expr) { #endif static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream source; source << standard_kernel_header(device); { get_header f(source); for_each<0>(expr, f); } source << "kernel void multi_expr_tuple(\n" "\t" << type_name<size_t>() << " n"; for(uint i = 1; i <= N; i++) source << ",\n\tglobal " << type_name<T>() << " *res_" << i; { get_params f(source); for_each<0>(expr, f); } source << "\n)\n{\n"; if ( is_cpu(device) ) { source << "\tsize_t chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);\n" "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n" "\tsize_t chunk_end = min(n, chunk_start + chunk_size);\n" "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n"; } else { source << "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; } { get_expressions f(source); for_each<0>(expr, f); } source << "\n"; for(uint i = 1; i <= N; i++) source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n"; source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, "multi_expr_tuple"); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, (*vec[i])(d)); { set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d)); for_each<0>(expr, f); } queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }
typename std::enable_if< boost::proto::matches< typename boost::proto::result_of::as_expr<Expr>::type, multivector_expr_grammar >::value, const multivector& >::type operator=(const Expr& expr) { static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); // If any device in context is CPU, then do not fuse the kernel, // but assign components individually. if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) { assign_subexpressions<0, N>(boost::proto::as_child(expr)); return *this; } for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream kernel_name; kernel_name << "multi_"; vector_name_context name_ctx(kernel_name); boost::proto::eval(boost::proto::as_child(expr), name_ctx); std::ostringstream source; source << standard_kernel_header(device); extract_user_functions()( boost::proto::as_child(expr), declare_user_function(source) ); source << "kernel void " << kernel_name.str() << "(\n\t" << type_name<size_t>() << " n"; for(size_t i = 0; i < N; ) source << ",\n\tglobal " << type_name<T>() << " *res_" << ++i; build_param_list<N>(boost::proto::as_child(expr), source); source << "\n)\n{\n" "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; build_expr_list(boost::proto::as_child(expr), source); source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, kernel_name.str().c_str()); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, vec[i]->operator()(d)); set_kernel_args<N>( boost::proto::as_child(expr), kernel->second.kernel, d, pos, vec[0]->part_start(d) ); queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }
/// apply kernel to get similarity metric. double operator()(const FeatVec<K>& x) const { return krn(x, m, sigma); }