backend::kernel key_value_mapping(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); src.kernel("key_value_mapping") .open("(") .template parameter< size_t >("n"); boost::mpl::for_each<K>(pointer_param<global_ptr, true>(src, "ikeys")); boost::mpl::for_each<K>(pointer_param<global_ptr >(src, "okeys")); src.template parameter< global_ptr<V> >("ovals"); src.template parameter< global_ptr<int> >("offset"); src.template parameter< global_ptr<const V> >("ivals"); src.close(")").open("{"); src.new_line().grid_stride_loop().open("{"); src.new_line() << "int num_sections = offset[n - 1] + 1;"; src.new_line() << "int off = offset[idx];"; src.new_line() << "if (idx < (n - 1) && off != offset[idx + 1])"; src.open("{"); for(int p = 0; p < boost::mpl::size<K>::value; ++p) src.new_line() << "okeys" << p << "[off] = ikeys" << p << "[idx];"; src.new_line() << "ovals[off] = ivals[idx];"; src.close("}"); src.new_line() << "if (idx == (n - 1))"; src.open("{"); for(int p = 0; p < boost::mpl::size<K>::value; ++p) src.new_line() << "okeys" << p << "[num_sections - 1] = ikeys" << p << "[idx];"; src.new_line() << "ovals[num_sections - 1] = ivals[idx];"; src.close("}"); src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "key_value_mapping"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
backend::kernel block_sum_by_key(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); Oper::define(src, "oper"); src.kernel("block_sum_by_key") .open("(") .template parameter< size_t >("n") .template parameter< global_ptr<const int> >("key_sum") .template parameter< global_ptr<const T> >("post_sum") .template parameter< global_ptr<const int> >("keys") .template parameter< global_ptr<T> >("output") .close(")").open("{"); src.new_line() << "size_t g_id = " << src.global_id(0) << ";"; src.new_line() << "size_t block = " << src.group_id(0) << ";"; src.new_line() << "if (g_id >= n) return;"; // accumulate prefix src.new_line() << "int key2 = keys[ g_id ];"; src.new_line() << "int key1 = (block > 0 ) ? key_sum[ block - 1 ] : key2 - 1;"; src.new_line() << "int key3 = (g_id < n - 1) ? keys [ g_id + 1 ] : key2 - 1;"; src.new_line() << "if (block > 0 && key1 == key2 && key2 != key3)"; src.open("{"); src.new_line() << type_name<T>() << " scan_result = output [ g_id ];"; src.new_line() << type_name<T>() << " post_block_sum = post_sum[ block - 1 ];"; src.new_line() << "output[ g_id ] = oper( scan_result, post_block_sum );"; src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "block_sum_by_key"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
backend::kernel offset_calculation(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); Comp::define(src, "comp"); src.kernel("offset_calculation") .open("(") .template parameter< size_t >("n"); boost::mpl::for_each<T>(pointer_param<global_ptr, true>(src, "keys")); src.template parameter< global_ptr<int> >("offsets"); src.close(")").open("{"); src.new_line().grid_stride_loop().open("{"); src.new_line() << "if (idx > 0)" << " offsets[idx] = !comp("; for(int p = 0; p < boost::mpl::size<T>::value; ++p) src << (p ? ", " : "") << "keys" << p << "[idx - 1]"; for(int p = 0; p < boost::mpl::size<T>::value; ++p) src << ", keys" << p << "[idx]"; src << ");"; src.new_line() << "else offsets[idx] = 0;"; src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "offset_calculation"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
backend::kernel block_inclusive_scan_by_key(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); Oper::define(src, "oper"); src.kernel("block_inclusive_scan_by_key") .open("(") .template parameter< size_t >("n") .template parameter< global_ptr<const int> >("key_sum") .template parameter< global_ptr<const T> >("pre_sum") .template parameter< global_ptr<T> >("post_sum") .template parameter< cl_uint >("work_per_thread") .close(")").open("{"); src.new_line() << "size_t l_id = " << src.local_id(0) << ";"; src.new_line() << "size_t g_id = " << src.global_id(0) << ";"; src.new_line() << "size_t wgsz = " << src.local_size(0) << ";"; src.new_line() << "size_t map_id = g_id * work_per_thread;"; src.new_line() << "struct Shared"; src.open("{"); src.new_line() << "int keys[" << NT << "];"; src.new_line() << type_name<T>() << " vals[" << NT << "];"; src.close("};"); src.smem_static_var("struct Shared", "shared"); src.new_line() << "uint offset;"; src.new_line() << "int key;"; src.new_line() << type_name<T>() << " work_sum;"; src.new_line() << "if (map_id < n)"; src.open("{"); src.new_line() << "int prev_key;"; // accumulate zeroth value manually src.new_line() << "offset = 0;"; src.new_line() << "key = key_sum[map_id];"; src.new_line() << "work_sum = pre_sum[map_id];"; src.new_line() << "post_sum[map_id] = work_sum;"; // Serial accumulation src.new_line() << "for( offset = offset + 1; offset < work_per_thread; ++offset )"; src.open("{"); src.new_line() << "prev_key = key;"; src.new_line() << "key = key_sum[ map_id + offset ];"; src.new_line() << "if ( map_id + offset < n )"; src.open("{"); src.new_line() << type_name<T>() << " y = pre_sum[ map_id + offset ];"; src.new_line() << "if ( key == prev_key ) work_sum = oper( work_sum, y );"; src.new_line() << "else work_sum = y;"; src.new_line() << "post_sum[ map_id + offset ] = work_sum;"; src.close("}"); src.close("}"); src.close("}"); src.new_line().barrier(); // load LDS with register sums src.new_line() << "shared.vals[ l_id ] = work_sum;"; src.new_line() << "shared.keys[ l_id ] = key;"; // scan in lds src.new_line() << type_name<T>() << " scan_sum = work_sum;"; src.new_line() << "for( offset = 1; offset < wgsz; offset *= 2 )"; src.open("{"); src.new_line().barrier(); src.new_line() << "if (map_id < n)"; src.open("{"); src.new_line() << "if (l_id >= offset)"; src.open("{"); src.new_line() << "int key1 = shared.keys[ l_id ];"; src.new_line() << "int key2 = shared.keys[ l_id - offset ];"; src.new_line() << "if ( key1 == key2 ) scan_sum = oper( scan_sum, shared.vals[ l_id - offset ] );"; src.new_line() << "else scan_sum = shared.vals[ l_id ];"; src.close("}"); src.close("}"); src.new_line().barrier(); src.new_line() << "shared.vals[ l_id ] = scan_sum;"; src.close("}"); src.new_line().barrier(); // write final scan from pre-scan and lds scan src.new_line() << "for( offset = 0; offset < work_per_thread; ++offset )"; src.open("{"); src.new_line().barrier(true); src.new_line() << "if (map_id < n && l_id > 0)"; src.open("{"); src.new_line() << type_name<T>() << " y = post_sum[ map_id + offset ];"; src.new_line() << "int key1 = key_sum [ map_id + offset ];"; src.new_line() << "int key2 = shared.keys[ l_id - 1 ];"; src.new_line() << "if ( key1 == key2 ) y = oper( y, shared.vals[l_id - 1] );"; src.new_line() << "post_sum[ map_id + offset ] = y;"; src.close("}"); src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "block_inclusive_scan_by_key"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
backend::kernel block_scan_by_key(const backend::command_queue &queue) { static detail::kernel_cache cache; auto cache_key = backend::cache_key(queue); auto kernel = cache.find(cache_key); if (kernel == cache.end()) { backend::source_generator src(queue); Oper::define(src, "oper"); src.kernel("block_scan_by_key") .open("(") .template parameter< size_t >("n") .template parameter< global_ptr<const int> >("keys") .template parameter< global_ptr<const T> >("vals") .template parameter< global_ptr<T> >("output") .template parameter< global_ptr<int> >("key_buf") .template parameter< global_ptr<T> >("val_buf") .close(")").open("{"); src.new_line() << "size_t l_id = " << src.local_id(0) << ";"; src.new_line() << "size_t g_id = " << src.global_id(0) << ";"; src.new_line() << "size_t block = " << src.group_id(0) << ";"; src.new_line() << "size_t wgsz = " << src.local_size(0) << ";"; src.new_line() << "struct Shared"; src.open("{"); src.new_line() << "int keys[" << NT << "];"; src.new_line() << type_name<T>() << " vals[" << NT << "];"; src.close("};"); src.smem_static_var("struct Shared", "shared"); src.new_line() << "int key;"; src.new_line() << type_name<T>() << " val;"; src.new_line() << "if (g_id < n)"; src.open("{"); src.new_line() << "key = keys[g_id];"; src.new_line() << "val = vals[g_id];"; src.new_line() << "shared.keys[l_id] = key;"; src.new_line() << "shared.vals[l_id] = val;"; src.close("}"); // Computes a scan within a workgroup updates vals in lds but not keys src.new_line() << type_name<T>() << " sum = val;"; src.new_line() << "for(size_t offset = 1; offset < wgsz; offset *= 2)"; src.open("{"); src.new_line().barrier(); src.new_line() << "if (l_id >= offset && shared.keys[l_id - offset] == key)"; src.open("{"); src.new_line() << "sum = oper(sum, shared.vals[l_id - offset]);"; src.close("}"); src.new_line().barrier(); src.new_line() << "shared.vals[l_id] = sum;"; src.close("}"); src.new_line().barrier(); src.new_line() << "if (g_id >= n) return;"; // Each work item writes out its calculated scan result, relative to the // beginning of each work group src.new_line() << "int key2 = -1;"; src.new_line() << "if (g_id < n - 1) key2 = keys[g_id + 1];"; src.new_line() << "if (key != key2) output[g_id] = sum;"; src.new_line() << "if (l_id == 0)"; src.open("{"); src.new_line() << "key_buf[block] = shared.keys[wgsz - 1];"; src.new_line() << "val_buf[block] = shared.vals[wgsz - 1];"; src.close("}"); src.close("}"); backend::kernel krn(queue, src.str(), "block_scan_by_key"); kernel = cache.insert(std::make_pair(cache_key, krn)).first; } return kernel->second; }
backend::kernel block_inclusive_scan(const backend::command_queue &queue) { static detail::kernel_cache cache; auto kernel = cache.find(queue); if (kernel == cache.end()) { backend::source_generator src(queue); Oper::define(src, "oper"); src.kernel("block_inclusive_scan") .open("(") .template parameter< size_t >("n") .template parameter< global_ptr<const T> >("input") .template parameter< T >("identity") .template parameter< global_ptr<T> >("scan_buf1") .template parameter< global_ptr<T> >("scan_buf2") .template parameter< int >("exclusive") .close(")").open("{"); src.new_line() << "size_t l_id = " << src.local_id(0) << ";"; src.new_line() << "size_t g_id = " << src.global_id(0) << ";"; src.new_line() << "size_t block = " << src.group_id(0) << ";"; src.new_line() << "size_t offset = 1;"; { std::ostringstream shared; shared << "shared[" << 2 * NT << "]"; src.smem_static_var(type_name<T>(), shared.str()); } // load input into shared memory src.new_line() << "if(block * " << 2 * NT << " + l_id < n)" << " shared[l_id] = input[block * " << 2 * NT << " + l_id];"; src.new_line() << "if(block * " << 2 * NT << " + l_id + " << NT << " < n)" << " shared[l_id + " << NT << "] =" << " input[block * " << 2 * NT << " + l_id + " << NT << "];"; // Exclusive case src.new_line() << "if(exclusive && g_id == 0)" << " shared[l_id] = oper(identity, input[0]);"; src.new_line() << "for (size_t start = " << NT << "; start > 0; start >>= 1, offset *= 2)"; src.open("{"); src.new_line().barrier(); src.new_line() << "if (l_id < start)"; src.open("{"); src.new_line() << "size_t temp1 = offset * (2 * l_id + 1) - 1;"; src.new_line() << "size_t temp2 = offset * (2 * l_id + 2) - 1;"; src.new_line() << type_name<T>() << " y2 = shared[temp2];"; src.new_line() << type_name<T>() << " y1 = shared[temp1];"; src.new_line() << "shared[temp2] = oper(y2, y1);"; src.close("}"); src.close("}"); src.new_line().barrier(); src.new_line() << "if (l_id == 0)"; src.open("{"); src.new_line() << "scan_buf1[ block ] = shared[" << NT * 2 - 1 << "];"; src.new_line() << "scan_buf2[ block ] = shared[" << NT - 1 << "];"; src.close("}"); src.close("}"); kernel = cache.insert(queue, backend::kernel( queue, src.str(), "block_inclusive_scan")); } return kernel->second; }
backend::kernel block_addition( const backend::command_queue &queue) { static detail::kernel_cache cache; auto kernel = cache.find(queue); if (kernel == cache.end()) { backend::source_generator src(queue); Oper::define(src, "oper"); src.kernel("block_addition") .open("(") .template parameter< size_t >("n") .template parameter< global_ptr<const T> >("input") .template parameter< global_ptr<T> >("output") .template parameter< global_ptr<T> >("post_sum") .template parameter< global_ptr<T> >("pre_sum") .template parameter< T >("identity") .template parameter< int >("exclusive") .close(")").open("{"); src.new_line() << "size_t l_id = " << src.local_id(0) << ";"; src.new_line() << "size_t g_id = " << src.global_id(0) << ";"; src.new_line() << "size_t block = " << src.group_id(0) << ";"; src.new_line() << type_name<T>() << " val;"; { std::ostringstream shared; shared << "shared[" << NT << "]"; src.smem_static_var(type_name<T>(), shared.str()); } src.new_line() << "if (g_id < n)"; src.open("{"); src.new_line() << "if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity;"; src.new_line() << "else val = input[g_id];"; src.close("}"); src.new_line() << "shared[l_id] = val;"; src.new_line() << type_name<T>() << " scan_result = val;"; src.new_line() << type_name<T>() << " post_block_sum, new_result;"; src.new_line() << type_name<T>() << " y1, y2, sum;"; src.new_line() << "if(l_id == 0 && g_id < n)"; src.open("{"); src.new_line() << "if(block > 0)"; src.open("{"); src.new_line() << "if(block % 2 == 0) post_block_sum = post_sum[ block/2 - 1 ];"; src.new_line() << "else if(block == 1) post_block_sum = pre_sum[0];"; src.new_line() << "else"; src.open("{"); src.new_line() << "y1 = post_sum[ block/2 - 1 ];"; src.new_line() << "y2 = pre_sum [ block/2];"; src.new_line() << "post_block_sum = oper(y1, y2);"; src.close("}"); src.new_line() << "new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum );"; src.close("}"); src.new_line() << "else new_result = scan_result;"; src.new_line() << "shared[ l_id ] = new_result;"; src.close("}"); // Computes a scan within a workgroup src.new_line() << "sum = shared[ l_id ];"; src.new_line() << "for( size_t offset = 1; offset < " << NT << "; offset *= 2 )"; src.open("{"); src.new_line().barrier(); src.new_line() << "if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] );"; src.new_line().barrier(); src.new_line() << "shared[ l_id ] = sum;"; src.close("}"); src.new_line().barrier(); src.new_line() << "if(g_id < n) output[ g_id ] = sum;"; src.close("}"); kernel = cache.insert(queue, backend::kernel( queue, src.str(), "block_addition")); } return kernel->second; }
backend::kernel intra_block_inclusive_scan(const backend::command_queue &queue) { static detail::kernel_cache cache; auto kernel = cache.find(queue); if (kernel == cache.end()) { backend::source_generator src(queue); Oper::define(src, "oper"); src.kernel("intra_block_inclusive_scan") .open("(") .template parameter< size_t >("n") .template parameter< global_ptr<T> >("post_sum") .template parameter< global_ptr<const T> >("pre_sum") .template parameter< T >("identity") .template parameter< uint >("work_per_thread") .close(")").open("{"); src.new_line() << "size_t l_id = " << src.local_id(0) << ";"; src.new_line() << "size_t g_id = " << src.global_id(0) << ";"; src.new_line() << "size_t map_id = g_id * work_per_thread;"; { std::ostringstream shared; shared << "shared[" << NT << "]"; src.smem_static_var(type_name<T>(), shared.str()); } src.new_line() << "size_t offset;"; src.new_line() << type_name<T>() << " work_sum;"; src.new_line() << "if (map_id < n)"; src.open("{"); // accumulate zeroth value manually src.new_line() << "offset = 0;"; src.new_line() << "work_sum = pre_sum[map_id];"; // Serial accumulation src.new_line() << "for( offset = 1; offset < work_per_thread; ++offset )"; src.open("{"); src.new_line() << "if (map_id + offset < n)" << " work_sum = oper( work_sum, pre_sum[map_id + offset] );"; src.close("}"); src.close("}"); src.new_line().barrier(); src.new_line() << type_name<T>() << " scan_sum = work_sum;"; src.new_line() << "shared[ l_id ] = work_sum;"; // scan in shared src.new_line() << "for( offset = 1; offset < " << NT << "; offset *= 2 )"; src.open("{"); src.new_line().barrier(); src.new_line() << "if (map_id < n && l_id >= offset)" << " scan_sum = oper( scan_sum, shared[ l_id - offset ] );"; src.new_line().barrier(); src.new_line() << "shared[ l_id ] = scan_sum;"; src.close("}"); src.new_line().barrier(); // write final scan from pre-scan and shared scan src.new_line() << "work_sum = pre_sum[map_id];"; src.new_line() << "if (l_id > 0)"; src.open("{"); src.new_line() << " work_sum = oper(work_sum, shared[l_id - 1]);"; src.new_line() << " post_sum[map_id] = work_sum;"; src.close("}"); src.new_line() << "else post_sum[map_id] = work_sum;"; src.new_line() << "for( offset = 1; offset < work_per_thread; ++offset )"; src.open("{"); src.new_line().barrier(); src.new_line() << "if (map_id < n && l_id > 0)"; src.open("{"); src.new_line() << type_name<T>() << " y = oper(pre_sum[map_id + offset], work_sum);"; src.new_line() << "post_sum[ map_id + offset ] = y;"; src.new_line() << "work_sum = y;"; src.close("}"); src.new_line() << "else"; src.open("{"); src.new_line() << "post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum);"; src.new_line() << "work_sum = post_sum[map_id + offset];"; src.close("}"); src.close("}"); src.close("}"); kernel = cache.insert(queue, backend::kernel( queue, src.str(), "intra_block_inclusive_scan")); } return kernel->second; }