int reduce_by_key_sink( IKTuple &&ikeys, vector<V> const &ivals, OKTuple &&okeys, vector<V> &ovals, Comp, Oper ) { namespace fusion = boost::fusion; typedef typename extract_value_types<IKTuple>::type K; static_assert( std::is_same<K, typename extract_value_types<OKTuple>::type>::value, "Incompatible input and output key types"); precondition( fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1, "reduce_by_key is only supported for single device contexts" ); precondition(fusion::at_c<0>(ikeys).size() == ivals.size(), "keys and values should have same size" ); const auto &queue = fusion::at_c<0>(ikeys).queue_list(); backend::select_context(queue[0]); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu; size_t count = fusion::at_c<0>(ikeys).size(); size_t num_blocks = (count + NT - 1) / NT; size_t scan_buf_size = alignup(num_blocks, NT); backend::device_vector<int> key_sum (queue[0], scan_buf_size); backend::device_vector<V> pre_sum (queue[0], scan_buf_size); backend::device_vector<V> post_sum (queue[0], scan_buf_size); backend::device_vector<V> offset_val(queue[0], count); backend::device_vector<int> offset (queue[0], count); /***** Kernel 0 *****/ auto krn0 = offset_calculation<K, Comp>(queue[0]); krn0.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn0)); krn0.push_arg(offset); krn0(queue[0]); VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;);
int reduce_by_key_sink( IKTuple &&ikeys, vector<V> const &ivals, OKTuple &&okeys, vector<V> &ovals, Comp, Oper ) { namespace fusion = boost::fusion; typedef typename extract_value_types<IKTuple>::type K; static_assert( std::is_same<K, typename extract_value_types<OKTuple>::type>::value, "Incompatible input and output key types"); precondition( fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1, "Sorting is only supported for single device contexts" ); precondition(fusion::at_c<0>(ikeys).size() == ivals.size(), "keys and values should have same size" ); const auto &queue = fusion::at_c<0>(ikeys).queue_list(); backend::select_context(queue[0]); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu; size_t count = fusion::at_c<0>(ikeys).size(); size_t num_blocks = (count + NT - 1) / NT; size_t scan_buf_size = alignup(num_blocks, NT); backend::device_vector<int> key_sum (queue[0], scan_buf_size); backend::device_vector<V> pre_sum (queue[0], scan_buf_size); backend::device_vector<V> post_sum (queue[0], scan_buf_size); backend::device_vector<V> offset_val(queue[0], count); backend::device_vector<int> offset (queue[0], count); /***** Kernel 0 *****/ auto krn0 = detail::offset_calculation<K, Comp>(queue[0]); krn0.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn0)); krn0.push_arg(offset); krn0(queue[0]); VEX_FUNCTION(plus, int(int, int), "return prm1 + prm2;"); detail::scan(queue[0], offset, offset, 0, false, plus); /***** Kernel 1 *****/ auto krn1 = is_cpu(queue[0]) ? detail::block_scan_by_key<NT_cpu, V, Oper>(queue[0]) : detail::block_scan_by_key<NT_gpu, V, Oper>(queue[0]); krn1.push_arg(count); krn1.push_arg(offset); krn1.push_arg(ivals(0)); krn1.push_arg(offset_val); krn1.push_arg(key_sum); krn1.push_arg(pre_sum); krn1.config(num_blocks, NT); krn1(queue[0]); /***** Kernel 2 *****/ uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT)); auto krn2 = is_cpu(queue[0]) ? detail::block_inclusive_scan_by_key<NT_cpu, V, Oper>(queue[0]) : detail::block_inclusive_scan_by_key<NT_gpu, V, Oper>(queue[0]); krn2.push_arg(num_blocks); krn2.push_arg(key_sum); krn2.push_arg(pre_sum); krn2.push_arg(post_sum); krn2.push_arg(work_per_thread); krn2.config(1, NT); krn2(queue[0]); /***** Kernel 3 *****/ auto krn3 = detail::block_sum_by_key<V, Oper>(queue[0]); krn3.push_arg(count); krn3.push_arg(key_sum); krn3.push_arg(post_sum); krn3.push_arg(offset); krn3.push_arg(offset_val); krn3.config(num_blocks, NT); krn3(queue[0]); /***** resize okeys and ovals *****/ int out_elements; offset.read(queue[0], count - 1, 1, &out_elements, true); ++out_elements; boost::fusion::for_each(okeys, do_vex_resize(queue, out_elements)); ovals.resize(ivals.queue_list(), out_elements); /***** Kernel 4 *****/ auto krn4 = detail::key_value_mapping<K, V>(queue[0]); krn4.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn4)); boost::fusion::for_each(okeys, do_push_arg(krn4)); krn4.push_arg(ovals(0)); krn4.push_arg(offset); krn4.push_arg(offset_val); krn4(queue[0]); return out_elements; }
void scan( backend::command_queue const &queue, backend::device_vector<T> const &input, backend::device_vector<T> &output, T init, bool exclusive, Oper ) { precondition( input.size() == output.size(), "Wrong output size in inclusive_scan" ); backend::select_context(queue); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue) ? NT_cpu : NT_gpu; const int NT2 = 2 * NT; int do_exclusive = exclusive ? 1 : 0; const size_t count = input.size(); const size_t num_blocks = (count + NT2 - 1) / NT2; const size_t scan_buf_size = alignup(num_blocks, NT2); backend::device_vector<T> pre_sum1(queue, scan_buf_size); backend::device_vector<T> pre_sum2(queue, scan_buf_size); backend::device_vector<T> post_sum(queue, scan_buf_size); // Kernel0 auto krn0 = is_cpu(queue) ? block_inclusive_scan<NT_cpu, T, Oper>(queue) : block_inclusive_scan<NT_gpu, T, Oper>(queue); krn0.push_arg(count); krn0.push_arg(input); krn0.push_arg(init); krn0.push_arg(pre_sum1); krn0.push_arg(pre_sum2); krn0.push_arg(do_exclusive); krn0.config(num_blocks, NT); krn0(queue); // Kernel1 auto krn1 = is_cpu(queue) ? intra_block_inclusive_scan<NT_cpu, T, Oper>(queue) : intra_block_inclusive_scan<NT_gpu, T, Oper>(queue); uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT)); krn1.push_arg(num_blocks); krn1.push_arg(post_sum); krn1.push_arg(pre_sum1); krn1.push_arg(init); krn1.push_arg(work_per_thread); krn1.config(1, NT); krn1(queue); // Kernel2 auto krn2 = is_cpu(queue) ? block_addition<NT_cpu, T, Oper>(queue) : block_addition<NT_gpu, T, Oper>(queue); krn2.push_arg(count); krn2.push_arg(input); krn2.push_arg(output); krn2.push_arg(post_sum); krn2.push_arg(pre_sum2); krn2.push_arg(init); krn2.push_arg(do_exclusive); krn2.config(num_blocks * 2, NT); krn2(queue); }