int reduce_by_key_sink( IKTuple &&ikeys, vector<V> const &ivals, OKTuple &&okeys, vector<V> &ovals, Comp, Oper ) { namespace fusion = boost::fusion; typedef typename extract_value_types<IKTuple>::type K; static_assert( std::is_same<K, typename extract_value_types<OKTuple>::type>::value, "Incompatible input and output key types"); precondition( fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1, "reduce_by_key is only supported for single device contexts" ); precondition(fusion::at_c<0>(ikeys).size() == ivals.size(), "keys and values should have same size" ); const auto &queue = fusion::at_c<0>(ikeys).queue_list(); backend::select_context(queue[0]); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu; size_t count = fusion::at_c<0>(ikeys).size(); size_t num_blocks = (count + NT - 1) / NT; size_t scan_buf_size = alignup(num_blocks, NT); backend::device_vector<int> key_sum (queue[0], scan_buf_size); backend::device_vector<V> pre_sum (queue[0], scan_buf_size); backend::device_vector<V> post_sum (queue[0], scan_buf_size); backend::device_vector<V> offset_val(queue[0], count); backend::device_vector<int> offset (queue[0], count); /***** Kernel 0 *****/ auto krn0 = offset_calculation<K, Comp>(queue[0]); krn0.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn0)); krn0.push_arg(offset); krn0(queue[0]); VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;);
int reduce_by_key_sink( IKTuple &&ikeys, vector<V> const &ivals, OKTuple &&okeys, vector<V> &ovals, Comp, Oper ) { namespace fusion = boost::fusion; typedef typename extract_value_types<IKTuple>::type K; static_assert( std::is_same<K, typename extract_value_types<OKTuple>::type>::value, "Incompatible input and output key types"); precondition( fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1, "Sorting is only supported for single device contexts" ); precondition(fusion::at_c<0>(ikeys).size() == ivals.size(), "keys and values should have same size" ); const auto &queue = fusion::at_c<0>(ikeys).queue_list(); backend::select_context(queue[0]); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu; size_t count = fusion::at_c<0>(ikeys).size(); size_t num_blocks = (count + NT - 1) / NT; size_t scan_buf_size = alignup(num_blocks, NT); backend::device_vector<int> key_sum (queue[0], scan_buf_size); backend::device_vector<V> pre_sum (queue[0], scan_buf_size); backend::device_vector<V> post_sum (queue[0], scan_buf_size); backend::device_vector<V> offset_val(queue[0], count); backend::device_vector<int> offset (queue[0], count); /***** Kernel 0 *****/ auto krn0 = detail::offset_calculation<K, Comp>(queue[0]); krn0.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn0)); krn0.push_arg(offset); krn0(queue[0]); VEX_FUNCTION(plus, int(int, int), "return prm1 + prm2;"); detail::scan(queue[0], offset, offset, 0, false, plus); /***** Kernel 1 *****/ auto krn1 = is_cpu(queue[0]) ? detail::block_scan_by_key<NT_cpu, V, Oper>(queue[0]) : detail::block_scan_by_key<NT_gpu, V, Oper>(queue[0]); krn1.push_arg(count); krn1.push_arg(offset); krn1.push_arg(ivals(0)); krn1.push_arg(offset_val); krn1.push_arg(key_sum); krn1.push_arg(pre_sum); krn1.config(num_blocks, NT); krn1(queue[0]); /***** Kernel 2 *****/ uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT)); auto krn2 = is_cpu(queue[0]) ? detail::block_inclusive_scan_by_key<NT_cpu, V, Oper>(queue[0]) : detail::block_inclusive_scan_by_key<NT_gpu, V, Oper>(queue[0]); krn2.push_arg(num_blocks); krn2.push_arg(key_sum); krn2.push_arg(pre_sum); krn2.push_arg(post_sum); krn2.push_arg(work_per_thread); krn2.config(1, NT); krn2(queue[0]); /***** Kernel 3 *****/ auto krn3 = detail::block_sum_by_key<V, Oper>(queue[0]); krn3.push_arg(count); krn3.push_arg(key_sum); krn3.push_arg(post_sum); krn3.push_arg(offset); krn3.push_arg(offset_val); krn3.config(num_blocks, NT); krn3(queue[0]); /***** resize okeys and ovals *****/ int out_elements; offset.read(queue[0], count - 1, 1, &out_elements, true); ++out_elements; boost::fusion::for_each(okeys, do_vex_resize(queue, out_elements)); ovals.resize(ivals.queue_list(), out_elements); /***** Kernel 4 *****/ auto krn4 = detail::key_value_mapping<K, V>(queue[0]); krn4.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn4)); boost::fusion::for_each(okeys, do_push_arg(krn4)); krn4.push_arg(ovals(0)); krn4.push_arg(offset); krn4.push_arg(offset_val); krn4(queue[0]); return out_elements; }