thrust::pair<Iterator3,Iterator4> reduce_by_key(thrust::tbb::dispatchable<System> &system, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 keys_result, Iterator4 values_result, BinaryPredicate binary_pred, BinaryFunction binary_op) { typedef typename thrust::iterator_difference<Iterator1>::type difference_type; difference_type n = keys_last - keys_first; if(n == 0) return std::make_pair(keys_result, values_result); // XXX this value is a tuning opportunity const difference_type parallelism_threshold = 10000; if(n < parallelism_threshold) { // don't bother parallelizing for small n thrust::cpp::tag seq; return thrust::reduce_by_key(seq, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op); } // count the number of processors const unsigned int p = std::max(1u, ::tbb::tbb_thread::hardware_concurrency()); // generate O(P) intervals of sequential work // XXX oversubscribing is a tuning opportunity const unsigned int subscription_rate = 1; difference_type interval_size = thrust::min<difference_type>(parallelism_threshold, thrust::max<difference_type>(n, n / (subscription_rate * p))); difference_type num_intervals = divide_ri(n, interval_size); // decompose the input into intervals of size N / num_intervals // add one extra element to this vector to store the size of the entire result thrust::detail::temporary_array<difference_type, System> interval_output_offsets(0, system, num_intervals + 1); // first count the number of tail flags in each interval ::tail_flags<Iterator1,BinaryPredicate> tail_flags = make_tail_flags(keys_first, keys_last, binary_pred); reduce_intervals(system, tail_flags.begin(), tail_flags.end(), interval_size, interval_output_offsets.begin() + 1, thrust::plus<size_t>()); interval_output_offsets[0] = 0; // scan the counts to get each body's output offset thrust::cpp::tag seq; thrust::inclusive_scan(seq, interval_output_offsets.begin() + 1, interval_output_offsets.end(), interval_output_offsets.begin() + 1); // do a reduce_by_key serially in each thread // the final interval never has a carry by definition, so don't reserve space for it thrust::detail::temporary_array<typename partial_sum_type<Iterator2,BinaryFunction>::type, System> carries(0, system, num_intervals - 1); // force grainsize == 1 with simple_partioner() ::tbb::parallel_for(::tbb::blocked_range<difference_type>(0, num_intervals, 1), make_serial_reduce_by_key_body(keys_first, values_first, interval_output_offsets.begin(), keys_result, values_result, carries.begin(), n, interval_size, num_intervals, binary_pred, binary_op), ::tbb::simple_partitioner()); difference_type size_of_result = interval_output_offsets[num_intervals]; // sequentially accumulate the carries // note that the last interval does not have a carry // XXX find a way to express this loop via a sequential algorithm, perhaps reduce_by_key typedef typename std::vector<typename partial_sum_type<Iterator2,BinaryFunction>::type>::size_type size_type; for(size_type i = 0; i < carries.size(); ++i) { // if our interval has a carry, then we need to sum the carry to the next interval's output offset // if it does not have a carry, then we need to ignore carry_value[i] if(interval_has_carry(i, interval_size, num_intervals, tail_flags.begin())) { difference_type output_idx = interval_output_offsets[i+1]; values_result[output_idx] = binary_op(values_result[output_idx], carries[i]); } } return thrust::make_pair(keys_result + size_of_result, values_result + size_of_result); }
inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); }