Ejemplo n.º 1
0
  thrust::pair<Iterator3,Iterator4>
    reduce_by_key(thrust::tbb::dispatchable<System> &system,
                  Iterator1 keys_first, Iterator1 keys_last, 
                  Iterator2 values_first,
                  Iterator3 keys_result,
                  Iterator4 values_result,
                  BinaryPredicate binary_pred,
                  BinaryFunction binary_op)
{

  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
  difference_type n = keys_last - keys_first;
  if(n == 0) return std::make_pair(keys_result, values_result);

  // XXX this value is a tuning opportunity
  const difference_type parallelism_threshold = 10000;

  if(n < parallelism_threshold)
  {
    // don't bother parallelizing for small n
    thrust::cpp::tag seq;
    return thrust::reduce_by_key(seq, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op);
  }

  // count the number of processors
  const unsigned int p = std::max(1u, ::tbb::tbb_thread::hardware_concurrency());

  // generate O(P) intervals of sequential work
  // XXX oversubscribing is a tuning opportunity
  const unsigned int subscription_rate = 1;
  difference_type interval_size = thrust::min<difference_type>(parallelism_threshold, thrust::max<difference_type>(n, n / (subscription_rate * p)));
  difference_type num_intervals = divide_ri(n, interval_size);

  // decompose the input into intervals of size N / num_intervals
  // add one extra element to this vector to store the size of the entire result
  thrust::detail::temporary_array<difference_type, System> interval_output_offsets(0, system, num_intervals + 1);

  // first count the number of tail flags in each interval
  ::tail_flags<Iterator1,BinaryPredicate> tail_flags = make_tail_flags(keys_first, keys_last, binary_pred);
  reduce_intervals(system, tail_flags.begin(), tail_flags.end(), interval_size, interval_output_offsets.begin() + 1, thrust::plus<size_t>());
  interval_output_offsets[0] = 0;

  // scan the counts to get each body's output offset
  thrust::cpp::tag seq;
  thrust::inclusive_scan(seq,
                         interval_output_offsets.begin() + 1, interval_output_offsets.end(), 
                         interval_output_offsets.begin() + 1);

  // do a reduce_by_key serially in each thread
  // the final interval never has a carry by definition, so don't reserve space for it
  thrust::detail::temporary_array<typename partial_sum_type<Iterator2,BinaryFunction>::type, System> carries(0, system, num_intervals - 1);

  // force grainsize == 1 with simple_partioner()
  ::tbb::parallel_for(::tbb::blocked_range<difference_type>(0, num_intervals, 1),
    make_serial_reduce_by_key_body(keys_first, values_first, interval_output_offsets.begin(), keys_result, values_result, carries.begin(), n, interval_size, num_intervals, binary_pred, binary_op),
    ::tbb::simple_partitioner());

  difference_type size_of_result = interval_output_offsets[num_intervals];

  // sequentially accumulate the carries
  // note that the last interval does not have a carry
  // XXX find a way to express this loop via a sequential algorithm, perhaps reduce_by_key
  typedef typename std::vector<typename partial_sum_type<Iterator2,BinaryFunction>::type>::size_type size_type;
  for(size_type i = 0; i < carries.size(); ++i)
  {
    // if our interval has a carry, then we need to sum the carry to the next interval's output offset
    // if it does not have a carry, then we need to ignore carry_value[i]
    if(interval_has_carry(i, interval_size, num_intervals, tail_flags.begin()))
    {
      difference_type output_idx = interval_output_offsets[i+1];

      values_result[output_idx] = binary_op(values_result[output_idx], carries[i]);
    }
  }

  return thrust::make_pair(keys_result + size_of_result, values_result + size_of_result);
}
Ejemplo n.º 2
0
 inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); }