Beispiel #1
0
__device__
RandomAccessIterator3 merge(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &exec,
                            RandomAccessIterator1 first1, RandomAccessIterator1 last1,
                            RandomAccessIterator2 first2, RandomAccessIterator2 last2,
                            RandomAccessIterator3 result,
                            Compare comp)
{
  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

  typedef typename thrust::iterator_value<RandomAccessIterator3>::type value_type;

  value_type *buffer = reinterpret_cast<value_type*>(bulk::malloc(exec, exec.size() * exec.grainsize() * sizeof(value_type)));

  size_type chunk_size = exec.size() * exec.this_exec.grainsize();

  size_type n1 = last1 - first1;
  size_type n2 = last2 - first2;

  // avoid the search & loop when possible
  if(n1 + n2 <= chunk_size)
  {
    result = detail::merge_detail::bounded_merge_with_buffer(exec, first1, last1, first2, last2, buffer, result, comp);
  } // end if
  else
  {
    while((first1 < last1) || (first2 < last2))
    {
      size_type n1 = last1 - first1;
      size_type n2 = last2 - first2;

      size_type diag = thrust::min<size_type>(chunk_size, n1 + n2);

      size_type mp = bulk::merge_path(first1, n1, first2, n2, diag, comp);

      result = detail::merge_detail::bounded_merge_with_buffer(exec,
                                                               first1, first1 + mp,
                                                               first2, first2 + diag - mp,
                                                               buffer,
                                                               result,
                                                               comp);

      first1 += mp;
      first2 += diag - mp;
    } // end while
  } // end else

  bulk::free(exec, buffer);

  return result;
} // end merge()
Beispiel #2
0
  __device__ void operator()(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &this_group,
                             RandomAccessIterator1 first,
                             Decomposition decomp,
                             RandomAccessIterator2 result,
                             BinaryFunction binary_op)
  {
    typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;

    typename Decomposition::range rng = decomp[this_group.index()];

    value_type init = first[rng.second-1];

    value_type sum = bulk::reduce(this_group, first + rng.first, first + rng.second - 1, init, binary_op);

    if(this_group.this_exec.index() == 0)
    {
      result[this_group.index()] = sum;
    } // end if
  } // end operator()
__forceinline__ __device__
typename thrust::detail::enable_if<
(size * grainsize > 0),
RandomAccessIterator2
>::type
simple_copy_n(bulk::concurrent_group<
              agent<grainsize>,
              size
              > &g,
              RandomAccessIterator1 first, Size n,
              RandomAccessIterator2 result)
{
    typedef bulk::concurrent_group<
    agent<grainsize>,
          size
          > group_type;

    RandomAccessIterator2 return_me = result + n;

    typedef typename group_type::size_type size_type;
    size_type chunk_size = size * grainsize;

    size_type tid = g.this_exec.index();

    // important special case which avoids the expensive for loop below
    if(chunk_size == n)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type idx = size * i + tid;
            result[idx] = first[idx];
        } // end for
    } // end if
    else
    {
        // XXX i have a feeling the indexing could be rewritten to require less arithmetic
        for(RandomAccessIterator1 last = first + n;
                first < last;
                first += chunk_size, result += chunk_size)
        {
            // avoid conditional accesses when possible
            if((last - first) >= chunk_size)
            {
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = size * i + tid;
                    result[idx] = first[idx];
                } // end for
            } // end if
            else
            {
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = size * i + tid;
                    if(idx < (last - first))
                    {
                        result[idx] = first[idx];
                    } // end if
                } // end for
            } // end else
        } // end for
    } // end else

    g.wait();

    return return_me;
} // end simple_copy_n()
Beispiel #4
0
__device__
void scatter_if(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
                RandomAccessIterator1 first,
                RandomAccessIterator1 last,
                RandomAccessIterator2 map,
                RandomAccessIterator3 stencil,
                RandomAccessIterator4 result)
{
    typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

    size_type chunk_size = g.size() * grainsize;

    size_type n = last - first;

    size_type tid = g.this_exec.index();

    // important special case which avoids the expensive for loop below
    if(chunk_size == n)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type idx = g.size() * i + tid;

            if(stencil[idx])
            {
                result[map[idx]] = first[idx];
            } // end if
        } // end for
    } // end if
    else if(n < chunk_size)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type idx = g.size() * i + tid;

            if(idx < (last - first) && stencil[idx])
            {
                result[map[idx]] = first[idx];
            } // end if
        } // end for
    } // end if
    else
    {
        for(;
                first < last;
                first += chunk_size, map += chunk_size, stencil += chunk_size)
        {
            if((last - first) >= chunk_size)
            {
                // avoid conditional accesses when possible
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = g.size() * i + tid;

                    if(stencil[idx])
                    {
                        result[map[idx]] = first[idx];
                    } // end if
                } // end for
            } // end if
            else
            {
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = g.size() * i + tid;

                    if(idx < (last - first) && stencil[idx])
                    {
                        result[map[idx]] = first[idx];
                    } // end if
                } // end for
            } // end else
        } // end for
    } // end else

    g.wait();
} // end scatter_if
Beispiel #5
0
__device__
T accumulate(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
             RandomAccessIterator first,
             RandomAccessIterator last,
             T init,
             BinaryFunction binary_op)
{
  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

  const size_type elements_per_group = groupsize * grainsize;

  size_type tid = g.this_exec.index();

  T sum = init;

  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;

  typedef detail::accumulate_detail::buffer<
    groupsize,
    grainsize,
    RandomAccessIterator,
    T
  > buffer_type;

#if __CUDA_ARCH__ >= 200
  buffer_type *buffer = reinterpret_cast<buffer_type*>(bulk::malloc(g, sizeof(buffer_type)));
#else
  __shared__ uninitialized<buffer_type> buffer_impl;
  buffer_type *buffer = &buffer_impl.get();
#endif
  
  for(; first < last; first += elements_per_group)
  {
    // XXX each iteration is essentially a bounded accumulate
    
    size_type partition_size = thrust::min<size_type>(elements_per_group, last - first);
    
    // copy partition into smem
    bulk::copy_n(g, first, partition_size, buffer->inputs.data());
    
    T this_sum;
    size_type local_offset = grainsize * g.this_exec.index();

    size_type local_size = thrust::max<size_type>(0,thrust::min<size_type>(grainsize, partition_size - grainsize * tid));

    if(local_size)
    {
      this_sum = buffer->inputs[local_offset];
      this_sum = bulk::accumulate(bound<grainsize-1>(g.this_exec),
                                  buffer->inputs.data() + local_offset + 1,
                                  buffer->inputs.data() + local_offset + local_size,
                                  this_sum,
                                  binary_op);
    } // end if

    g.wait();

    if(local_size)
    {
      buffer->sums[tid] = this_sum;
    } // end if

    g.wait();
    
    // sum over the group
    sum = accumulate_detail::destructive_accumulate_n(g, buffer->sums.data(), thrust::min<size_type>(groupsize,n), sum, binary_op);
  } // end for

#if __CUDA_ARCH__ >= 200
  bulk::free(g, buffer);
#endif

  return sum;
} // end accumulate