Example #1
0
__device__
void scatter_if(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
                RandomAccessIterator1 first,
                RandomAccessIterator1 last,
                RandomAccessIterator2 map,
                RandomAccessIterator3 stencil,
                RandomAccessIterator4 result)
{
    typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

    size_type chunk_size = g.size() * grainsize;

    size_type n = last - first;

    size_type tid = g.this_exec.index();

    // important special case which avoids the expensive for loop below
    if(chunk_size == n)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type idx = g.size() * i + tid;

            if(stencil[idx])
            {
                result[map[idx]] = first[idx];
            } // end if
        } // end for
    } // end if
    else if(n < chunk_size)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type idx = g.size() * i + tid;

            if(idx < (last - first) && stencil[idx])
            {
                result[map[idx]] = first[idx];
            } // end if
        } // end for
    } // end if
    else
    {
        for(;
                first < last;
                first += chunk_size, map += chunk_size, stencil += chunk_size)
        {
            if((last - first) >= chunk_size)
            {
                // avoid conditional accesses when possible
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = g.size() * i + tid;

                    if(stencil[idx])
                    {
                        result[map[idx]] = first[idx];
                    } // end if
                } // end for
            } // end if
            else
            {
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = g.size() * i + tid;

                    if(idx < (last - first) && stencil[idx])
                    {
                        result[map[idx]] = first[idx];
                    } // end if
                } // end for
            } // end else
        } // end for
    } // end else

    g.wait();
} // end scatter_if
__forceinline__ __device__
typename thrust::detail::enable_if<
(size * grainsize > 0),
RandomAccessIterator2
>::type
simple_copy_n(bulk::concurrent_group<
              agent<grainsize>,
              size
              > &g,
              RandomAccessIterator1 first, Size n,
              RandomAccessIterator2 result)
{
    typedef bulk::concurrent_group<
    agent<grainsize>,
          size
          > group_type;

    RandomAccessIterator2 return_me = result + n;

    typedef typename group_type::size_type size_type;
    size_type chunk_size = size * grainsize;

    size_type tid = g.this_exec.index();

    // important special case which avoids the expensive for loop below
    if(chunk_size == n)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type idx = size * i + tid;
            result[idx] = first[idx];
        } // end for
    } // end if
    else
    {
        // XXX i have a feeling the indexing could be rewritten to require less arithmetic
        for(RandomAccessIterator1 last = first + n;
                first < last;
                first += chunk_size, result += chunk_size)
        {
            // avoid conditional accesses when possible
            if((last - first) >= chunk_size)
            {
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = size * i + tid;
                    result[idx] = first[idx];
                } // end for
            } // end if
            else
            {
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = size * i + tid;
                    if(idx < (last - first))
                    {
                        result[idx] = first[idx];
                    } // end if
                } // end for
            } // end else
        } // end for
    } // end else

    g.wait();

    return return_me;
} // end simple_copy_n()
Example #3
0
__device__
T accumulate(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
             RandomAccessIterator first,
             RandomAccessIterator last,
             T init,
             BinaryFunction binary_op)
{
  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

  const size_type elements_per_group = groupsize * grainsize;

  size_type tid = g.this_exec.index();

  T sum = init;

  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;

  typedef detail::accumulate_detail::buffer<
    groupsize,
    grainsize,
    RandomAccessIterator,
    T
  > buffer_type;

#if __CUDA_ARCH__ >= 200
  buffer_type *buffer = reinterpret_cast<buffer_type*>(bulk::malloc(g, sizeof(buffer_type)));
#else
  __shared__ uninitialized<buffer_type> buffer_impl;
  buffer_type *buffer = &buffer_impl.get();
#endif
  
  for(; first < last; first += elements_per_group)
  {
    // XXX each iteration is essentially a bounded accumulate
    
    size_type partition_size = thrust::min<size_type>(elements_per_group, last - first);
    
    // copy partition into smem
    bulk::copy_n(g, first, partition_size, buffer->inputs.data());
    
    T this_sum;
    size_type local_offset = grainsize * g.this_exec.index();

    size_type local_size = thrust::max<size_type>(0,thrust::min<size_type>(grainsize, partition_size - grainsize * tid));

    if(local_size)
    {
      this_sum = buffer->inputs[local_offset];
      this_sum = bulk::accumulate(bound<grainsize-1>(g.this_exec),
                                  buffer->inputs.data() + local_offset + 1,
                                  buffer->inputs.data() + local_offset + local_size,
                                  this_sum,
                                  binary_op);
    } // end if

    g.wait();

    if(local_size)
    {
      buffer->sums[tid] = this_sum;
    } // end if

    g.wait();
    
    // sum over the group
    sum = accumulate_detail::destructive_accumulate_n(g, buffer->sums.data(), thrust::min<size_type>(groupsize,n), sum, binary_op);
  } // end for

#if __CUDA_ARCH__ >= 200
  bulk::free(g, buffer);
#endif

  return sum;
} // end accumulate