예제 #1
0
__device__
typename thrust::detail::enable_if<
bound <= groupsize * grainsize
>::type
scatter_if(bulk::bounded<
           bound,
           bulk::concurrent_group<bulk::agent<grainsize>,groupsize>
           > &g,
           RandomAccessIterator1 first,
           RandomAccessIterator1 last,
           RandomAccessIterator2 map,
           RandomAccessIterator3 stencil,
           RandomAccessIterator4 result)
{
    typedef typename bulk::bounded<
    bound,
    bulk::concurrent_group<bulk::agent<grainsize>,groupsize>
    >::size_type size_type;

    typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::agent_type agent_type;

    size_type n = last - first;

    size_type tid = g.this_exec.index();

    // avoid branches when possible
    if(n == bound)
    {
        for(size_type i = 0; i < g.this_exec.grainsize(); ++i)
        {
            size_type idx = g.size() * i + tid;

            if(stencil[idx])
            {
                result[map[idx]] = first[idx];
            } // end if
        } // end for
    } // end if
    else if(n < bound)
    {
        for(size_type i = 0; i < g.this_exec.grainsize(); ++i)
        {
            size_type idx = g.size() * i + tid;

            if(idx < (last - first) && stencil[idx])
            {
                result[map[idx]] = first[idx];
            } // end if
        } // end for
    } // end if

    g.wait();
} // end scatter_if()
예제 #2
0
__device__
typename thrust::detail::enable_if<
  (bound <= groupsize * grainsize)
>::type
inplace_merge(bulk::bounded<
                bound,
                bulk::concurrent_group<
                  bulk::agent<grainsize>,
                  groupsize
                >
              > &g,
              RandomAccessIterator first, RandomAccessIterator middle, RandomAccessIterator last,
              Compare comp)
{
  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

  size_type n1 = middle - first;
  size_type n2 = last - middle;

  // find the start of each local merge
  size_type local_offset = grainsize * g.this_exec.index();

  size_type mp = bulk::merge_path(first, n1, middle, n2, local_offset, comp);
  
  // do a local sequential merge
  size_type local_offset1 = mp;
  size_type local_offset2 = n1 + local_offset - mp;

  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
  value_type local_result[grainsize];
  bulk::merge(bulk::bound<grainsize>(g.this_exec),
              first + local_offset1, middle,
              first + local_offset2, last,
              local_result,
              comp);

  g.wait();

  // copy local result back to source
  // this is faster than getting the size from merge's result
  size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n1 + n2 - local_offset));
  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_result, local_size, first + local_offset); 

  g.wait();
} // end inplace_merge()
예제 #3
0
__device__
typename thrust::detail::enable_if<
  (bound <= groupsize * grainsize),
  RandomAccessIterator3
>::type
merge(bulk::bounded<
        bound,
        bulk::concurrent_group<
          bulk::agent<grainsize>,
          groupsize
        >
      > &g,
      RandomAccessIterator1 first1, RandomAccessIterator1 last1,
      RandomAccessIterator2 first2, RandomAccessIterator2 last2,
      RandomAccessIterator3 result,
      Compare comp)
{
  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

  size_type n1 = last1 - first1;
  size_type n2 = last2 - first2;

  // find the start of each local merge
  size_type local_offset = grainsize * g.this_exec.index();

  size_type mp = bulk::merge_path(first1, n1, first2, n2, local_offset, comp);
  
  // do a local sequential merge
  size_type local_offset1 = mp;
  size_type local_offset2 = local_offset - mp;
  
  typedef typename thrust::iterator_value<RandomAccessIterator3>::type value_type;
  value_type local_result[grainsize];
  bulk::merge(bulk::bound<grainsize>(g.this_exec),
              first1 + local_offset1, last1,
              first2 + local_offset2, last2,
              local_result,
              comp);

  // store local result
  // this is faster than getting the size from merge's result
  size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n1 + n2 - local_offset));
  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_result, local_size, result + local_offset); 

  g.wait();

  return result + thrust::min<size_type>(groupsize * grainsize, n1 + n2);
} // end merge()
예제 #4
0
파일: reduce.hpp 프로젝트: sebas095/CUDA
__forceinline__ __device__
T reduce(const bulk::bounded<bound,bulk::agent<grainsize> > &exec,
         RandomAccessIterator first,
         RandomAccessIterator last,
         T init,
         BinaryFunction binary_op)
{
  typedef typename bulk::bounded<bound,bulk::agent<grainsize> >::size_type size_type;

  size_type n = last - first;

  for(size_type i = 0; i < exec.bound(); ++i)
  {
    if(i < n)
    {
      init = binary_op(init, first[i]);
    } // end if
  } // end for i

  return init;
} // end reduce()
__device__
typename thrust::detail::enable_if<
(bound <= groupsize * grainsize),
RandomAccessIterator2
>::type
copy_n(bulk::bounded<
       bound,
       concurrent_group<
       agent<grainsize>,
       groupsize
       >
       > &g,
       RandomAccessIterator1 first,
       Size n,
       RandomAccessIterator2 result)
{
    typedef bounded<
    bound,
    concurrent_group<
    agent<grainsize>,
    groupsize
    >
    > group_type;

    typedef typename group_type::size_type size_type;

    size_type tid = g.this_exec.index();

    typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;

    // XXX make this an uninitialized array
    value_type stage[grainsize];

    // avoid conditional accesses when possible
    if(groupsize * grainsize <= n)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type src_idx = g.size() * i + tid;
            stage[i] = first[src_idx];
        } // end for i

        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type dst_idx = g.size() * i + tid;
            result[dst_idx] = stage[i];
        } // end for i
    } // end if
    else
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type src_idx = g.size() * i + tid;
            if(src_idx < n)
            {
                stage[i] = first[src_idx];
            } // end if
        } // end for

        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type dst_idx = g.size() * i + tid;
            if(dst_idx < n)
            {
                result[dst_idx] = stage[i];
            } // end if
        } // end for
    } // end else

    g.wait();

    return result + thrust::min<Size>(g.size() * grainsize, n);
} // end copy_n()
예제 #6
0
__device__
thrust::pair<RandomAccessIterator5,RandomAccessIterator6>
merge_by_key(bulk::bounded<
               groupsize*grainsize,
               bulk::concurrent_group<bulk::agent<grainsize>, groupsize>
             > &g,
             RandomAccessIterator1 keys_first1, RandomAccessIterator1 keys_last1,
             RandomAccessIterator2 keys_first2, RandomAccessIterator2 keys_last2,
             RandomAccessIterator3 values_first1,
             RandomAccessIterator4 values_first2,
             RandomAccessIterator5 keys_result,
             RandomAccessIterator6 values_result,
             Compare comp)
{
  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

  typedef typename thrust::iterator_value<RandomAccessIterator5>::type key_type;

#if __CUDA_ARCH__ >= 200
  union
  {
    key_type  *keys;
    size_type *indices;
  } stage;

  stage.keys = static_cast<key_type*>(bulk::malloc(g, groupsize * grainsize * thrust::max(sizeof(key_type), sizeof(size_type))));
#else
  __shared__ union
  {
    key_type  keys[groupsize * grainsize];
    size_type indices[groupsize * grainsize];
  } stage;
#endif

  size_type n1 = keys_last1 - keys_first1;
  size_type n2 = keys_last2 - keys_first2;
  size_type  n = n1 + n2;
  
  // copy keys into stage
  bulk::copy_n(g,
               thrust::detail::make_join_iterator(keys_first1, n1, keys_first2),
               n,
               stage.keys);

  // find the start of each agent's sequential merge
  size_type diag = thrust::min<size_type>(n1 + n2, grainsize * g.this_exec.index());
  size_type mp = bulk::merge_path(stage.keys, n1, stage.keys + n1, n2, diag, comp);
  
  // compute the ranges of the sources in the stage.
  size_type start1 = mp;
  size_type start2 = n1 + diag - mp;

  size_type end1 = n1;
  size_type end2 = n1 + n2;
  
  // each agent merges sequentially
  key_type  results[grainsize];
  size_type indices[grainsize];
  bulk::merge_by_key(bulk::bound<grainsize>(g.this_exec),
                     stage.keys + start1, stage.keys + end1,
                     stage.keys + start2, stage.keys + end2,
                     thrust::make_counting_iterator<size_type>(start1),
                     thrust::make_counting_iterator<size_type>(start2),
                     results,
                     indices,
                     comp);
  g.wait();
  
  // each agent stores merged keys back to the stage
  size_type local_offset = grainsize * g.this_exec.index();
  size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n - local_offset));
  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), results, local_size, stage.keys + local_offset);
  g.wait();
  
  // store merged keys to the result
  keys_result = bulk::copy_n(g, stage.keys, n, keys_result);
  
  // each agent copies the indices into the stage
  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), indices, local_size, stage.indices + local_offset);
  g.wait();
  
  // gather values into merged order
  values_result = bulk::gather(g,
                               stage.indices, stage.indices + n,
                               thrust::detail::make_join_iterator(values_first1, n1, values_first2),
                               values_result);

#if __CUDA_ARCH__ >= 200
  bulk::free(g, stage.keys);
#endif

  return thrust::make_pair(keys_result, values_result);
} // end merge_by_key()