C++ (Cpp) concurrent_group::wait Beispiele

Programmiersprache: C++ (Cpp)

Namespace / Paketname: bulk

Klasse / Typ: concurrent_group

Methode / Funktion: wait

Beispiele auf hotexamples.com: 3

C++ (Cpp) concurrent_group::wait - 3 Beispiele gefunden. Dies sind die am besten bewerteten C++ (Cpp) Beispiele für die bulk::concurrent_group::wait, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

wait(3)

size(2)

grainsize(1)

index(1)

Häufig verwendete Methoden

wait (3)

size (2)

grainsize (1)

index (1)

Beispiel #1

Datei anzeigen

Datei: scatter.hpp Projekt: johnforce/thrust

__device__
void scatter_if(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
                RandomAccessIterator1 first,
                RandomAccessIterator1 last,
                RandomAccessIterator2 map,
                RandomAccessIterator3 stencil,
                RandomAccessIterator4 result)
{
    typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

    size_type chunk_size = g.size() * grainsize;

    size_type n = last - first;

    size_type tid = g.this_exec.index();

    // important special case which avoids the expensive for loop below
    if(chunk_size == n)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type idx = g.size() * i + tid;

            if(stencil[idx])
            {
                result[map[idx]] = first[idx];
            } // end if
        } // end for
    } // end if
    else if(n < chunk_size)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type idx = g.size() * i + tid;

            if(idx < (last - first) && stencil[idx])
            {
                result[map[idx]] = first[idx];
            } // end if
        } // end for
    } // end if
    else
    {
        for(;
                first < last;
                first += chunk_size, map += chunk_size, stencil += chunk_size)
        {
            if((last - first) >= chunk_size)
            {
                // avoid conditional accesses when possible
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = g.size() * i + tid;

                    if(stencil[idx])
                    {
                        result[map[idx]] = first[idx];
                    } // end if
                } // end for
            } // end if
            else
            {
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = g.size() * i + tid;

                    if(idx < (last - first) && stencil[idx])
                    {
                        result[map[idx]] = first[idx];
                    } // end if
                } // end for
            } // end else
        } // end for
    } // end else

    g.wait();
} // end scatter_if

Beispiel #2

Datei anzeigen

Datei: copy.hpp Projekt: nicholasferguson/thrust_using_tbb_instead_of_GPU

__forceinline__ __device__
typename thrust::detail::enable_if<
(size * grainsize > 0),
RandomAccessIterator2
>::type
simple_copy_n(bulk::concurrent_group<
              agent<grainsize>,
              size
              > &g,
              RandomAccessIterator1 first, Size n,
              RandomAccessIterator2 result)
{
    typedef bulk::concurrent_group<
    agent<grainsize>,
          size
          > group_type;

    RandomAccessIterator2 return_me = result + n;

    typedef typename group_type::size_type size_type;
    size_type chunk_size = size * grainsize;

    size_type tid = g.this_exec.index();

    // important special case which avoids the expensive for loop below
    if(chunk_size == n)
    {
        for(size_type i = 0; i < grainsize; ++i)
        {
            size_type idx = size * i + tid;
            result[idx] = first[idx];
        } // end for
    } // end if
    else
    {
        // XXX i have a feeling the indexing could be rewritten to require less arithmetic
        for(RandomAccessIterator1 last = first + n;
                first < last;
                first += chunk_size, result += chunk_size)
        {
            // avoid conditional accesses when possible
            if((last - first) >= chunk_size)
            {
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = size * i + tid;
                    result[idx] = first[idx];
                } // end for
            } // end if
            else
            {
                for(size_type i = 0; i < grainsize; ++i)
                {
                    size_type idx = size * i + tid;
                    if(idx < (last - first))
                    {
                        result[idx] = first[idx];
                    } // end if
                } // end for
            } // end else
        } // end for
    } // end else

    g.wait();

    return return_me;
} // end simple_copy_n()

Beispiel #3

Datei anzeigen

Datei: accumulate.hpp Projekt: sebas095/CUDA

__device__
T accumulate(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
             RandomAccessIterator first,
             RandomAccessIterator last,
             T init,
             BinaryFunction binary_op)
{
  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;

  const size_type elements_per_group = groupsize * grainsize;

  size_type tid = g.this_exec.index();

  T sum = init;

  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;

  typedef detail::accumulate_detail::buffer<
    groupsize,
    grainsize,
    RandomAccessIterator,
    T
  > buffer_type;

#if __CUDA_ARCH__ >= 200
  buffer_type *buffer = reinterpret_cast<buffer_type*>(bulk::malloc(g, sizeof(buffer_type)));
#else
  __shared__ uninitialized<buffer_type> buffer_impl;
  buffer_type *buffer = &buffer_impl.get();
#endif
  
  for(; first < last; first += elements_per_group)
  {
    // XXX each iteration is essentially a bounded accumulate
    
    size_type partition_size = thrust::min<size_type>(elements_per_group, last - first);
    
    // copy partition into smem
    bulk::copy_n(g, first, partition_size, buffer->inputs.data());
    
    T this_sum;
    size_type local_offset = grainsize * g.this_exec.index();

    size_type local_size = thrust::max<size_type>(0,thrust::min<size_type>(grainsize, partition_size - grainsize * tid));

    if(local_size)
    {
      this_sum = buffer->inputs[local_offset];
      this_sum = bulk::accumulate(bound<grainsize-1>(g.this_exec),
                                  buffer->inputs.data() + local_offset + 1,
                                  buffer->inputs.data() + local_offset + local_size,
                                  this_sum,
                                  binary_op);
    } // end if

    g.wait();

    if(local_size)
    {
      buffer->sums[tid] = this_sum;
    } // end if

    g.wait();
    
    // sum over the group
    sum = accumulate_detail::destructive_accumulate_n(g, buffer->sums.data(), thrust::min<size_type>(groupsize,n), sum, binary_op);
  } // end for

#if __CUDA_ARCH__ >= 200
  bulk::free(g, buffer);
#endif

  return sum;
} // end accumulate