__device__ void scatter_if(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g, RandomAccessIterator1 first, RandomAccessIterator1 last, RandomAccessIterator2 map, RandomAccessIterator3 stencil, RandomAccessIterator4 result) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; size_type chunk_size = g.size() * grainsize; size_type n = last - first; size_type tid = g.this_exec.index(); // important special case which avoids the expensive for loop below if(chunk_size == n) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else if(n < chunk_size) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(idx < (last - first) && stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else { for(; first < last; first += chunk_size, map += chunk_size, stencil += chunk_size) { if((last - first) >= chunk_size) { // avoid conditional accesses when possible for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else { for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(idx < (last - first) && stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end else } // end for } // end else g.wait(); } // end scatter_if
__forceinline__ __device__ typename thrust::detail::enable_if< (size * grainsize > 0), RandomAccessIterator2 >::type simple_copy_n(bulk::concurrent_group< agent<grainsize>, size > &g, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result) { typedef bulk::concurrent_group< agent<grainsize>, size > group_type; RandomAccessIterator2 return_me = result + n; typedef typename group_type::size_type size_type; size_type chunk_size = size * grainsize; size_type tid = g.this_exec.index(); // important special case which avoids the expensive for loop below if(chunk_size == n) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = size * i + tid; result[idx] = first[idx]; } // end for } // end if else { // XXX i have a feeling the indexing could be rewritten to require less arithmetic for(RandomAccessIterator1 last = first + n; first < last; first += chunk_size, result += chunk_size) { // avoid conditional accesses when possible if((last - first) >= chunk_size) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = size * i + tid; result[idx] = first[idx]; } // end for } // end if else { for(size_type i = 0; i < grainsize; ++i) { size_type idx = size * i + tid; if(idx < (last - first)) { result[idx] = first[idx]; } // end if } // end for } // end else } // end for } // end else g.wait(); return return_me; } // end simple_copy_n()
__device__ T accumulate(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g, RandomAccessIterator first, RandomAccessIterator last, T init, BinaryFunction binary_op) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; const size_type elements_per_group = groupsize * grainsize; size_type tid = g.this_exec.index(); T sum = init; typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first; typedef detail::accumulate_detail::buffer< groupsize, grainsize, RandomAccessIterator, T > buffer_type; #if __CUDA_ARCH__ >= 200 buffer_type *buffer = reinterpret_cast<buffer_type*>(bulk::malloc(g, sizeof(buffer_type))); #else __shared__ uninitialized<buffer_type> buffer_impl; buffer_type *buffer = &buffer_impl.get(); #endif for(; first < last; first += elements_per_group) { // XXX each iteration is essentially a bounded accumulate size_type partition_size = thrust::min<size_type>(elements_per_group, last - first); // copy partition into smem bulk::copy_n(g, first, partition_size, buffer->inputs.data()); T this_sum; size_type local_offset = grainsize * g.this_exec.index(); size_type local_size = thrust::max<size_type>(0,thrust::min<size_type>(grainsize, partition_size - grainsize * tid)); if(local_size) { this_sum = buffer->inputs[local_offset]; this_sum = bulk::accumulate(bound<grainsize-1>(g.this_exec), buffer->inputs.data() + local_offset + 1, buffer->inputs.data() + local_offset + local_size, this_sum, binary_op); } // end if g.wait(); if(local_size) { buffer->sums[tid] = this_sum; } // end if g.wait(); // sum over the group sum = accumulate_detail::destructive_accumulate_n(g, buffer->sums.data(), thrust::min<size_type>(groupsize,n), sum, binary_op); } // end for #if __CUDA_ARCH__ >= 200 bulk::free(g, buffer); #endif return sum; } // end accumulate