__device__ RandomAccessIterator3 merge(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &exec, RandomAccessIterator1 first1, RandomAccessIterator1 last1, RandomAccessIterator2 first2, RandomAccessIterator2 last2, RandomAccessIterator3 result, Compare comp) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; typedef typename thrust::iterator_value<RandomAccessIterator3>::type value_type; value_type *buffer = reinterpret_cast<value_type*>(bulk::malloc(exec, exec.size() * exec.grainsize() * sizeof(value_type))); size_type chunk_size = exec.size() * exec.this_exec.grainsize(); size_type n1 = last1 - first1; size_type n2 = last2 - first2; // avoid the search & loop when possible if(n1 + n2 <= chunk_size) { result = detail::merge_detail::bounded_merge_with_buffer(exec, first1, last1, first2, last2, buffer, result, comp); } // end if else { while((first1 < last1) || (first2 < last2)) { size_type n1 = last1 - first1; size_type n2 = last2 - first2; size_type diag = thrust::min<size_type>(chunk_size, n1 + n2); size_type mp = bulk::merge_path(first1, n1, first2, n2, diag, comp); result = detail::merge_detail::bounded_merge_with_buffer(exec, first1, first1 + mp, first2, first2 + diag - mp, buffer, result, comp); first1 += mp; first2 += diag - mp; } // end while } // end else bulk::free(exec, buffer); return result; } // end merge()
__device__ void operator()(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &this_group, RandomAccessIterator1 first, Decomposition decomp, RandomAccessIterator2 result, BinaryFunction binary_op) { typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type; typename Decomposition::range rng = decomp[this_group.index()]; value_type init = first[rng.second-1]; value_type sum = bulk::reduce(this_group, first + rng.first, first + rng.second - 1, init, binary_op); if(this_group.this_exec.index() == 0) { result[this_group.index()] = sum; } // end if } // end operator()
__forceinline__ __device__ typename thrust::detail::enable_if< (size * grainsize > 0), RandomAccessIterator2 >::type simple_copy_n(bulk::concurrent_group< agent<grainsize>, size > &g, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result) { typedef bulk::concurrent_group< agent<grainsize>, size > group_type; RandomAccessIterator2 return_me = result + n; typedef typename group_type::size_type size_type; size_type chunk_size = size * grainsize; size_type tid = g.this_exec.index(); // important special case which avoids the expensive for loop below if(chunk_size == n) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = size * i + tid; result[idx] = first[idx]; } // end for } // end if else { // XXX i have a feeling the indexing could be rewritten to require less arithmetic for(RandomAccessIterator1 last = first + n; first < last; first += chunk_size, result += chunk_size) { // avoid conditional accesses when possible if((last - first) >= chunk_size) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = size * i + tid; result[idx] = first[idx]; } // end for } // end if else { for(size_type i = 0; i < grainsize; ++i) { size_type idx = size * i + tid; if(idx < (last - first)) { result[idx] = first[idx]; } // end if } // end for } // end else } // end for } // end else g.wait(); return return_me; } // end simple_copy_n()
__device__ void scatter_if(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g, RandomAccessIterator1 first, RandomAccessIterator1 last, RandomAccessIterator2 map, RandomAccessIterator3 stencil, RandomAccessIterator4 result) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; size_type chunk_size = g.size() * grainsize; size_type n = last - first; size_type tid = g.this_exec.index(); // important special case which avoids the expensive for loop below if(chunk_size == n) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else if(n < chunk_size) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(idx < (last - first) && stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else { for(; first < last; first += chunk_size, map += chunk_size, stencil += chunk_size) { if((last - first) >= chunk_size) { // avoid conditional accesses when possible for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else { for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(idx < (last - first) && stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end else } // end for } // end else g.wait(); } // end scatter_if
__device__ T accumulate(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g, RandomAccessIterator first, RandomAccessIterator last, T init, BinaryFunction binary_op) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; const size_type elements_per_group = groupsize * grainsize; size_type tid = g.this_exec.index(); T sum = init; typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first; typedef detail::accumulate_detail::buffer< groupsize, grainsize, RandomAccessIterator, T > buffer_type; #if __CUDA_ARCH__ >= 200 buffer_type *buffer = reinterpret_cast<buffer_type*>(bulk::malloc(g, sizeof(buffer_type))); #else __shared__ uninitialized<buffer_type> buffer_impl; buffer_type *buffer = &buffer_impl.get(); #endif for(; first < last; first += elements_per_group) { // XXX each iteration is essentially a bounded accumulate size_type partition_size = thrust::min<size_type>(elements_per_group, last - first); // copy partition into smem bulk::copy_n(g, first, partition_size, buffer->inputs.data()); T this_sum; size_type local_offset = grainsize * g.this_exec.index(); size_type local_size = thrust::max<size_type>(0,thrust::min<size_type>(grainsize, partition_size - grainsize * tid)); if(local_size) { this_sum = buffer->inputs[local_offset]; this_sum = bulk::accumulate(bound<grainsize-1>(g.this_exec), buffer->inputs.data() + local_offset + 1, buffer->inputs.data() + local_offset + local_size, this_sum, binary_op); } // end if g.wait(); if(local_size) { buffer->sums[tid] = this_sum; } // end if g.wait(); // sum over the group sum = accumulate_detail::destructive_accumulate_n(g, buffer->sums.data(), thrust::min<size_type>(groupsize,n), sum, binary_op); } // end for #if __CUDA_ARCH__ >= 200 bulk::free(g, buffer); #endif return sum; } // end accumulate