__device__ RandomAccessIterator3 merge(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &exec, RandomAccessIterator1 first1, RandomAccessIterator1 last1, RandomAccessIterator2 first2, RandomAccessIterator2 last2, RandomAccessIterator3 result, Compare comp) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; typedef typename thrust::iterator_value<RandomAccessIterator3>::type value_type; value_type *buffer = reinterpret_cast<value_type*>(bulk::malloc(exec, exec.size() * exec.grainsize() * sizeof(value_type))); size_type chunk_size = exec.size() * exec.this_exec.grainsize(); size_type n1 = last1 - first1; size_type n2 = last2 - first2; // avoid the search & loop when possible if(n1 + n2 <= chunk_size) { result = detail::merge_detail::bounded_merge_with_buffer(exec, first1, last1, first2, last2, buffer, result, comp); } // end if else { while((first1 < last1) || (first2 < last2)) { size_type n1 = last1 - first1; size_type n2 = last2 - first2; size_type diag = thrust::min<size_type>(chunk_size, n1 + n2); size_type mp = bulk::merge_path(first1, n1, first2, n2, diag, comp); result = detail::merge_detail::bounded_merge_with_buffer(exec, first1, first1 + mp, first2, first2 + diag - mp, buffer, result, comp); first1 += mp; first2 += diag - mp; } // end while } // end else bulk::free(exec, buffer); return result; } // end merge()
__device__ void scatter_if(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g, RandomAccessIterator1 first, RandomAccessIterator1 last, RandomAccessIterator2 map, RandomAccessIterator3 stencil, RandomAccessIterator4 result) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; size_type chunk_size = g.size() * grainsize; size_type n = last - first; size_type tid = g.this_exec.index(); // important special case which avoids the expensive for loop below if(chunk_size == n) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else if(n < chunk_size) { for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(idx < (last - first) && stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else { for(; first < last; first += chunk_size, map += chunk_size, stencil += chunk_size) { if((last - first) >= chunk_size) { // avoid conditional accesses when possible for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else { for(size_type i = 0; i < grainsize; ++i) { size_type idx = g.size() * i + tid; if(idx < (last - first) && stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end else } // end for } // end else g.wait(); } // end scatter_if