__device__ typename thrust::detail::enable_if< (bound <= groupsize * grainsize) >::type inplace_merge(bulk::bounded< bound, bulk::concurrent_group< bulk::agent<grainsize>, groupsize > > &g, RandomAccessIterator first, RandomAccessIterator middle, RandomAccessIterator last, Compare comp) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; size_type n1 = middle - first; size_type n2 = last - middle; // find the start of each local merge size_type local_offset = grainsize * g.this_exec.index(); size_type mp = bulk::merge_path(first, n1, middle, n2, local_offset, comp); // do a local sequential merge size_type local_offset1 = mp; size_type local_offset2 = n1 + local_offset - mp; typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type; value_type local_result[grainsize]; bulk::merge(bulk::bound<grainsize>(g.this_exec), first + local_offset1, middle, first + local_offset2, last, local_result, comp); g.wait(); // copy local result back to source // this is faster than getting the size from merge's result size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n1 + n2 - local_offset)); bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_result, local_size, first + local_offset); g.wait(); } // end inplace_merge()
__device__ typename thrust::detail::enable_if< bound <= groupsize * grainsize >::type scatter_if(bulk::bounded< bound, bulk::concurrent_group<bulk::agent<grainsize>,groupsize> > &g, RandomAccessIterator1 first, RandomAccessIterator1 last, RandomAccessIterator2 map, RandomAccessIterator3 stencil, RandomAccessIterator4 result) { typedef typename bulk::bounded< bound, bulk::concurrent_group<bulk::agent<grainsize>,groupsize> >::size_type size_type; typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::agent_type agent_type; size_type n = last - first; size_type tid = g.this_exec.index(); // avoid branches when possible if(n == bound) { for(size_type i = 0; i < g.this_exec.grainsize(); ++i) { size_type idx = g.size() * i + tid; if(stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if else if(n < bound) { for(size_type i = 0; i < g.this_exec.grainsize(); ++i) { size_type idx = g.size() * i + tid; if(idx < (last - first) && stencil[idx]) { result[map[idx]] = first[idx]; } // end if } // end for } // end if g.wait(); } // end scatter_if()
__device__ typename thrust::detail::enable_if< (bound <= groupsize * grainsize), RandomAccessIterator3 >::type merge(bulk::bounded< bound, bulk::concurrent_group< bulk::agent<grainsize>, groupsize > > &g, RandomAccessIterator1 first1, RandomAccessIterator1 last1, RandomAccessIterator2 first2, RandomAccessIterator2 last2, RandomAccessIterator3 result, Compare comp) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; size_type n1 = last1 - first1; size_type n2 = last2 - first2; // find the start of each local merge size_type local_offset = grainsize * g.this_exec.index(); size_type mp = bulk::merge_path(first1, n1, first2, n2, local_offset, comp); // do a local sequential merge size_type local_offset1 = mp; size_type local_offset2 = local_offset - mp; typedef typename thrust::iterator_value<RandomAccessIterator3>::type value_type; value_type local_result[grainsize]; bulk::merge(bulk::bound<grainsize>(g.this_exec), first1 + local_offset1, last1, first2 + local_offset2, last2, local_result, comp); // store local result // this is faster than getting the size from merge's result size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n1 + n2 - local_offset)); bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_result, local_size, result + local_offset); g.wait(); return result + thrust::min<size_type>(groupsize * grainsize, n1 + n2); } // end merge()
__device__ typename thrust::detail::enable_if< (bound <= groupsize * grainsize), RandomAccessIterator2 >::type copy_n(bulk::bounded< bound, concurrent_group< agent<grainsize>, groupsize > > &g, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result) { typedef bounded< bound, concurrent_group< agent<grainsize>, groupsize > > group_type; typedef typename group_type::size_type size_type; size_type tid = g.this_exec.index(); typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type; // XXX make this an uninitialized array value_type stage[grainsize]; // avoid conditional accesses when possible if(groupsize * grainsize <= n) { for(size_type i = 0; i < grainsize; ++i) { size_type src_idx = g.size() * i + tid; stage[i] = first[src_idx]; } // end for i for(size_type i = 0; i < grainsize; ++i) { size_type dst_idx = g.size() * i + tid; result[dst_idx] = stage[i]; } // end for i } // end if else { for(size_type i = 0; i < grainsize; ++i) { size_type src_idx = g.size() * i + tid; if(src_idx < n) { stage[i] = first[src_idx]; } // end if } // end for for(size_type i = 0; i < grainsize; ++i) { size_type dst_idx = g.size() * i + tid; if(dst_idx < n) { result[dst_idx] = stage[i]; } // end if } // end for } // end else g.wait(); return result + thrust::min<Size>(g.size() * grainsize, n); } // end copy_n()
__device__ thrust::pair<RandomAccessIterator5,RandomAccessIterator6> merge_by_key(bulk::bounded< groupsize*grainsize, bulk::concurrent_group<bulk::agent<grainsize>, groupsize> > &g, RandomAccessIterator1 keys_first1, RandomAccessIterator1 keys_last1, RandomAccessIterator2 keys_first2, RandomAccessIterator2 keys_last2, RandomAccessIterator3 values_first1, RandomAccessIterator4 values_first2, RandomAccessIterator5 keys_result, RandomAccessIterator6 values_result, Compare comp) { typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type; typedef typename thrust::iterator_value<RandomAccessIterator5>::type key_type; #if __CUDA_ARCH__ >= 200 union { key_type *keys; size_type *indices; } stage; stage.keys = static_cast<key_type*>(bulk::malloc(g, groupsize * grainsize * thrust::max(sizeof(key_type), sizeof(size_type)))); #else __shared__ union { key_type keys[groupsize * grainsize]; size_type indices[groupsize * grainsize]; } stage; #endif size_type n1 = keys_last1 - keys_first1; size_type n2 = keys_last2 - keys_first2; size_type n = n1 + n2; // copy keys into stage bulk::copy_n(g, thrust::detail::make_join_iterator(keys_first1, n1, keys_first2), n, stage.keys); // find the start of each agent's sequential merge size_type diag = thrust::min<size_type>(n1 + n2, grainsize * g.this_exec.index()); size_type mp = bulk::merge_path(stage.keys, n1, stage.keys + n1, n2, diag, comp); // compute the ranges of the sources in the stage. size_type start1 = mp; size_type start2 = n1 + diag - mp; size_type end1 = n1; size_type end2 = n1 + n2; // each agent merges sequentially key_type results[grainsize]; size_type indices[grainsize]; bulk::merge_by_key(bulk::bound<grainsize>(g.this_exec), stage.keys + start1, stage.keys + end1, stage.keys + start2, stage.keys + end2, thrust::make_counting_iterator<size_type>(start1), thrust::make_counting_iterator<size_type>(start2), results, indices, comp); g.wait(); // each agent stores merged keys back to the stage size_type local_offset = grainsize * g.this_exec.index(); size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n - local_offset)); bulk::copy_n(bulk::bound<grainsize>(g.this_exec), results, local_size, stage.keys + local_offset); g.wait(); // store merged keys to the result keys_result = bulk::copy_n(g, stage.keys, n, keys_result); // each agent copies the indices into the stage bulk::copy_n(bulk::bound<grainsize>(g.this_exec), indices, local_size, stage.indices + local_offset); g.wait(); // gather values into merged order values_result = bulk::gather(g, stage.indices, stage.indices + n, thrust::detail::make_join_iterator(values_first1, n1, values_first2), values_result); #if __CUDA_ARCH__ >= 200 bulk::free(g, stage.keys); #endif return thrust::make_pair(keys_result, values_result); } // end merge_by_key()