__inline__ __device__ T atomic_fetch_add( volatile T * const dest , typename ::Kokkos::Impl::enable_if< ( sizeof(T) != 4 ) && ( sizeof(T) != 8 ) , const T >::type& val ) { T return_val; // This is a way to (hopefully) avoid dead lock in a warp int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active!=done_active) { if(!done) { bool locked = Impl::lock_address_cuda_space( (void*) dest ); if( locked ) { return_val = *dest; *dest = return_val + val; Impl::unlock_address_cuda_space( (void*) dest ); done = 1; } } done_active = __ballot(done); } return return_val; }
__inline__ __device__ T atomic_compare_exchange( volatile T * const dest , const T & compare , typename Kokkos::Impl::enable_if< ( sizeof(T) != 4 ) && ( sizeof(T) != 8 ) , const T >::type& val ) { T return_val; // This is a way to (hopefully) avoid dead lock in a warp int done = 0; unsigned int active = __ballot(1); unsigned int done_active = 0; while (active!=done_active) { if(!done) { if( Impl::lock_address_cuda_space( (void*) dest ) ) { return_val = *dest; if( return_val == compare ) *dest = val; Impl::unlock_address_cuda_space( (void*) dest ); done = 1; } } done_active = __ballot(done); } return return_val; }
__global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; const unsigned int warp_num = hipThreadIdx_x >> pshift; #ifdef __HIP_PLATFORM_HCC__ atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif }
static __forceinline__ __device__ int ballot(int predicate, volatile int* cta_buffer) { #if __CUDA_ARCH__ >= 200 (void)cta_buffer; return __ballot(predicate); #else int tid = threadIdx.x; cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; return warp_reduce(cta_buffer); #endif }
static __forceinline__ __device__ int Ballot(int predicate) { #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) return __ballot(predicate); #else __shared__ volatile int cta_buffer[CTA_SIZE]; int tid = threadIdx.x; cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; return warp_reduce(cta_buffer); #endif }