Ejemplo n.º 1
0
__inline__ __device__
T atomic_fetch_add( volatile T * const dest ,
    typename ::Kokkos::Impl::enable_if<
                  ( sizeof(T) != 4 )
               && ( sizeof(T) != 8 )
             , const T >::type& val )
{
  T return_val;
  // This is a way to (hopefully) avoid dead lock in a warp
  int done = 0;
  unsigned int active = __ballot(1);
  unsigned int done_active = 0;
  while (active!=done_active) {
    if(!done) {
      bool locked = Impl::lock_address_cuda_space( (void*) dest );
      if( locked ) {
        return_val = *dest;
        *dest = return_val + val;
        Impl::unlock_address_cuda_space( (void*) dest );
        done = 1;
      }
    }
    done_active = __ballot(done);
  }
  return return_val;
}
__inline__ __device__
T atomic_compare_exchange( volatile T * const dest , const T & compare ,
    typename Kokkos::Impl::enable_if<
                  ( sizeof(T) != 4 )
               && ( sizeof(T) != 8 )
             , const T >::type& val )
{
  T return_val;
  // This is a way to (hopefully) avoid dead lock in a warp
  int done = 0;
  unsigned int active = __ballot(1);
  unsigned int done_active = 0;
  while (active!=done_active) {
    if(!done) {
      if( Impl::lock_address_cuda_space( (void*) dest ) ) {
        return_val = *dest;
        if( return_val == compare )
          *dest = val;
        Impl::unlock_address_cuda_space( (void*) dest );
        done = 1;
      }
    }
    done_active = __ballot(done);
  }
  return return_val;
}
__global__ void 
	gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
{

   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   const unsigned int warp_num = hipThreadIdx_x >> pshift;
#ifdef __HIP_PLATFORM_HCC__
   atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
#else
	atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
#endif
 
}
Ejemplo n.º 4
0
			static __forceinline__ __device__ int ballot(int predicate, volatile int* cta_buffer)
			{
#if __CUDA_ARCH__ >= 200
				(void)cta_buffer;
				return __ballot(predicate);
#else
				int tid = threadIdx.x;				
				cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
				return warp_reduce(cta_buffer);
#endif
			}          
Ejemplo n.º 5
0
        static __forceinline__ __device__ int Ballot(int predicate)
        {
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
            return __ballot(predicate);
#else
            __shared__ volatile int cta_buffer[CTA_SIZE];

            int tid = threadIdx.x;
            cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
            return warp_reduce(cta_buffer);
#endif
        }