__global__ void 
	gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
{

   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   const unsigned int warp_num = hipThreadIdx_x >> pshift;
#ifdef __HIP_PLATFORM_HCC__
   atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
#else
	atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
#endif
 
}
Beispiel #2
0
 __device__ __forceinline__ void reduceIter(int val1, int val2)
 {
     mySum += __popc(val1 ^ val2);
 }
 __device__ To operator()(uchar v1, uchar v2)
 {
     return __popc(v1 ^ v2);
 }
 __device__ To operator()(ushort v1, ushort v2)
 {
     return __popc(v1 ^ v2);
 }
 __device__ To operator()(uintl v1, uintl v2)
 {
     return __popc(v1 ^ v2);
 }