__global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; const unsigned int warp_num = hipThreadIdx_x >> pshift; #ifdef __HIP_PLATFORM_HCC__ atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif }
__device__ __forceinline__ void reduceIter(int val1, int val2) { mySum += __popc(val1 ^ val2); }
__device__ To operator()(uchar v1, uchar v2) { return __popc(v1 ^ v2); }
__device__ To operator()(ushort v1, ushort v2) { return __popc(v1 ^ v2); }
__device__ To operator()(uintl v1, uintl v2) { return __popc(v1 ^ v2); }