KOKKOS_INLINE_FUNCTION double shfl_up(const double &val, const int& delta, const int& width ) { int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_up(lo,delta,width); hi = __shfl_up(hi,delta,width); return __hiloint2double(hi,lo); }
KOKKOS_INLINE_FUNCTION double shfl(const double &val, const int& srcLane, const int& width) { int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl(lo,srcLane,width); hi = __shfl(hi,srcLane,width); return __hiloint2double(hi,lo); }
KOKKOS_INLINE_FUNCTION Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) { int lo = __double2loint(*reinterpret_cast<const double*>(&val)); int hi = __double2hiint(*reinterpret_cast<const double*>(&val)); lo = __shfl_up(lo,delta,width); hi = __shfl_up(hi,delta,width); const double tmp = __hiloint2double(hi,lo); return *(reinterpret_cast<const Scalar*>(&tmp)); }
__device__ double shfl_xor(double val, int laneMask, int width = warpSize) { int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_xor(lo, laneMask, width); hi = __shfl_xor(hi, laneMask, width); return __hiloint2double(hi, lo); }
__device__ double shfl_down(double val, uint delta, int width = warpSize) { int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_down(lo, delta, width); hi = __shfl_down(hi, delta, width); return __hiloint2double(hi, lo); }
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize) { #if __CUDA_ARCH__ >= 300 int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_down(lo, delta, width); hi = __shfl_down(hi, delta, width); return __hiloint2double(hi, lo); #else return 0.0; #endif }
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize) { #if __CUDA_ARCH__ >= 300 int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl(lo, srcLane, width); hi = __shfl(hi, srcLane, width); return __hiloint2double(hi, lo); #else return 0.0; #endif }
KOKKOS_INLINE_FUNCTION double shfl_down<double>(const double &val, const int& delta, const int& width){ #ifdef __CUDA_ARCH__ #if (__CUDA_ARCH__ >= 300) int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_down(lo,delta,width); hi = __shfl_down(hi,delta,width); return __hiloint2double(hi,lo); #else return val; #endif #else return val; #endif }
KOKKOS_INLINE_FUNCTION unsigned long shfl_down<unsigned long>(const unsigned long &val, const int& delta, const int& width){ #ifdef __CUDA_ARCH__ #if (__CUDA_ARCH__ >= 300) int lo = __double2loint(*reinterpret_cast<const double*>(&val)); int hi = __double2hiint(*reinterpret_cast<const double*>(&val)); lo = __shfl_down(lo,delta,width); hi = __shfl_down(hi,delta,width); const double tmp = __hiloint2double(hi,lo); return *(reinterpret_cast<const unsigned long*>(&tmp)); #else return val; #endif #else return val; #endif }