KOKKOS_INLINE_FUNCTION double shfl_down(const double &val, const int& delta, const int& width) { int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_down(lo,delta,width); hi = __shfl_down(hi,delta,width); return __hiloint2double(hi,lo); }
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) { int lo = __double2loint(*reinterpret_cast<const double*>(&val)); int hi = __double2hiint(*reinterpret_cast<const double*>(&val)); lo = __shfl_down(lo,delta,width); hi = __shfl_down(hi,delta,width); const double tmp = __hiloint2double(hi,lo); return *(reinterpret_cast<const Scalar*>(&tmp)); }
__device__ double shfl_down(double val, uint delta, int width = warpSize) { int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_down(lo, delta, width); hi = __shfl_down(hi, delta, width); return __hiloint2double(hi, lo); }
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize) { #if __CUDA_ARCH__ >= 300 int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_down(lo, delta, width); hi = __shfl_down(hi, delta, width); return __hiloint2double(hi, lo); #else return 0.0; #endif }
__device__ T shfl_down(T a, int b, int c=WARP_SIZE) { #if __CUDA_ARCH__ >= 300 return __shfl_down(a, b, c); #else return 0; #endif }
KOKKOS_INLINE_FUNCTION double shfl_down<double>(const double &val, const int& delta, const int& width){ #ifdef __CUDA_ARCH__ #if (__CUDA_ARCH__ >= 300) int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_down(lo,delta,width); hi = __shfl_down(hi,delta,width); return __hiloint2double(hi,lo); #else return val; #endif #else return val; #endif }
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) { Scalar tmp1 = val; float tmp = *reinterpret_cast<float*>(&tmp1); tmp = __shfl_down(tmp,delta,width); return *reinterpret_cast<Scalar*>(&tmp); }
KOKKOS_INLINE_FUNCTION unsigned int shfl_down( const unsigned int &val, const int& delta, const int& width) { unsigned int tmp1 = val; int tmp = *reinterpret_cast<int*>(&tmp1); tmp = __shfl_down(tmp,delta,width); return *reinterpret_cast<unsigned int*>(&tmp); }
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize) { #if __CUDA_ARCH__ >= 300 return __shfl_down(val, delta, width); #else return T(); #endif }
__device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize) { #if __CUDA_ARCH__ >= 300 return (unsigned int) __shfl_down((int) val, delta, width); #else return 0; #endif }
KOKKOS_INLINE_FUNCTION unsigned long shfl_down<unsigned long>(const unsigned long &val, const int& delta, const int& width){ #ifdef __CUDA_ARCH__ #if (__CUDA_ARCH__ >= 300) int lo = __double2loint(*reinterpret_cast<const double*>(&val)); int hi = __double2hiint(*reinterpret_cast<const double*>(&val)); lo = __shfl_down(lo,delta,width); hi = __shfl_down(hi,delta,width); const double tmp = __hiloint2double(hi,lo); return *(reinterpret_cast<const unsigned long*>(&tmp)); #else return val; #endif #else return val; #endif }
__device__ __forceinline__ unsigned warpReduceSum(unsigned val) { for (int offset = warpSize/2; offset > 0; offset /= 2) { val += __shfl_down(val, offset); } return val; }
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) { Impl::shfl_union<Scalar> s_val; Impl::shfl_union<Scalar> r_val; s_val = val; for(int i = 0; i<s_val.n; i++) r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width); return r_val.value(); }
KOKKOS_INLINE_FUNCTION float shfl_down<float>(const float &val, const int& delta, const int& width){ #ifdef __CUDA_ARCH__ #if (__CUDA_ARCH__ >= 300) return __shfl_down(val,delta,width); #else return val; #endif #else return val; #endif }
KOKKOS_INLINE_FUNCTION unsigned int shfl_down<unsigned int>(const unsigned int &val, const int& delta, const int& width){ #ifdef __CUDA_ARCH__ #if (__CUDA_ARCH__ >= 300) unsigned int tmp1 = val; int tmp = *reinterpret_cast<int*>(&tmp1); tmp = __shfl_down(tmp,delta,width); return *reinterpret_cast<unsigned int*>(&tmp); #else return val; #endif #else return val; #endif }
__device__ void update( uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask) { const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE; const auto dp0 = dp[0]; uint32_t lazy_out = 0, local_min = 0; { const unsigned int k = 0; #if CUDA_VERSION >= 9000 const uint32_t prev = __shfl_up_sync(mask, dp[DP_BLOCK_SIZE - 1], 1); #else const uint32_t prev = __shfl_up(dp[DP_BLOCK_SIZE - 1], 1); #endif uint32_t out = min(dp[k] - last_min, p2); if(lane_id != 0){ out = min(out, prev - last_min + p1); } out = min(out, dp[k + 1] - last_min + p1); lazy_out = local_min = out + local_costs[k]; } for(unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k){ uint32_t out = min(dp[k] - last_min, p2); out = min(out, dp[k - 1] - last_min + p1); out = min(out, dp[k + 1] - last_min + p1); dp[k - 1] = lazy_out; lazy_out = out + local_costs[k]; local_min = min(local_min, lazy_out); } { const unsigned int k = DP_BLOCK_SIZE - 1; #if CUDA_VERSION >= 9000 const uint32_t next = __shfl_down_sync(mask, dp0, 1); #else const uint32_t next = __shfl_down(dp0, 1); #endif uint32_t out = min(dp[k] - last_min, p2); out = min(out, dp[k - 1] - last_min + p1); if(lane_id + 1 != SUBGROUP_SIZE){ out = min(out, next - last_min + p1); } dp[k - 1] = lazy_out; dp[k] = out + local_costs[k]; local_min = min(local_min, dp[k]); } last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask); }
__device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize) { return __shfl_down(val, delta, width); }
KOKKOS_INLINE_FUNCTION float shfl_down(const float &val, const int& delta, const int& width) { return __shfl_down(val,delta,width); }
__device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warpSize) { return (uint) __shfl_down((int) val, delta, width); }
__device__ __forceinline__ schar shfl_down(schar val, uint delta, int width = warpSize) { return (schar) __shfl_down((int) val, delta, width); }