KOKKOS_INLINE_FUNCTION
 double shfl_down(const double &val, const int& delta, const int& width) {
   int lo = __double2loint(val);
   int hi = __double2hiint(val);
   lo = __shfl_down(lo,delta,width);
   hi = __shfl_down(hi,delta,width);
   return __hiloint2double(hi,lo);
 }
 KOKKOS_INLINE_FUNCTION
 Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
   int lo = __double2loint(*reinterpret_cast<const double*>(&val));
   int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
   lo = __shfl_down(lo,delta,width);
   hi = __shfl_down(hi,delta,width);
   const double tmp = __hiloint2double(hi,lo);
   return *(reinterpret_cast<const Scalar*>(&tmp));
 }
示例#3
0
__device__ double shfl_down(double val, uint delta, int width = warpSize)
{
    int lo = __double2loint(val);
    int hi = __double2hiint(val);

    lo = __shfl_down(lo, delta, width);
    hi = __shfl_down(hi, delta, width);

    return __hiloint2double(hi, lo);
}
示例#4
0
    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
    {
    #if __CUDA_ARCH__ >= 300
        int lo = __double2loint(val);
        int hi = __double2hiint(val);

        lo = __shfl_down(lo, delta, width);
        hi = __shfl_down(hi, delta, width);

        return __hiloint2double(hi, lo);
    #else
        return 0.0;
    #endif
    }
__device__ T shfl_down(T a, int b, int c=WARP_SIZE) {
#if __CUDA_ARCH__ >= 300
    return __shfl_down(a, b, c);
#else
    return 0;
#endif
}
KOKKOS_INLINE_FUNCTION
double shfl_down<double>(const double &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
  #if (__CUDA_ARCH__ >= 300)
    int lo = __double2loint(val);
    int hi = __double2hiint(val);
    lo = __shfl_down(lo,delta,width);
    hi = __shfl_down(hi,delta,width);
    return __hiloint2double(hi,lo);
  #else
    return val;
  #endif
#else
  return val;
#endif
}
 KOKKOS_INLINE_FUNCTION
 Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
   Scalar tmp1 = val;
   float tmp = *reinterpret_cast<float*>(&tmp1);
   tmp = __shfl_down(tmp,delta,width);
   return *reinterpret_cast<Scalar*>(&tmp);
 }
KOKKOS_INLINE_FUNCTION
unsigned int shfl_down(
    const unsigned int &val, const int& delta, const int& width) {
    unsigned int tmp1 = val;
    int tmp = *reinterpret_cast<int*>(&tmp1);
    tmp = __shfl_down(tmp,delta,width);
    return *reinterpret_cast<unsigned int*>(&tmp);
}
示例#9
0
 __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
 {
 #if __CUDA_ARCH__ >= 300
     return __shfl_down(val, delta, width);
 #else
     return T();
 #endif
 }
示例#10
0
 __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
 {
 #if __CUDA_ARCH__ >= 300
     return (unsigned int) __shfl_down((int) val, delta, width);
 #else
     return 0;
 #endif
 }
KOKKOS_INLINE_FUNCTION
unsigned long shfl_down<unsigned long>(const unsigned long &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
  #if (__CUDA_ARCH__ >= 300)
    int lo = __double2loint(*reinterpret_cast<const double*>(&val));
    int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
    lo = __shfl_down(lo,delta,width);
    hi = __shfl_down(hi,delta,width);
    const double tmp = __hiloint2double(hi,lo);
    return *(reinterpret_cast<const unsigned long*>(&tmp));
  #else
    return val;
  #endif
#else
  return val;
#endif
}
示例#12
0
__device__ __forceinline__
unsigned warpReduceSum(unsigned val)
{
    for (int offset = warpSize/2; offset > 0; offset /= 2) {
        val += __shfl_down(val, offset);
     }

    return val;
}
    KOKKOS_INLINE_FUNCTION
    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
      Impl::shfl_union<Scalar> s_val;
      Impl::shfl_union<Scalar> r_val;
      s_val = val;

      for(int i = 0; i<s_val.n; i++)
        r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
      return r_val.value();
    }
KOKKOS_INLINE_FUNCTION
float shfl_down<float>(const float &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
  #if (__CUDA_ARCH__ >= 300)
    return __shfl_down(val,delta,width);
  #else
    return val;
  #endif
#else
  return val;
#endif
}
KOKKOS_INLINE_FUNCTION
unsigned int shfl_down<unsigned int>(const unsigned int &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
  #if (__CUDA_ARCH__ >= 300)
    unsigned int tmp1 = val;
    int tmp = *reinterpret_cast<int*>(&tmp1);
    tmp = __shfl_down(tmp,delta,width);
    return *reinterpret_cast<unsigned int*>(&tmp);
  #else
    return val;
  #endif
#else
  return val;
#endif
}
示例#16
0
	__device__ void update(
		uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask)
	{
		const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;

		const auto dp0 = dp[0];
		uint32_t lazy_out = 0, local_min = 0;
		{
			const unsigned int k = 0;
#if CUDA_VERSION >= 9000
			const uint32_t prev =
				__shfl_up_sync(mask, dp[DP_BLOCK_SIZE - 1], 1);
#else
			const uint32_t prev = __shfl_up(dp[DP_BLOCK_SIZE - 1], 1);
#endif
			uint32_t out = min(dp[k] - last_min, p2);
			if(lane_id != 0){ out = min(out, prev - last_min + p1); }
			out = min(out, dp[k + 1] - last_min + p1);
			lazy_out = local_min = out + local_costs[k];
		}
		for(unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k){
			uint32_t out = min(dp[k] - last_min, p2);
			out = min(out, dp[k - 1] - last_min + p1);
			out = min(out, dp[k + 1] - last_min + p1);
			dp[k - 1] = lazy_out;
			lazy_out = out + local_costs[k];
			local_min = min(local_min, lazy_out);
		}
		{
			const unsigned int k = DP_BLOCK_SIZE - 1;
#if CUDA_VERSION >= 9000
			const uint32_t next = __shfl_down_sync(mask, dp0, 1);
#else
			const uint32_t next = __shfl_down(dp0, 1);
#endif
			uint32_t out = min(dp[k] - last_min, p2);
			out = min(out, dp[k - 1] - last_min + p1);
			if(lane_id + 1 != SUBGROUP_SIZE){
				out = min(out, next - last_min + p1);
			}
			dp[k - 1] = lazy_out;
			dp[k] = out + local_costs[k];
			local_min = min(local_min, dp[k]);
		}
		last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask);
	}
示例#17
0
__device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize)
{
    return __shfl_down(val, delta, width);
}
 KOKKOS_INLINE_FUNCTION
 float shfl_down(const float &val, const int& delta, const int& width) {
   return __shfl_down(val,delta,width);
 }
示例#19
0
__device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warpSize)
{
    return (uint) __shfl_down((int) val, delta, width);
}
示例#20
0
__device__ __forceinline__ schar shfl_down(schar val, uint delta, int width = warpSize)
{
    return (schar) __shfl_down((int) val, delta, width);
}