template <bool align> void SquaredDifferenceSum(
            const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride,
            size_t width, size_t height, uint64_t * sum)
        {
            assert(width < 0x10000);
            if (align)
                assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));

            size_t alignedWidth = Simd::AlignLo(width, A);
            uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth);

            uint64x2_t _sum = K64_0000000000000000;
            for (size_t row = 0; row < height; ++row)
            {
                uint32x4_t rowSum = K32_00000000;
                for (size_t col = 0; col < alignedWidth; col += A)
                {
                    uint8x16_t _a = Load<align>(a + col);
                    uint8x16_t _b = Load<align>(b + col);
                    rowSum = vaddq_u32(rowSum, SquaredDifferenceSum(_a, _b));
                }
                if (width - alignedWidth)
                {
                    uint8x16_t _a = Load<align>(a + width - A);
                    uint8x16_t _b = Load<align>(b + width - A);
                    rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, tailMask));
                }
                _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum));
                a += aStride;
                b += bStride;
            }
            *sum = ExtractSum64u(_sum);
        }
Esempio n. 2
0
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
  const uint32x4_t a = vpaddlq_u16(v_16x8);
  const uint64x2_t b = vpaddlq_u32(a);
  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
                                vreinterpret_u32_u64(vget_high_u64(b)));
  return vget_lane_u32(c, 0);
}
Esempio n. 3
0
inline int v_signmask(const v_uint32x4& a)
{
    int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
    uint64x2_t v1 = vpaddlq_u32(v0);
    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
}
Esempio n. 4
0
inline int v_signmask(const v_uint16x8& a)
{
    int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
}
Esempio n. 5
0
inline int v_signmask(const v_uint8x16& a)
{
    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
}
Esempio n. 6
0
void test_vpaddlQu32 (void)
{
  uint64x2_t out_uint64x2_t;
  uint32x4_t arg0_uint32x4_t;

  out_uint64x2_t = vpaddlq_u32 (arg0_uint32x4_t);
}
Esempio n. 7
0
  inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const
  {
    ResultType result = 0;
#if (defined __GNUC__ || defined __clang__) && defined USE_SSE
#ifdef __ARM_NEON__
    {
      uint32x4_t bits = vmovq_n_u32(0);
      for (size_t i = 0; i < size; i += 16) {
        uint8x16_t A_vec = vld1q_u8 (a + i);
        uint8x16_t B_vec = vld1q_u8 (b + i);
        uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
        uint8x16_t bitsSet = vcntq_u8 (AxorB);
        uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
        uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
        bits = vaddq_u32(bits, bitSet4);
      }
      uint64x2_t bitSet2 = vpaddlq_u32 (bits);
      result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
      result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
    }
#else
    {
      //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
      typedef unsigned long long pop_t;
      const size_t modulo = size % sizeof(pop_t);
      const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
      const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
      const pop_t* a2_end = a2 + (size / sizeof(pop_t));

      for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));

      if (modulo) {
        //in the case where size is not dividable by sizeof(pop_t)
        //need to mask off the bits at the end
        pop_t a_final = 0, b_final = 0;
        memcpy(&a_final, a2, modulo);
        memcpy(&b_final, b2, modulo);
        result += __builtin_popcountll(a_final ^ b_final);
      }
    }
#endif //NEON
    return result;
#endif
#ifdef PLATFORM_64_BIT
    if(size%64 == 0)
    {
      const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
      const uint64_t* pb = reinterpret_cast<const uint64_t*>(b);
      size /= (sizeof(uint64_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt64(*pa ^ *pb);
      }
    }
    else
    {
      const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
      const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
      size /= (sizeof(uint32_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt32(*pa ^ *pb);
      }
    }
#else
    const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
    const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
    size /= (sizeof(uint32_t)/sizeof(unsigned char));
    for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
      result += popcnt32(*pa ^ *pb);
    }
#endif
    return result;
  }