SIMD_INLINE uint32x4_t SquaredDifferenceSumMasked(const uint8x16_t & a, const uint8x16_t & b, const uint8x16_t & mask) { uint8x16_t ad = vandq_u8(vabdq_u8(a, b), mask); uint16x8_t lo = Square(vget_low_u8(ad)); uint16x8_t hi = Square(vget_high_u8(ad)); return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi)); }
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { const uint32x4_t a = vpaddlq_u16(v_16x8); const uint64x2_t b = vpaddlq_u32(a); const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), vreinterpret_u32_u64(vget_high_u64(b))); return vget_lane_u32(c, 0); }
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
inline int v_signmask(const v_uint16x8& a) { int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000)); uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0)); uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0)); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4); }
inline int v_signmask(const v_uint8x16& a) { int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100)); uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0)); uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); }
uint32x2_t FORCE_INLINE popcnt_neon_qreg(const uint8x16_t reg) { const uint8x16_t pcnt = vcntq_u8(reg); const uint16x8_t t0 = vpaddlq_u8(pcnt); const uint32x4_t t1 = vpaddlq_u16(t0); const uint32x2_t t2 = vadd_u32(vget_low_u32(t1), vget_high_u32(t1)); return t2; }
inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const { ResultType result = 0; #if (defined __GNUC__ || defined __clang__) && defined USE_SSE #ifdef __ARM_NEON__ { uint32x4_t bits = vmovq_n_u32(0); for (size_t i = 0; i < size; i += 16) { uint8x16_t A_vec = vld1q_u8 (a + i); uint8x16_t B_vec = vld1q_u8 (b + i); uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); uint8x16_t bitsSet = vcntq_u8 (AxorB); uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); bits = vaddq_u32(bits, bitSet4); } uint64x2_t bitSet2 = vpaddlq_u32 (bits); result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); } #else { //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) typedef unsigned long long pop_t; const size_t modulo = size % sizeof(pop_t); const pop_t* a2 = reinterpret_cast<const pop_t*> (a); const pop_t* b2 = reinterpret_cast<const pop_t*> (b); const pop_t* a2_end = a2 + (size / sizeof(pop_t)); for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2)); if (modulo) { //in the case where size is not dividable by sizeof(pop_t) //need to mask off the bits at the end pop_t a_final = 0, b_final = 0; memcpy(&a_final, a2, modulo); memcpy(&b_final, b2, modulo); result += __builtin_popcountll(a_final ^ b_final); } } #endif //NEON return result; #endif #ifdef PLATFORM_64_BIT if(size%64 == 0) { const uint64_t* pa = reinterpret_cast<const uint64_t*>(a); const uint64_t* pb = reinterpret_cast<const uint64_t*>(b); size /= (sizeof(uint64_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt64(*pa ^ *pb); } } else { const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } } #else const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } #endif return result; }
f64 dotProduct(const Size2D &_size, const u8 * src0Base, ptrdiff_t src0Stride, const u8 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width)) { size.width *= size.height; size.height = 1; } // It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow // We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements #define DOT_UINT_BLOCKSIZE 66050*8 f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; uint64x2_t ws = vmovq_n_u64(0); while(i + 16 <= size.width) { size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; uint32x4_t s1 = vmovq_n_u32(0); uint32x4_t s2 = vmovq_n_u32(0); for (; i <= lim; i += 16) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); uint8x16_t vs1 = vld1q_u8(src0 + i); uint8x16_t vs2 = vld1q_u8(src1 + i); uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2)); uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2)); s1 = vpadalq_u16(s1, vdot1); s2 = vpadalq_u16(s2, vdot2); } ws = vpadalq_u32(ws, s1); ws = vpadalq_u32(ws, s2); } if(i + 8 <= size.width) { uint8x8_t vs1 = vld1_u8(src0 + i); uint8x8_t vs2 = vld1_u8(src1 + i); ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2))); i += 8; } result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0); for (; i < size.width; ++i) result += s32(src0[i]) * s32(src1[i]); } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }