int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const { if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; len *= cn; int x = 0; v_int32 v_sum = vx_setzero_s32(); for (; x <= len - v_int16::nlanes; x += v_int16::nlanes) { v_int32 v_src0, v_src1; v_expand(vx_load(src0 + x), v_src0, v_src1); v_sum += v_src0 + v_src1; } if (x <= len - v_int32::nlanes) { v_sum += vx_load_expand(src0 + x); x += v_int32::nlanes; } if (cn == 1) *dst += v_reduce_sum(v_sum); else { int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes]; v_store_aligned(ar, v_sum); for (int i = 0; i < v_int32::nlanes; ++i) dst[i % cn] += ar[i]; } v_cleanup(); return x / cn; }
TheTest & test_reduce() { Data<R> dataA; R a = dataA; EXPECT_EQ((LaneType)1, v_reduce_min(a)); EXPECT_EQ((LaneType)R::nlanes, v_reduce_max(a)); EXPECT_EQ((LaneType)((1 + R::nlanes)*R::nlanes/2), v_reduce_sum(a)); return *this; }
TheTest & test_popcount() { static unsigned popcountTable[] = {0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33}; Data<R> dataA; R a = dataA; unsigned resB = (unsigned)v_reduce_sum(v_popcount(a)); EXPECT_EQ(popcountTable[R::nlanes], resB); return *this; }
float normL1_(const float* a, const float* b, int n) { int j = 0; float d = 0.f; #if CV_SIMD v_float32 v_d = vx_setzero_f32(); for (; j <= n - v_float32::nlanes; j += v_float32::nlanes) v_d += v_absdiff(vx_load(a + j), vx_load(b + j)); d = v_reduce_sum(v_d); #endif for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; }
int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const { if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; len *= cn; int x = 0; v_int32 v_sum = vx_setzero_s32(); int len0 = len & -v_int8::nlanes; while (x < len0) { const int len_tmp = min(x + 256*v_int16::nlanes, len0); v_int16 v_sum16 = vx_setzero_s16(); for (; x < len_tmp; x += v_int8::nlanes) { v_int16 v_src0, v_src1; v_expand(vx_load(src0 + x), v_src0, v_src1); v_sum16 += v_src0 + v_src1; } v_int32 v_half0, v_half1; v_expand(v_sum16, v_half0, v_half1); v_sum += v_half0 + v_half1; } if (x <= len - v_int16::nlanes) { v_int32 v_half0, v_half1; v_expand(vx_load_expand(src0 + x), v_half0, v_half1); v_sum += v_half0 + v_half1; x += v_int16::nlanes; } if (x <= len - v_int32::nlanes) { v_sum += vx_load_expand_q(src0 + x); x += v_int32::nlanes; } if (cn == 1) *dst += v_reduce_sum(v_sum); else { int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes]; v_store_aligned(ar, v_sum); for (int i = 0; i < v_int32::nlanes; ++i) dst[i % cn] += ar[i]; } v_cleanup(); return x / cn; }
float normL2Sqr_(const float* a, const float* b, int n) { int j = 0; float d = 0.f; #if CV_SIMD v_float32 v_d = vx_setzero_f32(); for (; j <= n - v_float32::nlanes; j += v_float32::nlanes) { v_float32 t = vx_load(a + j) - vx_load(b + j); v_d = v_muladd(t, t, v_d); } d = v_reduce_sum(v_d); #endif for( ; j < n; j++ ) { float t = a[j] - b[j]; d += t*t; } return d; }
int normHamming(const uchar* a, const uchar* b, int n) { CV_AVX_GUARD; int i = 0; int result = 0; #if CV_AVX2 { __m256i _r0 = _mm256_setzero_si256(); __m256i _0 = _mm256_setzero_si256(); __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); for(; i <= n - 32; i+= 32) { __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i)); __m256i _xor = _mm256_xor_si256(_a0, _b0); __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); } _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); } #endif // CV_AVX2 #if CV_POPCNT { # if defined CV_POPCNT_U64 for(; i <= n - 8; i += 8) { result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i)); } # endif for(; i <= n - 4; i += 4) { result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); } } #endif // CV_POPCNT #if CV_SIMD128 { v_uint32x4 t = v_setzero_u32(); for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) { t += v_popcount(v_load(a + i) ^ v_load(b + i)); } result += v_reduce_sum(t); } #endif // CV_SIMD128 #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; } #endif for(; i < n; i++) { result += popCountTable[a[i] ^ b[i]]; } return result; }