template <bool align> void SquaredDifferenceSum( const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if (align) assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); size_t alignedWidth = Simd::AlignLo(width, A); uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); uint64x2_t _sum = K64_0000000000000000; for (size_t row = 0; row < height; ++row) { uint32x4_t rowSum = K32_00000000; for (size_t col = 0; col < alignedWidth; col += A) { uint8x16_t _a = Load<align>(a + col); uint8x16_t _b = Load<align>(b + col); rowSum = vaddq_u32(rowSum, SquaredDifferenceSum(_a, _b)); } if (width - alignedWidth) { uint8x16_t _a = Load<align>(a + width - A); uint8x16_t _b = Load<align>(b + width - A); rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, tailMask)); } _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum)); a += aStride; b += bStride; } *sum = ExtractSum64u(_sum); }
/* u32x4 mm mul */ void mw_neon_mm_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, int Col, unsigned int * C) { int i, k, j; uint32x4_t neon_b, neon_c; uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3; uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_u32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_u32(A + j_T); j_T+=Row; neon_a1 = vld1q_u32(A + j_T); j_T+=Row; neon_a2 = vld1q_u32(A + j_T); j_T+=Row; neon_a3 = vld1q_u32(A + j_T); neon_b = vld1q_u32(B + k_Row + j); neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0)); neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1)); neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2)); neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3)); neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c); vst1q_lane_u32(C + k_Row + i, neon_c, 0); vst1q_lane_u32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_u32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_u32(C + k_Row + i + 3, neon_c, 3); } } } }
SIMD_INLINE uint32x4_t SquaredDifferenceSumMasked(const uint8x16_t & a, const uint8x16_t & b, const uint8x16_t & mask) { uint8x16_t ad = vandq_u8(vabdq_u8(a, b), mask); uint16x8_t lo = Square(vget_low_u8(ad)); uint16x8_t hi = Square(vget_high_u8(ad)); return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi)); }
int normL1_(const uchar* a, const uchar* b, int n) { int j = 0, d = 0; #if CV_SSE __m128i d0 = _mm_setzero_si128(); for( ; j <= n - 16; j += 16 ) { __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } for( ; j <= n - 4; j += 4 ) { __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); #elif CV_NEON uint32x4_t v_sum = vdupq_n_u32(0.0f); for ( ; j <= n - 16; j += 16) { uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); } uint CV_DECL_ALIGNED(16) buf[4]; vst1q_u32(buf, v_sum); d = buf[0] + buf[1] + buf[2] + buf[3]; #endif { for( ; j <= n - 4; j += 4 ) { d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); } } for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; }
void test_vaddQu32 (void) { uint32x4_t out_uint32x4_t; uint32x4_t arg0_uint32x4_t; uint32x4_t arg1_uint32x4_t; out_uint32x4_t = vaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t); }
/* u32x4 mv mul */ void mw_neon_mv_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, unsigned int * C) { int i = 0; int k = 0; uint32x4_t neon_b, neon_c; uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3; uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_u32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_u32(A + j); j+=Row; neon_a1 = vld1q_u32(A + j); j+=Row; neon_a2 = vld1q_u32(A + j); j+=Row; neon_a3 = vld1q_u32(A + j); neon_b = vld1q_u32(B + k); neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0)); neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1)); neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2)); neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3)); neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c); } vst1q_u32(C + i, neon_c); } }
/* u32x4 add */ void mw_neon_mm_add_u32x4(unsigned int * A, int Row, int Col, unsigned int * B, unsigned int * C) { uint32x4_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 4; i <= size ; i+=4) { k = i - 4; neon_a = vld1q_u32(A + k); neon_b = vld1q_u32(B + k); neon_c = vaddq_u32(neon_a, neon_b); vst1q_u32(C + k, neon_c); } k = i - 4; for (i = 0; i < size % 4; i++) { C[k + i] = A[k + i] + B[k + i]; } }
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm, const uint16_t* far_spectrum, int32_t* echo_est, uint32_t* far_energy, uint32_t* echo_energy_adapt, uint32_t* echo_energy_stored) { int16_t* start_stored_p = aecm->channelStored; int16_t* start_adapt_p = aecm->channelAdapt16; int32_t* echo_est_p = echo_est; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; const uint16_t* far_spectrum_p = far_spectrum; int16x8_t store_v, adapt_v; uint16x8_t spectrum_v; uint32x4_t echo_est_v_low, echo_est_v_high; uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v; far_energy_v = vdupq_n_u32(0); echo_adapt_v = vdupq_n_u32(0); echo_stored_v = vdupq_n_u32(0); // Get energy for the delayed far end signal and estimated // echo using both stored and adapted channels. // The C code: // for (i = 0; i < PART_LEN1; i++) { // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); // (*far_energy) += (uint32_t)(far_spectrum[i]); // *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i]; // (*echo_energy_stored) += (uint32_t)echo_est[i]; // } while (start_stored_p < end_stored_p) { spectrum_v = vld1q_u16(far_spectrum_p); adapt_v = vld1q_s16(start_adapt_p); store_v = vld1q_s16(start_stored_p); far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v)); far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v)); echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)), vget_low_u16(spectrum_v)); echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)), vget_high_u16(spectrum_v)); vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v); echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_low_s16(adapt_v)), vget_low_u16(spectrum_v)); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_high_s16(adapt_v)), vget_high_u16(spectrum_v)); start_stored_p += 8; start_adapt_p += 8; far_spectrum_p += 8; echo_est_p += 8; } AddLanes(far_energy, far_energy_v); AddLanes(echo_energy_stored, echo_stored_v); AddLanes(echo_energy_adapt, echo_adapt_v); echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); *echo_energy_stored += (uint32_t)echo_est[PART_LEN]; *far_energy += (uint32_t)far_spectrum[PART_LEN]; *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN]; }
void SkRGB16BlitterBlitV_neon(uint16_t* device, int height, size_t deviceRB, unsigned scale, uint32_t src32) { if (height >= 8) { uint16_t* dst = device; // prepare constants uint16x8_t vdev = vdupq_n_u16(0); uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE); uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE); uint32x4_t vsrc32 = vdupq_n_u32(src32); uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale); while (height >= 8){ LOAD_LANE_16(vdev, 0) LOAD_LANE_16(vdev, 1) LOAD_LANE_16(vdev, 2) LOAD_LANE_16(vdev, 3) LOAD_LANE_16(vdev, 4) LOAD_LANE_16(vdev, 5) LOAD_LANE_16(vdev, 6) LOAD_LANE_16(vdev, 7) // Expand_rgb_16 uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16)); uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5); uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5); // Compact_rgb_16 vdst32_lo = vaddq_u32(vdst32_lo, vsrc32); vdst32_hi = vaddq_u32(vdst32_hi, vsrc32); vdst32_lo = vshrq_n_u32(vdst32_lo, 5); vdst32_hi = vshrq_n_u32(vdst32_hi, 5); uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16); uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi); vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16); vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi); STORE_LANE_16(vdst16_lo, 0) STORE_LANE_16(vdst16_lo, 1) STORE_LANE_16(vdst16_lo, 2) STORE_LANE_16(vdst16_lo, 3) STORE_LANE_16(vdst16_hi, 0) STORE_LANE_16(vdst16_hi, 1) STORE_LANE_16(vdst16_hi, 2) STORE_LANE_16(vdst16_hi, 3) height -= 8; } } while (height != 0){ uint32_t dst32 = SkExpand_rgb_16(*device) * scale; *device = SkCompact_rgb_16((src32 + dst32) >> 5); device = (uint16_t*)((char*)device + deviceRB); height--; } }
void meanStdDev(const Size2D &size, const u16 * srcBase, ptrdiff_t srcStride, f32 * pMean, f32 * pStdDev) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3; f64 fsum = 0.0f, fsqsum = 0.0f; f32 arsum[8]; uint32x4_t v_zero = vdupq_n_u32(0u), v_sum; float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum; for (size_t i = 0; i < size.height; ++i) { const u16 * src = internal::getRowPtr(srcBase, srcStride, i); size_t j = 0u; while (j < roiw4) { size_t blockSize = std::min(roiw4 - j, blockSize0) + j; v_sum = v_zero; v_sqsum = v_zero_f; for ( ; j + 16 < blockSize ; j += 16) { internal::prefetch(src + j); uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); // 0 uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0)); uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo); float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); // 1 v_srclo = vmovl_u16(vget_low_u16(v_src1)); v_srchi = vmovl_u16(vget_high_u16(v_src1)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); v_srclo_f = vcvtq_f32_u32(v_srclo); v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); } for ( ; j < blockSize; j += 4) { uint32x4_t v_src = vmovl_u16(vld1_u16(src + j)); float32x4_t v_src_f = vcvtq_f32_u32(v_src); v_sum = vaddq_u32(v_sum, v_src); v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f); } vst1q_f32(arsum, vcvtq_f32_u32(v_sum)); vst1q_f32(arsum + 4, v_sqsum); fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3]; fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7]; } // collect a few last elements in the current row for ( ; j < size.width; ++j) { f32 srcval = src[j]; fsum += srcval; fsqsum += srcval * srcval; } } // calc mean and stddev f64 itotal = 1.0 / size.total(); f64 mean = fsum * itotal; f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0)); if (pMean) *pMean = mean; if (pStdDev) *pStdDev = stddev; #else (void)size; (void)srcBase; (void)srcStride; (void)pMean; (void)pStdDev; #endif }
inline uint32x4_t vaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vaddq_u32(v0, v1); }
inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const { ResultType result = 0; #if (defined __GNUC__ || defined __clang__) && defined USE_SSE #ifdef __ARM_NEON__ { uint32x4_t bits = vmovq_n_u32(0); for (size_t i = 0; i < size; i += 16) { uint8x16_t A_vec = vld1q_u8 (a + i); uint8x16_t B_vec = vld1q_u8 (b + i); uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); uint8x16_t bitsSet = vcntq_u8 (AxorB); uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); bits = vaddq_u32(bits, bitSet4); } uint64x2_t bitSet2 = vpaddlq_u32 (bits); result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); } #else { //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll) typedef unsigned long long pop_t; const size_t modulo = size % sizeof(pop_t); const pop_t* a2 = reinterpret_cast<const pop_t*> (a); const pop_t* b2 = reinterpret_cast<const pop_t*> (b); const pop_t* a2_end = a2 + (size / sizeof(pop_t)); for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2)); if (modulo) { //in the case where size is not dividable by sizeof(pop_t) //need to mask off the bits at the end pop_t a_final = 0, b_final = 0; memcpy(&a_final, a2, modulo); memcpy(&b_final, b2, modulo); result += __builtin_popcountll(a_final ^ b_final); } } #endif //NEON return result; #endif #ifdef PLATFORM_64_BIT if(size%64 == 0) { const uint64_t* pa = reinterpret_cast<const uint64_t*>(a); const uint64_t* pb = reinterpret_cast<const uint64_t*>(b); size /= (sizeof(uint64_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt64(*pa ^ *pb); } } else { const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } } #else const uint32_t* pa = reinterpret_cast<const uint32_t*>(a); const uint32_t* pb = reinterpret_cast<const uint32_t*>(b); size /= (sizeof(uint32_t)/sizeof(unsigned char)); for(size_t i = 0; i < size; ++i, ++pa, ++pb ) { result += popcnt32(*pa ^ *pb); } #endif return result; }