template <bool align> void SquaredDifferenceSum(
            const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride,
            size_t width, size_t height, uint64_t * sum)
        {
            assert(width < 0x10000);
            if (align)
                assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));

            size_t alignedWidth = Simd::AlignLo(width, A);
            uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth);

            uint64x2_t _sum = K64_0000000000000000;
            for (size_t row = 0; row < height; ++row)
            {
                uint32x4_t rowSum = K32_00000000;
                for (size_t col = 0; col < alignedWidth; col += A)
                {
                    uint8x16_t _a = Load<align>(a + col);
                    uint8x16_t _b = Load<align>(b + col);
                    rowSum = vaddq_u32(rowSum, SquaredDifferenceSum(_a, _b));
                }
                if (width - alignedWidth)
                {
                    uint8x16_t _a = Load<align>(a + width - A);
                    uint8x16_t _b = Load<align>(b + width - A);
                    rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, tailMask));
                }
                _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum));
                a += aStride;
                b += bStride;
            }
            *sum = ExtractSum64u(_sum);
        }
Exemplo n.º 2
0
/* u32x4 mm mul */
void mw_neon_mm_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, int Col, unsigned int * C)
{
	int i, k, j;

	uint32x4_t neon_b, neon_c;
	uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_u32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_u32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_u32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_u32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_u32(A + j_T);

				neon_b = vld1q_u32(B + k_Row + j);
				neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0));
				neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1));
				neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2));
				neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3));

				neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c);

				vst1q_lane_u32(C + k_Row + i, neon_c, 0);
				vst1q_lane_u32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_u32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_u32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
 SIMD_INLINE uint32x4_t SquaredDifferenceSumMasked(const uint8x16_t & a, const uint8x16_t & b, const uint8x16_t & mask)
 {
     uint8x16_t ad = vandq_u8(vabdq_u8(a, b), mask);
     uint16x8_t lo = Square(vget_low_u8(ad));
     uint16x8_t hi = Square(vget_high_u8(ad));
     return vaddq_u32(vpaddlq_u16(lo), vpaddlq_u16(hi));
 }
Exemplo n.º 4
0
int normL1_(const uchar* a, const uchar* b, int n)
{
    int j = 0, d = 0;
#if CV_SSE
    __m128i d0 = _mm_setzero_si128();

    for( ; j <= n - 16; j += 16 )
    {
        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }

    for( ; j <= n - 4; j += 4 )
    {
        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }
    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
#elif CV_NEON
    uint32x4_t v_sum = vdupq_n_u32(0.0f);
    for ( ; j <= n - 16; j += 16)
    {
        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
    }

    uint CV_DECL_ALIGNED(16) buf[4];
    vst1q_u32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
}
Exemplo n.º 5
0
void test_vaddQu32 (void)
{
  uint32x4_t out_uint32x4_t;
  uint32x4_t arg0_uint32x4_t;
  uint32x4_t arg1_uint32x4_t;

  out_uint32x4_t = vaddq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
}
Exemplo n.º 6
0
/* u32x4 mv mul */
void mw_neon_mv_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, unsigned int * C)
{
	int i = 0;
	int k = 0;

	uint32x4_t neon_b, neon_c;
	uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_u32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_u32(A + j);
			j+=Row;
			neon_a1 = vld1q_u32(A + j);
			j+=Row;
			neon_a2 = vld1q_u32(A + j);
			j+=Row;
			neon_a3 = vld1q_u32(A + j);

			neon_b = vld1q_u32(B + k);
			neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0));
			neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1));
			neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2));
			neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3));

			neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c);

		}

		vst1q_u32(C + i, neon_c);
	}
}
Exemplo n.º 7
0
/* u32x4 add */
void mw_neon_mm_add_u32x4(unsigned int * A, int Row, int Col, unsigned int * B, unsigned int * C)
{
	uint32x4_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 4; i <= size ; i+=4)
	{
		k = i - 4;
		neon_a = vld1q_u32(A + k);
		neon_b = vld1q_u32(B + k);
		neon_c = vaddq_u32(neon_a, neon_b);
		vst1q_u32(C + k, neon_c);
	}

	k = i - 4;
    for (i = 0; i < size % 4; i++)
	{
		C[k + i] = A[k + i] + B[k + i];
	}
}
Exemplo n.º 8
0
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm,
                                       const uint16_t* far_spectrum,
                                       int32_t* echo_est,
                                       uint32_t* far_energy,
                                       uint32_t* echo_energy_adapt,
                                       uint32_t* echo_energy_stored) {
  int16_t* start_stored_p = aecm->channelStored;
  int16_t* start_adapt_p = aecm->channelAdapt16;
  int32_t* echo_est_p = echo_est;
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  const uint16_t* far_spectrum_p = far_spectrum;
  int16x8_t store_v, adapt_v;
  uint16x8_t spectrum_v;
  uint32x4_t echo_est_v_low, echo_est_v_high;
  uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;

  far_energy_v = vdupq_n_u32(0);
  echo_adapt_v = vdupq_n_u32(0);
  echo_stored_v = vdupq_n_u32(0);

  // Get energy for the delayed far end signal and estimated
  // echo using both stored and adapted channels.
  // The C code:
  //  for (i = 0; i < PART_LEN1; i++) {
  //      echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                         far_spectrum[i]);
  //      (*far_energy) += (uint32_t)(far_spectrum[i]);
  //      *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
  //      (*echo_energy_stored) += (uint32_t)echo_est[i];
  //  }
  while (start_stored_p < end_stored_p) {
    spectrum_v = vld1q_u16(far_spectrum_p);
    adapt_v = vld1q_s16(start_adapt_p);
    store_v = vld1q_s16(start_stored_p);

    far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
    far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));

    echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
                               vget_low_u16(spectrum_v));
    echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
                                vget_high_u16(spectrum_v));
    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

    echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
    echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);

    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_low_s16(adapt_v)),
                             vget_low_u16(spectrum_v));
    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_high_s16(adapt_v)),
                             vget_high_u16(spectrum_v));

    start_stored_p += 8;
    start_adapt_p += 8;
    far_spectrum_p += 8;
    echo_est_p += 8;
  }

  AddLanes(far_energy, far_energy_v);
  AddLanes(echo_energy_stored, echo_stored_v);
  AddLanes(echo_energy_adapt, echo_adapt_v);

  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                             far_spectrum[PART_LEN]);
  *echo_energy_stored += (uint32_t)echo_est[PART_LEN];
  *far_energy += (uint32_t)far_spectrum[PART_LEN];
  *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
}
Exemplo n.º 9
0
void SkRGB16BlitterBlitV_neon(uint16_t* device,
                              int height,
                              size_t deviceRB,
                              unsigned scale,
                              uint32_t src32) {
    if (height >= 8)
    {
        uint16_t* dst = device;

        // prepare constants
        uint16x8_t vdev = vdupq_n_u16(0);
        uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
        uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);
        uint32x4_t vsrc32 = vdupq_n_u32(src32);
        uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);

        while (height >= 8){
            LOAD_LANE_16(vdev, 0)
            LOAD_LANE_16(vdev, 1)
            LOAD_LANE_16(vdev, 2)
            LOAD_LANE_16(vdev, 3)
            LOAD_LANE_16(vdev, 4)
            LOAD_LANE_16(vdev, 5)
            LOAD_LANE_16(vdev, 6)
            LOAD_LANE_16(vdev, 7)

            // Expand_rgb_16
            uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));
            uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);
            uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);

            // Compact_rgb_16
            vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);
            vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);
            vdst32_lo = vshrq_n_u32(vdst32_lo, 5);
            vdst32_hi = vshrq_n_u32(vdst32_hi, 5);

            uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);
            uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);
            uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);
            vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);
            vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);
            uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);

            STORE_LANE_16(vdst16_lo, 0)
            STORE_LANE_16(vdst16_lo, 1)
            STORE_LANE_16(vdst16_lo, 2)
            STORE_LANE_16(vdst16_lo, 3)
            STORE_LANE_16(vdst16_hi, 0)
            STORE_LANE_16(vdst16_hi, 1)
            STORE_LANE_16(vdst16_hi, 2)
            STORE_LANE_16(vdst16_hi, 3)
            height -= 8;
        }
    }
    while (height != 0){
        uint32_t dst32 = SkExpand_rgb_16(*device) * scale;
        *device = SkCompact_rgb_16((src32 + dst32) >> 5);
        device = (uint16_t*)((char*)device + deviceRB);
        height--;
    }
}
Exemplo n.º 10
0
void meanStdDev(const Size2D &size,
                const u16 * srcBase, ptrdiff_t srcStride,
                f32 * pMean, f32 * pStdDev)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
    f64 fsum = 0.0f, fsqsum = 0.0f;

    f32 arsum[8];
    uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
    float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;

    for (size_t i = 0; i < size.height; ++i)
    {
        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
        size_t j = 0u;

        while (j < roiw4)
        {
            size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
            v_sum = v_zero;
            v_sqsum = v_zero_f;

            for ( ; j + 16 < blockSize ; j += 16)
            {
                internal::prefetch(src + j);
                uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);

                // 0
                uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
                uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
                float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
                float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);

                // 1
                v_srclo = vmovl_u16(vget_low_u16(v_src1));
                v_srchi = vmovl_u16(vget_high_u16(v_src1));
                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
                v_srclo_f = vcvtq_f32_u32(v_srclo);
                v_srchi_f = vcvtq_f32_u32(v_srchi);
                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
            }

            for ( ; j < blockSize; j += 4)
            {
                uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
                float32x4_t v_src_f = vcvtq_f32_u32(v_src);
                v_sum = vaddq_u32(v_sum, v_src);
                v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
            }

            vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
            vst1q_f32(arsum + 4, v_sqsum);

            fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
            fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
        }

        // collect a few last elements in the current row
        for ( ; j < size.width; ++j)
        {
            f32 srcval = src[j];
            fsum += srcval;
            fsqsum += srcval * srcval;
        }
    }

    // calc mean and stddev
    f64 itotal = 1.0 / size.total();
    f64 mean = fsum * itotal;
    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));

    if (pMean)
        *pMean = mean;
    if (pStdDev)
        *pStdDev = stddev;
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)pMean;
    (void)pStdDev;
#endif
}
Exemplo n.º 11
0
inline  uint32x4_t vaddq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vaddq_u32(v0, v1); }
Exemplo n.º 12
0
  inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const
  {
    ResultType result = 0;
#if (defined __GNUC__ || defined __clang__) && defined USE_SSE
#ifdef __ARM_NEON__
    {
      uint32x4_t bits = vmovq_n_u32(0);
      for (size_t i = 0; i < size; i += 16) {
        uint8x16_t A_vec = vld1q_u8 (a + i);
        uint8x16_t B_vec = vld1q_u8 (b + i);
        uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
        uint8x16_t bitsSet = vcntq_u8 (AxorB);
        uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
        uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
        bits = vaddq_u32(bits, bitSet4);
      }
      uint64x2_t bitSet2 = vpaddlq_u32 (bits);
      result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
      result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
    }
#else
    {
      //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
      typedef unsigned long long pop_t;
      const size_t modulo = size % sizeof(pop_t);
      const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
      const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
      const pop_t* a2_end = a2 + (size / sizeof(pop_t));

      for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));

      if (modulo) {
        //in the case where size is not dividable by sizeof(pop_t)
        //need to mask off the bits at the end
        pop_t a_final = 0, b_final = 0;
        memcpy(&a_final, a2, modulo);
        memcpy(&b_final, b2, modulo);
        result += __builtin_popcountll(a_final ^ b_final);
      }
    }
#endif //NEON
    return result;
#endif
#ifdef PLATFORM_64_BIT
    if(size%64 == 0)
    {
      const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
      const uint64_t* pb = reinterpret_cast<const uint64_t*>(b);
      size /= (sizeof(uint64_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt64(*pa ^ *pb);
      }
    }
    else
    {
      const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
      const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
      size /= (sizeof(uint32_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt32(*pa ^ *pb);
      }
    }
#else
    const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
    const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
    size /= (sizeof(uint32_t)/sizeof(unsigned char));
    for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
      result += popcnt32(*pa ^ *pb);
    }
#endif
    return result;
  }