Exemplo n.º 1
0
/* u32x4 mm mul */
void mw_neon_mm_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, int Col, unsigned int * C)
{
	int i, k, j;

	uint32x4_t neon_b, neon_c;
	uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_u32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_u32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_u32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_u32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_u32(A + j_T);

				neon_b = vld1q_u32(B + k_Row + j);
				neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0));
				neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1));
				neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2));
				neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3));

				neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c);

				vst1q_lane_u32(C + k_Row + i, neon_c, 0);
				vst1q_lane_u32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_u32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_u32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
void test_vdupQ_nu32 (void)
{
  uint32x4_t out_uint32x4_t;
  uint32_t arg0_uint32_t;

  out_uint32x4_t = vdupq_n_u32 (arg0_uint32_t);
}
Exemplo n.º 3
0
static float32x4_t vsqrtq_f32(float32x4_t s) {
  int i;
  float32x4_t x = vrsqrteq_f32(s);

  // Code to handle sqrt(0).
  // If the input to sqrtf() is zero, a zero will be returned.
  // If the input to vrsqrteq_f32() is zero, positive infinity is returned.
  const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
  // check for divide by zero
  const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x));
  // zero out the positive infinity results
  x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero),
                                      vreinterpretq_u32_f32(x)));
  // from arm documentation
  // The Newton-Raphson iteration:
  //     x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2)
  // converges to (1/√d) if x0 is the result of VRSQRTE applied to d.
  //
  // Note: The precision did not improve after 2 iterations.
  for (i = 0; i < 2; i++) {
    x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x);
  }
  // sqrt(s) = s * 1/sqrt(s)
  return vmulq_f32(s, x);;
}
Exemplo n.º 4
0
/* u32x4 mv mul */
void mw_neon_mv_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, unsigned int * C)
{
	int i = 0;
	int k = 0;

	uint32x4_t neon_b, neon_c;
	uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_u32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_u32(A + j);
			j+=Row;
			neon_a1 = vld1q_u32(A + j);
			j+=Row;
			neon_a2 = vld1q_u32(A + j);
			j+=Row;
			neon_a3 = vld1q_u32(A + j);

			neon_b = vld1q_u32(B + k);
			neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0));
			neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1));
			neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2));
			neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3));

			neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c);

		}

		vst1q_u32(C + i, neon_c);
	}
}
Exemplo n.º 5
0
int normL1_(const uchar* a, const uchar* b, int n)
{
    int j = 0, d = 0;
#if CV_SSE
    __m128i d0 = _mm_setzero_si128();

    for( ; j <= n - 16; j += 16 )
    {
        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }

    for( ; j <= n - 4; j += 4 )
    {
        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }
    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
#elif CV_NEON
    uint32x4_t v_sum = vdupq_n_u32(0.0f);
    for ( ; j <= n - 16; j += 16)
    {
        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
    }

    uint CV_DECL_ALIGNED(16) buf[4];
    vst1q_u32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
}
Exemplo n.º 6
0
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm,
                                       const uint16_t* far_spectrum,
                                       int32_t* echo_est,
                                       uint32_t* far_energy,
                                       uint32_t* echo_energy_adapt,
                                       uint32_t* echo_energy_stored) {
  int16_t* start_stored_p = aecm->channelStored;
  int16_t* start_adapt_p = aecm->channelAdapt16;
  int32_t* echo_est_p = echo_est;
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  const uint16_t* far_spectrum_p = far_spectrum;
  int16x8_t store_v, adapt_v;
  uint16x8_t spectrum_v;
  uint32x4_t echo_est_v_low, echo_est_v_high;
  uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;

  far_energy_v = vdupq_n_u32(0);
  echo_adapt_v = vdupq_n_u32(0);
  echo_stored_v = vdupq_n_u32(0);

  // Get energy for the delayed far end signal and estimated
  // echo using both stored and adapted channels.
  // The C code:
  //  for (i = 0; i < PART_LEN1; i++) {
  //      echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                         far_spectrum[i]);
  //      (*far_energy) += (uint32_t)(far_spectrum[i]);
  //      *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
  //      (*echo_energy_stored) += (uint32_t)echo_est[i];
  //  }
  while (start_stored_p < end_stored_p) {
    spectrum_v = vld1q_u16(far_spectrum_p);
    adapt_v = vld1q_s16(start_adapt_p);
    store_v = vld1q_s16(start_stored_p);

    far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
    far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));

    echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
                               vget_low_u16(spectrum_v));
    echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
                                vget_high_u16(spectrum_v));
    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

    echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
    echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);

    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_low_s16(adapt_v)),
                             vget_low_u16(spectrum_v));
    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_high_s16(adapt_v)),
                             vget_high_u16(spectrum_v));

    start_stored_p += 8;
    start_adapt_p += 8;
    far_spectrum_p += 8;
    echo_est_p += 8;
  }

  AddLanes(far_energy, far_energy_v);
  AddLanes(echo_energy_stored, echo_stored_v);
  AddLanes(echo_energy_adapt, echo_adapt_v);

  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                             far_spectrum[PART_LEN]);
  *echo_energy_stored += (uint32_t)echo_est[PART_LEN];
  *far_energy += (uint32_t)far_spectrum[PART_LEN];
  *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
}
Exemplo n.º 7
0
void SkRGB16BlitterBlitV_neon(uint16_t* device,
                              int height,
                              size_t deviceRB,
                              unsigned scale,
                              uint32_t src32) {
    if (height >= 8)
    {
        uint16_t* dst = device;

        // prepare constants
        uint16x8_t vdev = vdupq_n_u16(0);
        uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
        uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);
        uint32x4_t vsrc32 = vdupq_n_u32(src32);
        uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);

        while (height >= 8){
            LOAD_LANE_16(vdev, 0)
            LOAD_LANE_16(vdev, 1)
            LOAD_LANE_16(vdev, 2)
            LOAD_LANE_16(vdev, 3)
            LOAD_LANE_16(vdev, 4)
            LOAD_LANE_16(vdev, 5)
            LOAD_LANE_16(vdev, 6)
            LOAD_LANE_16(vdev, 7)

            // Expand_rgb_16
            uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));
            uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);
            uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);

            // Compact_rgb_16
            vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);
            vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);
            vdst32_lo = vshrq_n_u32(vdst32_lo, 5);
            vdst32_hi = vshrq_n_u32(vdst32_hi, 5);

            uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);
            uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);
            uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);
            vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);
            vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);
            uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);

            STORE_LANE_16(vdst16_lo, 0)
            STORE_LANE_16(vdst16_lo, 1)
            STORE_LANE_16(vdst16_lo, 2)
            STORE_LANE_16(vdst16_lo, 3)
            STORE_LANE_16(vdst16_hi, 0)
            STORE_LANE_16(vdst16_hi, 1)
            STORE_LANE_16(vdst16_hi, 2)
            STORE_LANE_16(vdst16_hi, 3)
            height -= 8;
        }
    }
    while (height != 0){
        uint32_t dst32 = SkExpand_rgb_16(*device) * scale;
        *device = SkCompact_rgb_16((src32 + dst32) >> 5);
        device = (uint16_t*)((char*)device + deviceRB);
        height--;
    }
}
Exemplo n.º 8
0
uint32x4_t test_vdupq_n_u32(uint32_t v1) {
  // CHECK: test_vdupq_n_u32
  return vdupq_n_u32(v1);
  // CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
}
Exemplo n.º 9
0
void test_vdupq_nu32 (void)
{
  out_uint32x4_t = vdupq_n_u32 (~0x12000000);
}
Exemplo n.º 10
0
bool CPU_ProbeNEON()
{
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
	return false;
#elif (CRYPTOPP_ARM_NEON_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
	volatile bool result = true;
	__try
	{
		uint32_t v1[4] = {1,1,1,1};
		uint32x4_t x1 = vld1q_u32(v1);
		uint64_t v2[2] = {1,1};
		uint64x2_t x2 = vld1q_u64(v2);

		uint32x4_t x3 = vdupq_n_u32(2);
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
		uint64x2_t x4 = vdupq_n_u64(2);
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);

		result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
	}
	__except (EXCEPTION_EXECUTE_HANDLER)
	{
		return false;
	}
	return result;
# else

	// longjmp and clobber warnings. Volatile is required.
	// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
	volatile bool result = true;

	volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
	if (oldHandler == SIG_ERR)
		return false;

	volatile sigset_t oldMask;
	if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
		return false;

	if (setjmp(s_jmpSIGILL))
		result = false;
	else
	{
		uint32_t v1[4] = {1,1,1,1};
		uint32x4_t x1 = vld1q_u32(v1);
		uint64_t v2[2] = {1,1};
		uint64x2_t x2 = vld1q_u64(v2);

		uint32x4_t x3 = {0,0,0,0};
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
		uint64x2_t x4 = {0,0};
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);

		// Hack... GCC optimizes away the code and returns true
		result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
	}

	sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
	signal(SIGILL, oldHandler);
	return result;
# endif
#else
	return false;
#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
}
Exemplo n.º 11
0
void meanStdDev(const Size2D &size,
                const u16 * srcBase, ptrdiff_t srcStride,
                f32 * pMean, f32 * pStdDev)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
    f64 fsum = 0.0f, fsqsum = 0.0f;

    f32 arsum[8];
    uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
    float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;

    for (size_t i = 0; i < size.height; ++i)
    {
        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
        size_t j = 0u;

        while (j < roiw4)
        {
            size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
            v_sum = v_zero;
            v_sqsum = v_zero_f;

            for ( ; j + 16 < blockSize ; j += 16)
            {
                internal::prefetch(src + j);
                uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);

                // 0
                uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
                uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
                float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
                float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);

                // 1
                v_srclo = vmovl_u16(vget_low_u16(v_src1));
                v_srchi = vmovl_u16(vget_high_u16(v_src1));
                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
                v_srclo_f = vcvtq_f32_u32(v_srclo);
                v_srchi_f = vcvtq_f32_u32(v_srchi);
                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
            }

            for ( ; j < blockSize; j += 4)
            {
                uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
                float32x4_t v_src_f = vcvtq_f32_u32(v_src);
                v_sum = vaddq_u32(v_sum, v_src);
                v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
            }

            vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
            vst1q_f32(arsum + 4, v_sqsum);

            fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
            fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
        }

        // collect a few last elements in the current row
        for ( ; j < size.width; ++j)
        {
            f32 srcval = src[j];
            fsum += srcval;
            fsqsum += srcval * srcval;
        }
    }

    // calc mean and stddev
    f64 itotal = 1.0 / size.total();
    f64 mean = fsum * itotal;
    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));

    if (pMean)
        *pMean = mean;
    if (pStdDev)
        *pStdDev = stddev;
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)pMean;
    (void)pStdDev;
#endif
}
Exemplo n.º 12
0
inline  uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); }
Exemplo n.º 13
0
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
  // a^b = exp2(b * log2(a))
  //   exp2(x) and log2(x) are calculated using polynomial approximations.
  float32x4_t log2_a, b_log2_a, a_exp_b;

  // Calculate log2(x), x = a.
  {
    // To calculate log2(x), we decompose x like this:
    //   x = y * 2^n
    //     n is an integer
    //     y is in the [1.0, 2.0) range
    //
    //   log2(x) = log2(y) + n
    //     n       can be evaluated by playing with float representation.
    //     log2(y) in a small range can be approximated, this code uses an order
    //             five polynomial approximation. The coefficients have been
    //             estimated with the Remez algorithm and the resulting
    //             polynomial has a maximum relative error of 0.00086%.

    // Compute n.
    //    This is done by masking the exponent, shifting it into the top bit of
    //    the mantissa, putting eight into the biased exponent (to shift/
    //    compensate the fact that the exponent has been shifted in the top/
    //    fractional part and finally getting rid of the implicit leading one
    //    from the mantissa by substracting it out.
    const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
    const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
    const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
    const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
                                       vec_float_exponent_mask);
    const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
    const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
    const float32x4_t n =
        vsubq_f32(vreinterpretq_f32_u32(n_0),
                  vreinterpretq_f32_u32(vec_implicit_leading_one));
    // Compute y.
    const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
    const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
    const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
                                          vec_mantissa_mask);
    const float32x4_t y =
        vreinterpretq_f32_u32(vorrq_u32(mantissa,
                                        vec_zero_biased_exponent_is_one));
    // Approximate log2(y) ~= (y - 1) * pol5(y).
    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
    const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
    const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
    const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
    const float32x4_t C2 = vdupq_n_f32(2.5988452f);
    const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
    const float32x4_t C0 = vdupq_n_f32(3.1157899f);
    float32x4_t pol5_y = C5;
    pol5_y = vmlaq_f32(C4, y, pol5_y);
    pol5_y = vmlaq_f32(C3, y, pol5_y);
    pol5_y = vmlaq_f32(C2, y, pol5_y);
    pol5_y = vmlaq_f32(C1, y, pol5_y);
    pol5_y = vmlaq_f32(C0, y, pol5_y);
    const float32x4_t y_minus_one =
        vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
    const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);

    // Combine parts.
    log2_a = vaddq_f32(n, log2_y);
  }

  // b * log2(a)
  b_log2_a = vmulq_f32(b, log2_a);

  // Calculate exp2(x), x = b * log2(a).
  {
    // To calculate 2^x, we decompose x like this:
    //   x = n + y
    //     n is an integer, the value of x - 0.5 rounded down, therefore
    //     y is in the [0.5, 1.5) range
    //
    //   2^x = 2^n * 2^y
    //     2^n can be evaluated by playing with float representation.
    //     2^y in a small range can be approximated, this code uses an order two
    //         polynomial approximation. The coefficients have been estimated
    //         with the Remez algorithm and the resulting polynomial has a
    //         maximum relative error of 0.17%.
    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
    const float32x4_t max_input = vdupq_n_f32(129.f);
    const float32x4_t min_input = vdupq_n_f32(-126.99999f);
    const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
    const float32x4_t x_max = vmaxq_f32(x_min, min_input);
    // Compute n.
    const float32x4_t half = vdupq_n_f32(0.5f);
    const float32x4_t x_minus_half = vsubq_f32(x_max, half);
    const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);

    // Compute 2^n.
    const int32x4_t float_exponent_bias = vdupq_n_s32(127);
    const int32x4_t two_n_exponent =
        vaddq_s32(x_minus_half_floor, float_exponent_bias);
    const float32x4_t two_n =
        vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
    // Compute y.
    const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));

    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
    const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
    const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
    const float32x4_t C0 = vdupq_n_f32(1.0017247f);
    float32x4_t exp2_y = C2;
    exp2_y = vmlaq_f32(C1, y, exp2_y);
    exp2_y = vmlaq_f32(C0, y, exp2_y);

    // Combine parts.
    a_exp_b = vmulq_f32(exp2_y, two_n);
  }

  return a_exp_b;
}
Exemplo n.º 14
0
static inline int32_t TransformAndFindMaxNeon(int16_t* inre,
                                              int16_t* inim,
                                              int32_t* outre,
                                              int32_t* outim) {
  int k;
  int16_t* inre1 = inre;
  int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
  int16_t* inim1 = inim;
  int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
  int32_t* outre1 = outre;
  int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
  int32_t* outim1 = outim;
  int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
  const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
  const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4];
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
  for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    int16x4_t tmpi = vld1_s16(kSinTab1);
    kSinTab1 += 4;
    int16x4_t tmpr = vld1_s16(kSinTab2);
    kSinTab2 -= 4;
    int16x4_t inre_0 = vld1_s16(inre1);
    inre1 += 4;
    int16x4_t inre_1 = vld1_s16(inre2);
    inre2 -= 4;
    int16x4_t inim_0 = vld1_s16(inim1);
    inim1 += 4;
    int16x4_t inim_1 = vld1_s16(inim2);
    inim2 -= 4;
    tmpr = vneg_s16(tmpr);
    inre_1 = vrev64_s16(inre_1);
    inim_1 = vrev64_s16(inim_1);
    tmpr = vrev64_s16(tmpr);

    int32x4_t xr = vmull_s16(tmpr, inre_0);
    int32x4_t xi = vmull_s16(tmpr, inim_0);
    int32x4_t yr = vmull_s16(tmpr, inim_1);
    int32x4_t yi = vmull_s16(tmpi, inim_1);
    xr = vmlal_s16(xr, tmpi, inim_0);
    xi = vmlsl_s16(xi, tmpi, inre_0);
    yr = vmlal_s16(yr, tmpi, inre_1);
    yi = vmlsl_s16(yi, tmpr, inre_1);
    yr = vnegq_s32(yr);

    xr = vshrq_n_s32(xr, 5);
    xi = vshrq_n_s32(xi, 5);
    yr = vshrq_n_s32(yr, 5);
    yi = vshrq_n_s32(yi, 5);

    int32x4_t outr0 = vsubq_s32(xr, yi);
    int32x4_t outr1 = vaddq_s32(xr, yi);
    int32x4_t outi0 = vaddq_s32(xi, yr);
    int32x4_t outi1 = vsubq_s32(yr, xi);

    // Find the absolute maximum in the vectors.
    int32x4_t tmp0 = vabsq_s32(outr0);
    int32x4_t tmp1 = vabsq_s32(outr1);
    int32x4_t tmp2 = vabsq_s32(outi0);
    int32x4_t tmp3 = vabsq_s32(outi1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));

    // Store the vectors.
    outr1 = vrev64q_s32(outr1);
    outi1 = vrev64q_s32(outi1);
    int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1));
    int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1));

    vst1q_s32(outre1, outr0);
    outre1 += 4;
    vst1q_s32(outim1, outi0);
    outim1 += 4;
    vst1q_s32(outre2, outr_1);
    outre2 -= 4;
    vst1q_s32(outim2, outi_1);
    outim2 -= 4;
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}
Exemplo n.º 15
0
static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9,
                                               int16_t* inre2Q9,
                                               int32_t* outreQ16,
                                               int32_t* outimQ16) {
  int k;
  const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
  const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
  // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921.
  // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code.
  int32_t fact  = 16921 << 5;
  int32x4_t factq = vdupq_n_s32(fact);
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  for (k = 0; k < FRAMESAMPLES/2; k += 8) {
    int16x8_t tmpr = vld1q_s16(kCosTab);
    int16x8_t tmpi = vld1q_s16(kSinTab);
    int16x8_t inre1 = vld1q_s16(inre1Q9);
    int16x8_t inre2 = vld1q_s16(inre2Q9);
    kCosTab += 8;
    kSinTab += 8;
    inre1Q9 += 8;
    inre2Q9 += 8;

    // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
    int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1));
    int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2));
    tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2));
    tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp2 = vmull_high_s16(tmpr, inre1);
    int32x4_t tmp3 = vmull_high_s16(tmpr, inre2);
    tmp2 = vmlal_high_s16(tmp2, tmpi, inre2);
    tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1);
#else
    int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1));
    int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2));
    tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2));
    tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1));
#endif

    int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq);
    int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq);
    int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq);
    int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq);
    vst1q_s32(outreQ16, outr_0);
    outreQ16 += 4;
    vst1q_s32(outreQ16, outr_1);
    outreQ16 += 4;
    vst1q_s32(outimQ16, outi_0);
    outimQ16 += 4;
    vst1q_s32(outimQ16, outi_1);
    outimQ16 += 4;

    // Find the absolute maximum in the vectors.
    tmp0 = vabsq_s32(outr_0);
    tmp1 = vabsq_s32(outr_1);
    tmp2 = vabsq_s32(outi_0);
    tmp3 = vabsq_s32(outi_1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}
Exemplo n.º 16
0
void test_vdupq_nu32 (void)
{
  out_uint32x4_t = vdupq_n_u32 (0x12ff);
}