static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
                                           const int16_t* vector1,
                                           const int16_t* vector2,
                                           size_t length,
                                           int scaling) {
  size_t i = 0;
  size_t len1 = length >> 3;
  size_t len2 = length & 7;
  int64x2_t sum0 = vdupq_n_s64(0);
  int64x2_t sum1 = vdupq_n_s64(0);

  for (i = len1; i > 0; i -= 1) {
    int16x8_t seq1_16x8 = vld1q_s16(vector1);
    int16x8_t seq2_16x8 = vld1q_s16(vector2);
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
#else
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
                               vget_high_s16(seq2_16x8));
#endif
    sum0 = vpadalq_s32(sum0, tmp0);
    sum1 = vpadalq_s32(sum1, tmp1);
    vector1 += 8;
    vector2 += 8;
  }

  // Calculate the rest of the samples.
  int64_t sum_res = 0;
  for (i = len2; i > 0; i -= 1) {
    sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
    vector1++;
    vector2++;
  }

  sum0 = vaddq_s64(sum0, sum1);
#if defined(WEBRTC_ARCH_ARM64)
  int64_t sum2 = vaddvq_s64(sum0);
  *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
#else
  int64x1_t shift = vdup_n_s64(-scaling);
  int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
  sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
  sum2 = vshl_s64(sum2, shift);
  vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
#endif
}
Пример #2
0
static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre,
                                                       int16_t* inim,
                                                       int32_t* outre1,
                                                       int32_t* outre2,
                                                       int32_t sh) {
  int k;
  int16_t* p_inre = inre;
  int16_t* p_inim = inim;
  int32_t* p_outre1 = outre1;
  int32_t* p_outre2 = outre2;
  const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
  const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
  int32x4_t shift = vdupq_n_s32(-sh - 16);
  // Divide through by the normalizing constant:
  // scale all values with 1/240, i.e. with 273 in Q16.
  // 273/65536 ~= 0.0041656
  // 1/240 ~= 0.0041666
  int16x8_t scale = vdupq_n_s16(273);
  // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727.
  int factQ19 = 31727 << 16;
  int32x4_t fact = vdupq_n_s32(factQ19);

  for (k = 0; k < FRAMESAMPLES/2; k += 8) {
    int16x8_t inre16x8 = vld1q_s16(p_inre);
    int16x8_t inim16x8 = vld1q_s16(p_inim);
    p_inre += 8;
    p_inim += 8;
    int16x8_t tmpr = vld1q_s16(kCosTab);
    int16x8_t tmpi = vld1q_s16(kSinTab);
    kCosTab += 8;
    kSinTab += 8;
    // By vshl and vmull, we effectively did "<< (-sh - 16)",
    // instead of "<< (-sh)" and ">> 16" as in the C code.
    int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale));
    int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale);
    int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale);
#else
    int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8),
                                   vget_high_s16(scale));
    int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8),
                                   vget_high_s16(scale));
#endif

    outre1_0 = vshlq_s32(outre1_0, shift);
    outre1_1 = vshlq_s32(outre1_1, shift);
    outre2_0 = vshlq_s32(outre2_0, shift);
    outre2_1 = vshlq_s32(outre2_1, shift);

    // Demodulate and separate.
    int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr));
    int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmpr_1 = vmovl_high_s16(tmpr);
    int32x4_t tmpi_1 = vmovl_high_s16(tmpi);
#else
    int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr));
    int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi));
#endif

    int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0));
    int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0));
    int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1));
    int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1));
    xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0));
    xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0));
    xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1));
    xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1));

#if defined(WEBRTC_ARCH_ARM64)
    int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0);
    int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0);
    int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1);
    int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1);
    xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0);
    xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0);
    xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1);
    xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1);
#else
    int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0));
    int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0));
    int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1));
    int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1));
    xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0));
    xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0));
    xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1));
    xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1));
#endif

    outre1_0 = vcombine_s32(vshrn_n_s64(xr0, 10), vshrn_n_s64(xr1, 10));
    outre2_0 = vcombine_s32(vshrn_n_s64(xi0, 10), vshrn_n_s64(xi1, 10));
    outre1_1 = vcombine_s32(vshrn_n_s64(xr2, 10), vshrn_n_s64(xr3, 10));
    outre2_1 = vcombine_s32(vshrn_n_s64(xi2, 10), vshrn_n_s64(xi3, 10));
    outre1_0 = vqdmulhq_s32(outre1_0, fact);
    outre2_0 = vqdmulhq_s32(outre2_0, fact);
    outre1_1 = vqdmulhq_s32(outre1_1, fact);
    outre2_1 = vqdmulhq_s32(outre2_1, fact);

    vst1q_s32(p_outre1, outre1_0);
    p_outre1 += 4;
    vst1q_s32(p_outre1, outre1_1);
    p_outre1 += 4;
    vst1q_s32(p_outre2, outre2_0);
    p_outre2 += 4;
    vst1q_s32(p_outre2, outre2_1);
    p_outre2 += 4;
  }
}
Пример #3
0
static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9,
                                               int16_t* inre2Q9,
                                               int32_t* outreQ16,
                                               int32_t* outimQ16) {
  int k;
  const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
  const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
  // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921.
  // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code.
  int32_t fact  = 16921 << 5;
  int32x4_t factq = vdupq_n_s32(fact);
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  for (k = 0; k < FRAMESAMPLES/2; k += 8) {
    int16x8_t tmpr = vld1q_s16(kCosTab);
    int16x8_t tmpi = vld1q_s16(kSinTab);
    int16x8_t inre1 = vld1q_s16(inre1Q9);
    int16x8_t inre2 = vld1q_s16(inre2Q9);
    kCosTab += 8;
    kSinTab += 8;
    inre1Q9 += 8;
    inre2Q9 += 8;

    // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
    int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1));
    int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2));
    tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2));
    tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp2 = vmull_high_s16(tmpr, inre1);
    int32x4_t tmp3 = vmull_high_s16(tmpr, inre2);
    tmp2 = vmlal_high_s16(tmp2, tmpi, inre2);
    tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1);
#else
    int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1));
    int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2));
    tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2));
    tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1));
#endif

    int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq);
    int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq);
    int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq);
    int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq);
    vst1q_s32(outreQ16, outr_0);
    outreQ16 += 4;
    vst1q_s32(outreQ16, outr_1);
    outreQ16 += 4;
    vst1q_s32(outimQ16, outi_0);
    outimQ16 += 4;
    vst1q_s32(outimQ16, outi_1);
    outimQ16 += 4;

    // Find the absolute maximum in the vectors.
    tmp0 = vabsq_s32(outr_0);
    tmp1 = vabsq_s32(outr_1);
    tmp2 = vabsq_s32(outi_0);
    tmp3 = vabsq_s32(outi_1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}