void test_vrev64s16 (void)
{
  int16x4_t out_int16x4_t;
  int16x4_t arg0_int16x4_t;

  out_int16x4_t = vrev64_s16 (arg0_int16x4_t);
}
Esempio n. 2
0
static inline int32_t TransformAndFindMaxNeon(int16_t* inre,
                                              int16_t* inim,
                                              int32_t* outre,
                                              int32_t* outim) {
  int k;
  int16_t* inre1 = inre;
  int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
  int16_t* inim1 = inim;
  int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
  int32_t* outre1 = outre;
  int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
  int32_t* outim1 = outim;
  int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
  const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
  const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4];
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
  for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    int16x4_t tmpi = vld1_s16(kSinTab1);
    kSinTab1 += 4;
    int16x4_t tmpr = vld1_s16(kSinTab2);
    kSinTab2 -= 4;
    int16x4_t inre_0 = vld1_s16(inre1);
    inre1 += 4;
    int16x4_t inre_1 = vld1_s16(inre2);
    inre2 -= 4;
    int16x4_t inim_0 = vld1_s16(inim1);
    inim1 += 4;
    int16x4_t inim_1 = vld1_s16(inim2);
    inim2 -= 4;
    tmpr = vneg_s16(tmpr);
    inre_1 = vrev64_s16(inre_1);
    inim_1 = vrev64_s16(inim_1);
    tmpr = vrev64_s16(tmpr);

    int32x4_t xr = vmull_s16(tmpr, inre_0);
    int32x4_t xi = vmull_s16(tmpr, inim_0);
    int32x4_t yr = vmull_s16(tmpr, inim_1);
    int32x4_t yi = vmull_s16(tmpi, inim_1);
    xr = vmlal_s16(xr, tmpi, inim_0);
    xi = vmlsl_s16(xi, tmpi, inre_0);
    yr = vmlal_s16(yr, tmpi, inre_1);
    yi = vmlsl_s16(yi, tmpr, inre_1);
    yr = vnegq_s32(yr);

    xr = vshrq_n_s32(xr, 5);
    xi = vshrq_n_s32(xi, 5);
    yr = vshrq_n_s32(yr, 5);
    yi = vshrq_n_s32(yi, 5);

    int32x4_t outr0 = vsubq_s32(xr, yi);
    int32x4_t outr1 = vaddq_s32(xr, yi);
    int32x4_t outi0 = vaddq_s32(xi, yr);
    int32x4_t outi1 = vsubq_s32(yr, xi);

    // Find the absolute maximum in the vectors.
    int32x4_t tmp0 = vabsq_s32(outr0);
    int32x4_t tmp1 = vabsq_s32(outr1);
    int32x4_t tmp2 = vabsq_s32(outi0);
    int32x4_t tmp3 = vabsq_s32(outi1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));

    // Store the vectors.
    outr1 = vrev64q_s32(outr1);
    outi1 = vrev64q_s32(outi1);
    int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1));
    int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1));

    vst1q_s32(outre1, outr0);
    outre1 += 4;
    vst1q_s32(outim1, outi0);
    outim1 += 4;
    vst1q_s32(outre2, outr_1);
    outre2 -= 4;
    vst1q_s32(outim2, outi_1);
    outim2 -= 4;
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}
Esempio n. 3
0
inline int16x4_t   vrev64(const int16x4_t   & v) { return vrev64_s16(v); }
Esempio n. 4
0
static inline void PostShiftAndSeparateNeon(int16_t* inre,
                                            int16_t* inim,
                                            int16_t* outre,
                                            int16_t* outim,
                                            int32_t sh) {
  int k;
  int16_t* inre1 = inre;
  int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
  int16_t* inim1 = inim;
  int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
  int16_t* outre1 = outre;
  int16_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
  int16_t* outim1 = outim;
  int16_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
  const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
  const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 -4];
  // By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
  // ">> 14" and then ">> 9" as in the C code.
  int32x4_t shift = vdupq_n_s32(-sh - 23);

  for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    int16x4_t tmpi = vld1_s16(kSinTab1);
    kSinTab1 += 4;
    int16x4_t tmpr = vld1_s16(kSinTab2);
    kSinTab2 -= 4;
    int16x4_t inre_0 = vld1_s16(inre1);
    inre1 += 4;
    int16x4_t inre_1 = vld1_s16(inre2);
    inre2 -= 4;
    int16x4_t inim_0 = vld1_s16(inim1);
    inim1 += 4;
    int16x4_t inim_1 = vld1_s16(inim2);
    inim2 -= 4;
    tmpr = vneg_s16(tmpr);
    inre_1 = vrev64_s16(inre_1);
    inim_1 = vrev64_s16(inim_1);
    tmpr = vrev64_s16(tmpr);

    int16x4_t xr = vqadd_s16(inre_0, inre_1);
    int16x4_t xi = vqsub_s16(inim_0, inim_1);
    int16x4_t yr = vqadd_s16(inim_0, inim_1);
    int16x4_t yi = vqsub_s16(inre_1, inre_0);

    int32x4_t outr0 = vmull_s16(tmpr, xr);
    int32x4_t outi0 = vmull_s16(tmpi, xr);
    int32x4_t outr1 = vmull_s16(tmpi, yr);
    int32x4_t outi1 = vmull_s16(tmpi, yi);
    outr0 = vmlsl_s16(outr0, tmpi, xi);
    outi0 = vmlal_s16(outi0, tmpr, xi);
    outr1 = vmlal_s16(outr1, tmpr, yi);
    outi1 = vmlsl_s16(outi1, tmpr, yr);

    outr0 = vshlq_s32(outr0, shift);
    outi0 = vshlq_s32(outi0, shift);
    outr1 = vshlq_s32(outr1, shift);
    outi1 = vshlq_s32(outi1, shift);
    outr1 = vnegq_s32(outr1);

    int16x4_t outre_0  = vmovn_s32(outr0);
    int16x4_t outim_0  = vmovn_s32(outi0);
    int16x4_t outre_1  = vmovn_s32(outr1);
    int16x4_t outim_1  = vmovn_s32(outi1);
    outre_1 = vrev64_s16(outre_1);
    outim_1 = vrev64_s16(outim_1);

    vst1_s16(outre1, outre_0);
    outre1 += 4;
    vst1_s16(outim1, outim_0);
    outim1 += 4;
    vst1_s16(outre2, outre_1);
    outre2 -= 4;
    vst1_s16(outim2, outim_1);
    outim2 -= 4;
  }
}