예제 #1
0
static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
                                          int32x4_t *const a0,
                                          int32x4_t *const a1,
                                          int32x4_t *const a2,
                                          int32x4_t *const a3) {
  int32x4_t b0, b1, b2, b3;

  transpose_s32_4x4(a0, a1, a2, a3);
  b0 = vaddq_s32(*a0, *a2);
  b1 = vsubq_s32(*a0, *a2);
  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
  b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);
  b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
  b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
  b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
  *a0 = vaddq_s32(b0, b3);
  *a1 = vaddq_s32(b1, b2);
  *a2 = vsubq_s32(b1, b2);
  *a3 = vsubq_s32(b0, b3);
}
예제 #2
0
void test_vsubQs32 (void)
{
  int32x4_t out_int32x4_t;
  int32x4_t arg0_int32x4_t;
  int32x4_t arg1_int32x4_t;

  out_int32x4_t = vsubq_s32 (arg0_int32x4_t, arg1_int32x4_t);
}
예제 #3
0
static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
                               int16x4_t *d5s16, int16x8_t *q3s16,
                               int16x8_t *q8s16, int16x8_t *q9s16) {
  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;

  d6s16 = vget_low_s16(*q3s16);

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  q10s32 = vmull_s16(*d3s16, d16s16);
  q11s32 = vmull_s16(*d4s16, d16s16);
  q12s32 = vmull_s16(d6s16, d17s16);
  q13s32 = vmull_s16(*d5s16, d18s16);
  q14s32 = vmull_s16(*d3s16, d18s16);
  q15s32 = vmovl_s16(d16s16);
  q15s32 = vaddw_s16(q15s32, d19s16);
  q8s32 = vmull_s16(*d4s16, d19s16);
  q15s32 = vsubw_s16(q15s32, d18s16);
  q9s32 = vmull_s16(*d5s16, d19s16);

  q10s32 = vaddq_s32(q10s32, q13s32);
  q10s32 = vaddq_s32(q10s32, q8s32);
  q11s32 = vsubq_s32(q11s32, q14s32);
  q8s32 = vdupq_n_s32(sinpi_3_9);
  q11s32 = vsubq_s32(q11s32, q9s32);
  q15s32 = vmulq_s32(q15s32, q8s32);

  q13s32 = vaddq_s32(q10s32, q12s32);
  q10s32 = vaddq_s32(q10s32, q11s32);
  q14s32 = vaddq_s32(q11s32, q12s32);
  q10s32 = vsubq_s32(q10s32, q12s32);

  d16s16 = vqrshrn_n_s32(q13s32, 14);
  d17s16 = vqrshrn_n_s32(q14s32, 14);
  d18s16 = vqrshrn_n_s32(q15s32, 14);
  d19s16 = vqrshrn_n_s32(q10s32, 14);

  *q8s16 = vcombine_s16(d16s16, d17s16);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  return;
}
static INLINE void iadst_half_butterfly_bd10_neon(int32x4_t *const x,
                                                  const int32x2_t c) {
  const int32x4_t sum = vaddq_s32(x[0], x[1]);
  const int32x4_t sub = vsubq_s32(x[0], x[1]);

  x[0] = vmulq_lane_s32(sum, c, 0);
  x[1] = vmulq_lane_s32(sub, c, 0);
  x[0] = vrshrq_n_s32(x[0], DCT_CONST_BITS);
  x[1] = vrshrq_n_s32(x[1], DCT_CONST_BITS);
}
예제 #5
0
static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
                                          int32x4_t *const a0,
                                          int32x4_t *const a1,
                                          int32x4_t *const a2,
                                          int32x4_t *const a3) {
  int32x4_t b0, b1, b2, b3;
  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;

  transpose_s32_4x4(a0, a1, a2, a3);
  b0 = vaddq_s32(*a0, *a2);
  b1 = vsubq_s32(*a0, *a2);
  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
  c4 = vsubq_s64(c4, c8);
  c5 = vsubq_s64(c5, c9);
  c6 = vaddq_s64(c6, c10);
  c7 = vaddq_s64(c7, c11);
  b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
                    vrshrn_n_s64(c1, DCT_CONST_BITS));
  b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
                    vrshrn_n_s64(c3, DCT_CONST_BITS));
  b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
                    vrshrn_n_s64(c5, DCT_CONST_BITS));
  b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
                    vrshrn_n_s64(c7, DCT_CONST_BITS));
  *a0 = vaddq_s32(b0, b3);
  *a1 = vaddq_s32(b1, b2);
  *a2 = vsubq_s32(b1, b2);
  *a3 = vsubq_s32(b0, b3);
}
static INLINE void iadst_half_butterfly_bd12_neon(int32x4_t *const x,
                                                  const int32x2_t c) {
  const int32x4_t sum = vaddq_s32(x[0], x[1]);
  const int32x4_t sub = vsubq_s32(x[0], x[1]);
  const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0);
  const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0);
  const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0);
  const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0);
  const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS);
  const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS);
  const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS);
  const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS);

  x[0] = vcombine_s32(out0_lo, out0_hi);
  x[1] = vcombine_s32(out1_lo, out1_hi);
}
예제 #7
0
파일: mw_neon.c 프로젝트: Daniel-Hwang/eeg
/* s32x4 sub */
void mw_neon_mm_sub_s32x4(int * A, int Row, int Col, int * B, int * C)
{
	int32x4_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 4; i <= size ; i+=4)
	{
		k = i - 4;
		neon_a = vld1q_s32(A + k);
		neon_b = vld1q_s32(B + k);
		neon_c = vsubq_s32(neon_a, neon_b);
		vst1q_s32(C + k, neon_c);
	}

	k = i - 4;
    for (i = 0; i < size % 4; i++)
	{
		C[k + i] = A[k + i] - B[k + i];
	}
}
예제 #8
0
파일: armdct.c 프로젝트: AlecGamble/daala
OD_SIMD_INLINE void od_idct4_kernel(int32x4_t *y0, int32x4_t *y1,
 int32x4_t *y2, int32x4_t *y3) {
  int32x4_t t0 = *y0;
  int32x4_t t1 = *y1;
  int32x4_t t2 = *y2;
  int32x4_t t3 = *y3;
  int32x4_t t2h;
  od_transpose4(&t0, &t1, &t2, &t3);
  t3 = vaddq_s32(t3, OD_DCT_MUL(t1, 18293, 8192, 14));
  t1 = vsubq_s32(t1, OD_DCT_MUL(t3, 21407, 16384, 15));
  t3 = vaddq_s32(t3, OD_DCT_MUL(t1, 23013, 16384, 15));
  t2 = vsubq_s32(t0, t2);
  t2h = OD_UNBIASED_RSHIFT(t2, 1);
  t0 = vsubq_s32(t0, vsubq_s32(t2h, OD_UNBIASED_RSHIFT(t3, 1)));
  t1 = vsubq_s32(t2h, t1);
  *y0 = t0;
  *y1 = vsubq_s32(t2, t1);
  *y2 = t1;
  *y3 = vsubq_s32(t0, t3);
}
예제 #9
0
파일: armdct.c 프로젝트: AlecGamble/daala
OD_SIMD_INLINE void od_fdct4_kernel(int32x4_t *x0, int32x4_t *x1,
 int32x4_t *x2, int32x4_t *x3) {
  /*9 adds, 2 shifts, 3 "muls".*/
  int32x4_t t0 = *x0;
  int32x4_t t2 = *x1;
  int32x4_t t1 = *x2;
  int32x4_t t3 = *x3;
  int32x4_t t2h;
  /*+1/-1 butterflies:*/
  t3 = vsubq_s32(t0, t3);
  t2 = vaddq_s32(t2, t1);
  t2h = OD_UNBIASED_RSHIFT(t2, 1);
  t1 = vsubq_s32(t2h, t1);
  t0 = vsubq_s32(t0, OD_UNBIASED_RSHIFT(t3, 1));
  /*+ Embedded 2-point type-II DCT.*/
  t0 = vaddq_s32(t0, t2h);
  t2 = vsubq_s32(t0, t2);
  /*+ Embedded 2-point type-IV DST.*/
  /*23013/32768 ~= 4*sin(\frac{\pi}{8}) - 2*tan(\frac{\pi}{8}) ~=
     0.70230660471416898931046248770220*/
  od_overflow_check_epi32(t1, 23013, 16384, 0);
  t3 = vsubq_s32(t3, OD_DCT_MUL(t1, 23013, 16384, 15));
  /*21407/32768~=\sqrt{1/2}*cos(\frac{\pi}{8}))
     ~=0.65328148243818826392832158671359*/
  od_overflow_check_epi32(t3, 21407, 16384, 1);
  t1 = vaddq_s32(t1, OD_DCT_MUL(t3, 21407, 16384, 15));
  /*18293/16384 ~= 4*sin(\frac{\pi}{8}) - tan(\frac{\pi}{8}) ~=
     1.1165201670872640381121512119119*/
  od_overflow_check_epi32(t3, 18293, 8192, 2);
  t3 = vsubq_s32(t3, OD_DCT_MUL(t1, 18293, 8192, 14));
  od_transpose4(&t0, &t1, &t2, &t3);
  *x0 = t0;
  *x1 = t1;
  *x2 = t2;
  *x3 = t3;
}
예제 #10
0
static inline int32_t TransformAndFindMaxNeon(int16_t* inre,
                                              int16_t* inim,
                                              int32_t* outre,
                                              int32_t* outim) {
  int k;
  int16_t* inre1 = inre;
  int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
  int16_t* inim1 = inim;
  int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
  int32_t* outre1 = outre;
  int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
  int32_t* outim1 = outim;
  int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
  const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
  const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4];
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
  for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    int16x4_t tmpi = vld1_s16(kSinTab1);
    kSinTab1 += 4;
    int16x4_t tmpr = vld1_s16(kSinTab2);
    kSinTab2 -= 4;
    int16x4_t inre_0 = vld1_s16(inre1);
    inre1 += 4;
    int16x4_t inre_1 = vld1_s16(inre2);
    inre2 -= 4;
    int16x4_t inim_0 = vld1_s16(inim1);
    inim1 += 4;
    int16x4_t inim_1 = vld1_s16(inim2);
    inim2 -= 4;
    tmpr = vneg_s16(tmpr);
    inre_1 = vrev64_s16(inre_1);
    inim_1 = vrev64_s16(inim_1);
    tmpr = vrev64_s16(tmpr);

    int32x4_t xr = vmull_s16(tmpr, inre_0);
    int32x4_t xi = vmull_s16(tmpr, inim_0);
    int32x4_t yr = vmull_s16(tmpr, inim_1);
    int32x4_t yi = vmull_s16(tmpi, inim_1);
    xr = vmlal_s16(xr, tmpi, inim_0);
    xi = vmlsl_s16(xi, tmpi, inre_0);
    yr = vmlal_s16(yr, tmpi, inre_1);
    yi = vmlsl_s16(yi, tmpr, inre_1);
    yr = vnegq_s32(yr);

    xr = vshrq_n_s32(xr, 5);
    xi = vshrq_n_s32(xi, 5);
    yr = vshrq_n_s32(yr, 5);
    yi = vshrq_n_s32(yi, 5);

    int32x4_t outr0 = vsubq_s32(xr, yi);
    int32x4_t outr1 = vaddq_s32(xr, yi);
    int32x4_t outi0 = vaddq_s32(xi, yr);
    int32x4_t outi1 = vsubq_s32(yr, xi);

    // Find the absolute maximum in the vectors.
    int32x4_t tmp0 = vabsq_s32(outr0);
    int32x4_t tmp1 = vabsq_s32(outr1);
    int32x4_t tmp2 = vabsq_s32(outi0);
    int32x4_t tmp3 = vabsq_s32(outi1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));

    // Store the vectors.
    outr1 = vrev64q_s32(outr1);
    outi1 = vrev64q_s32(outi1);
    int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1));
    int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1));

    vst1q_s32(outre1, outr0);
    outre1 += 4;
    vst1q_s32(outim1, outi0);
    outim1 += 4;
    vst1q_s32(outre2, outr_1);
    outre2 -= 4;
    vst1q_s32(outim2, outi_1);
    outim2 -= 4;
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}
static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1,
                               int32x4_t *const io2, int32x4_t *const io3,
                               int32x4_t *const io4, int32x4_t *const io5,
                               int32x4_t *const io6, int32x4_t *const io7) {
  const int32x4_t c0 =
      create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
  const int32x4_t c1 =
      create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
  const int32x4_t c2 =
      create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
  int32x4_t x[8], t[4];
  int64x2_t s[8][2];

  x[0] = *io7;
  x[1] = *io0;
  x[2] = *io5;
  x[3] = *io2;
  x[4] = *io3;
  x[5] = *io4;
  x[6] = *io1;
  x[7] = *io6;

  // stage 1
  iadst_butterfly_lane_0_1_bd12_neon(x[0], x[1], vget_low_s32(c0), s[0], s[1]);
  iadst_butterfly_lane_0_1_bd12_neon(x[2], x[3], vget_high_s32(c0), s[2], s[3]);
  iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_low_s32(c1), s[4], s[5]);
  iadst_butterfly_lane_0_1_bd12_neon(x[6], x[7], vget_high_s32(c1), s[6], s[7]);

  x[0] = add_dct_const_round_shift_low_8_bd12(s[0], s[4]);
  x[1] = add_dct_const_round_shift_low_8_bd12(s[1], s[5]);
  x[2] = add_dct_const_round_shift_low_8_bd12(s[2], s[6]);
  x[3] = add_dct_const_round_shift_low_8_bd12(s[3], s[7]);
  x[4] = sub_dct_const_round_shift_low_8_bd12(s[0], s[4]);
  x[5] = sub_dct_const_round_shift_low_8_bd12(s[1], s[5]);
  x[6] = sub_dct_const_round_shift_low_8_bd12(s[2], s[6]);
  x[7] = sub_dct_const_round_shift_low_8_bd12(s[3], s[7]);

  // stage 2
  t[0] = x[0];
  t[1] = x[1];
  t[2] = x[2];
  t[3] = x[3];
  iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_high_s32(c2), s[4], s[5]);
  iadst_butterfly_lane_1_0_bd12_neon(x[7], x[6], vget_high_s32(c2), s[7], s[6]);

  x[0] = vaddq_s32(t[0], t[2]);
  x[1] = vaddq_s32(t[1], t[3]);
  x[2] = vsubq_s32(t[0], t[2]);
  x[3] = vsubq_s32(t[1], t[3]);
  x[4] = add_dct_const_round_shift_low_8_bd12(s[4], s[6]);
  x[5] = add_dct_const_round_shift_low_8_bd12(s[5], s[7]);
  x[6] = sub_dct_const_round_shift_low_8_bd12(s[4], s[6]);
  x[7] = sub_dct_const_round_shift_low_8_bd12(s[5], s[7]);

  // stage 3
  iadst_half_butterfly_bd12_neon(x + 2, vget_low_s32(c2));
  iadst_half_butterfly_bd12_neon(x + 6, vget_low_s32(c2));

  *io0 = x[0];
  *io1 = vnegq_s32(x[4]);
  *io2 = x[6];
  *io3 = vnegq_s32(x[2]);
  *io4 = x[3];
  *io5 = vnegq_s32(x[7]);
  *io6 = x[5];
  *io7 = vnegq_s32(x[1]);
}
static OPUS_INLINE int32x4_t calc_state( const int32x4_t state_QS0_s32x4, const int32x4_t state_QS0_1_s32x4, const int32x4_t state_QS1_1_s32x4, const int32x4_t warping_Q16_s32x4 )
{
    int32x4_t t_s32x4 = vsubq_s32( state_QS0_s32x4, state_QS0_1_s32x4 );
    t_s32x4 = vqdmulhq_s32( t_s32x4, warping_Q16_s32x4 );
    return vaddq_s32( state_QS1_1_s32x4, t_s32x4 );
}
static INLINE int32x4_t
sub_dct_const_round_shift_low_8_bd10(const int32x4_t in0, const int32x4_t in1) {
  const int32x4_t sub = vsubq_s32(in0, in1);
  return vrshrq_n_s32(sub, DCT_CONST_BITS);
}
예제 #14
0
// Update the noise estimation information.
static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) {
  const int16_t kExp2Const = 11819; // Q13
  int16_t* ptr_noiseEstLogQuantile = NULL;
  int16_t* ptr_noiseEstQuantile = NULL;
  int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
  int32x4_t twentyOne32x4 = vdupq_n_s32(21);
  int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
  int32x4_t constB32x4 = vdupq_n_s32(0x200000);

  int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
                                        inst->magnLen);

  // Guarantee a Q-domain as high as possible and still fit in int16
  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
                                                                 tmp16,
                                                                 21);

  int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);

  for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
       ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
       ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
       ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {

    // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i];
    int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
    int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);

    // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
    int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
    v32x4A = vorrq_s32(v32x4A, constB32x4);

    // tmp16 = (int16_t)(tmp32no2 >> 21);
    v32x4B = vshrq_n_s32(v32x4B, 21);

    // tmp16 -= 21;// shift 21 to get result in Q0
    v32x4B = vsubq_s32(v32x4B, twentyOne32x4);

    // tmp16 += (int16_t) inst->qNoise;
    // shift to get result in Q(qNoise)
    v32x4B = vaddq_s32(v32x4B, qNoise32x4);

    // if (tmp16 < 0) {
    //   tmp32no1 >>= -tmp16;
    // } else {
    //   tmp32no1 <<= tmp16;
    // }
    v32x4B = vshlq_s32(v32x4A, v32x4B);

    // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
    v16x4 = vqmovn_s32(v32x4B);

    //inst->noiseEstQuantile[i] = tmp16;
    vst1_s16(ptr_noiseEstQuantile, v16x4);
  }

  // Last iteration:

  // inst->quantile[i]=exp(inst->lquantile[offset+i]);
  // in Q21
  int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile;
  int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac

  tmp16 = (int16_t)(tmp32no2 >> 21);
  tmp16 -= 21;// shift 21 to get result in Q0
  tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
  if (tmp16 < 0) {
    tmp32no1 >>= -tmp16;
  } else {
예제 #15
0
static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
                               int16x8_t *q10s16, int16x8_t *q11s16,
                               int16x8_t *q12s16, int16x8_t *q13s16,
                               int16x8_t *q14s16, int16x8_t *q15s16) {
  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  int16x8_t q2s16, q4s16, q5s16, q6s16;
  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);
  d20s16 = vget_low_s16(*q10s16);
  d21s16 = vget_high_s16(*q10s16);
  d22s16 = vget_low_s16(*q11s16);
  d23s16 = vget_high_s16(*q11s16);
  d24s16 = vget_low_s16(*q12s16);
  d25s16 = vget_high_s16(*q12s16);
  d26s16 = vget_low_s16(*q13s16);
  d27s16 = vget_high_s16(*q13s16);
  d28s16 = vget_low_s16(*q14s16);
  d29s16 = vget_high_s16(*q14s16);
  d30s16 = vget_low_s16(*q15s16);
  d31s16 = vget_high_s16(*q15s16);

  d14s16 = vdup_n_s16((int16_t)cospi_2_64);
  d15s16 = vdup_n_s16((int16_t)cospi_30_64);

  q1s32 = vmull_s16(d30s16, d14s16);
  q2s32 = vmull_s16(d31s16, d14s16);
  q3s32 = vmull_s16(d30s16, d15s16);
  q4s32 = vmull_s16(d31s16, d15s16);

  d30s16 = vdup_n_s16((int16_t)cospi_18_64);
  d31s16 = vdup_n_s16((int16_t)cospi_14_64);

  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);

  q5s32 = vmull_s16(d22s16, d30s16);
  q6s32 = vmull_s16(d23s16, d30s16);
  q7s32 = vmull_s16(d22s16, d31s16);
  q8s32 = vmull_s16(d23s16, d31s16);

  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);

  q11s32 = vaddq_s32(q1s32, q5s32);
  q12s32 = vaddq_s32(q2s32, q6s32);
  q1s32 = vsubq_s32(q1s32, q5s32);
  q2s32 = vsubq_s32(q2s32, q6s32);

  d22s16 = vqrshrn_n_s32(q11s32, 14);
  d23s16 = vqrshrn_n_s32(q12s32, 14);
  *q11s16 = vcombine_s16(d22s16, d23s16);

  q12s32 = vaddq_s32(q3s32, q7s32);
  q15s32 = vaddq_s32(q4s32, q8s32);
  q3s32 = vsubq_s32(q3s32, q7s32);
  q4s32 = vsubq_s32(q4s32, q8s32);

  d2s16 = vqrshrn_n_s32(q1s32, 14);
  d3s16 = vqrshrn_n_s32(q2s32, 14);
  d24s16 = vqrshrn_n_s32(q12s32, 14);
  d25s16 = vqrshrn_n_s32(q15s32, 14);
  d6s16 = vqrshrn_n_s32(q3s32, 14);
  d7s16 = vqrshrn_n_s32(q4s32, 14);
  *q12s16 = vcombine_s16(d24s16, d25s16);

  d0s16 = vdup_n_s16((int16_t)cospi_10_64);
  d1s16 = vdup_n_s16((int16_t)cospi_22_64);
  q4s32 = vmull_s16(d26s16, d0s16);
  q5s32 = vmull_s16(d27s16, d0s16);
  q2s32 = vmull_s16(d26s16, d1s16);
  q6s32 = vmull_s16(d27s16, d1s16);

  d30s16 = vdup_n_s16((int16_t)cospi_26_64);
  d31s16 = vdup_n_s16((int16_t)cospi_6_64);

  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);

  q0s32 = vmull_s16(d18s16, d30s16);
  q13s32 = vmull_s16(d19s16, d30s16);

  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);

  q10s32 = vmull_s16(d18s16, d31s16);
  q9s32 = vmull_s16(d19s16, d31s16);

  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);

  q14s32 = vaddq_s32(q2s32, q10s32);
  q15s32 = vaddq_s32(q6s32, q9s32);
  q2s32 = vsubq_s32(q2s32, q10s32);
  q6s32 = vsubq_s32(q6s32, q9s32);

  d28s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d4s16 = vqrshrn_n_s32(q2s32, 14);
  d5s16 = vqrshrn_n_s32(q6s32, 14);
  *q14s16 = vcombine_s16(d28s16, d29s16);

  q9s32 = vaddq_s32(q4s32, q0s32);
  q10s32 = vaddq_s32(q5s32, q13s32);
  q4s32 = vsubq_s32(q4s32, q0s32);
  q5s32 = vsubq_s32(q5s32, q13s32);

  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
  d31s16 = vdup_n_s16((int16_t)cospi_24_64);

  d18s16 = vqrshrn_n_s32(q9s32, 14);
  d19s16 = vqrshrn_n_s32(q10s32, 14);
  d8s16 = vqrshrn_n_s32(q4s32, 14);
  d9s16 = vqrshrn_n_s32(q5s32, 14);
  *q9s16 = vcombine_s16(d18s16, d19s16);

  q5s32 = vmull_s16(d2s16, d30s16);
  q6s32 = vmull_s16(d3s16, d30s16);
  q7s32 = vmull_s16(d2s16, d31s16);
  q0s32 = vmull_s16(d3s16, d31s16);

  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);

  q1s32 = vmull_s16(d4s16, d30s16);
  q3s32 = vmull_s16(d5s16, d30s16);
  q10s32 = vmull_s16(d4s16, d31s16);
  q2s32 = vmull_s16(d5s16, d31s16);

  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);

  *q8s16 = vaddq_s16(*q11s16, *q9s16);
  *q11s16 = vsubq_s16(*q11s16, *q9s16);
  q4s16 = vaddq_s16(*q12s16, *q14s16);
  *q12s16 = vsubq_s16(*q12s16, *q14s16);

  q14s32 = vaddq_s32(q5s32, q1s32);
  q15s32 = vaddq_s32(q6s32, q3s32);
  q5s32 = vsubq_s32(q5s32, q1s32);
  q6s32 = vsubq_s32(q6s32, q3s32);

  d18s16 = vqrshrn_n_s32(q14s32, 14);
  d19s16 = vqrshrn_n_s32(q15s32, 14);
  d10s16 = vqrshrn_n_s32(q5s32, 14);
  d11s16 = vqrshrn_n_s32(q6s32, 14);
  *q9s16 = vcombine_s16(d18s16, d19s16);

  q1s32 = vaddq_s32(q7s32, q10s32);
  q3s32 = vaddq_s32(q0s32, q2s32);
  q7s32 = vsubq_s32(q7s32, q10s32);
  q0s32 = vsubq_s32(q0s32, q2s32);

  d28s16 = vqrshrn_n_s32(q1s32, 14);
  d29s16 = vqrshrn_n_s32(q3s32, 14);
  d14s16 = vqrshrn_n_s32(q7s32, 14);
  d15s16 = vqrshrn_n_s32(q0s32, 14);
  *q14s16 = vcombine_s16(d28s16, d29s16);

  d30s16 = vdup_n_s16((int16_t)cospi_16_64);

  d22s16 = vget_low_s16(*q11s16);
  d23s16 = vget_high_s16(*q11s16);
  q2s32 = vmull_s16(d22s16, d30s16);
  q3s32 = vmull_s16(d23s16, d30s16);
  q13s32 = vmull_s16(d22s16, d30s16);
  q1s32 = vmull_s16(d23s16, d30s16);

  d24s16 = vget_low_s16(*q12s16);
  d25s16 = vget_high_s16(*q12s16);
  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);

  d4s16 = vqrshrn_n_s32(q2s32, 14);
  d5s16 = vqrshrn_n_s32(q3s32, 14);
  d24s16 = vqrshrn_n_s32(q13s32, 14);
  d25s16 = vqrshrn_n_s32(q1s32, 14);
  q2s16 = vcombine_s16(d4s16, d5s16);
  *q12s16 = vcombine_s16(d24s16, d25s16);

  q13s32 = vmull_s16(d10s16, d30s16);
  q1s32 = vmull_s16(d11s16, d30s16);
  q11s32 = vmull_s16(d10s16, d30s16);
  q0s32 = vmull_s16(d11s16, d30s16);

  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);

  d20s16 = vqrshrn_n_s32(q13s32, 14);
  d21s16 = vqrshrn_n_s32(q1s32, 14);
  d12s16 = vqrshrn_n_s32(q11s32, 14);
  d13s16 = vqrshrn_n_s32(q0s32, 14);
  *q10s16 = vcombine_s16(d20s16, d21s16);
  q6s16 = vcombine_s16(d12s16, d13s16);

  q5s16 = vdupq_n_s16(0);

  *q9s16 = vsubq_s16(q5s16, *q9s16);
  *q11s16 = vsubq_s16(q5s16, q2s16);
  *q13s16 = vsubq_s16(q5s16, q6s16);
  *q15s16 = vsubq_s16(q5s16, q4s16);
  return;
}
예제 #16
0
inline   int32x4_t vsubq(const int32x4_t   & v0, const int32x4_t   & v1) { return vsubq_s32(v0, v1); }
예제 #17
0
inline v_int32x4 v_ceil(const v_float32x4& a)
{
    int32x4_t a1 = vcvtq_s32_f32(a.val);
    uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
    return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
}