static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, int32x4_t *const a0, int32x4_t *const a1, int32x4_t *const a2, int32x4_t *const a3) { int32x4_t b0, b1, b2, b3; transpose_s32_4x4(a0, a1, a2, a3); b0 = vaddq_s32(*a0, *a2); b1 = vsubq_s32(*a0, *a2); b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1); b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); *a3 = vsubq_s32(b0, b3); }
void test_vsubQs32 (void) { int32x4_t out_int32x4_t; int32x4_t arg0_int32x4_t; int32x4_t arg1_int32x4_t; out_int32x4_t = vsubq_s32 (arg0_int32x4_t, arg1_int32x4_t); }
static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, int16x4_t *d5s16, int16x8_t *q3s16, int16x8_t *q8s16, int16x8_t *q9s16) { int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; d6s16 = vget_low_s16(*q3s16); d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); q10s32 = vmull_s16(*d3s16, d16s16); q11s32 = vmull_s16(*d4s16, d16s16); q12s32 = vmull_s16(d6s16, d17s16); q13s32 = vmull_s16(*d5s16, d18s16); q14s32 = vmull_s16(*d3s16, d18s16); q15s32 = vmovl_s16(d16s16); q15s32 = vaddw_s16(q15s32, d19s16); q8s32 = vmull_s16(*d4s16, d19s16); q15s32 = vsubw_s16(q15s32, d18s16); q9s32 = vmull_s16(*d5s16, d19s16); q10s32 = vaddq_s32(q10s32, q13s32); q10s32 = vaddq_s32(q10s32, q8s32); q11s32 = vsubq_s32(q11s32, q14s32); q8s32 = vdupq_n_s32(sinpi_3_9); q11s32 = vsubq_s32(q11s32, q9s32); q15s32 = vmulq_s32(q15s32, q8s32); q13s32 = vaddq_s32(q10s32, q12s32); q10s32 = vaddq_s32(q10s32, q11s32); q14s32 = vaddq_s32(q11s32, q12s32); q10s32 = vsubq_s32(q10s32, q12s32); d16s16 = vqrshrn_n_s32(q13s32, 14); d17s16 = vqrshrn_n_s32(q14s32, 14); d18s16 = vqrshrn_n_s32(q15s32, 14); d19s16 = vqrshrn_n_s32(q10s32, 14); *q8s16 = vcombine_s16(d16s16, d17s16); *q9s16 = vcombine_s16(d18s16, d19s16); return; }
static INLINE void iadst_half_butterfly_bd10_neon(int32x4_t *const x, const int32x2_t c) { const int32x4_t sum = vaddq_s32(x[0], x[1]); const int32x4_t sub = vsubq_s32(x[0], x[1]); x[0] = vmulq_lane_s32(sum, c, 0); x[1] = vmulq_lane_s32(sub, c, 0); x[0] = vrshrq_n_s32(x[0], DCT_CONST_BITS); x[1] = vrshrq_n_s32(x[1], DCT_CONST_BITS); }
static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, int32x4_t *const a0, int32x4_t *const a1, int32x4_t *const a2, int32x4_t *const a3) { int32x4_t b0, b1, b2, b3; int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; transpose_s32_4x4(a0, a1, a2, a3); b0 = vaddq_s32(*a0, *a2); b1 = vsubq_s32(*a0, *a2); c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); c4 = vsubq_s64(c4, c8); c5 = vsubq_s64(c5, c9); c6 = vaddq_s64(c6, c10); c7 = vaddq_s64(c7, c11); b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), vrshrn_n_s64(c1, DCT_CONST_BITS)); b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), vrshrn_n_s64(c3, DCT_CONST_BITS)); b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), vrshrn_n_s64(c5, DCT_CONST_BITS)); b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), vrshrn_n_s64(c7, DCT_CONST_BITS)); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); *a3 = vsubq_s32(b0, b3); }
static INLINE void iadst_half_butterfly_bd12_neon(int32x4_t *const x, const int32x2_t c) { const int32x4_t sum = vaddq_s32(x[0], x[1]); const int32x4_t sub = vsubq_s32(x[0], x[1]); const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0); const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0); const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0); const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0); const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS); const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS); const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS); const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS); x[0] = vcombine_s32(out0_lo, out0_hi); x[1] = vcombine_s32(out1_lo, out1_hi); }
/* s32x4 sub */ void mw_neon_mm_sub_s32x4(int * A, int Row, int Col, int * B, int * C) { int32x4_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 4; i <= size ; i+=4) { k = i - 4; neon_a = vld1q_s32(A + k); neon_b = vld1q_s32(B + k); neon_c = vsubq_s32(neon_a, neon_b); vst1q_s32(C + k, neon_c); } k = i - 4; for (i = 0; i < size % 4; i++) { C[k + i] = A[k + i] - B[k + i]; } }
OD_SIMD_INLINE void od_idct4_kernel(int32x4_t *y0, int32x4_t *y1, int32x4_t *y2, int32x4_t *y3) { int32x4_t t0 = *y0; int32x4_t t1 = *y1; int32x4_t t2 = *y2; int32x4_t t3 = *y3; int32x4_t t2h; od_transpose4(&t0, &t1, &t2, &t3); t3 = vaddq_s32(t3, OD_DCT_MUL(t1, 18293, 8192, 14)); t1 = vsubq_s32(t1, OD_DCT_MUL(t3, 21407, 16384, 15)); t3 = vaddq_s32(t3, OD_DCT_MUL(t1, 23013, 16384, 15)); t2 = vsubq_s32(t0, t2); t2h = OD_UNBIASED_RSHIFT(t2, 1); t0 = vsubq_s32(t0, vsubq_s32(t2h, OD_UNBIASED_RSHIFT(t3, 1))); t1 = vsubq_s32(t2h, t1); *y0 = t0; *y1 = vsubq_s32(t2, t1); *y2 = t1; *y3 = vsubq_s32(t0, t3); }
OD_SIMD_INLINE void od_fdct4_kernel(int32x4_t *x0, int32x4_t *x1, int32x4_t *x2, int32x4_t *x3) { /*9 adds, 2 shifts, 3 "muls".*/ int32x4_t t0 = *x0; int32x4_t t2 = *x1; int32x4_t t1 = *x2; int32x4_t t3 = *x3; int32x4_t t2h; /*+1/-1 butterflies:*/ t3 = vsubq_s32(t0, t3); t2 = vaddq_s32(t2, t1); t2h = OD_UNBIASED_RSHIFT(t2, 1); t1 = vsubq_s32(t2h, t1); t0 = vsubq_s32(t0, OD_UNBIASED_RSHIFT(t3, 1)); /*+ Embedded 2-point type-II DCT.*/ t0 = vaddq_s32(t0, t2h); t2 = vsubq_s32(t0, t2); /*+ Embedded 2-point type-IV DST.*/ /*23013/32768 ~= 4*sin(\frac{\pi}{8}) - 2*tan(\frac{\pi}{8}) ~= 0.70230660471416898931046248770220*/ od_overflow_check_epi32(t1, 23013, 16384, 0); t3 = vsubq_s32(t3, OD_DCT_MUL(t1, 23013, 16384, 15)); /*21407/32768~=\sqrt{1/2}*cos(\frac{\pi}{8})) ~=0.65328148243818826392832158671359*/ od_overflow_check_epi32(t3, 21407, 16384, 1); t1 = vaddq_s32(t1, OD_DCT_MUL(t3, 21407, 16384, 15)); /*18293/16384 ~= 4*sin(\frac{\pi}{8}) - tan(\frac{\pi}{8}) ~= 1.1165201670872640381121512119119*/ od_overflow_check_epi32(t3, 18293, 8192, 2); t3 = vsubq_s32(t3, OD_DCT_MUL(t1, 18293, 8192, 14)); od_transpose4(&t0, &t1, &t2, &t3); *x0 = t0; *x1 = t1; *x2 = t2; *x3 = t3; }
static inline int32_t TransformAndFindMaxNeon(int16_t* inre, int16_t* inim, int32_t* outre, int32_t* outim) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int32_t* outre1 = outre; int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int32_t* outim1 = outim; int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4]; uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code. for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int32x4_t xr = vmull_s16(tmpr, inre_0); int32x4_t xi = vmull_s16(tmpr, inim_0); int32x4_t yr = vmull_s16(tmpr, inim_1); int32x4_t yi = vmull_s16(tmpi, inim_1); xr = vmlal_s16(xr, tmpi, inim_0); xi = vmlsl_s16(xi, tmpi, inre_0); yr = vmlal_s16(yr, tmpi, inre_1); yi = vmlsl_s16(yi, tmpr, inre_1); yr = vnegq_s32(yr); xr = vshrq_n_s32(xr, 5); xi = vshrq_n_s32(xi, 5); yr = vshrq_n_s32(yr, 5); yi = vshrq_n_s32(yi, 5); int32x4_t outr0 = vsubq_s32(xr, yi); int32x4_t outr1 = vaddq_s32(xr, yi); int32x4_t outi0 = vaddq_s32(xi, yr); int32x4_t outi1 = vsubq_s32(yr, xi); // Find the absolute maximum in the vectors. int32x4_t tmp0 = vabsq_s32(outr0); int32x4_t tmp1 = vabsq_s32(outr1); int32x4_t tmp2 = vabsq_s32(outi0); int32x4_t tmp3 = vabsq_s32(outi1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); // Store the vectors. outr1 = vrev64q_s32(outr1); outi1 = vrev64q_s32(outi1); int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1)); int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1)); vst1q_s32(outre1, outr0); outre1 += 4; vst1q_s32(outim1, outi0); outim1 += 4; vst1q_s32(outre2, outr_1); outre2 -= 4; vst1q_s32(outim2, outi_1); outim2 -= 4; } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, int32x4_t *const io7) { const int32x4_t c0 = create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); const int32x4_t c1 = create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); const int32x4_t c2 = create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); int32x4_t x[8], t[4]; int64x2_t s[8][2]; x[0] = *io7; x[1] = *io0; x[2] = *io5; x[3] = *io2; x[4] = *io3; x[5] = *io4; x[6] = *io1; x[7] = *io6; // stage 1 iadst_butterfly_lane_0_1_bd12_neon(x[0], x[1], vget_low_s32(c0), s[0], s[1]); iadst_butterfly_lane_0_1_bd12_neon(x[2], x[3], vget_high_s32(c0), s[2], s[3]); iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_low_s32(c1), s[4], s[5]); iadst_butterfly_lane_0_1_bd12_neon(x[6], x[7], vget_high_s32(c1), s[6], s[7]); x[0] = add_dct_const_round_shift_low_8_bd12(s[0], s[4]); x[1] = add_dct_const_round_shift_low_8_bd12(s[1], s[5]); x[2] = add_dct_const_round_shift_low_8_bd12(s[2], s[6]); x[3] = add_dct_const_round_shift_low_8_bd12(s[3], s[7]); x[4] = sub_dct_const_round_shift_low_8_bd12(s[0], s[4]); x[5] = sub_dct_const_round_shift_low_8_bd12(s[1], s[5]); x[6] = sub_dct_const_round_shift_low_8_bd12(s[2], s[6]); x[7] = sub_dct_const_round_shift_low_8_bd12(s[3], s[7]); // stage 2 t[0] = x[0]; t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_high_s32(c2), s[4], s[5]); iadst_butterfly_lane_1_0_bd12_neon(x[7], x[6], vget_high_s32(c2), s[7], s[6]); x[0] = vaddq_s32(t[0], t[2]); x[1] = vaddq_s32(t[1], t[3]); x[2] = vsubq_s32(t[0], t[2]); x[3] = vsubq_s32(t[1], t[3]); x[4] = add_dct_const_round_shift_low_8_bd12(s[4], s[6]); x[5] = add_dct_const_round_shift_low_8_bd12(s[5], s[7]); x[6] = sub_dct_const_round_shift_low_8_bd12(s[4], s[6]); x[7] = sub_dct_const_round_shift_low_8_bd12(s[5], s[7]); // stage 3 iadst_half_butterfly_bd12_neon(x + 2, vget_low_s32(c2)); iadst_half_butterfly_bd12_neon(x + 6, vget_low_s32(c2)); *io0 = x[0]; *io1 = vnegq_s32(x[4]); *io2 = x[6]; *io3 = vnegq_s32(x[2]); *io4 = x[3]; *io5 = vnegq_s32(x[7]); *io6 = x[5]; *io7 = vnegq_s32(x[1]); }
static OPUS_INLINE int32x4_t calc_state( const int32x4_t state_QS0_s32x4, const int32x4_t state_QS0_1_s32x4, const int32x4_t state_QS1_1_s32x4, const int32x4_t warping_Q16_s32x4 ) { int32x4_t t_s32x4 = vsubq_s32( state_QS0_s32x4, state_QS0_1_s32x4 ); t_s32x4 = vqdmulhq_s32( t_s32x4, warping_Q16_s32x4 ); return vaddq_s32( state_QS1_1_s32x4, t_s32x4 ); }
static INLINE int32x4_t sub_dct_const_round_shift_low_8_bd10(const int32x4_t in0, const int32x4_t in1) { const int32x4_t sub = vsubq_s32(in0, in1); return vrshrq_n_s32(sub, DCT_CONST_BITS); }
// Update the noise estimation information. static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) { const int16_t kExp2Const = 11819; // Q13 int16_t* ptr_noiseEstLogQuantile = NULL; int16_t* ptr_noiseEstQuantile = NULL; int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const); int32x4_t twentyOne32x4 = vdupq_n_s32(21); int32x4_t constA32x4 = vdupq_n_s32(0x1fffff); int32x4_t constB32x4 = vdupq_n_s32(0x200000); int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset, inst->magnLen); // Guarantee a Q-domain as high as possible and still fit in int16 inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const, tmp16, 21); int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise); for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset], ptr_noiseEstQuantile = &inst->noiseEstQuantile[0]; ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3]; ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) { // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i]; int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile); int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4); // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4); v32x4A = vorrq_s32(v32x4A, constB32x4); // tmp16 = (int16_t)(tmp32no2 >> 21); v32x4B = vshrq_n_s32(v32x4B, 21); // tmp16 -= 21;// shift 21 to get result in Q0 v32x4B = vsubq_s32(v32x4B, twentyOne32x4); // tmp16 += (int16_t) inst->qNoise; // shift to get result in Q(qNoise) v32x4B = vaddq_s32(v32x4B, qNoise32x4); // if (tmp16 < 0) { // tmp32no1 >>= -tmp16; // } else { // tmp32no1 <<= tmp16; // } v32x4B = vshlq_s32(v32x4A, v32x4B); // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1); v16x4 = vqmovn_s32(v32x4B); //inst->noiseEstQuantile[i] = tmp16; vst1_s16(ptr_noiseEstQuantile, v16x4); } // Last iteration: // inst->quantile[i]=exp(inst->lquantile[offset+i]); // in Q21 int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile; int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac tmp16 = (int16_t)(tmp32no2 >> 21); tmp16 -= 21;// shift 21 to get result in Q0 tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise) if (tmp16 < 0) { tmp32no1 >>= -tmp16; } else {
static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, int16x8_t *q11s16, int16x8_t *q12s16, int16x8_t *q13s16, int16x8_t *q14s16, int16x8_t *q15s16) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; int16x8_t q2s16, q4s16, q5s16, q6s16; int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32; int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); d20s16 = vget_low_s16(*q10s16); d21s16 = vget_high_s16(*q10s16); d22s16 = vget_low_s16(*q11s16); d23s16 = vget_high_s16(*q11s16); d24s16 = vget_low_s16(*q12s16); d25s16 = vget_high_s16(*q12s16); d26s16 = vget_low_s16(*q13s16); d27s16 = vget_high_s16(*q13s16); d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); d30s16 = vget_low_s16(*q15s16); d31s16 = vget_high_s16(*q15s16); d14s16 = vdup_n_s16((int16_t)cospi_2_64); d15s16 = vdup_n_s16((int16_t)cospi_30_64); q1s32 = vmull_s16(d30s16, d14s16); q2s32 = vmull_s16(d31s16, d14s16); q3s32 = vmull_s16(d30s16, d15s16); q4s32 = vmull_s16(d31s16, d15s16); d30s16 = vdup_n_s16((int16_t)cospi_18_64); d31s16 = vdup_n_s16((int16_t)cospi_14_64); q1s32 = vmlal_s16(q1s32, d16s16, d15s16); q2s32 = vmlal_s16(q2s32, d17s16, d15s16); q3s32 = vmlsl_s16(q3s32, d16s16, d14s16); q4s32 = vmlsl_s16(q4s32, d17s16, d14s16); q5s32 = vmull_s16(d22s16, d30s16); q6s32 = vmull_s16(d23s16, d30s16); q7s32 = vmull_s16(d22s16, d31s16); q8s32 = vmull_s16(d23s16, d31s16); q5s32 = vmlal_s16(q5s32, d24s16, d31s16); q6s32 = vmlal_s16(q6s32, d25s16, d31s16); q7s32 = vmlsl_s16(q7s32, d24s16, d30s16); q8s32 = vmlsl_s16(q8s32, d25s16, d30s16); q11s32 = vaddq_s32(q1s32, q5s32); q12s32 = vaddq_s32(q2s32, q6s32); q1s32 = vsubq_s32(q1s32, q5s32); q2s32 = vsubq_s32(q2s32, q6s32); d22s16 = vqrshrn_n_s32(q11s32, 14); d23s16 = vqrshrn_n_s32(q12s32, 14); *q11s16 = vcombine_s16(d22s16, d23s16); q12s32 = vaddq_s32(q3s32, q7s32); q15s32 = vaddq_s32(q4s32, q8s32); q3s32 = vsubq_s32(q3s32, q7s32); q4s32 = vsubq_s32(q4s32, q8s32); d2s16 = vqrshrn_n_s32(q1s32, 14); d3s16 = vqrshrn_n_s32(q2s32, 14); d24s16 = vqrshrn_n_s32(q12s32, 14); d25s16 = vqrshrn_n_s32(q15s32, 14); d6s16 = vqrshrn_n_s32(q3s32, 14); d7s16 = vqrshrn_n_s32(q4s32, 14); *q12s16 = vcombine_s16(d24s16, d25s16); d0s16 = vdup_n_s16((int16_t)cospi_10_64); d1s16 = vdup_n_s16((int16_t)cospi_22_64); q4s32 = vmull_s16(d26s16, d0s16); q5s32 = vmull_s16(d27s16, d0s16); q2s32 = vmull_s16(d26s16, d1s16); q6s32 = vmull_s16(d27s16, d1s16); d30s16 = vdup_n_s16((int16_t)cospi_26_64); d31s16 = vdup_n_s16((int16_t)cospi_6_64); q4s32 = vmlal_s16(q4s32, d20s16, d1s16); q5s32 = vmlal_s16(q5s32, d21s16, d1s16); q2s32 = vmlsl_s16(q2s32, d20s16, d0s16); q6s32 = vmlsl_s16(q6s32, d21s16, d0s16); q0s32 = vmull_s16(d18s16, d30s16); q13s32 = vmull_s16(d19s16, d30s16); q0s32 = vmlal_s16(q0s32, d28s16, d31s16); q13s32 = vmlal_s16(q13s32, d29s16, d31s16); q10s32 = vmull_s16(d18s16, d31s16); q9s32 = vmull_s16(d19s16, d31s16); q10s32 = vmlsl_s16(q10s32, d28s16, d30s16); q9s32 = vmlsl_s16(q9s32, d29s16, d30s16); q14s32 = vaddq_s32(q2s32, q10s32); q15s32 = vaddq_s32(q6s32, q9s32); q2s32 = vsubq_s32(q2s32, q10s32); q6s32 = vsubq_s32(q6s32, q9s32); d28s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d4s16 = vqrshrn_n_s32(q2s32, 14); d5s16 = vqrshrn_n_s32(q6s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); q9s32 = vaddq_s32(q4s32, q0s32); q10s32 = vaddq_s32(q5s32, q13s32); q4s32 = vsubq_s32(q4s32, q0s32); q5s32 = vsubq_s32(q5s32, q13s32); d30s16 = vdup_n_s16((int16_t)cospi_8_64); d31s16 = vdup_n_s16((int16_t)cospi_24_64); d18s16 = vqrshrn_n_s32(q9s32, 14); d19s16 = vqrshrn_n_s32(q10s32, 14); d8s16 = vqrshrn_n_s32(q4s32, 14); d9s16 = vqrshrn_n_s32(q5s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q5s32 = vmull_s16(d2s16, d30s16); q6s32 = vmull_s16(d3s16, d30s16); q7s32 = vmull_s16(d2s16, d31s16); q0s32 = vmull_s16(d3s16, d31s16); q5s32 = vmlal_s16(q5s32, d6s16, d31s16); q6s32 = vmlal_s16(q6s32, d7s16, d31s16); q7s32 = vmlsl_s16(q7s32, d6s16, d30s16); q0s32 = vmlsl_s16(q0s32, d7s16, d30s16); q1s32 = vmull_s16(d4s16, d30s16); q3s32 = vmull_s16(d5s16, d30s16); q10s32 = vmull_s16(d4s16, d31s16); q2s32 = vmull_s16(d5s16, d31s16); q1s32 = vmlsl_s16(q1s32, d8s16, d31s16); q3s32 = vmlsl_s16(q3s32, d9s16, d31s16); q10s32 = vmlal_s16(q10s32, d8s16, d30s16); q2s32 = vmlal_s16(q2s32, d9s16, d30s16); *q8s16 = vaddq_s16(*q11s16, *q9s16); *q11s16 = vsubq_s16(*q11s16, *q9s16); q4s16 = vaddq_s16(*q12s16, *q14s16); *q12s16 = vsubq_s16(*q12s16, *q14s16); q14s32 = vaddq_s32(q5s32, q1s32); q15s32 = vaddq_s32(q6s32, q3s32); q5s32 = vsubq_s32(q5s32, q1s32); q6s32 = vsubq_s32(q6s32, q3s32); d18s16 = vqrshrn_n_s32(q14s32, 14); d19s16 = vqrshrn_n_s32(q15s32, 14); d10s16 = vqrshrn_n_s32(q5s32, 14); d11s16 = vqrshrn_n_s32(q6s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q1s32 = vaddq_s32(q7s32, q10s32); q3s32 = vaddq_s32(q0s32, q2s32); q7s32 = vsubq_s32(q7s32, q10s32); q0s32 = vsubq_s32(q0s32, q2s32); d28s16 = vqrshrn_n_s32(q1s32, 14); d29s16 = vqrshrn_n_s32(q3s32, 14); d14s16 = vqrshrn_n_s32(q7s32, 14); d15s16 = vqrshrn_n_s32(q0s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); d30s16 = vdup_n_s16((int16_t)cospi_16_64); d22s16 = vget_low_s16(*q11s16); d23s16 = vget_high_s16(*q11s16); q2s32 = vmull_s16(d22s16, d30s16); q3s32 = vmull_s16(d23s16, d30s16); q13s32 = vmull_s16(d22s16, d30s16); q1s32 = vmull_s16(d23s16, d30s16); d24s16 = vget_low_s16(*q12s16); d25s16 = vget_high_s16(*q12s16); q2s32 = vmlal_s16(q2s32, d24s16, d30s16); q3s32 = vmlal_s16(q3s32, d25s16, d30s16); q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); d4s16 = vqrshrn_n_s32(q2s32, 14); d5s16 = vqrshrn_n_s32(q3s32, 14); d24s16 = vqrshrn_n_s32(q13s32, 14); d25s16 = vqrshrn_n_s32(q1s32, 14); q2s16 = vcombine_s16(d4s16, d5s16); *q12s16 = vcombine_s16(d24s16, d25s16); q13s32 = vmull_s16(d10s16, d30s16); q1s32 = vmull_s16(d11s16, d30s16); q11s32 = vmull_s16(d10s16, d30s16); q0s32 = vmull_s16(d11s16, d30s16); q13s32 = vmlal_s16(q13s32, d14s16, d30s16); q1s32 = vmlal_s16(q1s32, d15s16, d30s16); q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); d20s16 = vqrshrn_n_s32(q13s32, 14); d21s16 = vqrshrn_n_s32(q1s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q0s32, 14); *q10s16 = vcombine_s16(d20s16, d21s16); q6s16 = vcombine_s16(d12s16, d13s16); q5s16 = vdupq_n_s16(0); *q9s16 = vsubq_s16(q5s16, *q9s16); *q11s16 = vsubq_s16(q5s16, q2s16); *q13s16 = vsubq_s16(q5s16, q6s16); *q15s16 = vsubq_s16(q5s16, q4s16); return; }
inline int32x4_t vsubq(const int32x4_t & v0, const int32x4_t & v1) { return vsubq_s32(v0, v1); }
inline v_int32x4 v_ceil(const v_float32x4& a) { int32x4_t a1 = vcvtq_s32_f32(a.val); uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1)); return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask))); }