static inline int32_t TransformAndFindMaxNeon(int16_t* inre, int16_t* inim, int32_t* outre, int32_t* outim) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int32_t* outre1 = outre; int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int32_t* outim1 = outim; int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4]; uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code. for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int32x4_t xr = vmull_s16(tmpr, inre_0); int32x4_t xi = vmull_s16(tmpr, inim_0); int32x4_t yr = vmull_s16(tmpr, inim_1); int32x4_t yi = vmull_s16(tmpi, inim_1); xr = vmlal_s16(xr, tmpi, inim_0); xi = vmlsl_s16(xi, tmpi, inre_0); yr = vmlal_s16(yr, tmpi, inre_1); yi = vmlsl_s16(yi, tmpr, inre_1); yr = vnegq_s32(yr); xr = vshrq_n_s32(xr, 5); xi = vshrq_n_s32(xi, 5); yr = vshrq_n_s32(yr, 5); yi = vshrq_n_s32(yi, 5); int32x4_t outr0 = vsubq_s32(xr, yi); int32x4_t outr1 = vaddq_s32(xr, yi); int32x4_t outi0 = vaddq_s32(xi, yr); int32x4_t outi1 = vsubq_s32(yr, xi); // Find the absolute maximum in the vectors. int32x4_t tmp0 = vabsq_s32(outr0); int32x4_t tmp1 = vabsq_s32(outr1); int32x4_t tmp2 = vabsq_s32(outi0); int32x4_t tmp3 = vabsq_s32(outi1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); // Store the vectors. outr1 = vrev64q_s32(outr1); outi1 = vrev64q_s32(outi1); int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1)); int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1)); vst1q_s32(outre1, outr0); outre1 += 4; vst1q_s32(outim1, outi0); outim1 += 4; vst1q_s32(outre2, outr_1); outre2 -= 4; vst1q_s32(outim2, outi_1); outim2 -= 4; } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, int32x4_t *const io7) { const int32x4_t c0 = create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); const int32x4_t c1 = create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); const int32x4_t c2 = create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); int32x4_t x[8], t[4]; int64x2_t s[8][2]; x[0] = *io7; x[1] = *io0; x[2] = *io5; x[3] = *io2; x[4] = *io3; x[5] = *io4; x[6] = *io1; x[7] = *io6; // stage 1 iadst_butterfly_lane_0_1_bd12_neon(x[0], x[1], vget_low_s32(c0), s[0], s[1]); iadst_butterfly_lane_0_1_bd12_neon(x[2], x[3], vget_high_s32(c0), s[2], s[3]); iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_low_s32(c1), s[4], s[5]); iadst_butterfly_lane_0_1_bd12_neon(x[6], x[7], vget_high_s32(c1), s[6], s[7]); x[0] = add_dct_const_round_shift_low_8_bd12(s[0], s[4]); x[1] = add_dct_const_round_shift_low_8_bd12(s[1], s[5]); x[2] = add_dct_const_round_shift_low_8_bd12(s[2], s[6]); x[3] = add_dct_const_round_shift_low_8_bd12(s[3], s[7]); x[4] = sub_dct_const_round_shift_low_8_bd12(s[0], s[4]); x[5] = sub_dct_const_round_shift_low_8_bd12(s[1], s[5]); x[6] = sub_dct_const_round_shift_low_8_bd12(s[2], s[6]); x[7] = sub_dct_const_round_shift_low_8_bd12(s[3], s[7]); // stage 2 t[0] = x[0]; t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_high_s32(c2), s[4], s[5]); iadst_butterfly_lane_1_0_bd12_neon(x[7], x[6], vget_high_s32(c2), s[7], s[6]); x[0] = vaddq_s32(t[0], t[2]); x[1] = vaddq_s32(t[1], t[3]); x[2] = vsubq_s32(t[0], t[2]); x[3] = vsubq_s32(t[1], t[3]); x[4] = add_dct_const_round_shift_low_8_bd12(s[4], s[6]); x[5] = add_dct_const_round_shift_low_8_bd12(s[5], s[7]); x[6] = sub_dct_const_round_shift_low_8_bd12(s[4], s[6]); x[7] = sub_dct_const_round_shift_low_8_bd12(s[5], s[7]); // stage 3 iadst_half_butterfly_bd12_neon(x + 2, vget_low_s32(c2)); iadst_half_butterfly_bd12_neon(x + 6, vget_low_s32(c2)); *io0 = x[0]; *io1 = vnegq_s32(x[4]); *io2 = x[6]; *io3 = vnegq_s32(x[2]); *io4 = x[3]; *io5 = vnegq_s32(x[7]); *io6 = x[5]; *io7 = vnegq_s32(x[1]); }
static inline void PostShiftAndSeparateNeon(int16_t* inre, int16_t* inim, int16_t* outre, int16_t* outim, int32_t sh) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int16_t* outre1 = outre; int16_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int16_t* outim1 = outim; int16_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 -4]; // By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)", // ">> 14" and then ">> 9" as in the C code. int32x4_t shift = vdupq_n_s32(-sh - 23); for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int16x4_t xr = vqadd_s16(inre_0, inre_1); int16x4_t xi = vqsub_s16(inim_0, inim_1); int16x4_t yr = vqadd_s16(inim_0, inim_1); int16x4_t yi = vqsub_s16(inre_1, inre_0); int32x4_t outr0 = vmull_s16(tmpr, xr); int32x4_t outi0 = vmull_s16(tmpi, xr); int32x4_t outr1 = vmull_s16(tmpi, yr); int32x4_t outi1 = vmull_s16(tmpi, yi); outr0 = vmlsl_s16(outr0, tmpi, xi); outi0 = vmlal_s16(outi0, tmpr, xi); outr1 = vmlal_s16(outr1, tmpr, yi); outi1 = vmlsl_s16(outi1, tmpr, yr); outr0 = vshlq_s32(outr0, shift); outi0 = vshlq_s32(outi0, shift); outr1 = vshlq_s32(outr1, shift); outi1 = vshlq_s32(outi1, shift); outr1 = vnegq_s32(outr1); int16x4_t outre_0 = vmovn_s32(outr0); int16x4_t outim_0 = vmovn_s32(outi0); int16x4_t outre_1 = vmovn_s32(outr1); int16x4_t outim_1 = vmovn_s32(outi1); outre_1 = vrev64_s16(outre_1); outim_1 = vrev64_s16(outim_1); vst1_s16(outre1, outre_0); outre1 += 4; vst1_s16(outim1, outim_0); outim1 += 4; vst1_s16(outre2, outre_1); outre2 -= 4; vst1_s16(outim2, outim_1); outim2 -= 4; } }