static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs, int16_t *block) { int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16; int16x8_t q14s16, q15s16, qzs16; int16x4_t d0s16, d2s16, d3s16, dzs16; uint16x8_t q1u16, q9u16; uint16x4_t d1u16; dzs16 = vdup_n_s16(0); qzs16 = vdupq_n_s16(0); q15s16 = vdupq_n_s16(qscale << 1); q14s16 = vdupq_n_s16(qadd); q13s16 = vnegq_s16(q14s16); if (nCoeffs > 4) { for (; nCoeffs > 8; nCoeffs -= 16, block += 16) { q0s16 = vld1q_s16(block); q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16)); q8s16 = vld1q_s16(block + 8); q1u16 = vceqq_s16(q0s16, qzs16); q2s16 = vmulq_s16(q0s16, q15s16); q11s16 = vreinterpretq_s16_u16(vcltq_s16(q8s16, qzs16)); q10s16 = vmulq_s16(q8s16, q15s16); q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16); q11s16 = vbslq_s16(vreinterpretq_u16_s16(q11s16), q13s16, q14s16); q2s16 = vaddq_s16(q2s16, q3s16); q9u16 = vceqq_s16(q8s16, qzs16); q10s16 = vaddq_s16(q10s16, q11s16); q0s16 = vbslq_s16(q1u16, q0s16, q2s16); q8s16 = vbslq_s16(q9u16, q8s16, q10s16); vst1q_s16(block, q0s16); vst1q_s16(block + 8, q8s16); } } if (nCoeffs <= 0) return; d0s16 = vld1_s16(block); d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16)); d1u16 = vceq_s16(d0s16, dzs16); d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16)); d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16), vget_high_s16(q13s16), vget_high_s16(q14s16)); d2s16 = vadd_s16(d2s16, d3s16); d0s16 = vbsl_s16(d1u16, d0s16, d2s16); vst1_s16(block, d0s16); }
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm, const uint16_t* far_spectrum, int32_t* echo_est, uint32_t* far_energy, uint32_t* echo_energy_adapt, uint32_t* echo_energy_stored) { int16_t* start_stored_p = aecm->channelStored; int16_t* start_adapt_p = aecm->channelAdapt16; int32_t* echo_est_p = echo_est; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; const uint16_t* far_spectrum_p = far_spectrum; int16x8_t store_v, adapt_v; uint16x8_t spectrum_v; uint32x4_t echo_est_v_low, echo_est_v_high; uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v; far_energy_v = vdupq_n_u32(0); echo_adapt_v = vdupq_n_u32(0); echo_stored_v = vdupq_n_u32(0); // Get energy for the delayed far end signal and estimated // echo using both stored and adapted channels. // The C code: // for (i = 0; i < PART_LEN1; i++) { // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); // (*far_energy) += (uint32_t)(far_spectrum[i]); // *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i]; // (*echo_energy_stored) += (uint32_t)echo_est[i]; // } while (start_stored_p < end_stored_p) { spectrum_v = vld1q_u16(far_spectrum_p); adapt_v = vld1q_s16(start_adapt_p); store_v = vld1q_s16(start_stored_p); far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v)); far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v)); echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)), vget_low_u16(spectrum_v)); echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)), vget_high_u16(spectrum_v)); vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v); echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_low_s16(adapt_v)), vget_low_u16(spectrum_v)); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_high_s16(adapt_v)), vget_high_u16(spectrum_v)); start_stored_p += 8; start_adapt_p += 8; far_spectrum_p += 8; echo_est_p += 8; } AddLanes(far_energy, far_energy_v); AddLanes(echo_energy_stored, echo_stored_v); AddLanes(echo_energy_adapt, echo_adapt_v); echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); *echo_energy_stored += (uint32_t)echo_est[PART_LEN]; *far_energy += (uint32_t)far_spectrum[PART_LEN]; *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN]; }