void test_vmlals32 (void) { int64x2_t out_int64x2_t; int64x2_t arg0_int64x2_t; int32x2_t arg1_int32x2_t; int32x2_t arg2_int32x2_t; out_int64x2_t = vmlal_s32 (arg0_int64x2_t, arg1_int32x2_t, arg2_int32x2_t); }
static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre, int16_t* inim, int32_t* outre1, int32_t* outre2, int32_t sh) { int k; int16_t* p_inre = inre; int16_t* p_inim = inim; int32_t* p_outre1 = outre1; int32_t* p_outre2 = outre2; const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; int32x4_t shift = vdupq_n_s32(-sh - 16); // Divide through by the normalizing constant: // scale all values with 1/240, i.e. with 273 in Q16. // 273/65536 ~= 0.0041656 // 1/240 ~= 0.0041666 int16x8_t scale = vdupq_n_s16(273); // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727. int factQ19 = 31727 << 16; int32x4_t fact = vdupq_n_s32(factQ19); for (k = 0; k < FRAMESAMPLES/2; k += 8) { int16x8_t inre16x8 = vld1q_s16(p_inre); int16x8_t inim16x8 = vld1q_s16(p_inim); p_inre += 8; p_inim += 8; int16x8_t tmpr = vld1q_s16(kCosTab); int16x8_t tmpi = vld1q_s16(kSinTab); kCosTab += 8; kSinTab += 8; // By vshl and vmull, we effectively did "<< (-sh - 16)", // instead of "<< (-sh)" and ">> 16" as in the C code. int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale)); int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale); int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale); #else int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8), vget_high_s16(scale)); int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8), vget_high_s16(scale)); #endif outre1_0 = vshlq_s32(outre1_0, shift); outre1_1 = vshlq_s32(outre1_1, shift); outre2_0 = vshlq_s32(outre2_0, shift); outre2_1 = vshlq_s32(outre2_1, shift); // Demodulate and separate. int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr)); int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t tmpr_1 = vmovl_high_s16(tmpr); int32x4_t tmpi_1 = vmovl_high_s16(tmpi); #else int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr)); int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi)); #endif int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0)); int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0)); int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1)); int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1)); xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0)); xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0)); xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1)); xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1)); #if defined(WEBRTC_ARCH_ARM64) int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0); int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0); int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1); int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1); xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0); xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0); xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1); xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1); #else int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0)); int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0)); int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1)); int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1)); xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0)); xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0)); xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1)); xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1)); #endif outre1_0 = vcombine_s32(vshrn_n_s64(xr0, 10), vshrn_n_s64(xr1, 10)); outre2_0 = vcombine_s32(vshrn_n_s64(xi0, 10), vshrn_n_s64(xi1, 10)); outre1_1 = vcombine_s32(vshrn_n_s64(xr2, 10), vshrn_n_s64(xr3, 10)); outre2_1 = vcombine_s32(vshrn_n_s64(xi2, 10), vshrn_n_s64(xi3, 10)); outre1_0 = vqdmulhq_s32(outre1_0, fact); outre2_0 = vqdmulhq_s32(outre2_0, fact); outre1_1 = vqdmulhq_s32(outre1_1, fact); outre2_1 = vqdmulhq_s32(outre2_1, fact); vst1q_s32(p_outre1, outre1_0); p_outre1 += 4; vst1q_s32(p_outre1, outre1_1); p_outre1 += 4; vst1q_s32(p_outre2, outre2_0); p_outre2 += 4; vst1q_s32(p_outre2, outre2_1); p_outre2 += 4; } }
void BQ_2I_D32F32C30_TRC_WRA_01 ( Biquad_Instance_t *pInstance, LVM_INT32 *pDataIn, LVM_INT32 *pDataOut, LVM_INT16 NrSamples) { #if !(defined __ARM_HAVE_NEON) LVM_INT32 ynL,ynR,templ,tempd; LVM_INT16 ii; PFilter_State pBiquadState = (PFilter_State) pInstance; for (ii = NrSamples; ii != 0; ii--) { /************************************************************************** PROCESSING OF THE LEFT CHANNEL ***************************************************************************/ /* ynL= ( A2 (Q30) * x(n-2)L (Q0) ) >>30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[2],ynL,30) /* ynL+= ( A1 (Q30) * x(n-1)L (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[0],templ,30) ynL+=templ; /* ynL+= ( A0 (Q30) * x(n)L (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[2],*pDataIn,templ,30) ynL+=templ; /* ynL+= (-B2 (Q30) * y(n-2)L (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[6],templ,30) ynL+=templ; /* ynL+= (-B1 (Q30) * y(n-1)L (Q0) ) >> 30 in Q0 */ MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[4],templ,30) ynL+=templ; /************************************************************************** PROCESSING OF THE RIGHT CHANNEL ***************************************************************************/ /* ynR= ( A2 (Q30) * x(n-2)R (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[3],ynR,30) /* ynR+= ( A1 (Q30) * x(n-1)R (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[1],templ,30) ynR+=templ; /* ynR+= ( A0 (Q30) * x(n)R (Q0) ) >> 30 in Q0*/ tempd=*(pDataIn+1); MUL32x32INTO32(pBiquadState->coefs[2],tempd,templ,30) ynR+=templ; /* ynR+= (-B2 (Q30) * y(n-2)R (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[7],templ,30) ynR+=templ; /* ynR+= (-B1 (Q30) * y(n-1)R (Q0) ) >> 30 in Q0 */ MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[5],templ,30) ynR+=templ; /************************************************************************** UPDATING THE DELAYS ***************************************************************************/ pBiquadState->pDelays[7]=pBiquadState->pDelays[5]; /* y(n-2)R=y(n-1)R*/ pBiquadState->pDelays[6]=pBiquadState->pDelays[4]; /* y(n-2)L=y(n-1)L*/ pBiquadState->pDelays[3]=pBiquadState->pDelays[1]; /* x(n-2)R=x(n-1)R*/ pBiquadState->pDelays[2]=pBiquadState->pDelays[0]; /* x(n-2)L=x(n-1)L*/ pBiquadState->pDelays[5]=(LVM_INT32)ynR; /* Update y(n-1)R in Q0*/ pBiquadState->pDelays[4]=(LVM_INT32)ynL; /* Update y(n-1)L in Q0*/ pBiquadState->pDelays[0]=(*pDataIn); /* Update x(n-1)L in Q0*/ pDataIn++; pBiquadState->pDelays[1]=(*pDataIn); /* Update x(n-1)R in Q0*/ pDataIn++; /************************************************************************** WRITING THE OUTPUT ***************************************************************************/ *pDataOut=(LVM_INT32)ynL; /* Write Left output in Q0*/ pDataOut++; *pDataOut=(LVM_INT32)ynR; /* Write Right ouput in Q0*/ pDataOut++; } #else LVM_INT16 ii=0; PFilter_State pBiquadState = (PFilter_State) pInstance; int32x2_t A2 = vdup_n_s32(pBiquadState->coefs[0]); int32x2_t A1 = vdup_n_s32(pBiquadState->coefs[1]); int32x2_t A0 = vdup_n_s32(pBiquadState->coefs[2]); int32x2_t B2 = vdup_n_s32(pBiquadState->coefs[3]); int32x2_t B1 = vdup_n_s32(pBiquadState->coefs[4]); int32x2_t X_2 = vld1_s32(&pBiquadState->pDelays[2]); int32x2_t X_1 = vld1_s32(&pBiquadState->pDelays[0]); int32x2_t Y_2 = vld1_s32(&pBiquadState->pDelays[6]); int32x2_t Y_1 = vld1_s32(&pBiquadState->pDelays[4]); for(ii=0; ii<NrSamples; ii++){ int32x2_t s = vld1_s32(pDataIn); int64x2_t r = vmull_s32(A2, X_2); r = vmlal_s32(r, A1, X_1); r = vmlal_s32(r, A0, s); r = vmlal_s32(r, B2, Y_2); r = vmlal_s32(r, B1, Y_1); int32_t ll =(int32_t)( vgetq_lane_s64(r, 0) >> 30); int32_t rr =(int32_t)( vgetq_lane_s64(r, 1) >> 30); pDataIn += 2; *pDataOut ++ = ll; *pDataOut ++ = rr; int32_t tmp1, tmp2; tmp1 = vget_lane_s32(X_1, 0); tmp2 = vget_lane_s32(X_1, 1); vset_lane_s32(tmp1, X_2, 0); vset_lane_s32(tmp2, X_2, 1); tmp1 = vget_lane_s32(Y_1, 0); tmp2 = vget_lane_s32(Y_1, 1); vset_lane_s32(tmp1, Y_2, 0); vset_lane_s32(tmp2, Y_2, 1); vset_lane_s32(ll, Y_1, 0); vset_lane_s32(rr, Y_1, 1); tmp1 = vget_lane_s32(s, 0); tmp2 = vget_lane_s32(s, 1); vset_lane_s32(tmp1, X_1, 0); vset_lane_s32(tmp2, X_1, 1); } vst1_s32(&pBiquadState->pDelays[2], X_2); vst1_s32(&pBiquadState->pDelays[0], X_1); vst1_s32(&pBiquadState->pDelays[6], Y_2); vst1_s32(&pBiquadState->pDelays[4], Y_1); #endif }