/* Inner loop used for function WebRtcIsacfix_NormLatticeFilterMa(). It does: for 0 <= n < HALF_SUBFRAMELEN - 1: *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); Note, function WebRtcIsacfix_FilterMaLoopNeon and WebRtcIsacfix_FilterMaLoopC are not bit-exact. The accuracy by the ARM Neon function is same or better. */ void WebRtcIsacfix_FilterMaLoopC(int16_t input0, // Filter coefficient int16_t input1, // Filter coefficient int32_t input2, // Inverse coeff. (1/input1) int32_t* ptr0, // Sample buffer int32_t* ptr1, // Sample buffer int32_t* ptr2) { // Sample buffer int n = 0; // Separate the 32-bit variable input2 into two 16-bit integers (high 16 and // low 16 bits), for using LATTICE_MUL_32_32_RSFT16 in the loop. int16_t t16a = (int16_t)(input2 >> 16); int16_t t16b = (int16_t)input2; if (t16b < 0) t16a++; // The loop filtering the samples *ptr0, *ptr1, *ptr2 with filter coefficients // input0, input1, and input2. for(n = 0; n < HALF_SUBFRAMELEN - 1; n++, ptr0++, ptr1++, ptr2++) { int32_t tmp32a = 0; int32_t tmp32b = 0; // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)); tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); // Q15 * Q15 >> 15 = Q15 tmp32b = *ptr2 + tmp32a; // Q15 + Q15 = Q15 *ptr2 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b); // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); // Q15*Q15>>15 = Q15 tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); // Q15*Q15>>15 = Q15 *ptr1 = tmp32a + tmp32b; // Q15 + Q15 = Q15 } }
/* 1D parabolic interpolation . All input and output values are in Q8 */ static __inline void Intrp1DQ8(int32_t *x, int32_t *fx, int32_t *y, int32_t *fy) { int16_t sign1=1, sign2=1; int32_t r32, q32, t32, nom32, den32; int16_t t16, tmp16, tmp16_1; if ((fx[0]>0) && (fx[2]>0)) { r32=fx[1]-fx[2]; q32=fx[0]-fx[1]; nom32=q32+r32; den32=WEBRTC_SPL_MUL_32_16((q32-r32), 2); if (nom32<0) sign1=-1; if (den32<0) sign2=-1; /* t = (q32+r32)/(2*(q32-r32)) = (fx[0]-fx[1] + fx[1]-fx[2])/(2 * fx[0]-fx[1] - (fx[1]-fx[2]))*/ /* (Signs are removed because WebRtcSpl_DivResultInQ31 can't handle negative numbers) */ t32=WebRtcSpl_DivResultInQ31(WEBRTC_SPL_MUL_32_16(nom32, sign1),WEBRTC_SPL_MUL_32_16(den32, sign2)); /* t in Q31, without signs */ t16=(int16_t)WEBRTC_SPL_RSHIFT_W32(t32, 23); /* Q8 */ t16=t16*sign1*sign2; /* t in Q8 with signs */ *y = x[0]+t16; /* Q8 */ // *y = x[1]+t16; /* Q8 */ /* The following code calculates fy in three steps */ /* fy = 0.5 * t * (t-1) * fx[0] + (1-t*t) * fx[1] + 0.5 * t * (t+1) * fx[2]; */ /* Part I: 0.5 * t * (t-1) * fx[0] */ tmp16_1=(int16_t)WEBRTC_SPL_MUL_16_16(t16,t16); /* Q8*Q8=Q16 */ tmp16_1 = WEBRTC_SPL_RSHIFT_W16(tmp16_1,2); /* Q16>>2 = Q14 */ t16 = (int16_t)WEBRTC_SPL_MUL_16_16(t16, 64); /* Q8<<6 = Q14 */ tmp16 = tmp16_1-t16; *fy = WEBRTC_SPL_MUL_16_32_RSFT15(tmp16, fx[0]); /* (Q14 * Q8 >>15)/2 = Q8 */ /* Part II: (1-t*t) * fx[1] */ tmp16 = 16384-tmp16_1; /* 1 in Q14 - Q14 */ *fy += WEBRTC_SPL_MUL_16_32_RSFT14(tmp16, fx[1]);/* Q14 * Q8 >> 14 = Q8 */ /* Part III: 0.5 * t * (t+1) * fx[2] */ tmp16 = tmp16_1+t16; *fy += WEBRTC_SPL_MUL_16_32_RSFT15(tmp16, fx[2]);/* (Q14 * Q8 >>15)/2 = Q8 */ } else { *y = x[0]; *fy= fx[1]; } }
/* MA filter */ void WebRtcIsacfix_NormLatticeFilterMa(int16_t orderCoef, int32_t *stateGQ15, int16_t *lat_inQ0, int16_t *filt_coefQ15, int32_t *gain_lo_hiQ17, int16_t lo_hi, int16_t *lat_outQ9) { int16_t sthQ15[MAX_AR_MODEL_ORDER]; int16_t cthQ15[MAX_AR_MODEL_ORDER]; int u, i, k, n; int16_t temp2,temp3; int16_t ord_1 = orderCoef+1; int32_t inv_cthQ16[MAX_AR_MODEL_ORDER]; int32_t gain32, fQtmp; int16_t gain16; int16_t gain_sh; int32_t tmp32, tmp32b; int32_t fQ15vec[HALF_SUBFRAMELEN]; int32_t gQ15[MAX_AR_MODEL_ORDER+1][HALF_SUBFRAMELEN]; int16_t sh; int16_t t16a; int16_t t16b; for (u=0;u<SUBFRAMES;u++) { int32_t temp1 = WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN); /* set the Direct Form coefficients */ temp2 = (int16_t)WEBRTC_SPL_MUL_16_16(u, orderCoef); temp3 = (int16_t)WEBRTC_SPL_MUL_16_16(2, u)+lo_hi; /* compute lattice filter coefficients */ memcpy(sthQ15, &filt_coefQ15[temp2], orderCoef * sizeof(int16_t)); WebRtcSpl_SqrtOfOneMinusXSquared(sthQ15, orderCoef, cthQ15); /* compute the gain */ gain32 = gain_lo_hiQ17[temp3]; gain_sh = WebRtcSpl_NormW32(gain32); gain32 = WEBRTC_SPL_LSHIFT_W32(gain32, gain_sh); //Q(17+gain_sh) for (k=0;k<orderCoef;k++) { gain32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], gain32); //Q15*Q(17+gain_sh)>>15 = Q(17+gain_sh) inv_cthQ16[k] = WebRtcSpl_DivW32W16((int32_t)2147483647, cthQ15[k]); // 1/cth[k] in Q31/Q15 = Q16 } gain16 = (int16_t)(gain32 >> 16); // Q(1+gain_sh). /* normalized lattice filter */ /*****************************/ /* initial conditions */ for (i=0;i<HALF_SUBFRAMELEN;i++) { fQ15vec[i] = WEBRTC_SPL_LSHIFT_W32((int32_t)lat_inQ0[i + temp1], 15); //Q15 gQ15[0][i] = WEBRTC_SPL_LSHIFT_W32((int32_t)lat_inQ0[i + temp1], 15); //Q15 } fQtmp = fQ15vec[0]; /* get the state of f&g for the first input, for all orders */ for (i=1;i<ord_1;i++) { // Calculate f[i][0] = inv_cth[i-1]*(f[i-1][0] + sth[i-1]*stateG[i-1]); tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[i-1], stateGQ15[i-1]);//Q15*Q15>>15 = Q15 tmp32b= fQtmp + tmp32; //Q15+Q15=Q15 tmp32 = inv_cthQ16[i-1]; //Q16 t16a = (int16_t)(tmp32 >> 16); t16b = (int16_t) (tmp32-WEBRTC_SPL_LSHIFT_W32(((int32_t)t16a), 16)); if (t16b<0) t16a++; tmp32 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b); fQtmp = tmp32; // Q15 // Calculate g[i][0] = cth[i-1]*stateG[i-1] + sth[i-1]* f[i][0]; tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[i-1], stateGQ15[i-1]); //Q15*Q15>>15 = Q15 tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[i-1], fQtmp); //Q15*Q15>>15 = Q15 tmp32 = tmp32 + tmp32b;//Q15+Q15 = Q15 gQ15[i][0] = tmp32; // Q15 } /* filtering */ /* save the states */ for(k=0;k<orderCoef;k++) { // for 0 <= n < HALF_SUBFRAMELEN - 1: // f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]); // g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1]; WebRtcIsacfix_FilterMaLoopFix(sthQ15[k], cthQ15[k], inv_cthQ16[k], &gQ15[k][0], &gQ15[k+1][1], &fQ15vec[1]); } fQ15vec[0] = fQtmp; for(n=0;n<HALF_SUBFRAMELEN;n++) { //gain32 >>= gain_sh; // Q(17+gain_sh) -> Q17 tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(gain16, fQ15vec[n]); //Q(1+gain_sh)*Q15>>16 = Q(gain_sh) sh = 9-gain_sh; //number of needed shifts to reach Q9 t16a = (int16_t) WEBRTC_SPL_SHIFT_W32(tmp32, sh); lat_outQ9[n + temp1] = t16a; } /* save the states */ for (i=0;i<ord_1;i++) { stateGQ15[i] = gQ15[i][HALF_SUBFRAMELEN-1]; } //process next frame } return; }
/* filter the signal using normalized lattice filter */ void WebRtcIsacfix_NormLatticeFilterAr(int16_t orderCoef, int16_t *stateGQ0, int32_t *lat_inQ25, int16_t *filt_coefQ15, int32_t *gain_lo_hiQ17, int16_t lo_hi, int16_t *lat_outQ0) { int ii,n,k,i,u; int16_t sthQ15[MAX_AR_MODEL_ORDER]; int16_t cthQ15[MAX_AR_MODEL_ORDER]; int32_t tmp32; int16_t tmpAR; int16_t ARfQ0vec[HALF_SUBFRAMELEN]; int16_t ARgQ0vec[MAX_AR_MODEL_ORDER+1]; int32_t inv_gain32; int16_t inv_gain16; int16_t den16; int16_t sh; int16_t temp2,temp3; int16_t ord_1 = orderCoef+1; for (u=0;u<SUBFRAMES;u++) { int32_t temp1 = WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN); //set the denominator and numerator of the Direct Form temp2 = (int16_t)WEBRTC_SPL_MUL_16_16(u, orderCoef); temp3 = (int16_t)WEBRTC_SPL_MUL_16_16(2, u) + lo_hi; for (ii=0; ii<orderCoef; ii++) { sthQ15[ii] = filt_coefQ15[temp2+ii]; } WebRtcSpl_SqrtOfOneMinusXSquared(sthQ15, orderCoef, cthQ15); /* Simulation of the 25 files shows that maximum value in the vector gain_lo_hiQ17[] is 441344, which means that it is log2((2^31)/441344) = 12.2 shifting bits from saturation. Therefore, it should be safe to use Q27 instead of Q17. */ tmp32 = WEBRTC_SPL_LSHIFT_W32(gain_lo_hiQ17[temp3], 10); // Q27 for (k=0;k<orderCoef;k++) { tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], tmp32); // Q15*Q27>>15 = Q27 } sh = WebRtcSpl_NormW32(tmp32); // tmp32 is the gain den16 = (int16_t) WEBRTC_SPL_SHIFT_W32(tmp32, sh-16); //Q(27+sh-16) = Q(sh+11) (all 16 bits are value bits) inv_gain32 = WebRtcSpl_DivW32W16((int32_t)2147483647, den16); // 1/gain in Q31/Q(sh+11) = Q(20-sh) //initial conditions inv_gain16 = (int16_t)(inv_gain32 >> 2); // 1/gain in Q(20-sh-2) = Q(18-sh) for (i=0;i<HALF_SUBFRAMELEN;i++) { tmp32 = WEBRTC_SPL_LSHIFT_W32(lat_inQ25[i + temp1], 1); //Q25->Q26 tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(inv_gain16, tmp32); //lat_in[]*inv_gain in (Q(18-sh)*Q26)>>16 = Q(28-sh) tmp32 = WEBRTC_SPL_SHIFT_W32(tmp32, -(28-sh)); // lat_in[]*inv_gain in Q0 ARfQ0vec[i] = (int16_t)WebRtcSpl_SatW32ToW16(tmp32); // Q0 } for (i=orderCoef-1;i>=0;i--) //get the state of f&g for the first input, for all orders { tmp32 = (cthQ15[i] * ARfQ0vec[0] - sthQ15[i] * stateGQ0[i] + 16384) >> 15; tmpAR = (int16_t)WebRtcSpl_SatW32ToW16(tmp32); // Q0 tmp32 = (sthQ15[i] * ARfQ0vec[0] + cthQ15[i] * stateGQ0[i] + 16384) >> 15; ARgQ0vec[i+1] = (int16_t)WebRtcSpl_SatW32ToW16(tmp32); // Q0 ARfQ0vec[0] = tmpAR; } ARgQ0vec[0] = ARfQ0vec[0]; // Filter ARgQ0vec[] and ARfQ0vec[] through coefficients cthQ15[] and sthQ15[]. WebRtcIsacfix_FilterArLoop(ARgQ0vec, ARfQ0vec, cthQ15, sthQ15, orderCoef); for(n=0;n<HALF_SUBFRAMELEN;n++) { lat_outQ0[n + temp1] = ARfQ0vec[n]; } /* cannot use memcpy in the following */ for (i=0;i<ord_1;i++) { stateGQ0[i] = ARgQ0vec[i]; } } return; }
/* filter the signal using normalized lattice filter */ void WebRtcIsacfix_NormLatticeFilterAr(size_t orderCoef, int16_t *stateGQ0, int32_t *lat_inQ25, int16_t *filt_coefQ15, int32_t *gain_lo_hiQ17, int16_t lo_hi, int16_t *lat_outQ0) { size_t ii, k, i; int n, u; int16_t sthQ15[MAX_AR_MODEL_ORDER]; int16_t cthQ15[MAX_AR_MODEL_ORDER]; int32_t tmp32; int16_t tmpAR; int16_t ARfQ0vec[HALF_SUBFRAMELEN]; int16_t ARgQ0vec[MAX_AR_MODEL_ORDER+1]; int32_t inv_gain32; int16_t inv_gain16; int16_t den16; int16_t sh; int16_t temp2,temp3; size_t ord_1 = orderCoef+1; for (u=0;u<SUBFRAMES;u++) { int32_t temp1 = u * HALF_SUBFRAMELEN; //set the denominator and numerator of the Direct Form temp2 = (int16_t)(u * orderCoef); temp3 = (int16_t)(2 * u + lo_hi); for (ii=0; ii<orderCoef; ii++) { sthQ15[ii] = filt_coefQ15[temp2+ii]; } WebRtcSpl_SqrtOfOneMinusXSquared(sthQ15, orderCoef, cthQ15); // Originally, this line was assumed to never overflow, since "[s]imulation // of the 25 files shows that maximum value in the vector gain_lo_hiQ17[] // is 441344, which means that it is log2((2^31)/441344) = 12.2 shifting // bits from saturation. Therefore, it should be safe to use Q27 instead of // Q17." However, a fuzzer test succeeded in provoking an overflow here, // which we ignore on the theory that only "abnormal" inputs cause // overflow. tmp32 = OverflowingLShiftS32(gain_lo_hiQ17[temp3], 10); // Q27 for (k=0;k<orderCoef;k++) { tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], tmp32); // Q15*Q27>>15 = Q27 } sh = WebRtcSpl_NormW32(tmp32); // tmp32 is the gain den16 = (int16_t) WEBRTC_SPL_SHIFT_W32(tmp32, sh-16); //Q(27+sh-16) = Q(sh+11) (all 16 bits are value bits) inv_gain32 = WebRtcSpl_DivW32W16((int32_t)2147483647, den16); // 1/gain in Q31/Q(sh+11) = Q(20-sh) //initial conditions inv_gain16 = (int16_t)(inv_gain32 >> 2); // 1/gain in Q(20-sh-2) = Q(18-sh) for (i=0;i<HALF_SUBFRAMELEN;i++) { tmp32 = OverflowingLShiftS32(lat_inQ25[i + temp1], 1); // Q25->Q26 tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(inv_gain16, tmp32); //lat_in[]*inv_gain in (Q(18-sh)*Q26)>>16 = Q(28-sh) tmp32 = WEBRTC_SPL_SHIFT_W32(tmp32, -(28-sh)); // lat_in[]*inv_gain in Q0 ARfQ0vec[i] = (int16_t)WebRtcSpl_SatW32ToW16(tmp32); // Q0 } // Get the state of f & g for the first input, for all orders. for (i = orderCoef; i > 0; i--) { tmp32 = (cthQ15[i - 1] * ARfQ0vec[0] - sthQ15[i - 1] * stateGQ0[i - 1] + 16384) >> 15; tmpAR = (int16_t)WebRtcSpl_SatW32ToW16(tmp32); // Q0 tmp32 = (sthQ15[i - 1] * ARfQ0vec[0] + cthQ15[i - 1] * stateGQ0[i - 1] + 16384) >> 15; ARgQ0vec[i] = (int16_t)WebRtcSpl_SatW32ToW16(tmp32); // Q0 ARfQ0vec[0] = tmpAR; } ARgQ0vec[0] = ARfQ0vec[0]; // Filter ARgQ0vec[] and ARfQ0vec[] through coefficients cthQ15[] and sthQ15[]. WebRtcIsacfix_FilterArLoop(ARgQ0vec, ARfQ0vec, cthQ15, sthQ15, orderCoef); for(n=0;n<HALF_SUBFRAMELEN;n++) { lat_outQ0[n + temp1] = ARfQ0vec[n]; } /* cannot use memcpy in the following */ for (i=0;i<ord_1;i++) { stateGQ0[i] = ARgQ0vec[i]; } } return; }
// Contains a function for the core loop in the normalized lattice MA // filter routine for iSAC codec, optimized for ARM Neon platform. // It does: // for 0 <= n < HALF_SUBFRAMELEN - 1: // *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); // *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); // Output is not bit-exact with the reference C code, due to the replacement // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon // instructions. The difference should not be bigger than 1. void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, // Filter coefficient int16_t input1, // Filter coefficient int32_t input2, // Inverse coefficient int32_t* ptr0, // Sample buffer int32_t* ptr1, // Sample buffer int32_t* ptr2) // Sample buffer { int n = 0; int loop = (HALF_SUBFRAMELEN - 1) >> 3; int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7; int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16); int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16); int32x4_t input2_v = vdupq_n_s32(input2); int32x4_t tmp0a, tmp1a, tmp2a, tmp3a; int32x4_t tmp0b, tmp1b, tmp2b, tmp3b; int32x4_t ptr0va, ptr1va, ptr2va; int32x4_t ptr0vb, ptr1vb, ptr2vb; // Unroll to process 8 samples at once. for (n = 0; n < loop; n++) { ptr0va = vld1q_s32(ptr0); ptr0vb = vld1q_s32(ptr0 + 4); ptr0 += 8; ptr2va = vld1q_s32(ptr2); ptr2vb = vld1q_s32(ptr2 + 4); // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); tmp0b = vqrdmulhq_s32(ptr0vb, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); tmp1b = vqrdmulhq_s32(ptr0vb, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2b = vaddq_s32(tmp0b, ptr2vb); tmp2a = vshlq_n_s32(tmp2a, 15); tmp2b = vshlq_n_s32(tmp2b, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); ptr2vb = vqrdmulhq_s32(tmp2b, input2_v); vst1q_s32(ptr2, ptr2va); vst1q_s32(ptr2 + 4, ptr2vb); ptr2 += 8; // Calculate tmp3 = ptr2v * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); tmp3b = vqrdmulhq_s32(ptr2vb, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); ptr1vb = vaddq_s32(tmp1b, tmp3b); vst1q_s32(ptr1, ptr1va); vst1q_s32(ptr1 + 4, ptr1vb); ptr1 += 8; } // Process four more samples. if (loop_tail & 0x4) { ptr0va = vld1q_s32(ptr0); ptr2va = vld1q_s32(ptr2); ptr0 += 4; // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2a = vshlq_n_s32(tmp2a, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); vst1q_s32(ptr2, ptr2va); ptr2 += 4; // Calculate tmp3 = *(ptr2) * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); vst1q_s32(ptr1, ptr1va); ptr1 += 4; } // Process two more samples. if (loop_tail & 0x2) { int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail; int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail; ptr0v_tail = vld1_s32(ptr0); ptr2v_tail = vld1_s32(ptr2); ptr0 += 2; // Calculate tmp0 = (*ptr0) * input0. tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v)); // Calculate tmp1 = (*ptr0) * input1. tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v)); // Calculate tmp2 = tmp0 + *(ptr2). tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail); tmp2_tail = vshl_n_s32(tmp2_tail, 15); // Calculate *ptr2 = input2 * tmp2. ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v)); vst1_s32(ptr2, ptr2v_tail); ptr2 += 2; // Calculate tmp3 = *(ptr2) * input0. tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v)); // Calculate *ptr1 = tmp1 + tmp3. ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail); vst1_s32(ptr1, ptr1v_tail); ptr1 += 2; } // Process one more sample. if (loop_tail & 0x1) { int16_t t16a = (int16_t)(input2 >> 16); int16_t t16b = (int16_t)input2; if (t16b < 0) t16a++; int32_t tmp32a; int32_t tmp32b; // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); tmp32b = *ptr2 + tmp32a; *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) + (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b))); // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); *ptr1 = tmp32a + tmp32b; }