void test_vst1s32 (void) { int32_t *arg0_int32_t; int32x2_t arg1_int32x2_t; vst1_s32 (arg0_int32_t, arg1_int32x2_t); }
test_vdup_lane_s32 () { int32x2_t a; int32x2_t b; int i; int32_t c[2] = { 0, 1 }; int32_t d[2]; a = vld1_s32 (c); b = wrap_vdup_lane_s32_0 (a); vst1_s32 (d, b); for (i = 0; i < 2; i++) if (c[0] != d[i]) return 1; b = wrap_vdup_lane_s32_1 (a); vst1_s32 (d, b); for (i = 0; i < 2; i++) if (c[1] != d[i]) return 1; return 0; }
test_vreinterpret_s32_f64 () { float64x1_t a; int32x2_t b; float64_t c[1] = { PI_F64 }; int32_t d[2] = { 0x54442D18, 0x400921FB }; int32_t e[2]; int i; a = vld1_f64 (c); b = wrap_vreinterpret_s32_f64 (a); vst1_s32 (e, b); for (i = 0; i < 2; i++) if (d[i] != e[i]) return 1; return 0; };
void test6(float *p, int32x2_t v) { return vst1_s32(p, v); // expected-warning {{incompatible pointer types}} }
inline void vst1(s32 * ptr, const int32x2_t & v) { return vst1_s32(ptr, v); }
// Contains a function for the core loop in the normalized lattice MA // filter routine for iSAC codec, optimized for ARM Neon platform. // It does: // for 0 <= n < HALF_SUBFRAMELEN - 1: // *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); // *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); // Output is not bit-exact with the reference C code, due to the replacement // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon // instructions. The difference should not be bigger than 1. void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, // Filter coefficient int16_t input1, // Filter coefficient int32_t input2, // Inverse coefficient int32_t* ptr0, // Sample buffer int32_t* ptr1, // Sample buffer int32_t* ptr2) // Sample buffer { int n = 0; int loop = (HALF_SUBFRAMELEN - 1) >> 3; int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7; int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16); int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16); int32x4_t input2_v = vdupq_n_s32(input2); int32x4_t tmp0a, tmp1a, tmp2a, tmp3a; int32x4_t tmp0b, tmp1b, tmp2b, tmp3b; int32x4_t ptr0va, ptr1va, ptr2va; int32x4_t ptr0vb, ptr1vb, ptr2vb; // Unroll to process 8 samples at once. for (n = 0; n < loop; n++) { ptr0va = vld1q_s32(ptr0); ptr0vb = vld1q_s32(ptr0 + 4); ptr0 += 8; ptr2va = vld1q_s32(ptr2); ptr2vb = vld1q_s32(ptr2 + 4); // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); tmp0b = vqrdmulhq_s32(ptr0vb, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); tmp1b = vqrdmulhq_s32(ptr0vb, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2b = vaddq_s32(tmp0b, ptr2vb); tmp2a = vshlq_n_s32(tmp2a, 15); tmp2b = vshlq_n_s32(tmp2b, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); ptr2vb = vqrdmulhq_s32(tmp2b, input2_v); vst1q_s32(ptr2, ptr2va); vst1q_s32(ptr2 + 4, ptr2vb); ptr2 += 8; // Calculate tmp3 = ptr2v * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); tmp3b = vqrdmulhq_s32(ptr2vb, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); ptr1vb = vaddq_s32(tmp1b, tmp3b); vst1q_s32(ptr1, ptr1va); vst1q_s32(ptr1 + 4, ptr1vb); ptr1 += 8; } // Process four more samples. if (loop_tail & 0x4) { ptr0va = vld1q_s32(ptr0); ptr2va = vld1q_s32(ptr2); ptr0 += 4; // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2a = vshlq_n_s32(tmp2a, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); vst1q_s32(ptr2, ptr2va); ptr2 += 4; // Calculate tmp3 = *(ptr2) * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); vst1q_s32(ptr1, ptr1va); ptr1 += 4; } // Process two more samples. if (loop_tail & 0x2) { int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail; int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail; ptr0v_tail = vld1_s32(ptr0); ptr2v_tail = vld1_s32(ptr2); ptr0 += 2; // Calculate tmp0 = (*ptr0) * input0. tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v)); // Calculate tmp1 = (*ptr0) * input1. tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v)); // Calculate tmp2 = tmp0 + *(ptr2). tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail); tmp2_tail = vshl_n_s32(tmp2_tail, 15); // Calculate *ptr2 = input2 * tmp2. ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v)); vst1_s32(ptr2, ptr2v_tail); ptr2 += 2; // Calculate tmp3 = *(ptr2) * input0. tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v)); // Calculate *ptr1 = tmp1 + tmp3. ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail); vst1_s32(ptr1, ptr1v_tail); ptr1 += 2; } // Process one more sample. if (loop_tail & 0x1) { int16_t t16a = (int16_t)(input2 >> 16); int16_t t16b = (int16_t)input2; if (t16b < 0) t16a++; int32_t tmp32a; int32_t tmp32b; // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); tmp32b = *ptr2 + tmp32a; *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) + (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b))); // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); *ptr1 = tmp32a + tmp32b; }
void BQ_2I_D32F32C30_TRC_WRA_01 ( Biquad_Instance_t *pInstance, LVM_INT32 *pDataIn, LVM_INT32 *pDataOut, LVM_INT16 NrSamples) { #if !(defined __ARM_HAVE_NEON) LVM_INT32 ynL,ynR,templ,tempd; LVM_INT16 ii; PFilter_State pBiquadState = (PFilter_State) pInstance; for (ii = NrSamples; ii != 0; ii--) { /************************************************************************** PROCESSING OF THE LEFT CHANNEL ***************************************************************************/ /* ynL= ( A2 (Q30) * x(n-2)L (Q0) ) >>30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[2],ynL,30) /* ynL+= ( A1 (Q30) * x(n-1)L (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[0],templ,30) ynL+=templ; /* ynL+= ( A0 (Q30) * x(n)L (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[2],*pDataIn,templ,30) ynL+=templ; /* ynL+= (-B2 (Q30) * y(n-2)L (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[6],templ,30) ynL+=templ; /* ynL+= (-B1 (Q30) * y(n-1)L (Q0) ) >> 30 in Q0 */ MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[4],templ,30) ynL+=templ; /************************************************************************** PROCESSING OF THE RIGHT CHANNEL ***************************************************************************/ /* ynR= ( A2 (Q30) * x(n-2)R (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[3],ynR,30) /* ynR+= ( A1 (Q30) * x(n-1)R (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[1],templ,30) ynR+=templ; /* ynR+= ( A0 (Q30) * x(n)R (Q0) ) >> 30 in Q0*/ tempd=*(pDataIn+1); MUL32x32INTO32(pBiquadState->coefs[2],tempd,templ,30) ynR+=templ; /* ynR+= (-B2 (Q30) * y(n-2)R (Q0) ) >> 30 in Q0*/ MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[7],templ,30) ynR+=templ; /* ynR+= (-B1 (Q30) * y(n-1)R (Q0) ) >> 30 in Q0 */ MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[5],templ,30) ynR+=templ; /************************************************************************** UPDATING THE DELAYS ***************************************************************************/ pBiquadState->pDelays[7]=pBiquadState->pDelays[5]; /* y(n-2)R=y(n-1)R*/ pBiquadState->pDelays[6]=pBiquadState->pDelays[4]; /* y(n-2)L=y(n-1)L*/ pBiquadState->pDelays[3]=pBiquadState->pDelays[1]; /* x(n-2)R=x(n-1)R*/ pBiquadState->pDelays[2]=pBiquadState->pDelays[0]; /* x(n-2)L=x(n-1)L*/ pBiquadState->pDelays[5]=(LVM_INT32)ynR; /* Update y(n-1)R in Q0*/ pBiquadState->pDelays[4]=(LVM_INT32)ynL; /* Update y(n-1)L in Q0*/ pBiquadState->pDelays[0]=(*pDataIn); /* Update x(n-1)L in Q0*/ pDataIn++; pBiquadState->pDelays[1]=(*pDataIn); /* Update x(n-1)R in Q0*/ pDataIn++; /************************************************************************** WRITING THE OUTPUT ***************************************************************************/ *pDataOut=(LVM_INT32)ynL; /* Write Left output in Q0*/ pDataOut++; *pDataOut=(LVM_INT32)ynR; /* Write Right ouput in Q0*/ pDataOut++; } #else LVM_INT16 ii=0; PFilter_State pBiquadState = (PFilter_State) pInstance; int32x2_t A2 = vdup_n_s32(pBiquadState->coefs[0]); int32x2_t A1 = vdup_n_s32(pBiquadState->coefs[1]); int32x2_t A0 = vdup_n_s32(pBiquadState->coefs[2]); int32x2_t B2 = vdup_n_s32(pBiquadState->coefs[3]); int32x2_t B1 = vdup_n_s32(pBiquadState->coefs[4]); int32x2_t X_2 = vld1_s32(&pBiquadState->pDelays[2]); int32x2_t X_1 = vld1_s32(&pBiquadState->pDelays[0]); int32x2_t Y_2 = vld1_s32(&pBiquadState->pDelays[6]); int32x2_t Y_1 = vld1_s32(&pBiquadState->pDelays[4]); for(ii=0; ii<NrSamples; ii++){ int32x2_t s = vld1_s32(pDataIn); int64x2_t r = vmull_s32(A2, X_2); r = vmlal_s32(r, A1, X_1); r = vmlal_s32(r, A0, s); r = vmlal_s32(r, B2, Y_2); r = vmlal_s32(r, B1, Y_1); int32_t ll =(int32_t)( vgetq_lane_s64(r, 0) >> 30); int32_t rr =(int32_t)( vgetq_lane_s64(r, 1) >> 30); pDataIn += 2; *pDataOut ++ = ll; *pDataOut ++ = rr; int32_t tmp1, tmp2; tmp1 = vget_lane_s32(X_1, 0); tmp2 = vget_lane_s32(X_1, 1); vset_lane_s32(tmp1, X_2, 0); vset_lane_s32(tmp2, X_2, 1); tmp1 = vget_lane_s32(Y_1, 0); tmp2 = vget_lane_s32(Y_1, 1); vset_lane_s32(tmp1, Y_2, 0); vset_lane_s32(tmp2, Y_2, 1); vset_lane_s32(ll, Y_1, 0); vset_lane_s32(rr, Y_1, 1); tmp1 = vget_lane_s32(s, 0); tmp2 = vget_lane_s32(s, 1); vset_lane_s32(tmp1, X_1, 0); vset_lane_s32(tmp2, X_1, 1); } vst1_s32(&pBiquadState->pDelays[2], X_2); vst1_s32(&pBiquadState->pDelays[0], X_1); vst1_s32(&pBiquadState->pDelays[6], Y_2); vst1_s32(&pBiquadState->pDelays[4], Y_1); #endif }