Exemplo n.º 1
0
test_vdups_lane_s32 ()
{
  int32x2_t a;
  int32_t b;
  int32_t c[2] = { 0, 1 };

  a = vld1_s32 (c);
  b = wrap_vdups_lane_s32_0 (vcreate_s32 (0), a);
  if (c[0] != b)
    return 1;
  b = wrap_vdups_lane_s32_1 (a);
  if (c[1] != b)
    return 1;
  return 0;
}
Exemplo n.º 2
0
test_vreinterpret_f64_s32 ()
{
  int32x2_t a;
  float64x1_t b;
  int32_t c[2] = { 0x54442D18, 0x400921FB };
  float64_t d[1] = { PI_F64 };
  float64_t e[1];
  int i;

  a = vld1_s32 (c);
  b = wrap_vreinterpret_f64_s32 (a);
  vst1_f64 (e, b);
  if (!DOUBLE_EQUALS (d[0], e[0], __DBL_EPSILON__))
    return 1;
  return 0;
};
Exemplo n.º 3
0
test_vdupq_lane_s32 ()
{
  int32x2_t a;
  int32x4_t b;
  int i;
  int32_t c[2] = { 0, 1 };
  int32_t d[4];

  a = vld1_s32 (c);
  b = wrap_vdupq_lane_s32_0 (a);
  vst1q_s32 (d, b);
  for (i = 0; i < 4; i++)
    if (c[0] != d[i])
      return 1;

  b = wrap_vdupq_lane_s32_1 (a);
  vst1q_s32 (d, b);
  for (i = 0; i < 4; i++)
    if (c[1] != d[i])
      return 1;
  return 0;
}
Exemplo n.º 4
0
void silk_biquad_alt_stride2_neon(
    const opus_int16            *in,                /* I     input signal                                               */
    const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
    const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
    opus_int32                  *S,                 /* I/O   State vector [4]                                           */
    opus_int16                  *out,               /* O     output signal                                              */
    const opus_int32            len                 /* I     signal length (must be even)                               */
)
{
    /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
    opus_int        k            = 0;
    const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 );
    const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 );
    int16x4_t       in_s16x4  = vdup_n_s16( 0 );
    int16x4_t       out_s16x4;
    int32x2_t       A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2;
    int32x4_t       A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4;
    int32x2x2_t     t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2;

#ifdef OPUS_CHECK_ASM
    opus_int32 S_c[ 4 ];
    VARDECL( opus_int16, out_c );
    SAVE_STACK;
    ALLOC( out_c, 2 * len, opus_int16 );

    silk_memcpy( &S_c, S, sizeof( S_c ) );
    silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len );
#endif

    /* Negate A_Q28 values and split in two parts */
    A_Q28_s32x2 = vld1_s32( A_Q28 );
    A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 );
    A_L_s32x2   = vshl_n_s32( A_Q28_s32x2, 18 );                                                        /* ( -A_Q28[] & 0x00003FFF ) << 18                                                     */
    A_L_s32x2   = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) );           /* ( -A_Q28[] & 0x00003FFF ) << 15                                                     */
    A_U_s32x2   = vshr_n_s32( A_Q28_s32x2, 14 );                                                        /* silk_RSHIFT( -A_Q28[], 14 )                                                         */
    A_U_s32x2   = vshl_n_s32( A_U_s32x2, 16 );                                                          /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */
    A_U_s32x2   = vshr_n_s32( A_U_s32x2, 1 );                                                           /* silk_RSHIFT( -A_Q28[], 14 ) << 15                                                   */

    B_Q28_s32x2  = vld1_s32( B_Q28 );
    t_s32x2      = vld1_s32( B_Q28 + 1 );
    t0_s32x2x2   = vzip_s32( A_L_s32x2, A_L_s32x2 );
    t1_s32x2x2   = vzip_s32( A_U_s32x2, A_U_s32x2 );
    t2_s32x2x2   = vzip_s32( t_s32x2, t_s32x2 );
    A_L_s32x4    = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_L_Q28          */
    A_U_s32x4    = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_U_Q28          */
    B_Q28_s32x4  = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] );                            /* B_Q28[ {1,1,2,2} ]        */
    S_s32x4      = vld1q_s32( S );                                                                      /* S0 = S[ 0 ]; S3 = S[ 3 ]; */
    S_s32x2x2    = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) );                       /* S2 = S[ 1 ]; S1 = S[ 2 ]; */
    S_s32x4      = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] );

    for( ; k < len - 1; k += 2 ) {
        int32x4_t in_s32x4[ 2 ], t_s32x4;
        int32x2_t out32_Q14_s32x2[ 2 ];

        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */
        in_s16x4      = vld1_s16( &in[ 2 * k ] );                                                       /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */
        in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 );                                                    /* in{0,1,2,3} << 15                      */
        t_s32x4       = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 );                             /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */
        in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15                      */
        in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15                      */
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] );
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] );

        /* Scale back to Q0 and saturate */
        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] );                   /* out32_Q14_{0,1,2,3}                                                                                        */
        out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 );                                   /* out32_Q14_{0,1,2,3} + (1<<14) - 1                                                                          */
        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) )                             */
        vst1_s16( &out[ 2 * k ], out_s16x4 );                                                           /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */
    }

    /* Process leftover. */
    if( k < len ) {
        int32x4_t in_s32x4;
        int32x2_t out32_Q14_s32x2;

        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */
        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
        in_s32x4     = vshll_n_s16( in_s16x4, 15 );                                                     /* in{0,1} << 15                      */
        t_s32x2      = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 );                    /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */
        in_s32x4     = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) );              /* in{0,1,0,1} << 15                  */
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 );

        /* Scale back to Q0 and saturate */
        out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 );                                    /* out32_Q14_{0,1} + (1<<14) - 1                                                              */
        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 );                             /* out32_Q14_{0,1,0,1} + (1<<14) - 1                                                          */
        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) )             */
        vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 );                                               /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */
        vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 );                                               /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */
    }

    vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 );                                                              /* S[ 0 ] = S0; */
    vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 );                                                              /* S[ 1 ] = S2; */
    vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 );                                                              /* S[ 2 ] = S1; */
    vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 );                                                              /* S[ 3 ] = S3; */

#ifdef OPUS_CHECK_ASM
    silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) );
    silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) );
    RESTORE_STACK;
#endif
}
Exemplo n.º 5
0
inline   int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); }
Exemplo n.º 6
0
// Contains a function for the core loop in the normalized lattice MA
// filter routine for iSAC codec, optimized for ARM Neon platform.
// It does:
//  for 0 <= n < HALF_SUBFRAMELEN - 1:
//    *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
//    *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
// Output is not bit-exact with the reference C code, due to the replacement
// of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
// instructions. The difference should not be bigger than 1.
void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
                                    int16_t input1,  // Filter coefficient
                                    int32_t input2,  // Inverse coefficient
                                    int32_t* ptr0,   // Sample buffer
                                    int32_t* ptr1,   // Sample buffer
                                    int32_t* ptr2)   // Sample buffer
{
  int n = 0;
  int loop = (HALF_SUBFRAMELEN - 1) >> 3;
  int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7;

  int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16);
  int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16);
  int32x4_t input2_v = vdupq_n_s32(input2);
  int32x4_t tmp0a, tmp1a, tmp2a, tmp3a;
  int32x4_t tmp0b, tmp1b, tmp2b, tmp3b;
  int32x4_t ptr0va, ptr1va, ptr2va;
  int32x4_t ptr0vb, ptr1vb, ptr2vb;

  // Unroll to process 8 samples at once.
  for (n = 0; n < loop; n++) {
    ptr0va = vld1q_s32(ptr0);
    ptr0vb = vld1q_s32(ptr0 + 4);
    ptr0 += 8;

    ptr2va = vld1q_s32(ptr2);
    ptr2vb = vld1q_s32(ptr2 + 4);

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
    tmp0b = vqrdmulhq_s32(ptr0vb, input0_v);

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
    tmp1b = vqrdmulhq_s32(ptr0vb, input1_v);

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
    tmp2b = vaddq_s32(tmp0b, ptr2vb);
    tmp2a = vshlq_n_s32(tmp2a, 15);
    tmp2b = vshlq_n_s32(tmp2b, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
    ptr2vb = vqrdmulhq_s32(tmp2b, input2_v);

    vst1q_s32(ptr2, ptr2va);
    vst1q_s32(ptr2 + 4, ptr2vb);
    ptr2 += 8;

    // Calculate tmp3 = ptr2v * input0.
    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
    tmp3b = vqrdmulhq_s32(ptr2vb, input0_v);

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1va = vaddq_s32(tmp1a, tmp3a);
    ptr1vb = vaddq_s32(tmp1b, tmp3b);

    vst1q_s32(ptr1, ptr1va);
    vst1q_s32(ptr1 + 4, ptr1vb);
    ptr1 += 8;
  }

  // Process four more samples.
  if (loop_tail & 0x4) {
    ptr0va = vld1q_s32(ptr0);
    ptr2va = vld1q_s32(ptr2);
    ptr0 += 4;

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
    tmp2a = vshlq_n_s32(tmp2a, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);

    vst1q_s32(ptr2, ptr2va);
    ptr2 += 4;

    // Calculate tmp3 = *(ptr2) * input0.
    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1va = vaddq_s32(tmp1a, tmp3a);

    vst1q_s32(ptr1, ptr1va);
    ptr1 += 4;
  }

  // Process two more samples.
  if (loop_tail & 0x2) {
    int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
    int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
    ptr0v_tail = vld1_s32(ptr0);
    ptr2v_tail = vld1_s32(ptr2);
    ptr0 += 2;

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v));

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v));

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
    tmp2_tail = vshl_n_s32(tmp2_tail, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v));

    vst1_s32(ptr2, ptr2v_tail);
    ptr2 += 2;

    // Calculate tmp3 = *(ptr2) * input0.
    tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v));

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail);

    vst1_s32(ptr1, ptr1v_tail);
    ptr1 += 2;
  }

  // Process one more sample.
  if (loop_tail & 0x1) {
    int16_t t16a = (int16_t)(input2 >> 16);
    int16_t t16b = (int16_t)input2;
    if (t16b < 0) t16a++;
    int32_t tmp32a;
    int32_t tmp32b;

    // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)).
    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0);
    tmp32b = *ptr2 + tmp32a;
    *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) +
                       (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b)));

    // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2).
    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0);
    tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2);
    *ptr1 = tmp32a + tmp32b;
  }
Exemplo n.º 7
0
void BQ_2I_D32F32C30_TRC_WRA_01 (           Biquad_Instance_t       *pInstance,
                                            LVM_INT32                    *pDataIn,
                                            LVM_INT32                    *pDataOut,
                                            LVM_INT16                    NrSamples)


    {
#if !(defined  __ARM_HAVE_NEON)
        LVM_INT32 ynL,ynR,templ,tempd;
        LVM_INT16 ii;
        PFilter_State pBiquadState = (PFilter_State) pInstance;

         for (ii = NrSamples; ii != 0; ii--)
         {


            /**************************************************************************
                            PROCESSING OF THE LEFT CHANNEL
            ***************************************************************************/
            /* ynL= ( A2 (Q30) * x(n-2)L (Q0) ) >>30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[2],ynL,30)

            /* ynL+= ( A1 (Q30) * x(n-1)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[0],templ,30)
            ynL+=templ;

            /* ynL+= ( A0 (Q30) * x(n)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[2],*pDataIn,templ,30)
            ynL+=templ;

             /* ynL+= (-B2 (Q30) * y(n-2)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[6],templ,30)
            ynL+=templ;

            /* ynL+= (-B1 (Q30) * y(n-1)L (Q0) ) >> 30 in Q0 */
            MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[4],templ,30)
            ynL+=templ;

            /**************************************************************************
                            PROCESSING OF THE RIGHT CHANNEL
            ***************************************************************************/
            /* ynR= ( A2 (Q30) * x(n-2)R (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[3],ynR,30)

            /* ynR+= ( A1 (Q30) * x(n-1)R (Q0) ) >> 30  in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[1],templ,30)
            ynR+=templ;

            /* ynR+= ( A0 (Q30) * x(n)R (Q0) ) >> 30 in Q0*/
            tempd=*(pDataIn+1);
            MUL32x32INTO32(pBiquadState->coefs[2],tempd,templ,30)
            ynR+=templ;

            /* ynR+= (-B2 (Q30) * y(n-2)R (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[7],templ,30)
            ynR+=templ;

            /* ynR+= (-B1 (Q30) * y(n-1)R (Q0) ) >> 30 in Q0 */
            MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[5],templ,30)
            ynR+=templ;

            /**************************************************************************
                            UPDATING THE DELAYS
            ***************************************************************************/
            pBiquadState->pDelays[7]=pBiquadState->pDelays[5]; /* y(n-2)R=y(n-1)R*/
            pBiquadState->pDelays[6]=pBiquadState->pDelays[4]; /* y(n-2)L=y(n-1)L*/
            pBiquadState->pDelays[3]=pBiquadState->pDelays[1]; /* x(n-2)R=x(n-1)R*/
            pBiquadState->pDelays[2]=pBiquadState->pDelays[0]; /* x(n-2)L=x(n-1)L*/
            pBiquadState->pDelays[5]=(LVM_INT32)ynR; /* Update y(n-1)R in Q0*/
            pBiquadState->pDelays[4]=(LVM_INT32)ynL; /* Update y(n-1)L in Q0*/
            pBiquadState->pDelays[0]=(*pDataIn); /* Update x(n-1)L in Q0*/
            pDataIn++;
            pBiquadState->pDelays[1]=(*pDataIn); /* Update x(n-1)R in Q0*/
            pDataIn++;

            /**************************************************************************
                            WRITING THE OUTPUT
            ***************************************************************************/
            *pDataOut=(LVM_INT32)ynL; /* Write Left output in Q0*/
            pDataOut++;
            *pDataOut=(LVM_INT32)ynR; /* Write Right ouput in Q0*/
            pDataOut++;


        }
#else
        LVM_INT16 ii=0;
	      
		PFilter_State pBiquadState = (PFilter_State) pInstance;

		int32x2_t A2 = vdup_n_s32(pBiquadState->coefs[0]);
		int32x2_t A1 = vdup_n_s32(pBiquadState->coefs[1]);
		int32x2_t A0 = vdup_n_s32(pBiquadState->coefs[2]);
		int32x2_t B2 = vdup_n_s32(pBiquadState->coefs[3]);
		int32x2_t B1 = vdup_n_s32(pBiquadState->coefs[4]);
		
		int32x2_t X_2 = vld1_s32(&pBiquadState->pDelays[2]);
		int32x2_t X_1 = vld1_s32(&pBiquadState->pDelays[0]);
		int32x2_t Y_2 = vld1_s32(&pBiquadState->pDelays[6]);
		int32x2_t Y_1 = vld1_s32(&pBiquadState->pDelays[4]);

		for(ii=0; ii<NrSamples; ii++){
		  int32x2_t s = vld1_s32(pDataIn);
		  int64x2_t r = vmull_s32(A2, X_2);
		  r = vmlal_s32(r, A1, X_1);
		  r = vmlal_s32(r, A0, s);
		  r = vmlal_s32(r, B2, Y_2);
		  r = vmlal_s32(r, B1, Y_1);
		  int32_t ll =(int32_t)( vgetq_lane_s64(r, 0) >> 30);
		  int32_t rr =(int32_t)( vgetq_lane_s64(r, 1) >> 30);
		  pDataIn += 2;
		  *pDataOut ++ = ll;
		  *pDataOut ++ = rr;
		  int32_t tmp1, tmp2;
		  tmp1 = vget_lane_s32(X_1, 0);
		  tmp2 = vget_lane_s32(X_1, 1);
		  vset_lane_s32(tmp1, X_2, 0);
		  vset_lane_s32(tmp2, X_2, 1);
		  tmp1 = vget_lane_s32(Y_1, 0);
		  tmp2 = vget_lane_s32(Y_1, 1);
		  vset_lane_s32(tmp1, Y_2, 0);
		  vset_lane_s32(tmp2, Y_2, 1);

		  vset_lane_s32(ll, Y_1, 0);
		  vset_lane_s32(rr, Y_1, 1);
		  
		  tmp1 = vget_lane_s32(s, 0);
		  tmp2 = vget_lane_s32(s, 1);
		  vset_lane_s32(tmp1, X_1, 0);
		  vset_lane_s32(tmp2, X_1, 1);
		}
        vst1_s32(&pBiquadState->pDelays[2], X_2);
        vst1_s32(&pBiquadState->pDelays[0], X_1);
        vst1_s32(&pBiquadState->pDelays[6], Y_2);
        vst1_s32(&pBiquadState->pDelays[4], Y_1);
#endif         

    }