/**    
* @brief  Core Real IFFT process    
* @param[in]   *pSrc 				points to the input buffer.   
* @param[in]   fftLen  			    length of FFT.    
* @param[in]   *pATable 			points to the twiddle Coef A buffer.   
* @param[in]   *pBTable 			points to the twiddle Coef B buffer.    
* @param[out]  *pDst 				points to the output buffer.   
* @param[in]   modifier 	        twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.   
* @return none.    
*/
void arm_split_rifft_q31(
    q31_t * pSrc,
    uint32_t fftLen,
    q31_t * pATable,
    q31_t * pBTable,
    q31_t * pDst,
    uint32_t modifier)
{
    q31_t outR, outI;                              /* Temporary variables for output */
    q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
    q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
    q31_t *pIn1 = &pSrc[0], *pIn2 = &pSrc[(2u * fftLen) + 1u];

    pCoefA = &pATable[0];
    pCoefB = &pBTable[0];

    while(fftLen > 0u)
    {
        /*    
        outR = (pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] +    
        pIn[2 * n - 2 * i] * pBTable[2 * i] -    
        pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);    

        outI = (pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] -    
        pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -    
        pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);   
        */
        CoefA1 = *pCoefA++;
        CoefA2 = *pCoefA;

        /* outR = (pIn[2 * i] * pATable[2 * i] */
        mult_32x32_keep32_R(outR, *pIn1, CoefA1);

        /* - pIn[2 * i] * pATable[2 * i + 1] */
        mult_32x32_keep32_R(outI, *pIn1++, -CoefA2);
        
        /* pIn[2 * i + 1] * pATable[2 * i + 1] */
        multAcc_32x32_keep32_R(outR, *pIn1, CoefA2);

        /* pIn[2 * i + 1] * pATable[2 * i] */
        multAcc_32x32_keep32_R(outI, *pIn1++, CoefA1);

        /* pIn[2 * n - 2 * i] * pBTable[2 * i] */
        multAcc_32x32_keep32_R(outR, *pIn2, CoefA2);
        CoefB1 = *pCoefB;

        /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
        multSub_32x32_keep32_R(outI, *pIn2--, CoefB1);

        /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
        multAcc_32x32_keep32_R(outR, *pIn2, CoefB1);

        /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
        multAcc_32x32_keep32_R(outI, *pIn2--, CoefA2);

        /* write output */
        *pDst++ = outR;
        *pDst++ = outI;

        /* update coefficient pointer */
        pCoefB = pCoefB + (modifier * 2u);
        pCoefA = pCoefA + ((modifier * 2u) - 1u);

        /* Decrement loop count */
        fftLen--;
    }
}
void arm_biquad_cascade_df1_fast_q31(
  const arm_biquad_casd_df1_inst_q31 * S,
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
{
  q31_t acc = 0;                                 /*  accumulator                   */
  q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
  q31_t *pOut = pDst;                            /*  output pointer initialization */
  q31_t *pState = S->pState;                     /*  pState pointer initialization */
  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
  q31_t Xn;                                      /*  temporary input               */
  int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output */
  uint32_t sample, stage = S->numStages;         /*  loop counters                     */


  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;

    /* Reading the state values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];

    /* Apply loop unrolling and compute 4 output values simultaneously. */
    /*      The variables acc ... acc3 hold output values that are being computed:       
     *       
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]       
     */

    sample = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.       
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
      mult_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn2);

      /* The result is converted to 1.31 , Yn2 variable is reused */
      Yn2 = acc << shift;

      /* Read the second input */
      Xn2 = *(pIn + 1u);

      /* Store the output in the destination buffer. */
      *pOut = Yn2;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn2);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn1);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn2);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn1);

      /* The result is converted to 1.31, Yn1 variable is reused  */
      Yn1 = acc << shift;

      /* Read the third input  */
      Xn1 = *(pIn + 2u);

      /* Store the output in the destination buffer. */
      *(pOut + 1u) = Yn1;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn1);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn2);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn2);

      /* The result is converted to 1.31, Yn2 variable is reused  */
      Yn2 = acc << shift;

      /* Read the forth input */
      Xn = *(pIn + 3u);

      /* Store the output in the destination buffer. */
      *(pOut + 2u) = Yn2;
      pIn += 4u;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn2);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn1);

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      Xn2 = Xn1;

      /* The result is converted to 1.31, Yn1 variable is reused  */
      Yn1 = acc << shift;

      /* Xn1 = Xn     */
      Xn1 = Xn;

      /* Store the output in the destination buffer. */
      *(pOut + 3u) = Yn1;
      pOut += 4u;

      /* decrement the loop counter */
      sample--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.       
     ** No loop unrolling is used. */
    sample = (blockSize & 0x3u);

   while(sample > 0u)
   {
      /* Read the input */
      Xn = *pIn++;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn2);

      /* The result is converted to 1.31  */
      acc = acc << shift;

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = acc;

      /* Store the output in the destination buffer. */
      *pOut++ = acc;

      /* decrement the loop counter */
      sample--;
   }

    /*  The first stage goes from the input buffer to the output buffer. */
    /*  Subsequent stages occur in-place in the output buffer */
    pIn = pDst;

    /* Reset to destination pointer */
    pOut = pDst;

    /*  Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;

  } while(--stage);
}
/**    
* @brief  Core Real FFT process    
* @param[in]   *pSrc 				points to the input buffer.    
* @param[in]   fftLen  			    length of FFT.   
* @param[in]   *pATable 			points to the twiddle Coef A buffer.    
* @param[in]   *pBTable 			points to the twiddle Coef B buffer.    
* @param[out]  *pDst 				points to the output buffer.    
* @param[in]   modifier 	        twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.   
* @return none.    
*/
void arm_split_rfft_q31(
    q31_t * pSrc,
    uint32_t fftLen,
    q31_t * pATable,
    q31_t * pBTable,
    q31_t * pDst,
    uint32_t modifier)
{
    uint32_t i;                                    /* Loop Counter */
    q31_t outR, outI;                              /* Temporary variables for output */
    q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
    q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
    q31_t *pOut1 = &pDst[2], *pOut2 = &pDst[(4u * fftLen) - 1u];
    q31_t *pIn1 = &pSrc[2], *pIn2 = &pSrc[(2u * fftLen) - 1u];

    /* Init coefficient pointers */
    pCoefA = &pATable[modifier * 2u];
    pCoefB = &pBTable[modifier * 2u];

    i = fftLen - 1u;

    while(i > 0u)
    {
        /*    
        outR = (pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1]    
        + pSrc[2 * n - 2 * i] * pBTable[2 * i] +    
        pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);    
        */

        /* outI = (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] +    
        pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -    
        pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); */

        CoefA1 = *pCoefA++;
        CoefA2 = *pCoefA;

        /* outR = (pSrc[2 * i] * pATable[2 * i] */    
        mult_32x32_keep32_R(outR, *pIn1, CoefA1);

        /* outI = pIn[2 * i] * pATable[2 * i + 1] */
        mult_32x32_keep32_R(outI, *pIn1++, CoefA2);

        /* - pSrc[2 * i + 1] * pATable[2 * i + 1] */
        multSub_32x32_keep32_R(outR, *pIn1, CoefA2);

        /* (pIn[2 * i + 1] * pATable[2 * i] */
        multAcc_32x32_keep32_R(outI, *pIn1++, CoefA1);

        /* pSrc[2 * n - 2 * i] * pBTable[2 * i]  */
        multSub_32x32_keep32_R(outR, *pIn2, CoefA2);
        CoefB1 = *pCoefB;

        /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
        multSub_32x32_keep32_R(outI, *pIn2--, CoefB1);

        /* pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
        multAcc_32x32_keep32_R(outR, *pIn2, CoefB1);

        /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
        multSub_32x32_keep32_R(outI, *pIn2--, CoefA2);

        /* write output */
        *pOut1++ = outR;
        *pOut1++ = outI;

        /* write complex conjugate output */
        *pOut2-- = -outI;
        *pOut2-- = outR;

        /* update coefficient pointer */
        pCoefB = pCoefB + (modifier * 2u);
        pCoefA = pCoefA + ((modifier * 2u) - 1u);

        i--;
    }
    pDst[2u * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
    pDst[(2u * fftLen) + 1u] = 0;

    pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
    pDst[1] = 0;
}