void arm_biquad_cascade_df1_fast_q31(
  const arm_biquad_casd_df1_inst_q31 * S,
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
{
  q31_t acc = 0;                                 /*  accumulator                   */
  q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
  q31_t *pOut = pDst;                            /*  output pointer initialization */
  q31_t *pState = S->pState;                     /*  pState pointer initialization */
  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
  q31_t Xn;                                      /*  temporary input               */
  int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output */
  uint32_t sample, stage = S->numStages;         /*  loop counters                     */


  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;

    /* Reading the state values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];

    /* Apply loop unrolling and compute 4 output values simultaneously. */
    /*      The variables acc ... acc3 hold output values that are being computed:       
     *       
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]       
     */

    sample = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.       
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);*/
      mult_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn2);

      /* The result is converted to 1.31 , Yn2 variable is reused */
      Yn2 = acc << shift;

      /* Read the second input */
      Xn2 = *(pIn + 1u);

      /* Store the output in the destination buffer. */
      *pOut = Yn2;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn2);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn1);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn2);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn1);

      /* The result is converted to 1.31, Yn1 variable is reused  */
      Yn1 = acc << shift;

      /* Read the third input  */
      Xn1 = *(pIn + 2u);

      /* Store the output in the destination buffer. */
      *(pOut + 1u) = Yn1;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn1);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn2);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn2);

      /* The result is converted to 1.31, Yn2 variable is reused  */
      Yn2 = acc << shift;

      /* Read the forth input */
      Xn = *(pIn + 3u);

      /* Store the output in the destination buffer. */
      *(pOut + 2u) = Yn2;
      pIn += 4u;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn2);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn1);

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      Xn2 = Xn1;

      /* The result is converted to 1.31, Yn1 variable is reused  */
      Yn1 = acc << shift;

      /* Xn1 = Xn     */
      Xn1 = Xn;

      /* Store the output in the destination buffer. */
      *(pOut + 3u) = Yn1;
      pOut += 4u;

      /* decrement the loop counter */
      sample--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.       
     ** No loop unrolling is used. */
    sample = (blockSize & 0x3u);

   while(sample > 0u)
   {
      /* Read the input */
      Xn = *pIn++;

      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      /*acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);*/
      mult_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b1 * x[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b[2] * x[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
      /*acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);*/
      multAcc_32x32_keep32_R(acc, a2, Yn2);

      /* The result is converted to 1.31  */
      acc = acc << shift;

      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = acc;

      /* Store the output in the destination buffer. */
      *pOut++ = acc;

      /* decrement the loop counter */
      sample--;
   }

    /*  The first stage goes from the input buffer to the output buffer. */
    /*  Subsequent stages occur in-place in the output buffer */
    pIn = pDst;

    /* Reset to destination pointer */
    pOut = pDst;

    /*  Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;

  } while(--stage);
}
/**    
* @brief  Core Real IFFT process    
* @param[in]   *pSrc 				points to the input buffer.   
* @param[in]   fftLen  			    length of FFT.    
* @param[in]   *pATable 			points to the twiddle Coef A buffer.   
* @param[in]   *pBTable 			points to the twiddle Coef B buffer.    
* @param[out]  *pDst 				points to the output buffer.   
* @param[in]   modifier 	        twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.   
* @return none.    
*/
void arm_split_rifft_q31(
    q31_t * pSrc,
    uint32_t fftLen,
    q31_t * pATable,
    q31_t * pBTable,
    q31_t * pDst,
    uint32_t modifier)
{
    q31_t outR, outI;                              /* Temporary variables for output */
    q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
    q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
    q31_t *pIn1 = &pSrc[0], *pIn2 = &pSrc[(2u * fftLen) + 1u];

    pCoefA = &pATable[0];
    pCoefB = &pBTable[0];

    while(fftLen > 0u)
    {
        /*    
        outR = (pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] +    
        pIn[2 * n - 2 * i] * pBTable[2 * i] -    
        pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);    

        outI = (pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] -    
        pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -    
        pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);   
        */
        CoefA1 = *pCoefA++;
        CoefA2 = *pCoefA;

        /* outR = (pIn[2 * i] * pATable[2 * i] */
        mult_32x32_keep32_R(outR, *pIn1, CoefA1);

        /* - pIn[2 * i] * pATable[2 * i + 1] */
        mult_32x32_keep32_R(outI, *pIn1++, -CoefA2);
        
        /* pIn[2 * i + 1] * pATable[2 * i + 1] */
        multAcc_32x32_keep32_R(outR, *pIn1, CoefA2);

        /* pIn[2 * i + 1] * pATable[2 * i] */
        multAcc_32x32_keep32_R(outI, *pIn1++, CoefA1);

        /* pIn[2 * n - 2 * i] * pBTable[2 * i] */
        multAcc_32x32_keep32_R(outR, *pIn2, CoefA2);
        CoefB1 = *pCoefB;

        /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
        multSub_32x32_keep32_R(outI, *pIn2--, CoefB1);

        /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
        multAcc_32x32_keep32_R(outR, *pIn2, CoefB1);

        /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
        multAcc_32x32_keep32_R(outI, *pIn2--, CoefA2);

        /* write output */
        *pDst++ = outR;
        *pDst++ = outI;

        /* update coefficient pointer */
        pCoefB = pCoefB + (modifier * 2u);
        pCoefA = pCoefA + ((modifier * 2u) - 1u);

        /* Decrement loop count */
        fftLen--;
    }
}
IAR_ONLY_LOW_OPTIMIZATION_ENTER
void arm_fir_fast_q31(
  const arm_fir_instance_q31 * S,
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
{
  q31_t *pState = S->pState;                     /* State pointer */
  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
  q31_t *pStateCurnt;                            /* Points to the current sample of the state */
  q31_t x0, x1, x2, x3;                          /* Temporary variables to hold state */
  q31_t c0;                                      /* Temporary variable to hold coefficient value */
  q31_t *px;                                     /* Temporary pointer for state */
  q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
  q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
  uint32_t i, tapCnt, blkCnt;                    /* Loop counters */

  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
  /* pStateCurnt points to the location where the new input data should be written */
  pStateCurnt = &(S->pState[(numTaps - 1u)]);

  /* Apply loop unrolling and compute 4 output values simultaneously.
   * The variables acc0 ... acc3 hold output values that are being computed:
   *
   *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
   *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
   *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
   *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
   */
  blkCnt = blockSize >> 2;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* Copy four new input samples into the state buffer */
    *pStateCurnt++ = *pSrc++;
    *pStateCurnt++ = *pSrc++;
    *pStateCurnt++ = *pSrc++;
    *pStateCurnt++ = *pSrc++;

    /* Set all accumulators to zero */
    acc0 = 0;
    acc1 = 0;
    acc2 = 0;
    acc3 = 0;

    /* Initialize state pointer */
    px = pState;

    /* Initialize coefficient pointer */
    pb = pCoeffs;

    /* Read the first three samples from the state buffer:
     *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
    x0 = *(px++);
    x1 = *(px++);
    x2 = *(px++);

    /* Loop unrolling.  Process 4 taps at a time. */
    tapCnt = numTaps >> 2;
    i = tapCnt;

    while(i > 0u)
    {
      /* Read the b[numTaps] coefficient */
      c0 = *pb;

      /* Read x[n-numTaps-3] sample */
      x3 = *px;

      /* acc0 +=  b[numTaps] * x[n-numTaps] */
      multAcc_32x32_keep32_R(acc0, x0, c0);

      /* acc1 +=  b[numTaps] * x[n-numTaps-1] */
      multAcc_32x32_keep32_R(acc1, x1, c0);

      /* acc2 +=  b[numTaps] * x[n-numTaps-2] */
      multAcc_32x32_keep32_R(acc2, x2, c0);

      /* acc3 +=  b[numTaps] * x[n-numTaps-3] */
      multAcc_32x32_keep32_R(acc3, x3, c0);

      /* Read the b[numTaps-1] coefficient */
      c0 = *(pb + 1u);

      /* Read x[n-numTaps-4] sample */
      x0 = *(px + 1u);

      /* Perform the multiply-accumulates */
      multAcc_32x32_keep32_R(acc0, x1, c0);
      multAcc_32x32_keep32_R(acc1, x2, c0);
      multAcc_32x32_keep32_R(acc2, x3, c0);
      multAcc_32x32_keep32_R(acc3, x0, c0);

      /* Read the b[numTaps-2] coefficient */
      c0 = *(pb + 2u);

      /* Read x[n-numTaps-5] sample */
      x1 = *(px + 2u);

      /* Perform the multiply-accumulates */
      multAcc_32x32_keep32_R(acc0, x2, c0);
      multAcc_32x32_keep32_R(acc1, x3, c0);
      multAcc_32x32_keep32_R(acc2, x0, c0);
      multAcc_32x32_keep32_R(acc3, x1, c0);

      /* Read the b[numTaps-3] coefficients */
      c0 = *(pb + 3u);

      /* Read x[n-numTaps-6] sample */
      x2 = *(px + 3u);

      /* Perform the multiply-accumulates */
      multAcc_32x32_keep32_R(acc0, x3, c0);
      multAcc_32x32_keep32_R(acc1, x0, c0);
      multAcc_32x32_keep32_R(acc2, x1, c0);
      multAcc_32x32_keep32_R(acc3, x2, c0);

      /* update coefficient pointer */
      pb += 4u;
      px += 4u;

      /* Decrement the loop counter */
      i--;
    }

    /* If the filter length is not a multiple of 4, compute the remaining filter taps */

    i = numTaps - (tapCnt * 4u);
    while(i > 0u)
    {
      /* Read coefficients */
      c0 = *(pb++);

      /* Fetch 1 state variable */
      x3 = *(px++);

      /* Perform the multiply-accumulates */
      multAcc_32x32_keep32_R(acc0, x0, c0);
      multAcc_32x32_keep32_R(acc1, x1, c0);
      multAcc_32x32_keep32_R(acc2, x2, c0);
      multAcc_32x32_keep32_R(acc3, x3, c0);

      /* Reuse the present sample states for next sample */
      x0 = x1;
      x1 = x2;
      x2 = x3;

      /* Decrement the loop counter */
      i--;
    }

    /* Advance the state pointer by 4 to process the next group of 4 samples */
    pState = pState + 4;

    /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.31
     ** Then store the 4 outputs in the destination buffer. */
    *pDst++ = (q31_t) (acc0 << 1);
    *pDst++ = (q31_t) (acc1 << 1);
    *pDst++ = (q31_t) (acc2 << 1);
    *pDst++ = (q31_t) (acc3 << 1);

    /* Decrement the samples loop counter */
    blkCnt--;
  }


  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
   ** No loop unrolling is used. */
  blkCnt = blockSize % 4u;

  while(blkCnt > 0u)
  {
    /* Copy one sample at a time into state buffer */
    *pStateCurnt++ = *pSrc++;

    /* Set the accumulator to zero */
    acc0 = 0;

    /* Initialize state pointer */
    px = pState;

    /* Initialize Coefficient pointer */
    pb = (pCoeffs);

    i = numTaps;

    /* Perform the multiply-accumulates */
    do
    {
      multAcc_32x32_keep32_R(acc0, (*px++), (*(pb++)));
      i--;
    } while(i > 0u);

    /* The result is in 2.30 format.  Convert to 1.31
     ** Then store the output in the destination buffer. */
    *pDst++ = (q31_t) (acc0 << 1);

    /* Advance state pointer by 1 for the next sample */
    pState = pState + 1;

    /* Decrement the samples loop counter */
    blkCnt--;
  }

  /* Processing is complete.
   ** Now copy the last numTaps - 1 samples to the start of the state buffer.
   ** This prepares the state buffer for the next function call. */

  /* Points to the start of the state buffer */
  pStateCurnt = S->pState;

  /* Calculate remaining number of copies */
  tapCnt = (numTaps - 1u);

  /* Copy the remaining q31_t data */
  while(tapCnt > 0u)
  {
    *pStateCurnt++ = *pState++;

    /* Decrement the loop counter */
    tapCnt--;
  }


}
/**    
* @brief  Core Real FFT process    
* @param[in]   *pSrc 				points to the input buffer.    
* @param[in]   fftLen  			    length of FFT.   
* @param[in]   *pATable 			points to the twiddle Coef A buffer.    
* @param[in]   *pBTable 			points to the twiddle Coef B buffer.    
* @param[out]  *pDst 				points to the output buffer.    
* @param[in]   modifier 	        twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.   
* @return none.    
*/
void arm_split_rfft_q31(
    q31_t * pSrc,
    uint32_t fftLen,
    q31_t * pATable,
    q31_t * pBTable,
    q31_t * pDst,
    uint32_t modifier)
{
    uint32_t i;                                    /* Loop Counter */
    q31_t outR, outI;                              /* Temporary variables for output */
    q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
    q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
    q31_t *pOut1 = &pDst[2], *pOut2 = &pDst[(4u * fftLen) - 1u];
    q31_t *pIn1 = &pSrc[2], *pIn2 = &pSrc[(2u * fftLen) - 1u];

    /* Init coefficient pointers */
    pCoefA = &pATable[modifier * 2u];
    pCoefB = &pBTable[modifier * 2u];

    i = fftLen - 1u;

    while(i > 0u)
    {
        /*    
        outR = (pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1]    
        + pSrc[2 * n - 2 * i] * pBTable[2 * i] +    
        pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);    
        */

        /* outI = (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] +    
        pIn[2 * n - 2 * i] * pBTable[2 * i + 1] -    
        pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); */

        CoefA1 = *pCoefA++;
        CoefA2 = *pCoefA;

        /* outR = (pSrc[2 * i] * pATable[2 * i] */    
        mult_32x32_keep32_R(outR, *pIn1, CoefA1);

        /* outI = pIn[2 * i] * pATable[2 * i + 1] */
        mult_32x32_keep32_R(outI, *pIn1++, CoefA2);

        /* - pSrc[2 * i + 1] * pATable[2 * i + 1] */
        multSub_32x32_keep32_R(outR, *pIn1, CoefA2);

        /* (pIn[2 * i + 1] * pATable[2 * i] */
        multAcc_32x32_keep32_R(outI, *pIn1++, CoefA1);

        /* pSrc[2 * n - 2 * i] * pBTable[2 * i]  */
        multSub_32x32_keep32_R(outR, *pIn2, CoefA2);
        CoefB1 = *pCoefB;

        /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
        multSub_32x32_keep32_R(outI, *pIn2--, CoefB1);

        /* pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
        multAcc_32x32_keep32_R(outR, *pIn2, CoefB1);

        /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
        multSub_32x32_keep32_R(outI, *pIn2--, CoefA2);

        /* write output */
        *pOut1++ = outR;
        *pOut1++ = outI;

        /* write complex conjugate output */
        *pOut2-- = -outI;
        *pOut2-- = outR;

        /* update coefficient pointer */
        pCoefB = pCoefB + (modifier * 2u);
        pCoefA = pCoefA + ((modifier * 2u) - 1u);

        i--;
    }
    pDst[2u * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
    pDst[(2u * fftLen) + 1u] = 0;

    pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
    pDst[1] = 0;
}