Ejemplo n.º 1
0
void arm_power_q7( 
  q7_t * pSrc, 
  uint32_t blockSize, 
  q31_t * pResult) 
{ 
  q31_t sum = 0;                                 /* Temporary result storage */ 
  q31_t input1;                                  /* Temporary variable to store packed input */ 
  q15_t in1, in2;                                /* Temporary variables to store input */ 
  q7_t in;                                       /* Temporary variable to store input */ 
  uint32_t blkCnt;                               /* loop counter */ 
 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* Reading two inputs of pSrc vector and packing */ 
    in1 = (q15_t) * pSrc++; 
    in2 = (q15_t) * pSrc++; 
    input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ 
    /* Compute Power and then store the result in a temporary variable, sum. */ 
    sum = __SMLAD(input1, input1, sum); 
 
    /* Reading two inputs of pSrc vector and packing */ 
    in1 = (q15_t) * pSrc++; 
    in2 = (q15_t) * pSrc++; 
    input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ 
    /* Compute Power and then store the result in a temporary variable, sum. */ 
    sum = __SMLAD(input1, input1, sum); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ 
    /* Compute Power and then store the result in a temporary variable, sum. */ 
    in = *pSrc++; 
    sum += ((q15_t) in * in); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* Store the result in 18.14 format  */ 
  *pResult = sum; 
} 
void arm_biquad_cascade_df1_fast_q15(
    const arm_biquad_casd_df1_inst_q15 * S,
    q15_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
{
    q15_t *pIn = pSrc;                             /*  Source pointer                               */
    q15_t *pOut = pDst;                            /*  Destination pointer                          */
    q31_t in;                                      /*  Temporary variable to hold input value       */
    q31_t out;                                     /*  Temporary variable to hold output value      */
    q31_t b0;                                      /*  Temporary variable to hold bo value          */
    q31_t b1, a1;                                  /*  Filter coefficients                          */
    q31_t state_in, state_out;                     /*  Filter state variables                       */
    q31_t acc0;                                    /*  Accumulator                                  */
    int32_t shift = (int32_t) (15 - S->postShift); /*  Post shift                                   */
    q15_t *pState = S->pState;                     /*  State pointer                                */
    q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
    q31_t *pState_q31;                             /*  32-bit state pointer for SIMD implementation */
    uint32_t sample, stage = S->numStages;         /*  Stage loop counter                           */



    do {
        /* Initialize state pointer of type q31 */
        pState_q31 = (q31_t *) (pState);

        /* Read the b0 and 0 coefficients using SIMD  */
        b0 = *__SIMD32(pCoeffs)++;

        /* Read the b1 and b2 coefficients using SIMD */
        b1 = *__SIMD32(pCoeffs)++;

        /* Read the a1 and a2 coefficients using SIMD */
        a1 = *__SIMD32(pCoeffs)++;

        /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
        state_in = (q31_t) (*pState_q31++);

        /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
        state_out = (q31_t) (*pState_q31);

        /* Apply loop unrolling and compute 2 output values simultaneously. */
        /*      The variables acc0 ... acc3 hold output values that are being computed:
         *
         *    acc0 =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
         *    acc0 =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
         */
        sample = blockSize >> 1u;

        /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.
         ** a second loop below computes the remaining 1 sample. */
        while(sample > 0u) {

            /* Read the input */
            in = *__SIMD32(pIn)++;

            /* out =  b0 * x[n] + 0 * 0 */
            out = __SMUAD(b0, in);
            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
            state_out = __PKHBT(state_out >> 16, (out), 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* out =  b0 * x[n] + 0 * 0 */
            out = __SMUADX(b0, in);
            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);


            /* Store the output in the destination buffer. */

#ifndef  ARM_MATH_BIG_ENDIAN

            *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);

#else

            *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in >> 16, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, in, 16);
            state_out = __PKHBT(state_out >> 16, out, 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */


            /* Decrement the loop counter */
            sample--;

        }

        /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
         ** No loop unrolling is used. */

        if((blockSize & 0x1u) != 0u) {
            /* Read the input */
            in = *pIn++;

            /* out =  b0 * x[n] + 0 * 0 */

#ifndef  ARM_MATH_BIG_ENDIAN

            out = __SMUAD(b0, in);

#else

            out = __SMUADX(b0, in);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);

            /* Store the output in the destination buffer. */
            *pOut++ = (q15_t) out;

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, in, 16);
            state_out = __PKHBT(state_out >> 16, out, 16);

#endif /*   #ifndef  ARM_MATH_BIG_ENDIAN    */

        }

        /*  The first stage goes from the input buffer to the output buffer.  */
        /*  Subsequent (numStages - 1) occur in-place in the output buffer  */
        pIn = pDst;

        /* Reset the output pointer */
        pOut = pDst;

        /*  Store the updated state variables back into the state array */
        *__SIMD32(pState)++ = state_in;
        *__SIMD32(pState)++ = state_out;


        /* Decrement the loop counter */
        stage--;

    } while(stage > 0u);
void arm_dot_prod_q7( 
  q7_t * pSrcA, 
  q7_t * pSrcB, 
  uint32_t blockSize, 
  q31_t * result) 
{ 
  q31_t input1, input2;                          /* Temporary variables to store input */ 
  q15_t in1, in2;                                /* Temporary variables to store input */ 
  q31_t sum = 0;                                 /* Temporary variables to store output */ 
  uint32_t blkCnt;                               /* loop counter */ 
 
 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* Reading two inputs of SrcA buffer and packing */ 
    in1 = (q15_t) * pSrcA++; 
    in2 = (q15_t) * pSrcA++; 
    input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
    /* Reading two inputs of SrcB buffer and packing */ 
    in1 = (q15_t) * pSrcB++; 
    in2 = (q15_t) * pSrcB++; 
    input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ 
    /* Perform Dot product of 2 packed inputs using SMLALD and store the result in a temporary variable. */ 
    sum = __SMLAD(input1, input2, sum); 
 
    /* Reading two inputs of SrcA buffer and packing */ 
    in1 = (q15_t) * pSrcA++; 
    in2 = (q15_t) * pSrcA++; 
    input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
    /* Reading two inputs of SrcB buffer and packing */ 
    in1 = (q15_t) * pSrcB++; 
    in2 = (q15_t) * pSrcB++; 
    input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ 
    /* Perform Dot product of 2 packed inputs using SMLALD and store the result in a temporary variable. */ 
    sum = __SMLAD(input1, input2, sum); 
 
 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ 
    /* Dot product and then store the results in a temporary buffer. */ 
    sum = __SMLAD(*pSrcA++, *pSrcB++, sum); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* Store the result in the destination buffer in 18.14 format */ 
  *result = sum; 
} 
Ejemplo n.º 4
0
void arm_dot_prod_q7(
    q7_t * pSrcA,
    q7_t * pSrcB,
    uint32_t blockSize,
    q31_t * result)
{
    uint32_t blkCnt;                               /* loop counter */

    q31_t sum = 0;                                 /* Temporary variables to store output */

#ifndef ARM_MATH_CM0_FAMILY

    /* Run the below code for Cortex-M4 and Cortex-M3 */

    q31_t input1, input2;                          /* Temporary variables to store input */
    q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables to store input */



    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
        /* read 4 samples at a time from sourceA */
        input1 = *__SIMD32(pSrcA)++;
        /* read 4 samples at a time from sourceB */
        input2 = *__SIMD32(pSrcB)++;

        /* extract two q7_t samples to q15_t samples */
        inA1 = __SXTB16(__ROR(input1, 8));
        /* extract reminaing two samples */
        inA2 = __SXTB16(input1);
        /* extract two q7_t samples to q15_t samples */
        inB1 = __SXTB16(__ROR(input2, 8));
        /* extract reminaing two samples */
        inB2 = __SXTB16(input2);

        /* multiply and accumulate two samples at a time */
        sum = __SMLAD(inA1, inB1, sum);
        sum = __SMLAD(inA2, inB2, sum);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

    while(blkCnt > 0u)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        /* Dot product and then store the results in a temporary buffer. */
        sum = __SMLAD(*pSrcA++, *pSrcB++, sum);

        /* Decrement the loop counter */
        blkCnt--;
    }

#else

    /* Run the below code for Cortex-M0 */



    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;

    while(blkCnt > 0u)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        /* Dot product and then store the results in a temporary buffer. */
        sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);

        /* Decrement the loop counter */
        blkCnt--;
    }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */


    /* Store the result in the destination buffer in 18.14 format */
    *result = sum;
}
Ejemplo n.º 5
0
void arm_power_q7(
  q7_t * pSrc,
  uint32_t blockSize,
  q31_t * pResult)
{
  q31_t sum = 0;                                 /* Temporary result storage */
  q7_t in;                                       /* Temporary variable to store input */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t input1;                                  /* Temporary variable to store packed input */
  q31_t in1, in2;                                /* Temporary variables to store input */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* Reading two inputs of pSrc vector and packing */
    input1 = *__SIMD32(pSrc)++;

    in1 = __SXTB16(__ROR(input1, 8));
    in2 = __SXTB16(input1);

    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
    /* calculate power and accumulate to accumulator */
    sum = __SMLAD(in1, in1, sum);
    sum = __SMLAD(in2, in2, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

  while(blkCnt > 0u)
  {
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
    /* Compute Power and then store the result in a temporary variable, sum. */
    in = *pSrc++;
    sum += ((q15_t) in * in);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Store the result in 18.14 format  */
  *pResult = sum;
}
Ejemplo n.º 6
0
/**
\brief Test case: TC_CoreSimd_ParMul16
\details
- Check Parallel 16-bit multiplication:
  __SMLAD
  __SMLADX
  __SMLALD
  __SMLALDX
  __SMLSD
  __SMLSDX
  __SMLSLD
  __SMLSLDX
  __SMUAD
  __SMUADX
  __SMUSD
  __SMUSDX
*/
void TC_CoreSimd_ParMul16 (void) {
#if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__  == 1)) || \
     (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))     )
  volatile int32_t op1_s32, op2_s32, op3_s32;
  volatile int32_t res_s32;

  volatile int64_t op1_s64;
  volatile int64_t res_s64;

  /* --- __SMLAD Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op3_s32 = 0x20000000;
  res_s32 = __SMLAD(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x20000017);

  /* --- __SMLADX Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op3_s32 = 0x00000800;
  res_s32 = __SMLADX(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x00000816);

  /* --- __SMLALD Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLALD(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000017LL);

  /* --- __SMLALDX Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLALDX(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000016LL);

  /* --- __SMLSD Test ---------------------------------------------- */
  op1_s32 = 0x00030006;
  op2_s32 = 0x00050004;
  op3_s32 = 0x00000800;
  res_s32 = __SMLSD(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x00000809);

  /* --- __SMLSDX Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op3_s32 = 0x00000800;
  res_s32 = __SMLSDX(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x000007FE);

  /* --- __SMLSLD Test ---------------------------------------------- */
  op1_s32 = 0x00030006;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLSLD(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000009LL);

  /* --- __SMLSLDX Test ---------------------------------------------- */
  op1_s32 = 0x00030006;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLSLDX(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000012LL);

  /* --- __SMUAD Test ---------------------------------------------- */
  op1_s32 = 0x00030001;
  op2_s32 = 0x00040002;
  res_s32 = __SMUAD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == 0x0000000E);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUAD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF2);

  /* --- __SMUADX Test ---------------------------------------------- */
  op1_s32 = 0x00030001;
  op2_s32 = 0x00040002;
  res_s32 = __SMUADX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == 0x0000000A);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUADX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6);

  /* --- __SMUSD Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x00030001;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUSD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUSD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == 0x0000000A);

  /* --- __SMUSDX Test ---------------------------------------------- */
  op1_s32 = 0x00030001;
  op2_s32 = 0x00040002;
  res_s32 = __SMUSDX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFFE);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUSDX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x00000002);
#endif
}
void arm_fir_q7( 
  const arm_fir_instance_q7 * S, 
  q7_t * pSrc, 
  q7_t * pDst, 
  uint32_t blockSize) 
{ 
  uint32_t numTaps = S->numTaps;                 /* Number of taps in the filter */ 
  uint32_t i, blkCnt;                            /* Loop counters */ 
  q7_t *pState = S->pState;                      /* State pointer */ 
  q7_t *pCoeffs = S->pCoeffs;                    /* Coefficient pointer */ 
  q7_t *px, *pb;                                 /* Temporary pointers to state and coeff */ 
  q31_t acc = 0;                                 /* Accumlator */ 
  q31_t input1, input2;                          /* Temporary variables to store input */ 
  q15_t in1, in2;                                /* Temporary variables to store input */ 
  q7_t *pStateCurnt;                             /* Points to the current sample of the state */ 
 
 
  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 
  /* pStateCurnt points to the location where the new input data should be written */ 
  pStateCurnt = S->pState + (numTaps - 1u); 
 
  i = blockSize >> 2u; 
 
  /* Copy four new input samples into the state buffer.  
   ** Use 32-bit SIMD to move the four 8-bit data.  Only requires one copy for every four samples. */ 
  while(i > 0u) 
  { 
    *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; 
    i--; 
  } 
 
  i = blockSize % 0x4u; 
 
  /* Copy remining samples into the state buffer. */ 
  while(i > 0u) 
  { 
    *pStateCurnt++ = *pSrc++; 
    i--; 
  } 
 
  blkCnt = blockSize; 
 
  /* Perform filtering upto BlockSize - BlockSize%4  */ 
  while(blkCnt > 0u) 
  { 
    /* Set accumulator to zero */ 
    acc = 0; 
 
    /* Initialize state pointer of type q7 */ 
    px = pState; 
 
    /* Initialize coeff pointer of type q7 */ 
    pb = pCoeffs; 
 
    i = numTaps >> 2u; 
 
    /* Loop over the number of taps.  Unroll by a factor of 4.  
     ** Repeat until we've computed numTaps-4 coefficients. */ 
    while(i > 0u) 
    { 
      /* Reading two inputs of state buffer and packing */ 
      in1 = (q15_t) * px++; 
      in2 = (q15_t) * px++; 
      input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
      /* Reading two inputs of coefficient buffer and packing */ 
      in1 = (q15_t) * pb++; 
      in2 = (q15_t) * pb++; 
      input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
      /* Perform Multiply and accumlation of 2 packed inputs and coefficients using SMLALD and store the result in accumlator. */ 
      acc = __SMLAD(input1, input2, acc); 
 
      /* Reading two inputs of state buffer and packing */ 
      in1 = (q15_t) * px++; 
      in2 = (q15_t) * px++; 
      input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
      /* Reading two inputs of coefficient buffer and packing */ 
      in1 = (q15_t) * pb++; 
      in2 = (q15_t) * pb++; 
      input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
 
      /* Perform Multiply and accumlation of 2 packed inputs and coefficients using SMLALD and store the result in accumlator. */ 
      acc = __SMLAD(input1, input2, acc); 
 
      /* Decrement the tap loop counter */ 
      i--; 
    } 
 
    i = numTaps % 0x4u; 
 
    /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 
    while(i > 0u) 
    { 
      acc = __SMLAD(*px++, *pb++, acc); 
      i--; 
 
    } 
 
    /* Saturate output */ 
    acc = __SSAT((acc >> 7), 8); 
 
    /*Store filter output */ 
    *pDst++ = (q7_t) (acc); 
 
    /* Advance the state pointer by 1 to process the next sample */ 
    pState = pState + 1; 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* Processing is complete.  
   ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.  
   ** This prepares the state buffer for the next function call. */ 
 
  /* Points to the start of the state buffer */ 
  pStateCurnt = S->pState; 
 
  /* Calculation of count for copying integer writes */ 
  i = (numTaps - 1u) >> 2u; 
 
  /* Copy four values using integer pointer */ 
  while(i > 0u) 
  { 
    *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 
 
    i--; 
 
  } 
 
  /* Calculation of count for remaining q7_t data */ 
  i = (numTaps - 1u) % 0x4u; 
 
  /* Copy of remaining q7_t data */ 
  while(i > 0u) 
  { 
    *pStateCurnt++ = *pState++; 
    i--; 
  } 
 
} 
Ejemplo n.º 8
0
void arm_power_q7(     
  q7_t * pSrc,     
  uint32_t blockSize,     
  q31_t * pResult)     
{     
  q31_t acc = 0;                                 /* Temporary result storage */     
  q31_t input1;                                  /* Temporary variable to store packed input */     
  q7_t in;                                       /* Temporary variable to store input */     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t inA1, inA2;	   							 /* Temporary variables to hold intermiediate data */  
  q31_t acc1 = 0;  
     
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {    
  	/* read four samples at a time from soruce buffer */   
  	input1 = _SIMD32_OFFSET(pSrc);  
  
	/* extend two q7_t values to q15_t values */  
#ifdef CCS  
  
	inA1 = __SXTB16(input1, 8);  
	inA2 = __SXTB16(input1, 0);  
  
#else  
  
	inA1 = __SXTB16(__ROR(input1, 8));  
	inA2 = __SXTB16(input1);  
  
#endif	//	#ifdef CCS  
  
    /* calculate power and accumulate to accumulator */  
	acc = __SMLAD(inA1, inA1, acc);     
  
  	/* read four samples at a time from soruce buffer */   
  	input1 = _SIMD32_OFFSET(pSrc + 4);  
  
#ifdef CCS  
  
	/* extend two q7_t values to q15_t values */  
	inA1 = __SXTB16(input1, 8);  
  
    /* calculate power and accumulate to accumulator */  
    acc1 = __SMLAD(inA2, inA2, acc1);     
  
	/* extend two q7_t values to q15_t values */  
	inA2 = __SXTB16(input1, 0);  
  
#else  
  
	/* extend two q7_t values to q15_t values */  
	inA1 = __SXTB16(__ROR(input1, 8));  
  
    /* calculate power and accumulate to accumulator */  
    acc1 = __SMLAD(inA2, inA2, acc1);     
  
	/* extend two q7_t values to q15_t values */  
	inA2 = __SXTB16(input1);  
  
#endif	//	#ifdef CCS  
  
    /* calculate power and accumulate to accumulator */  
    acc = __SMLAD(inA1, inA1, acc);     
    acc1 = __SMLAD(inA2, inA2, acc1);  
      
	/* update source buffer to process next samples */  
	pSrc += 8u;     
    
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
    
  /* add accumulators */  
  acc = acc + acc1;   
    
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */     
    /* Compute Power and then store the result in a temporary variable, acc. */     
    in = *pSrc++;     
    acc += ((q15_t) in * in);     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* Store the result in 18.14 format  */     
  *pResult = acc;     
}