buffer_c16_t Complex8DecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) {
	/* Decimates by two using a non-recursive third-order CIC filter.
	 */

	/* CIC filter (decimating by two):
	 * 	D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1
	 * 	D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1
	 *
	 * 	D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1
	 * 	D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1
	 */

	uint32_t i1_i0 = _i1_i0;
	uint32_t q1_q0 = _q1_q0;

	/* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */
	constexpr uint32_t scale_factor = 32;
	constexpr uint32_t k_3_1 = 0x00030001 * scale_factor;
	uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]);
	uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]);
	uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]);
	while(src_p < src_end) {
		const uint32_t q3_i3_q2_i2 = *(src_p++);						// 3
		const uint32_t q5_i5_q4_i4 = *(src_p++);

		const uint32_t d_i0_partial = __SMUAD(k_3_1, i1_i0);			// 1: = 3 * i1 + 1 * i0
		const uint32_t i3_i2 = __SXTB16(q3_i3_q2_i2,  0);				// 1: (q3_i3_q2_i2 ror  0)[23:16]:(q3_i3_q2_i2 ror  0)[7:0]
		const uint32_t d_i0 = __SMLADX(k_3_1, i3_i2, d_i0_partial);		// 1: + 3 * i2 + 1 * i3

		const uint32_t d_q0_partial = __SMUAD(k_3_1, q1_q0);			// 1: = 3 * q1 * 1 * q0
		const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2,  8);				// 1: (q3_i3_q2_i2 ror  8)[23:16]:(q3_i3_q2_i2 ror  8)[7:0]
		const uint32_t d_q0 = __SMLADX(k_3_1, q3_q2, d_q0_partial);		// 1: + 3 * q2 + 1 * q3 

		const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16);				// 1: (Rm<<16)[31:16]:Rn[15:0]

		const uint32_t d_i1_partial = __SMUAD(k_3_1, i3_i2);			// 1: = 3 * i3 + 1 * i2
		const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4,  0);				// 1: (q5_i5_q4_i4 ror  0)[23:16]:(q5_i5_q4_i4 ror  0)[7:0]
		const uint32_t d_i1 = __SMLADX(k_3_1, i5_i4, d_i1_partial);		// 1: + 1 * i5 + 3 * i4

		const uint32_t d_q1_partial = __SMUAD(k_3_1, q3_q2);			// 1: = 3 * q3 * 1 * q2
		const uint32_t q5_q4 = __SXTB16(q5_i5_q4_i4,  8);				// 1: (q5_i5_q4_i4 ror  8)[23:16]:(q5_i5_q4_i4 ror  8)[7:0]
		const uint32_t d_q1 = __SMLADX(k_3_1, q5_q4, d_q1_partial);		// 1: + 1 * q5 + 3 * q4 

		const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16);				// 1: (Rm<<16)[31:16]:Rn[15:0]

		*(dst_p++) = d_q0_i0;											// 3
		*(dst_p++) = d_q1_i1;

		i1_i0 = i5_i4;
		q1_q0 = q5_q4;
	}
	_i1_i0 = i1_i0;
	_q1_q0 = q1_q0;

	return { dst.p, src.count / 2, src.sampling_rate / 2 };
}
Exemplo n.º 2
0
/**
\brief Test case: TC_CoreSimd_PackUnpack
\details
- Check Packing and unpacking:
  __SXTB16
  __SXTAB16
  __UXTB16
  __UXTAB16
*/
void TC_CoreSimd_PackUnpack (void) {
#if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__  == 1)) || \
     (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))     )
  volatile int32_t op1_s32, op2_s32;
  volatile int32_t res_s32;

  /* --- __SXTB16 Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80830168;
  res_s32 = __SXTB16(op1_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFF830068);

  /* --- __SXTAB16 Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x000D0008;
  op2_s32 = (int32_t)0x80830168;
  res_s32 = __SXTAB16(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFF900070);

  /* --- __UXTB16 Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80830168;
  res_s32 = __UXTB16(op1_s32);
  ASSERT_TRUE(res_s32 == 0x00830068);

  /* --- __UXTAB16 Test ---------------------------------------------- */
  op1_s32 =          0x000D0008;
  op2_s32 = (int32_t)0x80830168;
  res_s32 = __UXTAB16(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == 0x00900070);
#endif
}
void arm_dot_prod_q7(
    q7_t * pSrcA,
    q7_t * pSrcB,
    uint32_t blockSize,
    q31_t * result)
{
    uint32_t blkCnt;                               /* loop counter */

    q31_t sum = 0;                                 /* Temporary variables to store output */

#ifndef ARM_MATH_CM0_FAMILY

    /* Run the below code for Cortex-M4 and Cortex-M3 */

    q31_t input1, input2;                          /* Temporary variables to store input */
    q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables to store input */



    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
        /* read 4 samples at a time from sourceA */
        input1 = *__SIMD32(pSrcA)++;
        /* read 4 samples at a time from sourceB */
        input2 = *__SIMD32(pSrcB)++;

        /* extract two q7_t samples to q15_t samples */
        inA1 = __SXTB16(__ROR(input1, 8));
        /* extract reminaing two samples */
        inA2 = __SXTB16(input1);
        /* extract two q7_t samples to q15_t samples */
        inB1 = __SXTB16(__ROR(input2, 8));
        /* extract reminaing two samples */
        inB2 = __SXTB16(input2);

        /* multiply and accumulate two samples at a time */
        sum = __SMLAD(inA1, inB1, sum);
        sum = __SMLAD(inA2, inB2, sum);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

    while(blkCnt > 0u)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        /* Dot product and then store the results in a temporary buffer. */
        sum = __SMLAD(*pSrcA++, *pSrcB++, sum);

        /* Decrement the loop counter */
        blkCnt--;
    }

#else

    /* Run the below code for Cortex-M0 */



    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;

    while(blkCnt > 0u)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        /* Dot product and then store the results in a temporary buffer. */
        sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);

        /* Decrement the loop counter */
        blkCnt--;
    }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */


    /* Store the result in the destination buffer in 18.14 format */
    *result = sum;
}
Exemplo n.º 4
0
void arm_q7_to_q15(
    q7_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
{
    q7_t *pIn = pSrc;                              /* Src pointer */
    uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY
    q31_t in;
    q31_t in1, in2;
    q31_t out1, out2;

    /* Run the below code for Cortex-M4 and Cortex-M3 */

    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
        /* C = (q15_t) A << 8 */
        /* convert from q7 to q15 and then store the results in the destination buffer */
        in = *__SIMD32(pIn)++;

        /* rotatate in by 8 and extend two q7_t values to q15_t values */
        in1 = __SXTB16(__ROR(in, 8));

        /* extend remainig two q7_t values to q15_t values */
        in2 = __SXTB16(in);

        in1 = in1 << 8u;
        in2 = in2 << 8u;

        in1 = in1 & 0xFF00FF00;
        in2 = in2 & 0xFF00FF00;

#ifndef ARM_MATH_BIG_ENDIAN

        out2 = __PKHTB(in1, in2, 16);
        out1 = __PKHBT(in2, in1, 16);

#else

        out1 = __PKHTB(in1, in2, 16);
        out2 = __PKHBT(in2, in1, 16);

#endif

        *__SIMD32(pDst)++ = out1;
        *__SIMD32(pDst)++ = out2;

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

#else

    /* Run the below code for Cortex-M0 */

    /* Loop over blockSize number of values */
    blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

    while(blkCnt > 0u)
    {
        /* C = (q15_t) A << 8 */
        /* convert from q7 to q15 and then store the results in the destination buffer */
        *pDst++ = (q15_t) * pIn++ << 8;

        /* Decrement the loop counter */
        blkCnt--;
    }

}
Exemplo n.º 5
0
void arm_power_q7(
  q7_t * pSrc,
  uint32_t blockSize,
  q31_t * pResult)
{
  q31_t sum = 0;                                 /* Temporary result storage */
  q7_t in;                                       /* Temporary variable to store input */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t input1;                                  /* Temporary variable to store packed input */
  q31_t in1, in2;                                /* Temporary variables to store input */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* Reading two inputs of pSrc vector and packing */
    input1 = *__SIMD32(pSrc)++;

    in1 = __SXTB16(__ROR(input1, 8));
    in2 = __SXTB16(input1);

    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
    /* calculate power and accumulate to accumulator */
    sum = __SMLAD(in1, in1, sum);
    sum = __SMLAD(in2, in2, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

  while(blkCnt > 0u)
  {
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
    /* Compute Power and then store the result in a temporary variable, sum. */
    in = *pSrc++;
    sum += ((q15_t) in * in);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Store the result in 18.14 format  */
  *pResult = sum;
}
buffer_c16_t TranslateByFSOver4AndDecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) {
	/* Translates incoming complex<int8_t> samples by -fs/4,
	 * decimates by two using a non-recursive third-order CIC filter.
	 */

	/* Derivation of algorithm:
	 * Original CIC filter (decimating by two):
	 * 	D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1
	 * 	D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1
	 *
	 * 	D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1
	 * 	D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1
	 *
	 * Translate -fs/4, phased 180 degrees, accomplished by complex multiplication
	 * of complex length-4 sequence:
	 *
	 * Substitute:
	 *	i0 = -i0, q0 = -q0
	 *	i1 = -q1, q1 =  i1
	 *	i2 =  i2, q2 =  q2
	 *	i3 =  q3, q3 = -i3
	 *	i4 = -i4, q4 = -q4
	 *	i5 = -q5, q5 =  i5
	 *
	 * Resulting taps (with decimation by 2, four samples in, two samples out):
	 *	D_I0 =  q3 * 1 +  i2 * 3 + -q1 * 3 + -i0 * 1
	 *	D_Q0 = -i3 * 1 +  q2 * 3 +  i1 * 3 + -q0 * 1
 	 *
	 *	D_I1 = -q5 * 1 + -i4 * 3 +  q3 * 3 +  i2 * 1
	 *	D_Q1 =  i5 * 1 + -q4 * 3 + -i3 * 3 +  q2 * 1
	 */

	// 6 cycles per complex input sample, not including loop overhead.
	uint32_t q1_i0 = _q1_i0;
	uint32_t q0_i1 = _q0_i1;
	/* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */
	constexpr uint32_t scale_factor = 32;
	const uint32_t k_3_1 = 0x00030001 * scale_factor;
	uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]);
	uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]);
	uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]);
	while(src_p < src_end) {
		const uint32_t q3_i3_q2_i2 = *(src_p++);			// 3
		const uint32_t q5_i5_q4_i4 = *(src_p++);

		const uint32_t i2_i3 = __SXTB16(q3_i3_q2_i2, 16);			// 1: (q3_i3_q2_i2 ror 16)[23:16]:(q3_i3_q2_i2 ror 16)[7:0]
		const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2,  8);			// 1: (q3_i3_q2_i2 ror  8)[23:16]:(q3_i3_q2_i2 ror  8)[7:0]
		const uint32_t i2_q3 = __PKHTB(i2_i3, q3_q2, 16);			// 1: Rn[31:16]:(Rm>>16)[15:0]
		const uint32_t i3_q2 = __PKHBT(q3_q2, i2_i3, 16);			// 1:(Rm<<16)[31:16]:Rn[15:0]

		// D_I0 = 3 * (i2 - q1) + (q3 - i0)
		const uint32_t i2_m_q1_q3_m_i0 = __QSUB16(i2_q3, q1_i0);	// 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
		const uint32_t d_i0 = __SMUAD(k_3_1, i2_m_q1_q3_m_i0);		// 1: Rm[15:0]*Rs[15:0]+Rm[31:16]*Rs[31:16]

		// D_Q0 = 3 * (q2 + i1) - (i3 + q0)
		const uint32_t i3_p_q0_q2_p_i1 = __QADD16(i3_q2, q0_i1);	// 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
		const uint32_t d_q0 = __SMUSDX(i3_p_q0_q2_p_i1, k_3_1);		// 1: Rm[15:0]*Rs[31:16]–Rm[31:16]*RsX[15:0]
		const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]

		const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4,  0);			// 1: (q5_i5_q4_i4 ror  0)[23:16]:(q5_i5_q4_i4 ror  0)[7:0]
		const uint32_t q4_q5 = __SXTB16(q5_i5_q4_i4, 24);			// 1: (q5_i5_q4_i4 ror 24)[23:16]:(q5_i5_q4_i4 ror 24)[7:0]
		const uint32_t q4_i5 = __PKHTB(q4_q5, i5_i4, 16);			// 1: Rn[31:16]:(Rm>>16)[15:0]
		const uint32_t q5_i4 = __PKHBT(i5_i4, q4_q5, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]

		// D_I1 = (i2 - q5) + 3 * (q3 - i4)
		const uint32_t i2_m_q5_q3_m_i4 = __QSUB16(i2_q3, q5_i4);	// 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
		const uint32_t d_i1 = __SMUADX(i2_m_q5_q3_m_i4, k_3_1);		// 1: Rm[15:0]*Rs[31:16]+Rm[31:16]*Rs[15:0]

		// D_Q1 = (i5 + q2) - 3 * (q4 + i3)
		const uint32_t q4_p_i3_i5_p_q2 = __QADD16(q4_i5, i3_q2);	// 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
		const uint32_t d_q1 = __SMUSD(k_3_1, q4_p_i3_i5_p_q2);		// 1: Rm[15:0]*Rs[15:0]–Rm[31:16]*Rs[31:16]
		const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]
		*(dst_p++) = d_q0_i0;							// 3
		*(dst_p++) = d_q1_i1;

		q1_i0 = q5_i4;
		q0_i1 = q4_i5;
	}
	_q1_i0 = q1_i0;
	_q0_i1 = q0_i1;

	return { dst.p, src.count / 2, src.sampling_rate / 2 };
}
Exemplo n.º 7
0
void arm_q7_to_q15(     
  q7_t * pSrc,     
  q15_t * pDst,     
  uint32_t blockSize)     
{     
  q7_t *pIn = pSrc;                              /* Src pointer */     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t in;  
  q31_t in1, in2;  
  q31_t out1, out2;  
  q31_t and = 0xFF00FF00;  
  
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = (q15_t) A << 8 */     
    /* convert from q7 to q15 and then store the results in the destination buffer */  
	/* read 4 samples at a time */     
	in = *__SIMD32(pIn)++;  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(in, 8);  
  
	/* extend remainig two q7_t values to q15_t values */  
	in2 = __SXTB16(in, 0);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(__ROR(in, 8));  
  
	/* extend remainig two q7_t values to q15_t values */  
	in2 = __SXTB16(in);  
  
  
#endif	/* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/  
	in1 = in1 << 8u;  
	in2 = in2 << 8u;  
  
	/* read next 4 sampels */  
	in = *__SIMD32(pIn)++;  
  
	/* anding with 0xff00ff00 */  
	in1 =  in1 & and;  
	out2 = in2 & and;  
  
	/* pack two 16 bit values */  
	out1 = __PKHTB(in1, out2, 16);  
	out2 = __PKHBT(out2, in1, 16);  
  
#ifndef ARM_MATH_BIG_ENDIAN	  
	  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst + 2) = out1;  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(in, 8);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(__ROR(in, 8));  
  
#endif  
  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst) = out2;  
  
#else  
  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst) = out1;  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(in, 8);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(__ROR(in, 8));  
  
#endif  
  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst + 2) = out2;  
  
#endif	 	//	#ifndef ARM_MATH_BIG_ENDIAN  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in2 = __SXTB16(in, 0);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in2 = __SXTB16(in);  
  
#endif  
  
	/* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/  
	in1 = in1 << 8u;  
	in2 = in2 << 8u;  
  
	/* anding with 0xff00ff00 */  
	out1 = in1 & and;  
	out2 = in2 & and;  
  
	/* pack two 16 bit values */  
	out1 = __PKHTB(in1, out2, 16);  
	out2 = __PKHBT(out2, in1, 16);  
  
	/* store two q15_t samples at a time to destination */  
#ifndef ARM_MATH_BIG_ENDIAN  
  
	_SIMD32_OFFSET(pDst + 6) = out1;  
	_SIMD32_OFFSET(pDst + 4) = out2;  
  
#else  
  
	_SIMD32_OFFSET(pDst + 4) = out1;  
	_SIMD32_OFFSET(pDst + 6) = out2;  
  
#endif	 	//	#ifndef ARM_MATH_BIG_ENDIAN  
  
	/* incremnet destination pointer */  
	pDst += 8u;  
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = (q15_t) A << 8 */     
    /* convert from q7 to q15 and then store the results in the destination buffer */     
    *pDst++ = (q15_t) * pIn++ << 8;     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
}     
Exemplo n.º 8
0
void arm_power_q7(     
  q7_t * pSrc,     
  uint32_t blockSize,     
  q31_t * pResult)     
{     
  q31_t acc = 0;                                 /* Temporary result storage */     
  q31_t input1;                                  /* Temporary variable to store packed input */     
  q7_t in;                                       /* Temporary variable to store input */     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t inA1, inA2;	   							 /* Temporary variables to hold intermiediate data */  
  q31_t acc1 = 0;  
     
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {    
  	/* read four samples at a time from soruce buffer */   
  	input1 = _SIMD32_OFFSET(pSrc);  
  
	/* extend two q7_t values to q15_t values */  
#ifdef CCS  
  
	inA1 = __SXTB16(input1, 8);  
	inA2 = __SXTB16(input1, 0);  
  
#else  
  
	inA1 = __SXTB16(__ROR(input1, 8));  
	inA2 = __SXTB16(input1);  
  
#endif	//	#ifdef CCS  
  
    /* calculate power and accumulate to accumulator */  
	acc = __SMLAD(inA1, inA1, acc);     
  
  	/* read four samples at a time from soruce buffer */   
  	input1 = _SIMD32_OFFSET(pSrc + 4);  
  
#ifdef CCS  
  
	/* extend two q7_t values to q15_t values */  
	inA1 = __SXTB16(input1, 8);  
  
    /* calculate power and accumulate to accumulator */  
    acc1 = __SMLAD(inA2, inA2, acc1);     
  
	/* extend two q7_t values to q15_t values */  
	inA2 = __SXTB16(input1, 0);  
  
#else  
  
	/* extend two q7_t values to q15_t values */  
	inA1 = __SXTB16(__ROR(input1, 8));  
  
    /* calculate power and accumulate to accumulator */  
    acc1 = __SMLAD(inA2, inA2, acc1);     
  
	/* extend two q7_t values to q15_t values */  
	inA2 = __SXTB16(input1);  
  
#endif	//	#ifdef CCS  
  
    /* calculate power and accumulate to accumulator */  
    acc = __SMLAD(inA1, inA1, acc);     
    acc1 = __SMLAD(inA2, inA2, acc1);  
      
	/* update source buffer to process next samples */  
	pSrc += 8u;     
    
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
    
  /* add accumulators */  
  acc = acc + acc1;   
    
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */     
    /* Compute Power and then store the result in a temporary variable, acc. */     
    in = *pSrc++;     
    acc += ((q15_t) in * in);     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* Store the result in 18.14 format  */     
  *pResult = acc;     
}