void arm_power_q7( q7_t * pSrc, uint32_t blockSize, q31_t * pResult) { q31_t sum = 0; /* Temporary result storage */ q31_t input1; /* Temporary variable to store packed input */ q15_t in1, in2; /* Temporary variables to store input */ q7_t in; /* Temporary variable to store input */ uint32_t blkCnt; /* loop counter */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* Reading two inputs of pSrc vector and packing */ in1 = (q15_t) * pSrc++; in2 = (q15_t) * pSrc++; input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ sum = __SMLAD(input1, input1, sum); /* Reading two inputs of pSrc vector and packing */ in1 = (q15_t) * pSrc++; in2 = (q15_t) * pSrc++; input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ sum = __SMLAD(input1, input1, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ in = *pSrc++; sum += ((q15_t) in * in); /* Decrement the loop counter */ blkCnt--; } /* Store the result in 18.14 format */ *pResult = sum; }
void arm_biquad_cascade_df1_fast_q15( const arm_biquad_casd_df1_inst_q15 * S, q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { q15_t *pIn = pSrc; /* Source pointer */ q15_t *pOut = pDst; /* Destination pointer */ q31_t in; /* Temporary variable to hold input value */ q31_t out; /* Temporary variable to hold output value */ q31_t b0; /* Temporary variable to hold bo value */ q31_t b1, a1; /* Filter coefficients */ q31_t state_in, state_out; /* Filter state variables */ q31_t acc0; /* Accumulator */ int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */ q15_t *pState = S->pState; /* State pointer */ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ q31_t *pState_q31; /* 32-bit state pointer for SIMD implementation */ uint32_t sample, stage = S->numStages; /* Stage loop counter */ do { /* Initialize state pointer of type q31 */ pState_q31 = (q31_t *) (pState); /* Read the b0 and 0 coefficients using SIMD */ b0 = *__SIMD32(pCoeffs)++; /* Read the b1 and b2 coefficients using SIMD */ b1 = *__SIMD32(pCoeffs)++; /* Read the a1 and a2 coefficients using SIMD */ a1 = *__SIMD32(pCoeffs)++; /* Read the input state values from the state buffer: x[n-1], x[n-2] */ state_in = (q31_t) (*pState_q31++); /* Read the output state values from the state buffer: y[n-1], y[n-2] */ state_out = (q31_t) (*pState_q31); /* Apply loop unrolling and compute 2 output values simultaneously. */ /* The variables acc0 ... acc3 hold output values that are being computed: * * acc0 = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] * acc0 = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ sample = blockSize >> 1u; /* First part of the processing with loop unrolling. Compute 2 outputs at a time. ** a second loop below computes the remaining 1 sample. */ while(sample > 0u) { /* Read the input */ in = *__SIMD32(pIn)++; /* out = b0 * x[n] + 0 * 0 */ out = __SMUAD(b0, in); /* acc0 = b1 * x[n-1] + acc0 += b2 * x[n-2] + out */ acc0 = __SMLAD(b1, state_in, out); /* acc0 += a1 * y[n-1] + acc0 += a2 * y[n-2] */ acc0 = __SMLAD(a1, state_out, acc0); /* The result is converted from 3.29 to 1.31 and then saturation is applied */ out = __SSAT((acc0 >> shift), 16); /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc0 */ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ #ifndef ARM_MATH_BIG_ENDIAN state_in = __PKHBT(in, state_in, 16); state_out = __PKHBT(out, state_out, 16); #else state_in = __PKHBT(state_in >> 16, (in >> 16), 16); state_out = __PKHBT(state_out >> 16, (out), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* out = b0 * x[n] + 0 * 0 */ out = __SMUADX(b0, in); /* acc0 = b1 * x[n-1] + acc0 += b2 * x[n-2] + out */ acc0 = __SMLAD(b1, state_in, out); /* acc0 += a1 * y[n-1] + acc0 += a2 * y[n-2] */ acc0 = __SMLAD(a1, state_out, acc0); /* The result is converted from 3.29 to 1.31 and then saturation is applied */ out = __SSAT((acc0 >> shift), 16); /* Store the output in the destination buffer. */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16); #else *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc0 */ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ #ifndef ARM_MATH_BIG_ENDIAN state_in = __PKHBT(in >> 16, state_in, 16); state_out = __PKHBT(out, state_out, 16); #else state_in = __PKHBT(state_in >> 16, in, 16); state_out = __PKHBT(state_out >> 16, out, 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Decrement the loop counter */ sample--; } /* If the blockSize is not a multiple of 2, compute any remaining output samples here. ** No loop unrolling is used. */ if((blockSize & 0x1u) != 0u) { /* Read the input */ in = *pIn++; /* out = b0 * x[n] + 0 * 0 */ #ifndef ARM_MATH_BIG_ENDIAN out = __SMUAD(b0, in); #else out = __SMUADX(b0, in); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* acc0 = b1 * x[n-1] + acc0 += b2 * x[n-2] + out */ acc0 = __SMLAD(b1, state_in, out); /* acc0 += a1 * y[n-1] + acc0 += a2 * y[n-2] */ acc0 = __SMLAD(a1, state_out, acc0); /* The result is converted from 3.29 to 1.31 and then saturation is applied */ out = __SSAT((acc0 >> shift), 16); /* Store the output in the destination buffer. */ *pOut++ = (q15_t) out; /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc0 */ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ #ifndef ARM_MATH_BIG_ENDIAN state_in = __PKHBT(in, state_in, 16); state_out = __PKHBT(out, state_out, 16); #else state_in = __PKHBT(state_in >> 16, in, 16); state_out = __PKHBT(state_out >> 16, out, 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ } /* The first stage goes from the input buffer to the output buffer. */ /* Subsequent (numStages - 1) occur in-place in the output buffer */ pIn = pDst; /* Reset the output pointer */ pOut = pDst; /* Store the updated state variables back into the state array */ *__SIMD32(pState)++ = state_in; *__SIMD32(pState)++ = state_out; /* Decrement the loop counter */ stage--; } while(stage > 0u);
void arm_dot_prod_q7( q7_t * pSrcA, q7_t * pSrcB, uint32_t blockSize, q31_t * result) { q31_t input1, input2; /* Temporary variables to store input */ q15_t in1, in2; /* Temporary variables to store input */ q31_t sum = 0; /* Temporary variables to store output */ uint32_t blkCnt; /* loop counter */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* Reading two inputs of SrcA buffer and packing */ in1 = (q15_t) * pSrcA++; in2 = (q15_t) * pSrcA++; input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* Reading two inputs of SrcB buffer and packing */ in1 = (q15_t) * pSrcB++; in2 = (q15_t) * pSrcB++; input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Perform Dot product of 2 packed inputs using SMLALD and store the result in a temporary variable. */ sum = __SMLAD(input1, input2, sum); /* Reading two inputs of SrcA buffer and packing */ in1 = (q15_t) * pSrcA++; in2 = (q15_t) * pSrcA++; input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* Reading two inputs of SrcB buffer and packing */ in1 = (q15_t) * pSrcB++; in2 = (q15_t) * pSrcB++; input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Perform Dot product of 2 packed inputs using SMLALD and store the result in a temporary variable. */ sum = __SMLAD(input1, input2, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Dot product and then store the results in a temporary buffer. */ sum = __SMLAD(*pSrcA++, *pSrcB++, sum); /* Decrement the loop counter */ blkCnt--; } /* Store the result in the destination buffer in 18.14 format */ *result = sum; }
void arm_dot_prod_q7( q7_t * pSrcA, q7_t * pSrcB, uint32_t blockSize, q31_t * result) { uint32_t blkCnt; /* loop counter */ q31_t sum = 0; /* Temporary variables to store output */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t input1, input2; /* Temporary variables to store input */ q31_t inA1, inA2, inB1, inB2; /* Temporary variables to store input */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* read 4 samples at a time from sourceA */ input1 = *__SIMD32(pSrcA)++; /* read 4 samples at a time from sourceB */ input2 = *__SIMD32(pSrcB)++; /* extract two q7_t samples to q15_t samples */ inA1 = __SXTB16(__ROR(input1, 8)); /* extract reminaing two samples */ inA2 = __SXTB16(input1); /* extract two q7_t samples to q15_t samples */ inB1 = __SXTB16(__ROR(input2, 8)); /* extract reminaing two samples */ inB2 = __SXTB16(input2); /* multiply and accumulate two samples at a time */ sum = __SMLAD(inA1, inB1, sum); sum = __SMLAD(inA2, inB2, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Dot product and then store the results in a temporary buffer. */ sum = __SMLAD(*pSrcA++, *pSrcB++, sum); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Dot product and then store the results in a temporary buffer. */ sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ /* Store the result in the destination buffer in 18.14 format */ *result = sum; }
void arm_power_q7( q7_t * pSrc, uint32_t blockSize, q31_t * pResult) { q31_t sum = 0; /* Temporary result storage */ q7_t in; /* Temporary variable to store input */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t input1; /* Temporary variable to store packed input */ q31_t in1, in2; /* Temporary variables to store input */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* Reading two inputs of pSrc vector and packing */ input1 = *__SIMD32(pSrc)++; in1 = __SXTB16(__ROR(input1, 8)); in2 = __SXTB16(input1); /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* calculate power and accumulate to accumulator */ sum = __SMLAD(in1, in1, sum); sum = __SMLAD(in2, in2, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0 */ while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ in = *pSrc++; sum += ((q15_t) in * in); /* Decrement the loop counter */ blkCnt--; } /* Store the result in 18.14 format */ *pResult = sum; }
/** \brief Test case: TC_CoreSimd_ParMul16 \details - Check Parallel 16-bit multiplication: __SMLAD __SMLADX __SMLALD __SMLALDX __SMLSD __SMLSDX __SMLSLD __SMLSLDX __SMUAD __SMUADX __SMUSD __SMUSDX */ void TC_CoreSimd_ParMul16 (void) { #if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) ) volatile int32_t op1_s32, op2_s32, op3_s32; volatile int32_t res_s32; volatile int64_t op1_s64; volatile int64_t res_s64; /* --- __SMLAD Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op3_s32 = 0x20000000; res_s32 = __SMLAD(op1_s32, op2_s32, op3_s32); ASSERT_TRUE(res_s32 == 0x20000017); /* --- __SMLADX Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op3_s32 = 0x00000800; res_s32 = __SMLADX(op1_s32, op2_s32, op3_s32); ASSERT_TRUE(res_s32 == 0x00000816); /* --- __SMLALD Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op1_s64 = 0x00000000200000000LL; res_s64 = __SMLALD(op1_s32, op2_s32, op1_s64); ASSERT_TRUE(res_s64 == 0x0000000200000017LL); /* --- __SMLALDX Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op1_s64 = 0x00000000200000000LL; res_s64 = __SMLALDX(op1_s32, op2_s32, op1_s64); ASSERT_TRUE(res_s64 == 0x0000000200000016LL); /* --- __SMLSD Test ---------------------------------------------- */ op1_s32 = 0x00030006; op2_s32 = 0x00050004; op3_s32 = 0x00000800; res_s32 = __SMLSD(op1_s32, op2_s32, op3_s32); ASSERT_TRUE(res_s32 == 0x00000809); /* --- __SMLSDX Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op3_s32 = 0x00000800; res_s32 = __SMLSDX(op1_s32, op2_s32, op3_s32); ASSERT_TRUE(res_s32 == 0x000007FE); /* --- __SMLSLD Test ---------------------------------------------- */ op1_s32 = 0x00030006; op2_s32 = 0x00050004; op1_s64 = 0x00000000200000000LL; res_s64 = __SMLSLD(op1_s32, op2_s32, op1_s64); ASSERT_TRUE(res_s64 == 0x0000000200000009LL); /* --- __SMLSLDX Test ---------------------------------------------- */ op1_s32 = 0x00030006; op2_s32 = 0x00050004; op1_s64 = 0x00000000200000000LL; res_s64 = __SMLSLDX(op1_s32, op2_s32, op1_s64); ASSERT_TRUE(res_s64 == 0x0000000200000012LL); /* --- __SMUAD Test ---------------------------------------------- */ op1_s32 = 0x00030001; op2_s32 = 0x00040002; res_s32 = __SMUAD(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == 0x0000000E); op1_s32 = (int32_t)0xFFFDFFFF; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUAD(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF2); /* --- __SMUADX Test ---------------------------------------------- */ op1_s32 = 0x00030001; op2_s32 = 0x00040002; res_s32 = __SMUADX(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == 0x0000000A); op1_s32 = (int32_t)0xFFFDFFFF; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUADX(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6); /* --- __SMUSD Test ---------------------------------------------- */ op1_s32 = (int32_t)0x00030001; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUSD(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6); op1_s32 = (int32_t)0xFFFDFFFF; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUSD(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == 0x0000000A); /* --- __SMUSDX Test ---------------------------------------------- */ op1_s32 = 0x00030001; op2_s32 = 0x00040002; res_s32 = __SMUSDX(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFFE); op1_s32 = (int32_t)0xFFFDFFFF; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUSDX(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x00000002); #endif }
void arm_fir_q7( const arm_fir_instance_q7 * S, q7_t * pSrc, q7_t * pDst, uint32_t blockSize) { uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ uint32_t i, blkCnt; /* Loop counters */ q7_t *pState = S->pState; /* State pointer */ q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ q7_t *px, *pb; /* Temporary pointers to state and coeff */ q31_t acc = 0; /* Accumlator */ q31_t input1, input2; /* Temporary variables to store input */ q15_t in1, in2; /* Temporary variables to store input */ q7_t *pStateCurnt; /* Points to the current sample of the state */ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ /* pStateCurnt points to the location where the new input data should be written */ pStateCurnt = S->pState + (numTaps - 1u); i = blockSize >> 2u; /* Copy four new input samples into the state buffer. ** Use 32-bit SIMD to move the four 8-bit data. Only requires one copy for every four samples. */ while(i > 0u) { *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; i--; } i = blockSize % 0x4u; /* Copy remining samples into the state buffer. */ while(i > 0u) { *pStateCurnt++ = *pSrc++; i--; } blkCnt = blockSize; /* Perform filtering upto BlockSize - BlockSize%4 */ while(blkCnt > 0u) { /* Set accumulator to zero */ acc = 0; /* Initialize state pointer of type q7 */ px = pState; /* Initialize coeff pointer of type q7 */ pb = pCoeffs; i = numTaps >> 2u; /* Loop over the number of taps. Unroll by a factor of 4. ** Repeat until we've computed numTaps-4 coefficients. */ while(i > 0u) { /* Reading two inputs of state buffer and packing */ in1 = (q15_t) * px++; in2 = (q15_t) * px++; input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* Reading two inputs of coefficient buffer and packing */ in1 = (q15_t) * pb++; in2 = (q15_t) * pb++; input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* Perform Multiply and accumlation of 2 packed inputs and coefficients using SMLALD and store the result in accumlator. */ acc = __SMLAD(input1, input2, acc); /* Reading two inputs of state buffer and packing */ in1 = (q15_t) * px++; in2 = (q15_t) * px++; input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* Reading two inputs of coefficient buffer and packing */ in1 = (q15_t) * pb++; in2 = (q15_t) * pb++; input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); /* Perform Multiply and accumlation of 2 packed inputs and coefficients using SMLALD and store the result in accumlator. */ acc = __SMLAD(input1, input2, acc); /* Decrement the tap loop counter */ i--; } i = numTaps % 0x4u; /* If the filter length is not a multiple of 4, compute the remaining filter taps */ while(i > 0u) { acc = __SMLAD(*px++, *pb++, acc); i--; } /* Saturate output */ acc = __SSAT((acc >> 7), 8); /*Store filter output */ *pDst++ = (q7_t) (acc); /* Advance the state pointer by 1 to process the next sample */ pState = pState + 1; /* Decrement the loop counter */ blkCnt--; } /* Processing is complete. ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. ** This prepares the state buffer for the next function call. */ /* Points to the start of the state buffer */ pStateCurnt = S->pState; /* Calculation of count for copying integer writes */ i = (numTaps - 1u) >> 2u; /* Copy four values using integer pointer */ while(i > 0u) { *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; i--; } /* Calculation of count for remaining q7_t data */ i = (numTaps - 1u) % 0x4u; /* Copy of remaining q7_t data */ while(i > 0u) { *pStateCurnt++ = *pState++; i--; } }
void arm_power_q7( q7_t * pSrc, uint32_t blockSize, q31_t * pResult) { q31_t acc = 0; /* Temporary result storage */ q31_t input1; /* Temporary variable to store packed input */ q7_t in; /* Temporary variable to store input */ uint32_t blkCnt; /* loop counter */ q31_t inA1, inA2; /* Temporary variables to hold intermiediate data */ q31_t acc1 = 0; /*loop Unrolling */ blkCnt = blockSize >> 3u; /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while(blkCnt > 0u) { /* read four samples at a time from soruce buffer */ input1 = _SIMD32_OFFSET(pSrc); /* extend two q7_t values to q15_t values */ #ifdef CCS inA1 = __SXTB16(input1, 8); inA2 = __SXTB16(input1, 0); #else inA1 = __SXTB16(__ROR(input1, 8)); inA2 = __SXTB16(input1); #endif // #ifdef CCS /* calculate power and accumulate to accumulator */ acc = __SMLAD(inA1, inA1, acc); /* read four samples at a time from soruce buffer */ input1 = _SIMD32_OFFSET(pSrc + 4); #ifdef CCS /* extend two q7_t values to q15_t values */ inA1 = __SXTB16(input1, 8); /* calculate power and accumulate to accumulator */ acc1 = __SMLAD(inA2, inA2, acc1); /* extend two q7_t values to q15_t values */ inA2 = __SXTB16(input1, 0); #else /* extend two q7_t values to q15_t values */ inA1 = __SXTB16(__ROR(input1, 8)); /* calculate power and accumulate to accumulator */ acc1 = __SMLAD(inA2, inA2, acc1); /* extend two q7_t values to q15_t values */ inA2 = __SXTB16(input1); #endif // #ifdef CCS /* calculate power and accumulate to accumulator */ acc = __SMLAD(inA1, inA1, acc); acc1 = __SMLAD(inA2, inA2, acc1); /* update source buffer to process next samples */ pSrc += 8u; /* Decrement the loop counter */ blkCnt--; } /* add accumulators */ acc = acc + acc1; /* If the blockSize is not a multiple of 8, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, acc. */ in = *pSrc++; acc += ((q15_t) in * in); /* Decrement the loop counter */ blkCnt--; } /* Store the result in 18.14 format */ *pResult = acc; }