/** \brief Test case: TC_CoreSimd_Part9 \details - Check Packing Halfword: __PKHBT __PKHTB */ void TC_CoreSimd_Pack16 (void) { #if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) ) volatile uint32_t op1_u32, op2_u32; volatile uint32_t res_u32; /* --- __PKHBT Test ---------------------------------------------- */ op1_u32 = 0x00000111; op2_u32 = 0x22200000; res_u32 = __PKHBT(op1_u32, op2_u32, 0); ASSERT_TRUE(res_u32 == 0x22200111); op1_u32 = 0x00000111; op2_u32 = 0x22200000; res_u32 = __PKHBT(op1_u32, op2_u32, 4); ASSERT_TRUE(res_u32 == 0x22000111); /* --- __PKHTB Test ---------------------------------------------- */ op1_u32 = 0x11100000; op2_u32 = 0x00000222; res_u32 = __PKHTB(op1_u32, op2_u32, 0); ASSERT_TRUE(res_u32 == 0x11100222); op1_u32 = 0x11100000; op2_u32 = 0x00000222; res_u32 = __PKHTB(op1_u32, op2_u32, 4); ASSERT_TRUE(res_u32 == 0x11100022); #endif }
void arm_pid_init_q15( arm_pid_instance_q15 * S, int32_t resetStateFlag) { #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ /* Derived coefficient A0 */ S->A0 = __QADD16(__QADD16(S->Kp, S->Ki), S->Kd); /* Derived coefficients and pack into A1 */ #ifndef ARM_MATH_BIG_ENDIAN S->A1 = __PKHBT(-__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), S->Kd, 16); #else S->A1 = __PKHBT(S->Kd, -__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Check whether state needs reset or not */ if(resetStateFlag) { /* Clear the state buffer. The size will be always 3 samples */ memset(S->state, 0, 3u * sizeof(q15_t)); } #else /* Run the below code for Cortex-M0 */ q31_t temp; /*to store the sum */ /* Derived coefficient A0 */ temp = S->Kp + S->Ki + S->Kd; S->A0 = (q15_t) __SSAT(temp, 16); /* Derived coefficients and pack into A1 */ temp = -(S->Kd + S->Kd + S->Kp); S->A1 = (q15_t) __SSAT(temp, 16); S->A2 = S->Kd; /* Check whether state needs reset or not */ if(resetStateFlag) { /* Clear the state buffer. The size will be always 3 samples */ memset(S->state, 0, 3u * sizeof(q15_t)); } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
buffer_c16_t Complex8DecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) { /* Decimates by two using a non-recursive third-order CIC filter. */ /* CIC filter (decimating by two): * D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1 * D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1 * * D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1 * D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1 */ uint32_t i1_i0 = _i1_i0; uint32_t q1_q0 = _q1_q0; /* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */ constexpr uint32_t scale_factor = 32; constexpr uint32_t k_3_1 = 0x00030001 * scale_factor; uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]); uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]); uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]); while(src_p < src_end) { const uint32_t q3_i3_q2_i2 = *(src_p++); // 3 const uint32_t q5_i5_q4_i4 = *(src_p++); const uint32_t d_i0_partial = __SMUAD(k_3_1, i1_i0); // 1: = 3 * i1 + 1 * i0 const uint32_t i3_i2 = __SXTB16(q3_i3_q2_i2, 0); // 1: (q3_i3_q2_i2 ror 0)[23:16]:(q3_i3_q2_i2 ror 0)[7:0] const uint32_t d_i0 = __SMLADX(k_3_1, i3_i2, d_i0_partial); // 1: + 3 * i2 + 1 * i3 const uint32_t d_q0_partial = __SMUAD(k_3_1, q1_q0); // 1: = 3 * q1 * 1 * q0 const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2, 8); // 1: (q3_i3_q2_i2 ror 8)[23:16]:(q3_i3_q2_i2 ror 8)[7:0] const uint32_t d_q0 = __SMLADX(k_3_1, q3_q2, d_q0_partial); // 1: + 3 * q2 + 1 * q3 const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] const uint32_t d_i1_partial = __SMUAD(k_3_1, i3_i2); // 1: = 3 * i3 + 1 * i2 const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4, 0); // 1: (q5_i5_q4_i4 ror 0)[23:16]:(q5_i5_q4_i4 ror 0)[7:0] const uint32_t d_i1 = __SMLADX(k_3_1, i5_i4, d_i1_partial); // 1: + 1 * i5 + 3 * i4 const uint32_t d_q1_partial = __SMUAD(k_3_1, q3_q2); // 1: = 3 * q3 * 1 * q2 const uint32_t q5_q4 = __SXTB16(q5_i5_q4_i4, 8); // 1: (q5_i5_q4_i4 ror 8)[23:16]:(q5_i5_q4_i4 ror 8)[7:0] const uint32_t d_q1 = __SMLADX(k_3_1, q5_q4, d_q1_partial); // 1: + 1 * q5 + 3 * q4 const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] *(dst_p++) = d_q0_i0; // 3 *(dst_p++) = d_q1_i1; i1_i0 = i5_i4; q1_q0 = q5_q4; } _i1_i0 = i1_i0; _q1_q0 = q1_q0; return { dst.p, src.count / 2, src.sampling_rate / 2 }; }
buffer_s16_t FM::execute( const buffer_c16_t& src, const buffer_s16_t& dst ) { auto z = z_; const auto src_p = src.p; const auto src_end = &src.p[src.count]; auto dst_p = dst.p; while(src_p < src_end) { const auto s0 = *__SIMD32(src_p)++; const auto s1 = *__SIMD32(src_p)++; const auto t0 = multiply_conjugate_s16_s32(s0, z); const auto t1 = multiply_conjugate_s16_s32(s1, s0); z = s1; const int32_t theta0_int = angle_approx_0deg27(t0) * ks16; const int32_t theta0_sat = __SSAT(theta0_int, 16); const int32_t theta1_int = angle_approx_0deg27(t1) * ks16; const int32_t theta1_sat = __SSAT(theta1_int, 16); *__SIMD32(dst_p)++ = __PKHBT( theta0_sat, theta1_sat, 16 ); } z_ = z; return { dst.p, src.count, src.sampling_rate }; }
void arm_negate_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q15_t in1, in2; /* Temporary variables */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = ~A */ /* Read two inputs */ in1 = *pSrc++; in2 = *pSrc++; /* Negate and then store the results in the destination buffer by packing. */ *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); in1 = *pSrc++; in2 = *pSrc++; *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = ~A */ /* Negate and then store the result in the destination buffer. */ *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } }
void arm_fill_q15( q15_t value, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t packedValue; /* value packed to 32 bits */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* Packing two 16 bit values to 32 bit value in order to use SIMD */ packedValue = __PKHBT(value, value, 16u); /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = value */ /* Fill the value in the destination buffer */ *__SIMD32(pDst)++ = packedValue; *__SIMD32(pDst)++ = packedValue; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = value */ /* Fill the value in the destination buffer */ *pDst++ = value; /* Decrement the loop counter */ blkCnt--; } }
void arm_fill_q15( q15_t value, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* Loop counter */ #if defined (ARM_MATH_LOOPUNROLL) q31_t packedValue; /* value packed to 32 bits */ /* Packing two 16 bit values to 32 bit value in order to use SIMD */ packedValue = __PKHBT(value, value, 16U); /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = value */ /* fill 2 times 2 samples at a time */ write_q15x2_ia (&pDst, packedValue); write_q15x2_ia (&pDst, packedValue); /* Decrement loop counter */ blkCnt--; } /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = value */ /* Fill value in destination buffer */ *pDst++ = value; /* Decrement loop counter */ blkCnt--; } }
void arm_fill_q15( q15_t value, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q31_t packedValue; /* value packed to 32 bits */ /*loop Unrolling */ blkCnt = blockSize >> 3u; /* Packing two 16 bit values to 32 bit value in order to use SIMD */ packedValue = __PKHBT(value, value, 16u); /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while(blkCnt > 0u) { /* C = value */ /* Fill the value in the destination buffer */ _SIMD32_OFFSET(pDst) = packedValue; _SIMD32_OFFSET(pDst + 2) = packedValue; _SIMD32_OFFSET(pDst + 4) = packedValue; _SIMD32_OFFSET(pDst + 6) = packedValue; pDst += 8u; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 8, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { /* C = value */ /* Fill the value in the destination buffer */ *pDst++ = value; /* Decrement the loop counter */ blkCnt--; } }
void arm_biquad_cascade_df1_fast_q15( const arm_biquad_casd_df1_inst_q15 * S, q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { q15_t *pIn = pSrc; /* Source pointer */ q15_t *pOut = pDst; /* Destination pointer */ q31_t in; /* Temporary variable to hold input value */ q31_t out; /* Temporary variable to hold output value */ q31_t b0; /* Temporary variable to hold bo value */ q31_t b1, a1; /* Filter coefficients */ q31_t state_in, state_out; /* Filter state variables */ q31_t acc0; /* Accumulator */ int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */ q15_t *pState = S->pState; /* State pointer */ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ q31_t *pState_q31; /* 32-bit state pointer for SIMD implementation */ uint32_t sample, stage = S->numStages; /* Stage loop counter */ do { /* Initialize state pointer of type q31 */ pState_q31 = (q31_t *) (pState); /* Read the b0 and 0 coefficients using SIMD */ b0 = *__SIMD32(pCoeffs)++; /* Read the b1 and b2 coefficients using SIMD */ b1 = *__SIMD32(pCoeffs)++; /* Read the a1 and a2 coefficients using SIMD */ a1 = *__SIMD32(pCoeffs)++; /* Read the input state values from the state buffer: x[n-1], x[n-2] */ state_in = (q31_t) (*pState_q31++); /* Read the output state values from the state buffer: y[n-1], y[n-2] */ state_out = (q31_t) (*pState_q31); /* Apply loop unrolling and compute 2 output values simultaneously. */ /* The variables acc0 ... acc3 hold output values that are being computed: * * acc0 = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] * acc0 = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ sample = blockSize >> 1u; /* First part of the processing with loop unrolling. Compute 2 outputs at a time. ** a second loop below computes the remaining 1 sample. */ while(sample > 0u) { /* Read the input */ in = *__SIMD32(pIn)++; /* out = b0 * x[n] + 0 * 0 */ out = __SMUAD(b0, in); /* acc0 = b1 * x[n-1] + acc0 += b2 * x[n-2] + out */ acc0 = __SMLAD(b1, state_in, out); /* acc0 += a1 * y[n-1] + acc0 += a2 * y[n-2] */ acc0 = __SMLAD(a1, state_out, acc0); /* The result is converted from 3.29 to 1.31 and then saturation is applied */ out = __SSAT((acc0 >> shift), 16); /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc0 */ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ #ifndef ARM_MATH_BIG_ENDIAN state_in = __PKHBT(in, state_in, 16); state_out = __PKHBT(out, state_out, 16); #else state_in = __PKHBT(state_in >> 16, (in >> 16), 16); state_out = __PKHBT(state_out >> 16, (out), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* out = b0 * x[n] + 0 * 0 */ out = __SMUADX(b0, in); /* acc0 = b1 * x[n-1] + acc0 += b2 * x[n-2] + out */ acc0 = __SMLAD(b1, state_in, out); /* acc0 += a1 * y[n-1] + acc0 += a2 * y[n-2] */ acc0 = __SMLAD(a1, state_out, acc0); /* The result is converted from 3.29 to 1.31 and then saturation is applied */ out = __SSAT((acc0 >> shift), 16); /* Store the output in the destination buffer. */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16); #else *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc0 */ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ #ifndef ARM_MATH_BIG_ENDIAN state_in = __PKHBT(in >> 16, state_in, 16); state_out = __PKHBT(out, state_out, 16); #else state_in = __PKHBT(state_in >> 16, in, 16); state_out = __PKHBT(state_out >> 16, out, 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Decrement the loop counter */ sample--; } /* If the blockSize is not a multiple of 2, compute any remaining output samples here. ** No loop unrolling is used. */ if((blockSize & 0x1u) != 0u) { /* Read the input */ in = *pIn++; /* out = b0 * x[n] + 0 * 0 */ #ifndef ARM_MATH_BIG_ENDIAN out = __SMUAD(b0, in); #else out = __SMUADX(b0, in); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* acc0 = b1 * x[n-1] + acc0 += b2 * x[n-2] + out */ acc0 = __SMLAD(b1, state_in, out); /* acc0 += a1 * y[n-1] + acc0 += a2 * y[n-2] */ acc0 = __SMLAD(a1, state_out, acc0); /* The result is converted from 3.29 to 1.31 and then saturation is applied */ out = __SSAT((acc0 >> shift), 16); /* Store the output in the destination buffer. */ *pOut++ = (q15_t) out; /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc0 */ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ #ifndef ARM_MATH_BIG_ENDIAN state_in = __PKHBT(in, state_in, 16); state_out = __PKHBT(out, state_out, 16); #else state_in = __PKHBT(state_in >> 16, in, 16); state_out = __PKHBT(state_out >> 16, out, 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ } /* The first stage goes from the input buffer to the output buffer. */ /* Subsequent (numStages - 1) occur in-place in the output buffer */ pIn = pDst; /* Reset the output pointer */ pOut = pDst; /* Store the updated state variables back into the state array */ *__SIMD32(pState)++ = state_in; *__SIMD32(pState)++ = state_out; /* Decrement the loop counter */ stage--; } while(stage > 0u);
void arm_abs_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q15_t in1; /* Input value1 */ q15_t in2; /* Input value2 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = |A| */ /* Read two inputs */ in1 = *pSrc++; in2 = *pSrc++; /* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)), ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16); #else *__SIMD32(pDst)++ = __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)), ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ in1 = *pSrc++; in2 = *pSrc++; #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)), ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16); #else *__SIMD32(pDst)++ = __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)), ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = |A| */ /* Read the input */ in1 = *pSrc++; /* Calculate absolute value of input and then store the result in the destination buffer. */ *pDst++ = (in1 > 0) ? in1 : __QSUB16(0, in1); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ q15_t in; /* Temporary input variable */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = |A| */ /* Read the input */ in = *pSrc++; /* Calculate absolute value of input and then store the result in the destination buffer. */ *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
void arm_q7_to_q15( q7_t * pSrc, q15_t * pDst, uint32_t blockSize) { q7_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY q31_t in; q31_t in1, in2; q31_t out1, out2; /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ in = *__SIMD32(pIn)++; /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); /* extend remainig two q7_t values to q15_t values */ in2 = __SXTB16(in); in1 = in1 << 8u; in2 = in2 << 8u; in1 = in1 & 0xFF00FF00; in2 = in2 & 0xFF00FF00; #ifndef ARM_MATH_BIG_ENDIAN out2 = __PKHTB(in1, in2, 16); out1 = __PKHBT(in2, in1, 16); #else out1 = __PKHTB(in1, in2, 16); out2 = __PKHBT(in2, in1, 16); #endif *__SIMD32(pDst)++ = out1; *__SIMD32(pDst)++ = out2; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) * pIn++ << 8; /* Decrement the loop counter */ blkCnt--; } }
buffer_c16_t FIRAndDecimateComplex::execute( const buffer_c16_t& src, const buffer_c16_t& dst ) { /* int16_t input (sample count "n" must be multiple of decimation_factor) * -> int16_t output, decimated by decimation_factor. * taps are normalized to 1 << 16 == 1.0. */ const auto output_sampling_rate = src.sampling_rate / decimation_factor_; const size_t output_samples = src.count / decimation_factor_; sample_t* dst_p = dst.p; const buffer_c16_t result { dst.p, output_samples, output_sampling_rate }; const sample_t* src_p = src.p; size_t outer_count = output_samples; while(outer_count > 0) { /* Put new samples into delay buffer */ auto z_new_p = &samples_[taps_count_ - decimation_factor_]; for(size_t i=0; i<decimation_factor_; i++) { *__SIMD32(z_new_p)++ = *__SIMD32(src_p)++; } size_t loop_count = taps_count_ / 8; auto t_p = &taps_reversed_[0]; auto z_p = &samples_[0]; int64_t t_real = 0; int64_t t_imag = 0; while(loop_count > 0) { const auto tap0 = *__SIMD32(t_p)++; const auto sample0 = *__SIMD32(z_p)++; const auto tap1 = *__SIMD32(t_p)++; const auto sample1 = *__SIMD32(z_p)++; t_real = __SMLSLD(sample0, tap0, t_real); t_imag = __SMLALDX(sample0, tap0, t_imag); t_real = __SMLSLD(sample1, tap1, t_real); t_imag = __SMLALDX(sample1, tap1, t_imag); const auto tap2 = *__SIMD32(t_p)++; const auto sample2 = *__SIMD32(z_p)++; const auto tap3 = *__SIMD32(t_p)++; const auto sample3 = *__SIMD32(z_p)++; t_real = __SMLSLD(sample2, tap2, t_real); t_imag = __SMLALDX(sample2, tap2, t_imag); t_real = __SMLSLD(sample3, tap3, t_real); t_imag = __SMLALDX(sample3, tap3, t_imag); const auto tap4 = *__SIMD32(t_p)++; const auto sample4 = *__SIMD32(z_p)++; const auto tap5 = *__SIMD32(t_p)++; const auto sample5 = *__SIMD32(z_p)++; t_real = __SMLSLD(sample4, tap4, t_real); t_imag = __SMLALDX(sample4, tap4, t_imag); t_real = __SMLSLD(sample5, tap5, t_real); t_imag = __SMLALDX(sample5, tap5, t_imag); const auto tap6 = *__SIMD32(t_p)++; const auto sample6 = *__SIMD32(z_p)++; const auto tap7 = *__SIMD32(t_p)++; const auto sample7 = *__SIMD32(z_p)++; t_real = __SMLSLD(sample6, tap6, t_real); t_imag = __SMLALDX(sample6, tap6, t_imag); t_real = __SMLSLD(sample7, tap7, t_real); t_imag = __SMLALDX(sample7, tap7, t_imag); loop_count--; } /* TODO: Re-evaluate whether saturation is performed, normalization, * all that jazz. */ const int32_t r = t_real >> 16; const int32_t i = t_imag >> 16; const int32_t r_sat = __SSAT(r, 16); const int32_t i_sat = __SSAT(i, 16); *__SIMD32(dst_p)++ = __PKHBT( r_sat, i_sat, 16 ); /* Shift sample buffer left/down by decimation factor. */ const size_t unroll_factor = 4; size_t shift_count = (taps_count_ - decimation_factor_) / unroll_factor; sample_t* t = &samples_[0]; const sample_t* s = &samples_[decimation_factor_]; while(shift_count > 0) { *__SIMD32(t)++ = *__SIMD32(s)++; *__SIMD32(t)++ = *__SIMD32(s)++; *__SIMD32(t)++ = *__SIMD32(s)++; *__SIMD32(t)++ = *__SIMD32(s)++; shift_count--; } shift_count = (taps_count_ - decimation_factor_) % unroll_factor; while(shift_count > 0) { *(t++) = *(s++); shift_count--; } outer_count--; } return result; }
void arm_mult_q15( q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counters */ q31_t inA1, inA2, inB1, inB2; /* temporary input variables */ q15_t out1, out2, out3, out4; /* temporary output variables */ q31_t mul1, mul2, mul3, mul4; /* temporary variables */ /* loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* read two samples at a time from sourceA */ inA1 = *__SIMD32(pSrcA)++; /* read two samples at a time from sourceB */ inB1 = *__SIMD32(pSrcB)++; /* read two samples at a time from sourceA */ inA2 = *__SIMD32(pSrcA)++; /* read two samples at a time from sourceB */ inB2 = *__SIMD32(pSrcB)++; /* multiply mul = sourceA * sourceB */ mul1 = (q31_t)((q15_t)(inA1 >> 16)* (q15_t)(inB1>> 16)); mul2 = (q31_t)((q15_t)inA1 * (q15_t)inB1); mul3 = (q31_t)((q15_t)(inA2 >> 16)* (q15_t)(inB2>> 16)); mul4 = (q31_t)((q15_t)inA2 * (q15_t)inB2); /* shift result by 15 to get 16 bit result */ mul1 = mul1 >> 15; mul2 = mul2 >> 15; mul3 = mul3 >> 15; mul4 = mul4 >> 15; /* saturate result to 16 bit */ #ifdef CCS out1 = (q15_t) __SSATA(mul1, 0, 16); out2 = (q15_t) __SSATA(mul2, 0, 16); out3 = (q15_t) __SSATA(mul3, 0, 16); out4 = (q15_t) __SSATA(mul4, 0, 16); #else out1 = (q15_t) __SSAT(mul1, 16); out2 = (q15_t) __SSAT(mul2, 16); out3 = (q15_t) __SSAT(mul3, 16); out4 = (q15_t) __SSAT(mul4, 16); #endif // #ifdef CCS /* store the result */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16); *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16); #else *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16); *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16); #endif // #ifndef ARM_MATH_BIG_ENDIAN /* Decrement the blockSize loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A * B */ /* Multiply the inputs and store the result in the destination buffer */ #ifdef CCS *pDst++ = (q15_t) __SSATA(((q31_t) ((*pSrcA++) * (*pSrcB++)) >> 15), 0, 16); #else *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16); #endif // #ifdef CCS /* Decrement the blockSize loop counter */ blkCnt--; } }
void arm_q7_to_q15( q7_t * pSrc, q15_t * pDst, uint32_t blockSize) { q7_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ q31_t in; q31_t in1, in2; q31_t out1, out2; q31_t and = 0xFF00FF00; /*loop Unrolling */ blkCnt = blockSize >> 3u; /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ /* read 4 samples at a time */ in = *__SIMD32(pIn)++; #ifdef CCS /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(in, 8); /* extend remainig two q7_t values to q15_t values */ in2 = __SXTB16(in, 0); #else /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); /* extend remainig two q7_t values to q15_t values */ in2 = __SXTB16(in); #endif /* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/ in1 = in1 << 8u; in2 = in2 << 8u; /* read next 4 sampels */ in = *__SIMD32(pIn)++; /* anding with 0xff00ff00 */ in1 = in1 & and; out2 = in2 & and; /* pack two 16 bit values */ out1 = __PKHTB(in1, out2, 16); out2 = __PKHBT(out2, in1, 16); #ifndef ARM_MATH_BIG_ENDIAN /* store two q15_t samples at a time to destination */ _SIMD32_OFFSET(pDst + 2) = out1; #ifdef CCS /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(in, 8); #else /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); #endif /* store two q15_t samples at a time to destination */ _SIMD32_OFFSET(pDst) = out2; #else /* store two q15_t samples at a time to destination */ _SIMD32_OFFSET(pDst) = out1; #ifdef CCS /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(in, 8); #else /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); #endif /* store two q15_t samples at a time to destination */ _SIMD32_OFFSET(pDst + 2) = out2; #endif // #ifndef ARM_MATH_BIG_ENDIAN #ifdef CCS /* rotatate in by 8 and extend two q7_t values to q15_t values */ in2 = __SXTB16(in, 0); #else /* rotatate in by 8 and extend two q7_t values to q15_t values */ in2 = __SXTB16(in); #endif /* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/ in1 = in1 << 8u; in2 = in2 << 8u; /* anding with 0xff00ff00 */ out1 = in1 & and; out2 = in2 & and; /* pack two 16 bit values */ out1 = __PKHTB(in1, out2, 16); out2 = __PKHBT(out2, in1, 16); /* store two q15_t samples at a time to destination */ #ifndef ARM_MATH_BIG_ENDIAN _SIMD32_OFFSET(pDst + 6) = out1; _SIMD32_OFFSET(pDst + 4) = out2; #else _SIMD32_OFFSET(pDst + 4) = out1; _SIMD32_OFFSET(pDst + 6) = out2; #endif // #ifndef ARM_MATH_BIG_ENDIAN /* incremnet destination pointer */ pDst += 8u; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 8, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) * pIn++ << 8; /* Decrement the loop counter */ blkCnt--; } }
void arm_offset_q15( q15_t * pSrc, q15_t offset, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t offset_packed; /* Offset packed to 32 bit */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* Offset is packed to 32 bit in order to use SIMD32 for addition */ offset_packed = __PKHBT(offset, offset, 16); /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer, 2 samples at a time. */ *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed); *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer. */ *pDst++ = (q15_t) __QADD16(*pSrc++, offset); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer. */ *pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
void arm_negate_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q15_t in1, in2; /* Temporary variables */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = -A */ /* Read two inputs */ in1 = *pSrc++; in2 = *pSrc++; /* Negate and then store the results in the destination buffer by packing. */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); #else *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ in1 = *pSrc++; in2 = *pSrc++; #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); #else *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0 */ while(blkCnt > 0u) { /* C = -A */ /* Negate and then store the result in the destination buffer. */ *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } }
buffer_c16_t TranslateByFSOver4AndDecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) { /* Translates incoming complex<int8_t> samples by -fs/4, * decimates by two using a non-recursive third-order CIC filter. */ /* Derivation of algorithm: * Original CIC filter (decimating by two): * D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1 * D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1 * * D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1 * D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1 * * Translate -fs/4, phased 180 degrees, accomplished by complex multiplication * of complex length-4 sequence: * * Substitute: * i0 = -i0, q0 = -q0 * i1 = -q1, q1 = i1 * i2 = i2, q2 = q2 * i3 = q3, q3 = -i3 * i4 = -i4, q4 = -q4 * i5 = -q5, q5 = i5 * * Resulting taps (with decimation by 2, four samples in, two samples out): * D_I0 = q3 * 1 + i2 * 3 + -q1 * 3 + -i0 * 1 * D_Q0 = -i3 * 1 + q2 * 3 + i1 * 3 + -q0 * 1 * * D_I1 = -q5 * 1 + -i4 * 3 + q3 * 3 + i2 * 1 * D_Q1 = i5 * 1 + -q4 * 3 + -i3 * 3 + q2 * 1 */ // 6 cycles per complex input sample, not including loop overhead. uint32_t q1_i0 = _q1_i0; uint32_t q0_i1 = _q0_i1; /* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */ constexpr uint32_t scale_factor = 32; const uint32_t k_3_1 = 0x00030001 * scale_factor; uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]); uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]); uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]); while(src_p < src_end) { const uint32_t q3_i3_q2_i2 = *(src_p++); // 3 const uint32_t q5_i5_q4_i4 = *(src_p++); const uint32_t i2_i3 = __SXTB16(q3_i3_q2_i2, 16); // 1: (q3_i3_q2_i2 ror 16)[23:16]:(q3_i3_q2_i2 ror 16)[7:0] const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2, 8); // 1: (q3_i3_q2_i2 ror 8)[23:16]:(q3_i3_q2_i2 ror 8)[7:0] const uint32_t i2_q3 = __PKHTB(i2_i3, q3_q2, 16); // 1: Rn[31:16]:(Rm>>16)[15:0] const uint32_t i3_q2 = __PKHBT(q3_q2, i2_i3, 16); // 1:(Rm<<16)[31:16]:Rn[15:0] // D_I0 = 3 * (i2 - q1) + (q3 - i0) const uint32_t i2_m_q1_q3_m_i0 = __QSUB16(i2_q3, q1_i0); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0] const uint32_t d_i0 = __SMUAD(k_3_1, i2_m_q1_q3_m_i0); // 1: Rm[15:0]*Rs[15:0]+Rm[31:16]*Rs[31:16] // D_Q0 = 3 * (q2 + i1) - (i3 + q0) const uint32_t i3_p_q0_q2_p_i1 = __QADD16(i3_q2, q0_i1); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0] const uint32_t d_q0 = __SMUSDX(i3_p_q0_q2_p_i1, k_3_1); // 1: Rm[15:0]*Rs[31:16]–Rm[31:16]*RsX[15:0] const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4, 0); // 1: (q5_i5_q4_i4 ror 0)[23:16]:(q5_i5_q4_i4 ror 0)[7:0] const uint32_t q4_q5 = __SXTB16(q5_i5_q4_i4, 24); // 1: (q5_i5_q4_i4 ror 24)[23:16]:(q5_i5_q4_i4 ror 24)[7:0] const uint32_t q4_i5 = __PKHTB(q4_q5, i5_i4, 16); // 1: Rn[31:16]:(Rm>>16)[15:0] const uint32_t q5_i4 = __PKHBT(i5_i4, q4_q5, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] // D_I1 = (i2 - q5) + 3 * (q3 - i4) const uint32_t i2_m_q5_q3_m_i4 = __QSUB16(i2_q3, q5_i4); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0] const uint32_t d_i1 = __SMUADX(i2_m_q5_q3_m_i4, k_3_1); // 1: Rm[15:0]*Rs[31:16]+Rm[31:16]*Rs[15:0] // D_Q1 = (i5 + q2) - 3 * (q4 + i3) const uint32_t q4_p_i3_i5_p_q2 = __QADD16(q4_i5, i3_q2); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0] const uint32_t d_q1 = __SMUSD(k_3_1, q4_p_i3_i5_p_q2); // 1: Rm[15:0]*Rs[15:0]–Rm[31:16]*Rs[31:16] const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] *(dst_p++) = d_q0_i0; // 3 *(dst_p++) = d_q1_i1; q1_i0 = q5_i4; q0_i1 = q4_i5; } _q1_i0 = q1_i0; _q0_i1 = q0_i1; return { dst.p, src.count / 2, src.sampling_rate / 2 }; }