buffer_s16_t FM::execute( const buffer_c16_t& src, const buffer_s16_t& dst ) { auto z = z_; const auto src_p = src.p; const auto src_end = &src.p[src.count]; auto dst_p = dst.p; while(src_p < src_end) { const auto s0 = *__SIMD32(src_p)++; const auto s1 = *__SIMD32(src_p)++; const auto t0 = multiply_conjugate_s16_s32(s0, z); const auto t1 = multiply_conjugate_s16_s32(s1, s0); z = s1; const int32_t theta0_int = angle_approx_0deg27(t0) * ks16; const int32_t theta0_sat = __SSAT(theta0_int, 16); const int32_t theta1_int = angle_approx_0deg27(t1) * ks16; const int32_t theta1_sat = __SSAT(theta1_int, 16); *__SIMD32(dst_p)++ = __PKHBT( theta0_sat, theta1_sat, 16 ); } z_ = z; return { dst.p, src.count, src.sampling_rate }; }
void arm_mean_q15( q15_t * pSrc, uint32_t blockSize, q15_t * pResult) { q31_t sum = 0; /* Temporary result storage */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t in; /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ in = *__SIMD32(pSrc)++; sum += ((in << 16) >> 16); sum += (in >> 16); in = *__SIMD32(pSrc)++; sum += ((in << 16) >> 16); sum += (in >> 16); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ sum += *pSrc++; /* Decrement the loop counter */ blkCnt--; } /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ /* Store the result to the destination */ *pResult = (q15_t) (sum / (q31_t)blockSize); }
void arm_negate_q7( q7_t * pSrc, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q7_t in; #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t input; /* Input values1-4 */ q31_t zero = 0x00000000; /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = -A */ /* Read four inputs */ input = *__SIMD32(pSrc)++; /* Store the Negated results in the destination buffer in a single cycle by packing the results */ *__SIMD32(pDst)++ = __QSUB8(zero, input); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = -A */ /* Negate and then store the results in the destination buffer. */ \ in = *pSrc++; *pDst++ = (in == (q7_t) 0x80) ? 0x7f : -in; /* Decrement the loop counter */ blkCnt--; } }
void arm_fill_q15( q15_t value, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t packedValue; /* value packed to 32 bits */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* Packing two 16 bit values to 32 bit value in order to use SIMD */ packedValue = __PKHBT(value, value, 16u); /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = value */ /* Fill the value in the destination buffer */ *__SIMD32(pDst)++ = packedValue; *__SIMD32(pDst)++ = packedValue; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = value */ /* Fill the value in the destination buffer */ *pDst++ = value; /* Decrement the loop counter */ blkCnt--; } }
void arm_copy_q7( q7_t * pSrc, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A */ /* Copy and then store the results in the destination buffer */ /* 4 samples are copied and stored at a time using SIMD */ *__SIMD32(pDst)++ = *__SIMD32(pSrc)++; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = A */ /* Copy and then store the results in the destination buffer */ *pDst++ = *pSrc++; /* Decrement the loop counter */ blkCnt--; } }
void arm_negate_q7( q7_t * pSrc, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q7_t in1; /* Input value1 */ q7_t in2; /* Input value2 */ q7_t in3; /* Input value3 */ q7_t in4; /* Input value4 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = -A */ /* Read four inputs */ in1 = *pSrc++; in2 = *pSrc++; in3 = *pSrc++; in4 = *pSrc++; /* Store the Negated results in the destination buffer in a single cycle by packing the results */ *__SIMD32(pDst)++ = __PACKq7(__SSAT(-in1, 8), __SSAT(-in2, 8), __SSAT(-in3, 8), __SSAT(-in4, 8)); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0 */ while(blkCnt > 0u) { /* C = -A */ /* Negate and then store the results in the destination buffer. */ *pDst++ = __SSAT(-*pSrc++, 8); /* Decrement the loop counter */ blkCnt--; } }
void feed(buffer_c16_t src, Callback callback) { auto src_p = src.p; while(src_p < &src.p[src.count]) { const uint32_t sample = *__SIMD32(src_p)++; const uint32_t mag_sq = __SMUAD(sample, sample); if( mag_sq > max_squared ) { max_squared = mag_sq; } } count += src.count; const size_t samples_per_update = src.sampling_rate * update_interval; if( count >= samples_per_update ) { const float max_squared_f = max_squared; const float max_db_f = complex16_mag_squared_to_dbv_norm(max_squared_f); const int32_t max_db = max_db_f; const ChannelStatistics statistics { .max_db = max_db, .count = count, }; callback(statistics); max_squared = 0; count = 0; } }
void arm_power_q15( q15_t * pSrc, uint32_t blockSize, q63_t * pResult) { q63_t sum = 0; /* Temporary result storage */ q31_t in32; /* Temporary variable to store input value */ q15_t in16; /* Temporary variable to store input value */ uint32_t blkCnt; /* loop counter */ /* loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ in32 = *__SIMD32(pSrc)++; sum = __SMLALD(in32, in32, sum); in32 = *__SIMD32(pSrc)++; sum = __SMLALD(in32, in32, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ in16 = *pSrc++; sum = __SMLALD(in16, in16, sum); /* Decrement the loop counter */ blkCnt--; } /* Store the results in 34.30 format */ *pResult = sum; }
void arm_negate_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q15_t in1, in2; /* Temporary variables */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = ~A */ /* Read two inputs */ in1 = *pSrc++; in2 = *pSrc++; /* Negate and then store the results in the destination buffer by packing. */ *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); in1 = *pSrc++; in2 = *pSrc++; *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = ~A */ /* Negate and then store the result in the destination buffer. */ *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } }
buffer_f32_t AM::execute( const buffer_c16_t& src, const buffer_f32_t& dst ) { const auto src_p = src.p; const auto src_end = &src.p[src.count]; auto dst_p = dst.p; while(src_p < src_end) { const uint32_t sample0 = *__SIMD32(src_p)++; const uint32_t sample1 = *__SIMD32(src_p)++; const uint32_t mag_sq0 = __SMUAD(sample0, sample0); const uint32_t mag_sq1 = __SMUAD(sample1, sample1); *(dst_p++) = __builtin_sqrtf(mag_sq0) * k; *(dst_p++) = __builtin_sqrtf(mag_sq1) * k; } return { dst.p, src.count, src.sampling_rate }; }
void arm_abs_q7( q7_t * pSrc, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q7_t in1; /* Input value1 */ q7_t in2; /* Input value2 */ q7_t in3; /* Input value3 */ q7_t in4; /* Input value4 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = |A| */ /* Read 4 inputs */ in1 = *pSrc++; in2 = *pSrc++; in3 = *pSrc++; in4 = *pSrc++; /* Store the Absolute result in the destination buffer by packing the 4 values in single cycle */ *__SIMD32(pDst)++ = __PACKq7(((in1 > 0) ? in1 : __SSAT(-in1, 8)), ((in2 > 0) ? in2 : __SSAT(-in2, 8)), ((in3 > 0) ? in3 : __SSAT(-in3, 8)), ((in4 > 0) ? in4 : __SSAT(-in4, 8))); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = |A| */ /* Read the input */ in1 = *pSrc++; /* Store the Absolute result in the destination buffer */ *pDst++ = (in1 > 0) ? in1 : __SSAT(-in1, 8); /* Decrement the loop counter */ blkCnt--; } }
void arm_copy_q7( q7_t * pSrc, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q31_t in1, in2; /*loop Unrolling */ blkCnt = blockSize >> 3u; /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while(blkCnt > 0u) { /* C = A */ /* Copy and then store the results in the destination buffer */ /* 4 samples are copied and stored at a time using SIMD */ in1 = *__SIMD32(pSrc)++; *__SIMD32(pDst)++ = in1; in2 = *__SIMD32(pSrc)++; *__SIMD32(pDst)++ = in2; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 8, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { /* C = A */ /* Copy and then store the results in the destination buffer */ *pDst++ = *pSrc++; /* Decrement the loop counter */ blkCnt--; } }
void arm_sub_q15( q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the results in the destination buffer two samples at a time. */ *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the result in the destination buffer. */ *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the result in the destination buffer. */ *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
buffer_f32_t FM::execute( const buffer_c16_t& src, const buffer_f32_t& dst ) { auto z = z_; const auto src_p = src.p; const auto src_end = &src.p[src.count]; auto dst_p = dst.p; while(src_p < src_end) { const auto s0 = *__SIMD32(src_p)++; const auto s1 = *__SIMD32(src_p)++; const auto t0 = multiply_conjugate_s16_s32(s0, z); const auto t1 = multiply_conjugate_s16_s32(s1, s0); z = s1; *(dst_p++) = angle_precise(t0) * kf; *(dst_p++) = angle_precise(t1) * kf; } z_ = z; return { dst.p, src.count, src.sampling_rate }; }
buffer_c16_t DecimateBy2CIC3::execute( const buffer_c16_t& src, const buffer_c16_t& dst ) { /* Complex non-recursive 3rd-order CIC filter (taps 1,3,3,1). * Gain of 8. * Consumes 16 bytes (4 s16:s16 samples) per loop iteration, * Produces 8 bytes (2 s16:s16 samples) per loop iteration. */ uint32_t t1 = _iq0; uint32_t t2 = _iq1; const uint32_t taps = 0x00000003; auto s = src.p; auto d = dst.p; const auto d_end = &dst.p[src.count / 2]; while(d < d_end) { uint32_t i = __SXTH(t1, 0); /* 1: I0 */ uint32_t q = __SXTH(t1, 16); /* 1: Q0 */ i = __SMLABB(t2, taps, i); /* 1: I1*3 + I0 */ q = __SMLATB(t2, taps, q); /* 1: Q1*3 + Q0 */ const uint32_t t3 = *__SIMD32(s)++; /* 3: Q2:I2 */ const uint32_t t4 = *__SIMD32(s)++; /* Q3:I3 */ i = __SMLABB(t3, taps, i); /* 1: I2*3 + I1*3 + I0 */ q = __SMLATB(t3, taps, q); /* 1: Q2*3 + Q1*3 + Q0 */ int32_t si0 = __SXTAH(i, t4, 0); /* 1: I3 + Q2*3 + Q1*3 + Q0 */ int32_t sq0 = __SXTAH(q, t4, 16); /* 1: Q3 + Q2*3 + Q1*3 + Q0 */ i = __BFI(si0 / 8, sq0 / 8, 16, 16); /* 1: D2_Q0:D2_I0 */ *__SIMD32(d)++ = i; /* D2_Q0:D2_I0 */ i = __SXTH(t3, 0); /* 1: I2 */ q = __SXTH(t3, 16); /* 1: Q2 */ i = __SMLABB(t4, taps, i); /* 1: I3*3 + I2 */ q = __SMLATB(t4, taps, q); /* 1: Q3*3 + Q2 */ t1 = *__SIMD32(s)++; /* 3: Q4:I4 */ t2 = *__SIMD32(s)++; /* Q5:I5 */ i = __SMLABB(t1, taps, i); /* 1: I4*3 + I3*3 + I2 */ q = __SMLATB(t1, taps, q); /* 1: Q4*3 + Q3*3 + Q2 */ int32_t si1 = __SXTAH(i, t2, 0) ; /* 1: I5 + Q4*3 + Q3*3 + Q2 */ int32_t sq1 = __SXTAH(q, t2, 16); /* 1: Q5 + Q4*3 + Q3*3 + Q2 */ i = __BFI(si1 / 8, sq1 / 8, 16, 16); /* 1: D2_Q1:D2_I1 */ *__SIMD32(d)++ = i; /* D2_Q1:D2_I1 */ } _iq0 = t1; _iq1 = t2; return { dst.p, src.count / 2, src.sampling_rate / 2 }; }
void arm_copy_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ /*loop Unrolling */ blkCnt = blockSize >> 3u; /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while(blkCnt > 0u) { /* C = A */ /* Read two inputs */ *__SIMD32(pDst)++ = *__SIMD32(pSrc)++; *__SIMD32(pDst)++ = *__SIMD32(pSrc)++; *__SIMD32(pDst)++ = *__SIMD32(pSrc)++; *__SIMD32(pDst)++ = *__SIMD32(pSrc)++; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 8, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { /* C = A */ /* Copy and then store the value in the destination buffer */ *pDst++ = *pSrc++; /* Decrement the loop counter */ blkCnt--; } }
void arm_fill_q7( q7_t value, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q31_t packedValue; /* value packed to 32 bits */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* Packing four 8 bit values to 32 bit value in order to use SIMD */ packedValue = __PACKq7(value, value, value, value); /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = value */ /* Fill the value in the destination buffer */ *__SIMD32(pDst)++ = packedValue; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = value */ /* Fill the value in the destination buffer */ *pDst++ = value; /* Decrement the loop counter */ blkCnt--; } }
void feed(const buffer_c16_t& src, Callback callback) { auto src_p = src.p; while(src_p < &src.p[src.count]) { const uint32_t sample = *__SIMD32(src_p)++; const uint32_t mag_sq = __SMUAD(sample, sample); if( mag_sq > max_squared ) { max_squared = mag_sq; } } count += src.count; const size_t samples_per_update = src.sampling_rate * update_interval; if( count >= samples_per_update ) { const float max_squared_f = max_squared; const int32_t max_db = mag2_to_dbv_norm(max_squared_f * (1.0f / (32768.0f * 32768.0f))); callback({ max_db, count }); max_squared = 0; count = 0; } }
void arm_sub_q15( q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the results in the destination buffer two samples at a time. */ *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the result in the destination buffer. */ *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++); /* Decrement the loop counter */ blkCnt--; } }
void arm_q15_to_q31( q15_t * pSrc, q31_t * pDst, uint32_t blockSize) { q15_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t in1, in2; q31_t out1, out2, out3, out4; /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = (q31_t)A << 16 */ /* convert from q15 to q31 and then store the results in the destination buffer */ in1 = *__SIMD32(pIn)++; in2 = *__SIMD32(pIn)++; #ifndef ARM_MATH_BIG_ENDIAN /* extract lower 16 bits to 32 bit result */ out1 = in1 << 16u; /* extract upper 16 bits to 32 bit result */ out2 = in1 & 0xFFFF0000; /* extract lower 16 bits to 32 bit result */ out3 = in2 << 16u; /* extract upper 16 bits to 32 bit result */ out4 = in2 & 0xFFFF0000; #else /* extract upper 16 bits to 32 bit result */ out1 = in1 & 0xFFFF0000; /* extract lower 16 bits to 32 bit result */ out2 = in1 << 16u; /* extract upper 16 bits to 32 bit result */ out3 = in2 & 0xFFFF0000; /* extract lower 16 bits to 32 bit result */ out4 = in2 << 16u; #endif // #ifndef ARM_MATH_BIG_ENDIAN *pDst++ = out1; *pDst++ = out2; *pDst++ = out3; *pDst++ = out4; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = (q31_t)A << 16 */ /* convert from q15 to q31 and then store the results in the destination buffer */ *pDst++ = (q31_t) * pIn++ << 16; /* Decrement the loop counter */ blkCnt--; } }
void arm_biquad_cascade_df1_fast_q15( const arm_biquad_casd_df1_inst_q15 * S, q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { q15_t *pIn = pSrc; /* Source pointer */ q15_t *pOut = pDst; /* Destination pointer */ q31_t in; /* Temporary variable to hold input value */ q31_t out; /* Temporary variable to hold output value */ q31_t b0; /* Temporary variable to hold bo value */ q31_t b1, a1; /* Filter coefficients */ q31_t state_in, state_out; /* Filter state variables */ q31_t acc0; /* Accumulator */ int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */ q15_t *pState = S->pState; /* State pointer */ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ q31_t *pState_q31; /* 32-bit state pointer for SIMD implementation */ uint32_t sample, stage = S->numStages; /* Stage loop counter */ do { /* Initialize state pointer of type q31 */ pState_q31 = (q31_t *) (pState); /* Read the b0 and 0 coefficients using SIMD */ b0 = *__SIMD32(pCoeffs)++; /* Read the b1 and b2 coefficients using SIMD */ b1 = *__SIMD32(pCoeffs)++; /* Read the a1 and a2 coefficients using SIMD */ a1 = *__SIMD32(pCoeffs)++; /* Read the input state values from the state buffer: x[n-1], x[n-2] */ state_in = (q31_t) (*pState_q31++); /* Read the output state values from the state buffer: y[n-1], y[n-2] */ state_out = (q31_t) (*pState_q31); /* Apply loop unrolling and compute 2 output values simultaneously. */ /* The variables acc0 ... acc3 hold output values that are being computed: * * acc0 = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] * acc0 = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ sample = blockSize >> 1u; /* First part of the processing with loop unrolling. Compute 2 outputs at a time. ** a second loop below computes the remaining 1 sample. */ while(sample > 0u) { /* Read the input */ in = *__SIMD32(pIn)++; /* out = b0 * x[n] + 0 * 0 */ out = __SMUAD(b0, in); /* acc0 = b1 * x[n-1] + acc0 += b2 * x[n-2] + out */ acc0 = __SMLAD(b1, state_in, out); /* acc0 += a1 * y[n-1] + acc0 += a2 * y[n-2] */ acc0 = __SMLAD(a1, state_out, acc0); /* The result is converted from 3.29 to 1.31 and then saturation is applied */ out = __SSAT((acc0 >> shift), 16); /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc0 */ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ #ifndef ARM_MATH_BIG_ENDIAN state_in = __PKHBT(in, state_in, 16); state_out = __PKHBT(out, state_out, 16); #else state_in = __PKHBT(state_in >> 16, (in >> 16), 16); state_out = __PKHBT(state_out >> 16, (out), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* out = b0 * x[n] + 0 * 0 */ out = __SMUADX(b0, in); /* acc0 = b1 * x[n-1] + acc0 += b2 * x[n-2] + out */ acc0 = __SMLAD(b1, state_in, out); /* acc0 += a1 * y[n-1] + acc0 += a2 * y[n-2] */ acc0 = __SMLAD(a1, state_out, acc0); /* The result is converted from 3.29 to 1.31 and then saturation is applied */ out = __SSAT((acc0 >> shift), 16); /* Store the output in the destination buffer. */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16); #else *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc0 */ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ #ifndef ARM_MATH_BIG_ENDIAN state_in = __PKHBT(in >> 16, state_in, 16); state_out = __PKHBT(out, state_out, 16); #else state_in = __PKHBT(state_in >> 16, in, 16); state_out = __PKHBT(state_out >> 16, out, 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Decrement the loop counter */ sample--; } /* If the blockSize is not a multiple of 2, compute any remaining output samples here. ** No loop unrolling is used. */ if((blockSize & 0x1u) != 0u) { /* Read the input */ in = *pIn++; /* out = b0 * x[n] + 0 * 0 */ #ifndef ARM_MATH_BIG_ENDIAN out = __SMUAD(b0, in); #else out = __SMUADX(b0, in); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* acc0 = b1 * x[n-1] + acc0 += b2 * x[n-2] + out */ acc0 = __SMLAD(b1, state_in, out); /* acc0 += a1 * y[n-1] + acc0 += a2 * y[n-2] */ acc0 = __SMLAD(a1, state_out, acc0); /* The result is converted from 3.29 to 1.31 and then saturation is applied */ out = __SSAT((acc0 >> shift), 16); /* Store the output in the destination buffer. */ *pOut++ = (q15_t) out; /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc0 */ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */ #ifndef ARM_MATH_BIG_ENDIAN state_in = __PKHBT(in, state_in, 16); state_out = __PKHBT(out, state_out, 16); #else state_in = __PKHBT(state_in >> 16, in, 16); state_out = __PKHBT(state_out >> 16, out, 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ } /* The first stage goes from the input buffer to the output buffer. */ /* Subsequent (numStages - 1) occur in-place in the output buffer */ pIn = pDst; /* Reset the output pointer */ pOut = pDst; /* Store the updated state variables back into the state array */ *__SIMD32(pState)++ = state_in; *__SIMD32(pState)++ = state_out; /* Decrement the loop counter */ stage--; } while(stage > 0u);
void arm_dot_prod_q15( q15_t * pSrcA, q15_t * pSrcB, uint32_t blockSize, q63_t * result) { q63_t sum = 0; /* Temporary result storage */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Calculate dot product and then store the result in a temporary buffer. */ sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum); sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Calculate dot product and then store the results in a temporary buffer. */ sum = __SMLALD(*pSrcA++, *pSrcB++, sum); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Calculate dot product and then store the results in a temporary buffer. */ sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ /* Store the result in the destination buffer in 34.30 format */ *result = sum; }
void arm_power_q7( q7_t * pSrc, uint32_t blockSize, q31_t * pResult) { q31_t sum = 0; /* Temporary result storage */ q7_t in; /* Temporary variable to store input */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t input1; /* Temporary variable to store packed input */ q31_t in1, in2; /* Temporary variables to store input */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* Reading two inputs of pSrc vector and packing */ input1 = *__SIMD32(pSrc)++; in1 = __SXTB16(__ROR(input1, 8)); in2 = __SXTB16(input1); /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* calculate power and accumulate to accumulator */ sum = __SMLAD(in1, in1, sum); sum = __SMLAD(in2, in2, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0 */ while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ in = *pSrc++; sum += ((q15_t) in * in); /* Decrement the loop counter */ blkCnt--; } /* Store the result in 18.14 format */ *pResult = sum; }
void arm_dot_prod_q7( q7_t * pSrcA, q7_t * pSrcB, uint32_t blockSize, q31_t * result) { uint32_t blkCnt; /* loop counter */ q31_t sum = 0; /* Temporary variables to store output */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t input1, input2; /* Temporary variables to store input */ q31_t inA1, inA2, inB1, inB2; /* Temporary variables to store input */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* read 4 samples at a time from sourceA */ input1 = *__SIMD32(pSrcA)++; /* read 4 samples at a time from sourceB */ input2 = *__SIMD32(pSrcB)++; /* extract two q7_t samples to q15_t samples */ inA1 = __SXTB16(__ROR(input1, 8)); /* extract reminaing two samples */ inA2 = __SXTB16(input1); /* extract two q7_t samples to q15_t samples */ inB1 = __SXTB16(__ROR(input2, 8)); /* extract reminaing two samples */ inB2 = __SXTB16(input2); /* multiply and accumulate two samples at a time */ sum = __SMLAD(inA1, inB1, sum); sum = __SMLAD(inA2, inB2, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Dot product and then store the results in a temporary buffer. */ sum = __SMLAD(*pSrcA++, *pSrcB++, sum); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Dot product and then store the results in a temporary buffer. */ sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ /* Store the result in the destination buffer in 18.14 format */ *result = sum; }
void arm_q7_to_q15( q7_t * pSrc, q15_t * pDst, uint32_t blockSize) { q7_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY q31_t in; q31_t in1, in2; q31_t out1, out2; /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ in = *__SIMD32(pIn)++; /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); /* extend remainig two q7_t values to q15_t values */ in2 = __SXTB16(in); in1 = in1 << 8u; in2 = in2 << 8u; in1 = in1 & 0xFF00FF00; in2 = in2 & 0xFF00FF00; #ifndef ARM_MATH_BIG_ENDIAN out2 = __PKHTB(in1, in2, 16); out1 = __PKHBT(in2, in1, 16); #else out1 = __PKHTB(in1, in2, 16); out2 = __PKHBT(in2, in1, 16); #endif *__SIMD32(pDst)++ = out1; *__SIMD32(pDst)++ = out2; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) * pIn++ << 8; /* Decrement the loop counter */ blkCnt--; } }
void arm_offset_q15( q15_t * pSrc, q15_t offset, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t offset_packed; /* Offset packed to 32 bit */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* Offset is packed to 32 bit in order to use SIMD32 for addition */ offset_packed = __PKHBT(offset, offset, 16); /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer, 2 samples at a time. */ *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed); *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer. */ *pDst++ = (q15_t) __QADD16(*pSrc++, offset); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer. */ *pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
void arm_negate_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q15_t in1, in2; /* Temporary variables */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = -A */ /* Read two inputs */ in1 = *pSrc++; in2 = *pSrc++; /* Negate and then store the results in the destination buffer by packing. */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); #else *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ in1 = *pSrc++; in2 = *pSrc++; #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); #else *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0 */ while(blkCnt > 0u) { /* C = -A */ /* Negate and then store the result in the destination buffer. */ *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } }
void arm_q7_to_q31( q7_t * pSrc, q31_t * pDst, uint32_t blockSize) { q7_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY q31_t in; /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = (q31_t) A << 24 */ /* convert from q7 to q31 and then store the results in the destination buffer */ in = *__SIMD32(pIn)++; #ifndef ARM_MATH_BIG_ENDIAN *pDst++ = (__ROR(in, 8)) & 0xFF000000; *pDst++ = (__ROR(in, 16)) & 0xFF000000; *pDst++ = (__ROR(in, 24)) & 0xFF000000; *pDst++ = (in & 0xFF000000); #else *pDst++ = (in & 0xFF000000); *pDst++ = (__ROR(in, 24)) & 0xFF000000; *pDst++ = (__ROR(in, 16)) & 0xFF000000; *pDst++ = (__ROR(in, 8)) & 0xFF000000; #endif // #ifndef ARM_MATH_BIG_ENDIAN /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = (q31_t) A << 24 */ /* convert from q7 to q31 and then store the results in the destination buffer */ *pDst++ = (q31_t) * pIn++ << 24; /* Decrement the loop counter */ blkCnt--; } }
void arm_abs_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q15_t in1; /* Input value1 */ q15_t in2; /* Input value2 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = |A| */ /* Read two inputs */ in1 = *pSrc++; in2 = *pSrc++; /* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)), ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16); #else *__SIMD32(pDst)++ = __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)), ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ in1 = *pSrc++; in2 = *pSrc++; #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)), ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16); #else *__SIMD32(pDst)++ = __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)), ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = |A| */ /* Read the input */ in1 = *pSrc++; /* Calculate absolute value of input and then store the result in the destination buffer. */ *pDst++ = (in1 > 0) ? in1 : __QSUB16(0, in1); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ q15_t in; /* Temporary input variable */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = |A| */ /* Read the input */ in = *pSrc++; /* Calculate absolute value of input and then store the result in the destination buffer. */ *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
void arm_var_q15( q15_t * pSrc, uint32_t blockSize, q15_t * pResult) { q31_t sum = 0; /* Accumulator */ q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */ uint32_t blkCnt; /* loop counter */ q63_t sumOfSquares = 0; /* Accumulator */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t in; /* input value */ q15_t in1; /* input value */ if(blockSize == 1) { *pResult = 0; return; } /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ /* Compute Sum of squares of the input samples * and then store the result in a temporary variable, sum. */ in = *__SIMD32(pSrc)++; sum += ((in << 16) >> 16); sum += (in >> 16); sumOfSquares = __SMLALD(in, in, sumOfSquares); in = *__SIMD32(pSrc)++; sum += ((in << 16) >> 16); sum += (in >> 16); sumOfSquares = __SMLALD(in, in, sumOfSquares); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ /* Compute Sum of squares of the input samples * and then store the result in a temporary variable, sum. */ in1 = *pSrc++; sumOfSquares = __SMLALD(in1, in1, sumOfSquares); sum += in1; /* Decrement the loop counter */ blkCnt--; } /* Compute Mean of squares of the input samples * and then store the result in a temporary variable, meanOfSquares. */ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1)); /* Compute square of mean */ squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1))); /* mean of the squares minus the square of the mean. */ *pResult = (meanOfSquares - squareOfMean) >> 15; #else /* Run the below code for Cortex-M0 */ q15_t in; /* input value */ if(blockSize == 1) { *pResult = 0; return; } /* Loop over blockSize number of values */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ /* Compute Sum of squares of the input samples * and then store the result in a temporary variable, sumOfSquares. */ in = *pSrc++; sumOfSquares += (in * in); /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ /* Compute sum of all input values and then store the result in a temporary variable, sum. */ sum += in; /* Decrement the loop counter */ blkCnt--; } /* Compute Mean of squares of the input samples * and then store the result in a temporary variable, meanOfSquares. */ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1)); /* Compute square of mean */ squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1))); /* mean of the squares minus the square of the mean. */ *pResult = (meanOfSquares - squareOfMean) >> 15; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }