void arm_negate_q7( q7_t * pSrc, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q7_t in1; /* Input value1 */ q7_t in2; /* Input value2 */ q7_t in3; /* Input value3 */ q7_t in4; /* Input value4 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = -A */ /* Read four inputs */ in1 = *pSrc++; in2 = *pSrc++; in3 = *pSrc++; in4 = *pSrc++; /* Store the Negated results in the destination buffer in a single cycle by packing the results */ *__SIMD32(pDst)++ = __PACKq7(__SSAT(-in1, 8), __SSAT(-in2, 8), __SSAT(-in3, 8), __SSAT(-in4, 8)); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0 */ while(blkCnt > 0u) { /* C = -A */ /* Negate and then store the results in the destination buffer. */ *pDst++ = __SSAT(-*pSrc++, 8); /* Decrement the loop counter */ blkCnt--; } }
buffer_s16_t FM::execute( const buffer_c16_t& src, const buffer_s16_t& dst ) { auto z = z_; const auto src_p = src.p; const auto src_end = &src.p[src.count]; auto dst_p = dst.p; while(src_p < src_end) { const auto s0 = *__SIMD32(src_p)++; const auto s1 = *__SIMD32(src_p)++; const auto t0 = multiply_conjugate_s16_s32(s0, z); const auto t1 = multiply_conjugate_s16_s32(s1, s0); z = s1; const int32_t theta0_int = angle_approx_0deg27(t0) * ks16; const int32_t theta0_sat = __SSAT(theta0_int, 16); const int32_t theta1_int = angle_approx_0deg27(t1) * ks16; const int32_t theta1_sat = __SSAT(theta1_int, 16); *__SIMD32(dst_p)++ = __PKHBT( theta0_sat, theta1_sat, 16 ); } z_ = z; return { dst.p, src.count, src.sampling_rate }; }
void arm_pid_init_q15( arm_pid_instance_q15 * S, int32_t resetStateFlag) { #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ /* Derived coefficient A0 */ S->A0 = __QADD16(__QADD16(S->Kp, S->Ki), S->Kd); /* Derived coefficients and pack into A1 */ #ifndef ARM_MATH_BIG_ENDIAN S->A1 = __PKHBT(-__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), S->Kd, 16); #else S->A1 = __PKHBT(S->Kd, -__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Check whether state needs reset or not */ if(resetStateFlag) { /* Clear the state buffer. The size will be always 3 samples */ memset(S->state, 0, 3u * sizeof(q15_t)); } #else /* Run the below code for Cortex-M0 */ q31_t temp; /*to store the sum */ /* Derived coefficient A0 */ temp = S->Kp + S->Ki + S->Kd; S->A0 = (q15_t) __SSAT(temp, 16); /* Derived coefficients and pack into A1 */ temp = -(S->Kd + S->Kd + S->Kp); S->A1 = (q15_t) __SSAT(temp, 16); S->A2 = S->Kd; /* Check whether state needs reset or not */ if(resetStateFlag) { /* Clear the state buffer. The size will be always 3 samples */ memset(S->state, 0, 3u * sizeof(q15_t)); } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
void arm_abs_q7( q7_t * pSrc, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q7_t in1; /* Input value1 */ q7_t in2; /* Input value2 */ q7_t in3; /* Input value3 */ q7_t in4; /* Input value4 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = |A| */ /* Read 4 inputs */ in1 = *pSrc++; in2 = *pSrc++; in3 = *pSrc++; in4 = *pSrc++; /* Store the Absolute result in the destination buffer by packing the 4 values in single cycle */ *__SIMD32(pDst)++ = __PACKq7(((in1 > 0) ? in1 : __SSAT(-in1, 8)), ((in2 > 0) ? in2 : __SSAT(-in2, 8)), ((in3 > 0) ? in3 : __SSAT(-in3, 8)), ((in4 > 0) ? in4 : __SSAT(-in4, 8))); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = |A| */ /* Read the input */ in1 = *pSrc++; /* Store the Absolute result in the destination buffer */ *pDst++ = (in1 > 0) ? in1 : __SSAT(-in1, 8); /* Decrement the loop counter */ blkCnt--; } }
void arm_sub_q15( q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the results in the destination buffer two samples at a time. */ *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the result in the destination buffer. */ *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the result in the destination buffer. */ *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
void arm_negate_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q15_t in1, in2; /* Temporary variables */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = ~A */ /* Read two inputs */ in1 = *pSrc++; in2 = *pSrc++; /* Negate and then store the results in the destination buffer by packing. */ *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); in1 = *pSrc++; in2 = *pSrc++; *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = ~A */ /* Negate and then store the result in the destination buffer. */ *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } }
void arm_cmplx_conj_q15( q15_t * pSrc, q15_t * pDst, uint32_t numSamples) { uint32_t blkCnt; /* loop counter */ /*loop Unrolling */ blkCnt = numSamples >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C[0]+jC[1] = A[0]+ j (-1) A[1] */ /* Calculate Complex Conjugate and then store the results in the destination buffer. */ *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } /* If the numSamples is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = numSamples % 0x4u; while(blkCnt > 0u) { /* C[0]+jC[1] = A[0]+ j (-1) A[1] */ /* Calculate Complex Conjugate and then store the results in the destination buffer. */ *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } }
void AudioOutput::fill_audio_buffer(const buffer_f32_t& audio, const bool send_to_fifo) { std::array<int16_t, 32> audio_int; auto audio_buffer = audio::dma::tx_empty_buffer(); for(size_t i=0; i<audio_buffer.count; i++) { const int32_t sample_int = audio.p[i] * k; const int32_t sample_saturated = __SSAT(sample_int, 16); audio_buffer.p[i].left = audio_buffer.p[i].right = sample_saturated; audio_int[i] = sample_saturated; } if( stream && send_to_fifo ) { stream->write(audio_int.data(), audio_buffer.count * sizeof(audio_int[0])); } feed_audio_stats(audio); }
void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out) { q31_t sum; int16_t i; q31_t min, max; max = -1 * 0x100000; min = 0x100000; for (i = 0; i < dim_vec; i++) { if (vec_in[i] > max) { max = vec_in[i]; } if (vec_in[i] < min) { min = vec_in[i]; } } /* we ignore really small values * anyway, they will be 0 after shrinking * to q7_t */ if (max - min > 16) { min = max - 16; } sum = 0; for (i = 0; i < dim_vec; i++) { sum += 0x1 << (vec_in[i] - min); } for (i = 0; i < dim_vec; i++) { /* we leave 7-bit dynamic range, so that 128 -> 100% confidence */ p_out[i] = (q15_t) __SSAT(((0x1 << (vec_in[i] - min + 14)) / sum), 16); } }
void arm_mult_q31( q31_t * pSrcA, q31_t * pSrcB, q31_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counters */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t inA1, inA2, inA3, inA4; /* temporary input variables */ q31_t inB1, inB2, inB3, inB4; /* temporary input variables */ q31_t out1, out2, out3, out4; /* temporary output variables */ /* loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A * B */ /* Multiply the inputs and then store the results in the destination buffer. */ inA1 = *pSrcA++; inA2 = *pSrcA++; inA3 = *pSrcA++; inA4 = *pSrcA++; inB1 = *pSrcB++; inB2 = *pSrcB++; inB3 = *pSrcB++; inB4 = *pSrcB++; out1 = ((q63_t) inA1 * inB1) >> 32; out2 = ((q63_t) inA2 * inB2) >> 32; out3 = ((q63_t) inA3 * inB3) >> 32; out4 = ((q63_t) inA4 * inB4) >> 32; out1 = __SSAT(out1, 31); out2 = __SSAT(out2, 31); out3 = __SSAT(out3, 31); out4 = __SSAT(out4, 31); *pDst++ = out1 << 1u; *pDst++ = out2 << 1u; *pDst++ = out3 << 1u; *pDst++ = out4 << 1u; /* Decrement the blockSize loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A * B */ /* Multiply the inputs and then store the results in the destination buffer. */ inA1 = *pSrcA++; inB1 = *pSrcB++; out1 = ((q63_t) inA1 * inB1) >> 32; out1 = __SSAT(out1, 31); *pDst++ = out1 << 1u; /* Decrement the blockSize loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A * B */ /* Multiply the inputs and then store the results in the destination buffer. */ *pDst++ = (q31_t) clip_q63_to_q31(((q63_t) (*pSrcA++) * (*pSrcB++)) >> 31); /* Decrement the blockSize loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
void arm_sub_q7( q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q31_t inA1, inB1, inA2, inB2; /* temporary input variabels */ q7_t inA, inB; /* temporary variables */ q31_t out1, out2, out3, out4; /* temporary output variables */ /*loop Unrolling */ blkCnt = blockSize >> 4u; /* First part of the processing with loop unrolling. Compute 16 outputs at a time. ** a second loop below computes the remaining 1 to 15 samples. */ while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the results in the destination buffer 4 samples at a time. */ /* read 4 samples at a time from sourceA */ inA1 = _SIMD32_OFFSET(pSrcA); /* read 4 samples at a time from sourceB */ inB1 = _SIMD32_OFFSET(pSrcB); /* read 4 samples at a time from sourceA */ inA2 = _SIMD32_OFFSET(pSrcA + 4); /* out = saturate(sourceA - sourceB) four samples at a time */ out1 = __QSUB8(inA1, inB1); /* read 4 samples at a time from sourceB */ inB2 = _SIMD32_OFFSET(pSrcB + 4); /* store result to destination four samples at a time */ _SIMD32_OFFSET(pDst) = out1; /* out = saturate(sourceA - sourceB) four samples at a time */ out2 = __QSUB8(inA2, inB2); /* read 4 samples at a time from sourceA */ inA1 = _SIMD32_OFFSET(pSrcA + 8); /* read 4 samples at a time from sourceB */ inB1 = _SIMD32_OFFSET(pSrcB + 8); /* read 4 samples at a time from sourceA */ inA2 = _SIMD32_OFFSET(pSrcA + 12); /* out = saturate(sourceA - sourceB) four samples at a time */ out3 = __QSUB8(inA1, inB1); /* read 4 samples at a time from sourceB */ inB2 = _SIMD32_OFFSET(pSrcB + 12); /* increment sourceA pointer by 16 to process next samples */ pSrcA += 16u; /* store result to destination four samples at a time */ _SIMD32_OFFSET(pDst + 4) = out2; /* out = saturate(sourceA - sourceB) four samples at a time */ out4 = __QSUB8(inA2, inB2); /* store result to destination four samples at a time */ _SIMD32_OFFSET(pDst + 8) = out3; /* Update source pointer to process next sampels */ pSrcB += 16u; /* store result to destination four samples at a time */ _SIMD32_OFFSET(pDst + 12) = out4; /* Update destination pointer to process next sampels */ pDst += 16u; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 16, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x10u; while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the result in the destination buffer. */ inA = *pSrcA++; inB = *pSrcB++; #ifdef CCS *pDst++ = __SSATA(inA - inB, 0, 8); #else *pDst++ = __SSAT(inA - inB, 8); #endif //#ifdef CCS /* Decrement the loop counter */ blkCnt--; } }
void arm_sub_q15( const q15_t * pSrcA, const q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* Loop counter */ #if defined (ARM_MATH_LOOPUNROLL) #if defined (ARM_MATH_DSP) q31_t inA1, inA2; q31_t inB1, inB2; #endif /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = A - B */ #if defined (ARM_MATH_DSP) /* read 2 times 2 samples at a time from sourceA */ inA1 = read_q15x2_ia ((q15_t **) &pSrcA); inA2 = read_q15x2_ia ((q15_t **) &pSrcA); /* read 2 times 2 samples at a time from sourceB */ inB1 = read_q15x2_ia ((q15_t **) &pSrcB); inB2 = read_q15x2_ia ((q15_t **) &pSrcB); /* Subtract and store 2 times 2 samples at a time */ write_q15x2_ia (&pDst, __QSUB16(inA1, inB1)); write_q15x2_ia (&pDst, __QSUB16(inA2, inB2)); #else *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); #endif /* Decrement loop counter */ blkCnt--; } /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = A - B */ /* Subtract and store result in destination buffer. */ #if defined (ARM_MATH_DSP) *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++); #else *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); #endif /* Decrement loop counter */ blkCnt--; } }
void arm_cmplx_conj_q15( q15_t * pSrc, q15_t * pDst, uint32_t numSamples) { #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ uint32_t blkCnt; /* loop counter */ /*loop Unrolling */ blkCnt = numSamples >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C[0]+jC[1] = A[0]+ j (-1) A[1] */ /* Calculate Complex Conjugate and then store the results in the destination buffer. */ *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } /* If the numSamples is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = numSamples % 0x4u; while(blkCnt > 0u) { /* C[0]+jC[1] = A[0]+ j (-1) A[1] */ /* Calculate Complex Conjugate and then store the results in the destination buffer. */ *pDst++ = *pSrc++; *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ while(numSamples > 0u) { /* realOut + j (imagOut) = realIn+ j (-1) imagIn */ /* Calculate Complex Conjugate and then store the results in the destination buffer. */ *pDst++ = *pSrc++; *pDst++ = -*pSrc++; /* Decrement the loop counter */ numSamples--; } #endif /* #ifndef ARM_MATH_CM0 */ }
void arm_negate_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q15_t in1, in2; /* Temporary variables */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = -A */ /* Read two inputs */ in1 = *pSrc++; in2 = *pSrc++; /* Negate and then store the results in the destination buffer by packing. */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); #else *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ in1 = *pSrc++; in2 = *pSrc++; #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); #else *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0 */ while(blkCnt > 0u) { /* C = -A */ /* Negate and then store the result in the destination buffer. */ *pDst++ = __SSAT(-*pSrc++, 16); /* Decrement the loop counter */ blkCnt--; } }
extern "C" void stm32_adc12(void) { PERF_COUNT_START static float last_v_ab[2]; static float prev_v_ab[2]; static uint8_t last_pwm_sector = 1u; static uint8_t prev_pwm_sector = 1u; int16_t phase_current_lsb[3]; uint16_t phase_oc[3]; float out_v_ab[2], i_ab[2]; float temp; hal_read_phase_shunts_(phase_current_lsb, prev_pwm_sector); /* Clarke transformation for balanced systems i_alpha = i_a, i_beta = (2 * i_b + i_a) / sqrt(3) Multiply by 8 because the phase current readings are right-aligned. */ i_ab[0] = float(phase_current_lsb[0]) * float(hal_full_scale_current_a * 8.0 / 32768.0); temp = float(phase_current_lsb[1]) * float(hal_full_scale_current_a * 8.0 / 32768.0); i_ab[1] = (0.57735026919f * i_ab[0] + 1.15470053838f * temp); out_v_ab[0] = out_v_ab[1] = 0.0f; if (high_frequency_task_) { high_frequency_task_(out_v_ab, prev_v_ab, i_ab, vbus_v_); } prev_v_ab[0] = last_v_ab[0]; prev_v_ab[1] = last_v_ab[1]; last_v_ab[0] = out_v_ab[0]; last_v_ab[1] = out_v_ab[1]; prev_pwm_sector = last_pwm_sector; /* Convert alpha-beta frame voltage fractions to SVM output compare values for each phase. */ temp = vbus_inv_; last_pwm_sector = svm_duty_cycle_from_v_alpha_beta( phase_oc, int16_t(__SSAT(int32_t(temp * out_v_ab[0]), 16u)), int16_t(__SSAT(int32_t(temp * out_v_ab[1]), 16u)), hal_pwm_period_ticks); /* Update the timer */ hal_update_timer_(last_pwm_sector, phase_oc); /* Clear the JEOS event, and prepare for the next hardware trigger ADC_ClearFlag(ADC1, ADC_FLAG_JEOS); */ putreg32(ADC_INT_JEOS, STM32_ADC1_ISR); /* Allow the next ADC conversion to happen based on the TIM1 CC4 event ADC_StartInjectedConversion(ADC1); */ putreg32(getreg32(STM32_ADC1_CR) | ADC_CR_JADSTART, STM32_ADC1_CR); PERF_COUNT_END }
arm_status arm_mat_sub_q15( const arm_matrix_instance_q15 * pSrcA, const arm_matrix_instance_q15 * pSrcB, arm_matrix_instance_q15 * pDst) { q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */ q15_t *pOut = pDst->pData; /* output data matrix pointer */ uint32_t numSamples; /* total number of elements in the matrix */ uint32_t blkCnt; /* loop counters */ arm_status status; /* status of matrix subtraction */ #ifdef ARM_MATH_MATRIX_CHECK /* Check for matrix mismatch condition */ if((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols)) { /* Set status as ARM_MATH_SIZE_MISMATCH */ status = ARM_MATH_SIZE_MISMATCH; } else #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ { /* Total number of samples in the input matrix */ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols; #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ /* Apply loop unrolling */ blkCnt = numSamples >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C(m,n) = A(m,n) - B(m,n) */ /* Subtract, Saturate and then store the results in the destination buffer. */ *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++); *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = numSamples % 0x4u; while(blkCnt > 0u) { /* C(m,n) = A(m,n) - B(m,n) */ /* Subtract and then store the results in the destination buffer. */ *pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = numSamples; while(blkCnt > 0u) { /* C(m,n) = A(m,n) - B(m,n) */ /* Subtract and then store the results in the destination buffer. */ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ /* Set status as ARM_MATH_SUCCESS */ status = ARM_MATH_SUCCESS; } /* Return to application */ return (status); }
void arm_mult_q15( q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counters */ q31_t inA1, inA2, inB1, inB2; /* temporary input variables */ q15_t out1, out2, out3, out4; /* temporary output variables */ q31_t mul1, mul2, mul3, mul4; /* temporary variables */ /* loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* read two samples at a time from sourceA */ inA1 = *__SIMD32(pSrcA)++; /* read two samples at a time from sourceB */ inB1 = *__SIMD32(pSrcB)++; /* read two samples at a time from sourceA */ inA2 = *__SIMD32(pSrcA)++; /* read two samples at a time from sourceB */ inB2 = *__SIMD32(pSrcB)++; /* multiply mul = sourceA * sourceB */ mul1 = (q31_t)((q15_t)(inA1 >> 16)* (q15_t)(inB1>> 16)); mul2 = (q31_t)((q15_t)inA1 * (q15_t)inB1); mul3 = (q31_t)((q15_t)(inA2 >> 16)* (q15_t)(inB2>> 16)); mul4 = (q31_t)((q15_t)inA2 * (q15_t)inB2); /* shift result by 15 to get 16 bit result */ mul1 = mul1 >> 15; mul2 = mul2 >> 15; mul3 = mul3 >> 15; mul4 = mul4 >> 15; /* saturate result to 16 bit */ #ifdef CCS out1 = (q15_t) __SSATA(mul1, 0, 16); out2 = (q15_t) __SSATA(mul2, 0, 16); out3 = (q15_t) __SSATA(mul3, 0, 16); out4 = (q15_t) __SSATA(mul4, 0, 16); #else out1 = (q15_t) __SSAT(mul1, 16); out2 = (q15_t) __SSAT(mul2, 16); out3 = (q15_t) __SSAT(mul3, 16); out4 = (q15_t) __SSAT(mul4, 16); #endif // #ifdef CCS /* store the result */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16); *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16); #else *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16); *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16); #endif // #ifndef ARM_MATH_BIG_ENDIAN /* Decrement the blockSize loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A * B */ /* Multiply the inputs and store the result in the destination buffer */ #ifdef CCS *pDst++ = (q15_t) __SSATA(((q31_t) ((*pSrcA++) * (*pSrcB++)) >> 15), 0, 16); #else *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16); #endif // #ifdef CCS /* Decrement the blockSize loop counter */ blkCnt--; } }
void arm_float_to_q7( float32_t * pSrc, q7_t * pDst, uint32_t blockSize) { float32_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ #ifdef ARM_MATH_ROUNDING float32_t in; #endif /*loop Unrolling */ blkCnt = blockSize >> 3u; /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 4 samples. */ while(blkCnt > 0u) { #ifdef ARM_MATH_ROUNDING /* C = A * 128 */ /* convert from float to q7 and then store the results in the destination buffer */ in = *pIn++; in = (in * 128); in += in > 0 ? 0.5 : -0.5; #ifdef CCS *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8)); #else *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #endif // #ifdef CCS in = *pIn++; in = (in * 128); in += in > 0 ? 0.5 : -0.5; #ifdef CCS *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8)); #else *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #endif // #ifdef CCS in = *pIn++; in = (in * 128); in += in > 0 ? 0.5 : -0.5; #ifdef CCS *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8)); #else *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #endif // #ifdef CCS in = *pIn++; in = (in * 128); in += in > 0 ? 0.5 : -0.5; #ifdef CCS *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8)); #else *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #endif // #ifdef CCS in = *pIn++; in = (in * 128); in += in > 0 ? 0.5 : -0.5; #ifdef CCS *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8)); #else *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #endif // #ifdef CCS in = *pIn++; in = (in * 128); in += in > 0 ? 0.5 : -0.5; #ifdef CCS *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8)); #else *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #endif // #ifdef CCS in = *pIn++; in = (in * 128); in += in > 0 ? 0.5 : -0.5; #ifdef CCS *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8)); #else *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #endif // #ifdef CCS in = *pIn++; in = (in * 128); in += in > 0 ? 0.5 : -0.5; #ifdef CCS *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8)); #else *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #endif // #ifdef CCS #else /* C = A * 128 */ /* convert from float to q7 and then store the results in the destination buffer */ #ifdef CCS *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8); *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8); *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8); *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8); *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8); *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8); *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8); *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8); #else *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); #endif // #ifdef CCS #endif // #ifdef ARM_MATH_ROUNDING /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 48 compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { #ifdef ARM_MATH_ROUNDING /* C = A * 128 */ /* convert from float to q7 and then store the results in the destination buffer */ in = *pIn++; in = (in * 128); in += in > 0 ? 0.5 : -0.5; #ifdef CCS *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8)); #else *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); #endif // #ifdef CCS #else /* C = A * 128 */ /* convert from float to q7 and then store the results in the destination buffer */ #ifdef CCS *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8); #else *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); #endif // #ifdef CCS #endif // #ifdef ARM_MATH_ROUNDING /* Decrement the loop counter */ blkCnt--; } }
void arm_offset_q15( q15_t * pSrc, q15_t offset, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t offset_packed; /* Offset packed to 32 bit */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* Offset is packed to 32 bit in order to use SIMD32 for addition */ offset_packed = __PKHBT(offset, offset, 16); /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer, 2 samples at a time. */ *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed); *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer. */ *pDst++ = (q15_t) __QADD16(*pSrc++, offset); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination buffer. */ *pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
void arm_cmplx_mult_real_q31( q31_t * pSrcCmplx, q31_t * pSrcReal, q31_t * pCmplxDst, uint32_t numSamples) { q31_t inA1; /* Temporary variable to store input value */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ uint32_t blkCnt; /* loop counters */ q31_t inA2, inA3, inA4; /* Temporary variables to hold input data */ q31_t inB1, inB2; /* Temporary variabels to hold input data */ q31_t out1, out2, out3, out4; /* Temporary variables to hold output data */ /* loop Unrolling */ blkCnt = numSamples >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C[2 * i] = A[2 * i] * B[i]. */ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */ /* read real input from complex input buffer */ inA1 = *pSrcCmplx++; inA2 = *pSrcCmplx++; /* read input from real input bufer */ inB1 = *pSrcReal++; inB2 = *pSrcReal++; /* read imaginary input from complex input buffer */ inA3 = *pSrcCmplx++; inA4 = *pSrcCmplx++; /* multiply complex input with real input */ out1 = ((q63_t) inA1 * inB1) >> 32; out2 = ((q63_t) inA2 * inB1) >> 32; out3 = ((q63_t) inA3 * inB2) >> 32; out4 = ((q63_t) inA4 * inB2) >> 32; /* sature the result */ out1 = __SSAT(out1, 31); out2 = __SSAT(out2, 31); out3 = __SSAT(out3, 31); out4 = __SSAT(out4, 31); /* get result in 1.31 format */ out1 = out1 << 1; out2 = out2 << 1; out3 = out3 << 1; out4 = out4 << 1; /* store the result to destination buffer */ *pCmplxDst++ = out1; *pCmplxDst++ = out2; *pCmplxDst++ = out3; *pCmplxDst++ = out4; /* read real input from complex input buffer */ inA1 = *pSrcCmplx++; inA2 = *pSrcCmplx++; /* read input from real input bufer */ inB1 = *pSrcReal++; inB2 = *pSrcReal++; /* read imaginary input from complex input buffer */ inA3 = *pSrcCmplx++; inA4 = *pSrcCmplx++; /* multiply complex input with real input */ out1 = ((q63_t) inA1 * inB1) >> 32; out2 = ((q63_t) inA2 * inB1) >> 32; out3 = ((q63_t) inA3 * inB2) >> 32; out4 = ((q63_t) inA4 * inB2) >> 32; /* sature the result */ out1 = __SSAT(out1, 31); out2 = __SSAT(out2, 31); out3 = __SSAT(out3, 31); out4 = __SSAT(out4, 31); /* get result in 1.31 format */ out1 = out1 << 1; out2 = out2 << 1; out3 = out3 << 1; out4 = out4 << 1; /* store the result to destination buffer */ *pCmplxDst++ = out1; *pCmplxDst++ = out2; *pCmplxDst++ = out3; *pCmplxDst++ = out4; /* Decrement the numSamples loop counter */ blkCnt--; } /* If the numSamples is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = numSamples % 0x4u; while(blkCnt > 0u) { /* C[2 * i] = A[2 * i] * B[i]. */ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */ /* read real input from complex input buffer */ inA1 = *pSrcCmplx++; inA2 = *pSrcCmplx++; /* read input from real input bufer */ inB1 = *pSrcReal++; /* multiply complex input with real input */ out1 = ((q63_t) inA1 * inB1) >> 32; out2 = ((q63_t) inA2 * inB1) >> 32; /* sature the result */ out1 = __SSAT(out1, 31); out2 = __SSAT(out2, 31); /* get result in 1.31 format */ out1 = out1 << 1; out2 = out2 << 1; /* store the result to destination buffer */ *pCmplxDst++ = out1; *pCmplxDst++ = out2; /* Decrement the numSamples loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ while(numSamples > 0u) { /* realOut = realA * realB. */ /* imagReal = imagA * realB. */ inA1 = *pSrcReal++; /* store the result in the destination buffer. */ *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31); *pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31); /* Decrement the numSamples loop counter */ numSamples--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
void arm_float_to_q15( float32_t * pSrc, q15_t * pDst, uint32_t blockSize) { float32_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ #ifdef ARM_MATH_ROUNDING float32_t in; #endif /* #ifdef ARM_MATH_ROUNDING */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { #ifdef ARM_MATH_ROUNDING /* C = A * 32768 */ /* convert from float to q15 and then store the results in the destination buffer */ in = *pIn++; in = (in * 32768.0f); in += in > 0 ? 0.5 : -0.5; *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); in = *pIn++; in = (in * 32768.0f); in += in > 0 ? 0.5 : -0.5; *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); in = *pIn++; in = (in * 32768.0f); in += in > 0 ? 0.5 : -0.5; *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); in = *pIn++; in = (in * 32768.0f); in += in > 0 ? 0.5 : -0.5; *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); #else /* C = A * 32768 */ /* convert from float to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); #endif /* #ifdef ARM_MATH_ROUNDING */ /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { #ifdef ARM_MATH_ROUNDING /* C = A * 32768 */ /* convert from float to q15 and then store the results in the destination buffer */ in = *pIn++; in = (in * 32768.0f); in += in > 0 ? 0.5 : -0.5; *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); #else /* C = A * 32768 */ /* convert from float to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); #endif /* #ifdef ARM_MATH_ROUNDING */ /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; while(blkCnt > 0u) { #ifdef ARM_MATH_ROUNDING /* C = A * 32768 */ /* convert from float to q15 and then store the results in the destination buffer */ in = *pIn++; in = (in * 32768.0f); in += in > 0 ? 0.5f : -0.5f; *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); #else /* C = A * 32768 */ /* convert from float to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); #endif /* #ifdef ARM_MATH_ROUNDING */ /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
buffer_c16_t FIRAndDecimateComplex::execute( const buffer_c16_t& src, const buffer_c16_t& dst ) { /* int16_t input (sample count "n" must be multiple of decimation_factor) * -> int16_t output, decimated by decimation_factor. * taps are normalized to 1 << 16 == 1.0. */ const auto output_sampling_rate = src.sampling_rate / decimation_factor_; const size_t output_samples = src.count / decimation_factor_; sample_t* dst_p = dst.p; const buffer_c16_t result { dst.p, output_samples, output_sampling_rate }; const sample_t* src_p = src.p; size_t outer_count = output_samples; while(outer_count > 0) { /* Put new samples into delay buffer */ auto z_new_p = &samples_[taps_count_ - decimation_factor_]; for(size_t i=0; i<decimation_factor_; i++) { *__SIMD32(z_new_p)++ = *__SIMD32(src_p)++; } size_t loop_count = taps_count_ / 8; auto t_p = &taps_reversed_[0]; auto z_p = &samples_[0]; int64_t t_real = 0; int64_t t_imag = 0; while(loop_count > 0) { const auto tap0 = *__SIMD32(t_p)++; const auto sample0 = *__SIMD32(z_p)++; const auto tap1 = *__SIMD32(t_p)++; const auto sample1 = *__SIMD32(z_p)++; t_real = __SMLSLD(sample0, tap0, t_real); t_imag = __SMLALDX(sample0, tap0, t_imag); t_real = __SMLSLD(sample1, tap1, t_real); t_imag = __SMLALDX(sample1, tap1, t_imag); const auto tap2 = *__SIMD32(t_p)++; const auto sample2 = *__SIMD32(z_p)++; const auto tap3 = *__SIMD32(t_p)++; const auto sample3 = *__SIMD32(z_p)++; t_real = __SMLSLD(sample2, tap2, t_real); t_imag = __SMLALDX(sample2, tap2, t_imag); t_real = __SMLSLD(sample3, tap3, t_real); t_imag = __SMLALDX(sample3, tap3, t_imag); const auto tap4 = *__SIMD32(t_p)++; const auto sample4 = *__SIMD32(z_p)++; const auto tap5 = *__SIMD32(t_p)++; const auto sample5 = *__SIMD32(z_p)++; t_real = __SMLSLD(sample4, tap4, t_real); t_imag = __SMLALDX(sample4, tap4, t_imag); t_real = __SMLSLD(sample5, tap5, t_real); t_imag = __SMLALDX(sample5, tap5, t_imag); const auto tap6 = *__SIMD32(t_p)++; const auto sample6 = *__SIMD32(z_p)++; const auto tap7 = *__SIMD32(t_p)++; const auto sample7 = *__SIMD32(z_p)++; t_real = __SMLSLD(sample6, tap6, t_real); t_imag = __SMLALDX(sample6, tap6, t_imag); t_real = __SMLSLD(sample7, tap7, t_real); t_imag = __SMLALDX(sample7, tap7, t_imag); loop_count--; } /* TODO: Re-evaluate whether saturation is performed, normalization, * all that jazz. */ const int32_t r = t_real >> 16; const int32_t i = t_imag >> 16; const int32_t r_sat = __SSAT(r, 16); const int32_t i_sat = __SSAT(i, 16); *__SIMD32(dst_p)++ = __PKHBT( r_sat, i_sat, 16 ); /* Shift sample buffer left/down by decimation factor. */ const size_t unroll_factor = 4; size_t shift_count = (taps_count_ - decimation_factor_) / unroll_factor; sample_t* t = &samples_[0]; const sample_t* s = &samples_[decimation_factor_]; while(shift_count > 0) { *__SIMD32(t)++ = *__SIMD32(s)++; *__SIMD32(t)++ = *__SIMD32(s)++; *__SIMD32(t)++ = *__SIMD32(s)++; *__SIMD32(t)++ = *__SIMD32(s)++; shift_count--; } shift_count = (taps_count_ - decimation_factor_) % unroll_factor; while(shift_count > 0) { *(t++) = *(s++); shift_count--; } outer_count--; } return result; }