void arm_negate_q7(
    q7_t * pSrc,
    q7_t * pDst,
    uint32_t blockSize)
{
    uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

    /* Run the below code for Cortex-M4 and Cortex-M3 */
    q7_t in1;                                      /* Input value1 */
    q7_t in2;                                      /* Input value2 */
    q7_t in3;                                      /* Input value3 */
    q7_t in4;                                      /* Input value4 */


    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u) {
        /* C = -A */
        /* Read four inputs */
        in1 = *pSrc++;
        in2 = *pSrc++;
        in3 = *pSrc++;
        in4 = *pSrc++;

        /* Store the Negated results in the destination buffer in a single cycle by packing the results */
        *__SIMD32(pDst)++ =
            __PACKq7(__SSAT(-in1, 8), __SSAT(-in2, 8), __SSAT(-in3, 8),
                     __SSAT(-in4, 8));

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

#else

    /* Run the below code for Cortex-M0 */

    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

    while(blkCnt > 0u) {
        /* C = -A */
        /* Negate and then store the results in the destination buffer. */
        *pDst++ = __SSAT(-*pSrc++, 8);

        /* Decrement the loop counter */
        blkCnt--;
    }
}
buffer_s16_t FM::execute(
	const buffer_c16_t& src,
	const buffer_s16_t& dst
) {
	auto z = z_;

	const auto src_p = src.p;
	const auto src_end = &src.p[src.count];
	auto dst_p = dst.p;
	while(src_p < src_end) {
		const auto s0 = *__SIMD32(src_p)++;
		const auto s1 = *__SIMD32(src_p)++;
		const auto t0 = multiply_conjugate_s16_s32(s0, z);
		const auto t1 = multiply_conjugate_s16_s32(s1, s0);
		z = s1;
		const int32_t theta0_int = angle_approx_0deg27(t0) * ks16;
		const int32_t theta0_sat = __SSAT(theta0_int, 16);
		const int32_t theta1_int = angle_approx_0deg27(t1) * ks16;
		const int32_t theta1_sat = __SSAT(theta1_int, 16);
		*__SIMD32(dst_p)++ = __PKHBT(
			theta0_sat,
			theta1_sat,
			16
		);
	}
	z_ = z;

	return { dst.p, src.count, src.sampling_rate };
}
void arm_pid_init_q15(
  arm_pid_instance_q15 * S,
  int32_t resetStateFlag)
{

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  /* Derived coefficient A0 */
  S->A0 = __QADD16(__QADD16(S->Kp, S->Ki), S->Kd);

  /* Derived coefficients and pack into A1 */

#ifndef  ARM_MATH_BIG_ENDIAN

  S->A1 = __PKHBT(-__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), S->Kd, 16);

#else

  S->A1 = __PKHBT(S->Kd, -__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

  /* Check whether state needs reset or not */
  if(resetStateFlag)
  {
    /* Clear the state buffer.  The size will be always 3 samples */
    memset(S->state, 0, 3u * sizeof(q15_t));
  }

#else

  /* Run the below code for Cortex-M0 */

  q31_t temp;                                    /*to store the sum */

  /* Derived coefficient A0 */
  temp = S->Kp + S->Ki + S->Kd;
  S->A0 = (q15_t) __SSAT(temp, 16);

  /* Derived coefficients and pack into A1 */
  temp = -(S->Kd + S->Kd + S->Kp);
  S->A1 = (q15_t) __SSAT(temp, 16);
  S->A2 = S->Kd;



  /* Check whether state needs reset or not */
  if(resetStateFlag)
  {
    /* Clear the state buffer.  The size will be always 3 samples */
    memset(S->state, 0, 3u * sizeof(q15_t));
  }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}
示例#4
0
void arm_abs_q7( 
  q7_t * pSrc, 
  q7_t * pDst, 
  uint32_t blockSize) 
{ 
  uint32_t blkCnt;                               /* loop counter */ 
  q7_t in1;                                      /* Input value1 */ 
  q7_t in2;                                      /* Input value2 */ 
  q7_t in3;                                      /* Input value3 */ 
  q7_t in4;                                      /* Input value4 */ 
 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = |A| */ 
    /* Read 4 inputs */ 
    in1 = *pSrc++; 
    in2 = *pSrc++; 
    in3 = *pSrc++; 
    in4 = *pSrc++; 
 
    /* Store the Absolute result in the destination buffer by packing the 4 values in single cycle */ 
    *__SIMD32(pDst)++ = 
      __PACKq7(((in1 > 0) ? in1 : __SSAT(-in1, 8)), 
               ((in2 > 0) ? in2 : __SSAT(-in2, 8)), 
               ((in3 > 0) ? in3 : __SSAT(-in3, 8)), 
               ((in4 > 0) ? in4 : __SSAT(-in4, 8))); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = |A| */ 
    /* Read the input */ 
    in1 = *pSrc++; 
 
    /* Store the Absolute result in the destination buffer */ 
    *pDst++ = (in1 > 0) ? in1 : __SSAT(-in1, 8); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
} 
void arm_sub_q15(
    q15_t * pSrcA,
    q15_t * pSrcB,
    q15_t * pDst,
    uint32_t blockSize)
{
    uint32_t blkCnt;                               /* loop counter */


#ifndef ARM_MATH_CM0

    /* Run the below code for Cortex-M4 and Cortex-M3 */
    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u) {
        /* C = A - B */
        /* Subtract and then store the results in the destination buffer two samples at a time. */
        *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
        *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

    while(blkCnt > 0u) {
        /* C = A - B */
        /* Subtract and then store the result in the destination buffer. */
        *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);

        /* Decrement the loop counter */
        blkCnt--;
    }

#else

    /* Run the below code for Cortex-M0 */

    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;

    while(blkCnt > 0u) {
        /* C = A - B */
        /* Subtract and then store the result in the destination buffer. */
        *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16);

        /* Decrement the loop counter */
        blkCnt--;
    }

#endif /* #ifndef ARM_MATH_CM0 */


}
void arm_negate_q15( 
  q15_t * pSrc, 
  q15_t * pDst, 
  uint32_t blockSize) 
{ 
  uint32_t blkCnt;                               /* loop counter */ 
  q15_t in1, in2;                                /* Temporary variables */ 
 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = ~A */ 
    /* Read two inputs */ 
    in1 = *pSrc++; 
    in2 = *pSrc++; 
    /* Negate and then store the results in the destination buffer by packing. */ 
    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); 
 
    in1 = *pSrc++; 
    in2 = *pSrc++; 
 
    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = ~A */ 
    /* Negate and then store the result in the destination buffer. */ 
    *pDst++ = __SSAT(-*pSrc++, 16); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
} 
void arm_cmplx_conj_q15( 
  q15_t * pSrc, 
  q15_t * pDst, 
  uint32_t numSamples) 
{ 
  uint32_t blkCnt;                               /* loop counter */ 
 
  /*loop Unrolling */ 
  blkCnt = numSamples >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */ 
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */ 
    *pDst++ = *pSrc++; 
    *pDst++ = __SSAT(-*pSrc++, 16); 
    *pDst++ = *pSrc++; 
    *pDst++ = __SSAT(-*pSrc++, 16); 
    *pDst++ = *pSrc++; 
    *pDst++ = __SSAT(-*pSrc++, 16); 
    *pDst++ = *pSrc++; 
    *pDst++ = __SSAT(-*pSrc++, 16); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = numSamples % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */ 
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */ 
    *pDst++ = *pSrc++; 
    *pDst++ = __SSAT(-*pSrc++, 16); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
} 
void AudioOutput::fill_audio_buffer(const buffer_f32_t& audio, const bool send_to_fifo) {
	std::array<int16_t, 32> audio_int;

	auto audio_buffer = audio::dma::tx_empty_buffer();
	for(size_t i=0; i<audio_buffer.count; i++) {
		const int32_t sample_int = audio.p[i] * k;
		const int32_t sample_saturated = __SSAT(sample_int, 16);
		audio_buffer.p[i].left = audio_buffer.p[i].right = sample_saturated;
		audio_int[i] = sample_saturated;
	}
	if( stream && send_to_fifo ) {
		stream->write(audio_int.data(), audio_buffer.count * sizeof(audio_int[0]));
	}

	feed_audio_stats(audio);
}
void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out)
{
    q31_t     sum;
    int16_t   i;
    q31_t     min, max;
    max = -1 * 0x100000;
    min = 0x100000;
    for (i = 0; i < dim_vec; i++)
    {
        if (vec_in[i] > max)
        {
            max = vec_in[i];
        }
        if (vec_in[i] < min)
        {
            min = vec_in[i];
        }
    }

    /* we ignore really small values  
     * anyway, they will be 0 after shrinking
     * to q7_t
     */
    if (max - min > 16)
    {
        min = max - 16;
    }

    sum = 0;

    for (i = 0; i < dim_vec; i++)
    {
        sum += 0x1 << (vec_in[i] - min);
    }

    for (i = 0; i < dim_vec; i++)
    {
        /* we leave 7-bit dynamic range, so that 128 -> 100% confidence */
        p_out[i] = (q15_t) __SSAT(((0x1 << (vec_in[i] - min + 14)) / sum), 16);
    }

}
示例#10
0
void arm_mult_q31(
  q31_t * pSrcA,
  q31_t * pSrcB,
  q31_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counters */

#ifndef ARM_MATH_CM0_FAMILY

/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2, inA3, inA4;                  /* temporary input variables */
  q31_t inB1, inB2, inB3, inB4;                  /* temporary input variables */
  q31_t out1, out2, out3, out4;                  /* temporary output variables */

  /* loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and then store the results in the destination buffer. */
    inA1 = *pSrcA++;
    inA2 = *pSrcA++;
    inA3 = *pSrcA++;
    inA4 = *pSrcA++;
    inB1 = *pSrcB++;
    inB2 = *pSrcB++;
    inB3 = *pSrcB++;
    inB4 = *pSrcB++;

    out1 = ((q63_t) inA1 * inB1) >> 32;
    out2 = ((q63_t) inA2 * inB2) >> 32;
    out3 = ((q63_t) inA3 * inB3) >> 32;
    out4 = ((q63_t) inA4 * inB4) >> 32;

    out1 = __SSAT(out1, 31);
    out2 = __SSAT(out2, 31);
    out3 = __SSAT(out3, 31);
    out4 = __SSAT(out4, 31);

    *pDst++ = out1 << 1u;
    *pDst++ = out2 << 1u;
    *pDst++ = out3 << 1u;
    *pDst++ = out4 << 1u;

    /* Decrement the blockSize loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and then store the results in the destination buffer. */
    inA1 = *pSrcA++;
    inB1 = *pSrcB++;
    out1 = ((q63_t) inA1 * inB1) >> 32;
    out1 = __SSAT(out1, 31);
    *pDst++ = out1 << 1u;

    /* Decrement the blockSize loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;


  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and then store the results in the destination buffer. */
    *pDst++ =
      (q31_t) clip_q63_to_q31(((q63_t) (*pSrcA++) * (*pSrcB++)) >> 31);

    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
  
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
}
示例#11
0
文件: arm_sub_q7.c 项目: JGSuw/DIP
void arm_sub_q7(     
  q7_t * pSrcA,     
  q7_t * pSrcB,     
  q7_t * pDst,     
  uint32_t blockSize)     
{     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t inA1, inB1, inA2, inB2;	 				 /* temporary input variabels */  
  q7_t inA, inB;								 /* temporary variables */  
  q31_t out1, out2, out3, out4;					 /* temporary output variables */  
     
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 4u;     
     
  /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.      
   ** a second loop below computes the remaining 1 to 15 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = A - B */     
    /* Subtract and then store the results in the destination buffer 4 samples at a time. */     
	/* read 4 samples at a time from sourceA */  
	inA1 = _SIMD32_OFFSET(pSrcA);  
	/* read 4 samples at a time from sourceB */  
	inB1 = _SIMD32_OFFSET(pSrcB);  
	/* read 4 samples at a time from sourceA */  
	inA2 = _SIMD32_OFFSET(pSrcA + 4);  
  
	/* out = saturate(sourceA - sourceB) four samples at a time */  
	out1 = __QSUB8(inA1, inB1);  
  
	/* read 4 samples at a time from sourceB */  
	inB2 = _SIMD32_OFFSET(pSrcB + 4);  
  
	/* store result to destination four samples at a time */  
	_SIMD32_OFFSET(pDst) = out1;  
  
	/* out = saturate(sourceA - sourceB) four samples at a time */  
	out2 = __QSUB8(inA2, inB2);  
  
	/* read 4 samples at a time from sourceA */  
	inA1 = _SIMD32_OFFSET(pSrcA + 8);  
	/* read 4 samples at a time from sourceB */  
	inB1 = _SIMD32_OFFSET(pSrcB + 8);  
	/* read 4 samples at a time from sourceA */  
	inA2 = _SIMD32_OFFSET(pSrcA + 12);  
  
	/* out = saturate(sourceA - sourceB) four samples at a time */  
	out3 = __QSUB8(inA1, inB1);  
  
	/* read 4 samples at a time from sourceB */  
	inB2 = _SIMD32_OFFSET(pSrcB + 12);  
	  
	/* increment sourceA pointer by 16 to process next samples */  
	pSrcA += 16u;  
  
	/* store result to destination four samples at a time */  
	_SIMD32_OFFSET(pDst + 4) = out2;  
  
	/* out = saturate(sourceA - sourceB) four samples at a time */  
	out4 = __QSUB8(inA2, inB2);  
	  
	/* store result to destination four samples at a time */  
	_SIMD32_OFFSET(pDst + 8) = out3;  
  
	/* Update source pointer to process next sampels */  
	pSrcB += 16u;  
  
	/* store result to destination four samples at a time */  
	_SIMD32_OFFSET(pDst + 12) = out4;  
  
	/* Update destination pointer to process next sampels */  
	pDst += 16u;  
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 16, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x10u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = A - B */     
    /* Subtract and then store the result in the destination buffer. */     
	inA = *pSrcA++;  
	inB = *pSrcB++;  
#ifdef CCS   
    *pDst++ = __SSATA(inA - inB, 0, 8);     
#else   
    *pDst++ = __SSAT(inA - inB, 8);     
#endif	//#ifdef CCS   
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
}     
示例#12
0
void arm_sub_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t blockSize)
{
        uint32_t blkCnt;                               /* Loop counter */

#if defined (ARM_MATH_LOOPUNROLL)

#if defined (ARM_MATH_DSP)
  q31_t inA1, inA2;
  q31_t inB1, inB2;
#endif

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

  while (blkCnt > 0U)
  {
    /* C = A - B */

#if defined (ARM_MATH_DSP)
    /* read 2 times 2 samples at a time from sourceA */
    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 times 2 samples at a time from sourceB */
    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);

    /* Subtract and store 2 times 2 samples at a time */
    write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
    write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
#else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
#endif

    /* Decrement loop counter */
    blkCnt--;
  }

  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

#else

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = A - B */

    /* Subtract and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
#else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
#endif

    /* Decrement loop counter */
    blkCnt--;
  }

}
void arm_cmplx_conj_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t numSamples)
{

#ifndef ARM_MATH_CM0

  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */

  /*loop Unrolling */
  blkCnt = numSamples >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.   
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    *pDst++ = *pSrc++;
    *pDst++ = __SSAT(-*pSrc++, 16);
    *pDst++ = *pSrc++;
    *pDst++ = __SSAT(-*pSrc++, 16);
    *pDst++ = *pSrc++;
    *pDst++ = __SSAT(-*pSrc++, 16);
    *pDst++ = *pSrc++;
    *pDst++ = __SSAT(-*pSrc++, 16);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.   
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;

  while(blkCnt > 0u)
  {
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    *pDst++ = *pSrc++;
    *pDst++ = __SSAT(-*pSrc++, 16);

    /* Decrement the loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  while(numSamples > 0u)
  {
    /* realOut + j (imagOut) = realIn+ j (-1) imagIn */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    *pDst++ = *pSrc++;
    *pDst++ = -*pSrc++;

    /* Decrement the loop counter */
    numSamples--;
  }

#endif /* #ifndef ARM_MATH_CM0 */

}
void arm_negate_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */


#ifndef ARM_MATH_CM0

/* Run the below code for Cortex-M4 and Cortex-M3 */

  q15_t in1, in2;                                /* Temporary variables */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.   
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Read two inputs */
    in1 = *pSrc++;
    in2 = *pSrc++;
    /* Negate and then store the results in the destination buffer by packing. */

#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16);

#else

    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    in1 = *pSrc++;
    in2 = *pSrc++;

#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16);

#else


    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.   
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the result in the destination buffer. */
    *pDst++ = __SSAT(-*pSrc++, 16);

    /* Decrement the loop counter */
    blkCnt--;
  }

}
示例#15
0
extern "C" void stm32_adc12(void) {
PERF_COUNT_START
    static float last_v_ab[2];
    static float prev_v_ab[2];
    static uint8_t last_pwm_sector = 1u;
    static uint8_t prev_pwm_sector = 1u;

    int16_t phase_current_lsb[3];
    uint16_t phase_oc[3];
    float out_v_ab[2], i_ab[2];
    float temp;

    hal_read_phase_shunts_(phase_current_lsb, prev_pwm_sector);

    /*
    Clarke transformation for balanced systems

    i_alpha = i_a,
    i_beta = (2 * i_b + i_a) / sqrt(3)

    Multiply by 8 because the phase current readings are right-aligned.
    */
    i_ab[0] = float(phase_current_lsb[0]) *
              float(hal_full_scale_current_a * 8.0 / 32768.0);
    temp = float(phase_current_lsb[1]) *
           float(hal_full_scale_current_a * 8.0 / 32768.0);
    i_ab[1] = (0.57735026919f * i_ab[0] + 1.15470053838f * temp);

    out_v_ab[0] = out_v_ab[1] = 0.0f;

    if (high_frequency_task_) {
        high_frequency_task_(out_v_ab, prev_v_ab, i_ab, vbus_v_);
    }

    prev_v_ab[0] = last_v_ab[0];
    prev_v_ab[1] = last_v_ab[1];

    last_v_ab[0] = out_v_ab[0];
    last_v_ab[1] = out_v_ab[1];

    prev_pwm_sector = last_pwm_sector;

    /*
    Convert alpha-beta frame voltage fractions to SVM output compare values
    for each phase.
    */
    temp = vbus_inv_;
    last_pwm_sector = svm_duty_cycle_from_v_alpha_beta(
        phase_oc,
        int16_t(__SSAT(int32_t(temp * out_v_ab[0]), 16u)),
        int16_t(__SSAT(int32_t(temp * out_v_ab[1]), 16u)),
        hal_pwm_period_ticks);

    /* Update the timer */
    hal_update_timer_(last_pwm_sector, phase_oc);

    /*
    Clear the JEOS event, and prepare for the next hardware trigger
    ADC_ClearFlag(ADC1, ADC_FLAG_JEOS);
    */
    putreg32(ADC_INT_JEOS, STM32_ADC1_ISR);
    /*
    Allow the next ADC conversion to happen based on the TIM1 CC4 event
    ADC_StartInjectedConversion(ADC1);
    */
    putreg32(getreg32(STM32_ADC1_CR) | ADC_CR_JADSTART, STM32_ADC1_CR);

PERF_COUNT_END
}
arm_status arm_mat_sub_q15(
    const arm_matrix_instance_q15 * pSrcA,
    const arm_matrix_instance_q15 * pSrcB,
    arm_matrix_instance_q15 * pDst)
{
    q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A */
    q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B */
    q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
    uint32_t numSamples;                           /* total number of elements in the matrix */
    uint32_t blkCnt;                               /* loop counters  */
    arm_status status;                             /* status of matrix subtraction  */


#ifdef ARM_MATH_MATRIX_CHECK


    /* Check for matrix mismatch condition */
    if((pSrcA->numRows != pSrcB->numRows) ||
       (pSrcA->numCols != pSrcB->numCols) ||
       (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols)) {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    } else
#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */

    {
        /* Total number of samples in the input matrix */
        numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;

#ifndef ARM_MATH_CM0

        /* Run the below code for Cortex-M4 and Cortex-M3 */

        /* Apply loop unrolling */
        blkCnt = numSamples >> 2u;

        /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
         ** a second loop below computes the remaining 1 to 3 samples. */
        while(blkCnt > 0u) {
            /* C(m,n) = A(m,n) - B(m,n) */
            /* Subtract, Saturate and then store the results in the destination buffer. */
            *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
            *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);

            /* Decrement the loop counter */
            blkCnt--;
        }

        /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
         ** No loop unrolling is used. */
        blkCnt = numSamples % 0x4u;

        while(blkCnt > 0u) {
            /* C(m,n) = A(m,n) - B(m,n) */
            /* Subtract and then store the results in the destination buffer. */
            *pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);

            /* Decrement the loop counter */
            blkCnt--;
        }

#else

        /* Run the below code for Cortex-M0 */

        /* Initialize blkCnt with number of samples */
        blkCnt = numSamples;

        while(blkCnt > 0u) {
            /* C(m,n) = A(m,n) - B(m,n) */
            /* Subtract and then store the results in the destination buffer. */
            *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);

            /* Decrement the loop counter */
            blkCnt--;
        }

#endif /* #ifndef ARM_MATH_CM0 */

        /* Set status as ARM_MATH_SUCCESS */
        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return (status);
}
示例#17
0
文件: arm_mult_q15.c 项目: JGSuw/DIP
void arm_mult_q15(     
  q15_t * pSrcA,     
  q15_t * pSrcB,     
  q15_t * pDst,     
  uint32_t blockSize)     
{     
  uint32_t blkCnt;                               /* loop counters */     
  q31_t inA1, inA2, inB1, inB2;					 /* temporary input variables */  
  q15_t out1, out2, out3, out4;					 /* temporary output variables */  
  q31_t mul1, mul2, mul3, mul4;					 /* temporary variables */  
     
  /* loop Unrolling */     
  blkCnt = blockSize >> 2u;     
     
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
   ** a second loop below computes the remaining 1 to 3 samples. */     
  while(blkCnt > 0u)     
  {  
    /* read two samples at a time from sourceA */  
	inA1 = *__SIMD32(pSrcA)++;  
	/* read two samples at a time from sourceB */  
	inB1 = *__SIMD32(pSrcB)++;  
    /* read two samples at a time from sourceA */  
    inA2 = *__SIMD32(pSrcA)++;  
	/* read two samples at a time from sourceB */  
	inB2 = *__SIMD32(pSrcB)++;  
  
	/* multiply mul = sourceA * sourceB */  
	mul1 = (q31_t)((q15_t)(inA1 >> 16)* (q15_t)(inB1>> 16));  
	mul2 = (q31_t)((q15_t)inA1 * (q15_t)inB1);  
	mul3 = (q31_t)((q15_t)(inA2 >> 16)* (q15_t)(inB2>> 16));  
	mul4 = (q31_t)((q15_t)inA2 * (q15_t)inB2);  
  
	/* shift result by 15 to get 16 bit result */  
	mul1 = mul1 >> 15;  
	mul2 = mul2 >> 15;  
	mul3 = mul3 >> 15;  
	mul4 = mul4 >> 15;  
  
	/* saturate result to 16 bit */  
#ifdef CCS  
  
	out1 = (q15_t) __SSATA(mul1, 0, 16);  
	out2 = (q15_t) __SSATA(mul2, 0, 16);  
	out3 = (q15_t) __SSATA(mul3, 0, 16);  
	out4 = (q15_t) __SSATA(mul4, 0, 16);  
  
#else  
  
	out1 = (q15_t) __SSAT(mul1, 16);  
	out2 = (q15_t) __SSAT(mul2, 16);  
	out3 = (q15_t) __SSAT(mul3, 16);  
	out4 = (q15_t) __SSAT(mul4, 16);  
  
#endif 	//	#ifdef CCS  
  
	/* store the result */  
#ifndef ARM_MATH_BIG_ENDIAN  
  
 	*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);  
 	*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);  
  
#else  
  
 	*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);  
 	*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);  
  
#endif		//	#ifndef ARM_MATH_BIG_ENDIAN  
  
    /* Decrement the blockSize loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x4u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = A * B */     
    /* Multiply the inputs and store the result in the destination buffer */     
#ifdef CCS	   
    *pDst++ = (q15_t) __SSATA(((q31_t) ((*pSrcA++) * (*pSrcB++)) >> 15), 0, 16);
#else   
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);     
#endif	//	#ifdef CCS     
    /* Decrement the blockSize loop counter */     
    blkCnt--;     
  }     
}     
示例#18
0
void arm_float_to_q7(     
  float32_t * pSrc,     
  q7_t * pDst,     
  uint32_t blockSize)     
{     
  float32_t *pIn = pSrc;                         /* Src pointer */     
  uint32_t blkCnt;                               /* loop counter */     
     
#ifdef ARM_MATH_ROUNDING     
     
  float32_t in;     
     
#endif     
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 4 samples. */     
  while(blkCnt > 0u)     
  {     
#ifdef ARM_MATH_ROUNDING     
    /* C = A * 128 */     
    /* convert from float to q7 and then store the results in the destination buffer */     
    in = *pIn++;     
    in = (in * 128);     
    in += in > 0 ? 0.5 : -0.5;     
#ifdef CCS   
    *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8));     
#else             
    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));   
#endif  //	#ifdef CCS   
     
    in = *pIn++;     
    in = (in * 128);     
    in += in > 0 ? 0.5 : -0.5;     
#ifdef CCS   
    *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8));     
#else             
    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));   
#endif  //	#ifdef CCS   
     
    in = *pIn++;     
    in = (in * 128);     
    in += in > 0 ? 0.5 : -0.5;     
#ifdef CCS   
    *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8));     
#else             
    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));   
#endif  //	#ifdef CCS   
     
    in = *pIn++;     
    in = (in * 128);     
    in += in > 0 ? 0.5 : -0.5;     
#ifdef CCS   
    *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8));     
#else             
    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));   
#endif  //	#ifdef CCS   
  
    in = *pIn++;     
    in = (in * 128);     
    in += in > 0 ? 0.5 : -0.5;     
#ifdef CCS   
    *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8));     
#else             
    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));   
#endif  //	#ifdef CCS   
     
    in = *pIn++;     
    in = (in * 128);     
    in += in > 0 ? 0.5 : -0.5;     
#ifdef CCS   
    *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8));     
#else             
    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));   
#endif  //	#ifdef CCS   
     
    in = *pIn++;     
    in = (in * 128);     
    in += in > 0 ? 0.5 : -0.5;     
#ifdef CCS   
    *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8));     
#else             
    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));   
#endif  //	#ifdef CCS   
     
    in = *pIn++;     
    in = (in * 128);     
    in += in > 0 ? 0.5 : -0.5;     
#ifdef CCS   
    *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8));     
#else             
    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));   
#endif  //	#ifdef CCS   
     
#else     
     
    /* C = A * 128 */     
    /* convert from float to q7 and then store the results in the destination buffer */   
#ifdef CCS   
    *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8);     
    *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8);     
    *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8);     
    *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8);     
    *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8);     
    *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8);     
    *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8);     
    *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8);     
#else     
    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);     
    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);     
    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);     
    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);   
    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);     
    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);     
    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);     
    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);   
#endif  //	#ifdef CCS   
     
#endif  //	#ifdef ARM_MATH_ROUNDING     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 48 compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
#ifdef ARM_MATH_ROUNDING     
    /* C = A * 128 */     
    /* convert from float to q7 and then store the results in the destination buffer */     
    in = *pIn++;     
    in = (in * 128);     
    in += in > 0 ? 0.5 : -0.5;   
#ifdef CCS   
    *pDst++ = (q7_t) (__SSATA((q15_t) (in), 0, 8));     
#else             
    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));   
#endif  //	#ifdef CCS   
     
#else     
     
    /* C = A * 128 */     
    /* convert from float to q7 and then store the results in the destination buffer */     
#ifdef CCS   
    *pDst++ = __SSATA((q31_t) (*pIn++ * 128.0f), 0, 8);     
#else   
    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);     
#endif  //	#ifdef CCS   
   
#endif  //	#ifdef ARM_MATH_ROUNDING     
   
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
}     
示例#19
0
void arm_offset_q15(
  q15_t * pSrc,
  q15_t offset,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t offset_packed;                           /* Offset packed to 32 bit */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PKHBT(offset, offset, 16);

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer, 2 samples at a time. */
    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);

    /* Decrement the loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16);

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}
void arm_cmplx_mult_real_q31(
  q31_t * pSrcCmplx,
  q31_t * pSrcReal,
  q31_t * pCmplxDst,
  uint32_t numSamples)
{
  q31_t inA1;                                    /* Temporary variable to store input value */

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counters */
  q31_t inA2, inA3, inA4;                        /* Temporary variables to hold input data */
  q31_t inB1, inB2;                              /* Temporary variabels to hold input data */
  q31_t out1, out2, out3, out4;                  /* Temporary variables to hold output data */

  /* loop Unrolling */
  blkCnt = numSamples >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[i].            */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
    /* read real input from complex input buffer */
    inA1 = *pSrcCmplx++;
    inA2 = *pSrcCmplx++;
    /* read input from real input bufer */
    inB1 = *pSrcReal++;
    inB2 = *pSrcReal++;
    /* read imaginary input from complex input buffer */
    inA3 = *pSrcCmplx++;
    inA4 = *pSrcCmplx++;

    /* multiply complex input with real input */
    out1 = ((q63_t) inA1 * inB1) >> 32;
    out2 = ((q63_t) inA2 * inB1) >> 32;
    out3 = ((q63_t) inA3 * inB2) >> 32;
    out4 = ((q63_t) inA4 * inB2) >> 32;

    /* sature the result */
    out1 = __SSAT(out1, 31);
    out2 = __SSAT(out2, 31);
    out3 = __SSAT(out3, 31);
    out4 = __SSAT(out4, 31);

    /* get result in 1.31 format */
    out1 = out1 << 1;
    out2 = out2 << 1;
    out3 = out3 << 1;
    out4 = out4 << 1;

    /* store the result to destination buffer */
    *pCmplxDst++ = out1;
    *pCmplxDst++ = out2;
    *pCmplxDst++ = out3;
    *pCmplxDst++ = out4;

    /* read real input from complex input buffer */
    inA1 = *pSrcCmplx++;
    inA2 = *pSrcCmplx++;
    /* read input from real input bufer */
    inB1 = *pSrcReal++;
    inB2 = *pSrcReal++;
    /* read imaginary input from complex input buffer */
    inA3 = *pSrcCmplx++;
    inA4 = *pSrcCmplx++;

    /* multiply complex input with real input */
    out1 = ((q63_t) inA1 * inB1) >> 32;
    out2 = ((q63_t) inA2 * inB1) >> 32;
    out3 = ((q63_t) inA3 * inB2) >> 32;
    out4 = ((q63_t) inA4 * inB2) >> 32;

    /* sature the result */
    out1 = __SSAT(out1, 31);
    out2 = __SSAT(out2, 31);
    out3 = __SSAT(out3, 31);
    out4 = __SSAT(out4, 31);

    /* get result in 1.31 format */
    out1 = out1 << 1;
    out2 = out2 << 1;
    out3 = out3 << 1;
    out4 = out4 << 1;

    /* store the result to destination buffer */
    *pCmplxDst++ = out1;
    *pCmplxDst++ = out2;
    *pCmplxDst++ = out3;
    *pCmplxDst++ = out4;

    /* Decrement the numSamples loop counter */
    blkCnt--;
  }

  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;

  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[i].            */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
    /* read real input from complex input buffer */
    inA1 = *pSrcCmplx++;
    inA2 = *pSrcCmplx++;
    /* read input from real input bufer */
    inB1 = *pSrcReal++;

    /* multiply complex input with real input */
    out1 = ((q63_t) inA1 * inB1) >> 32;
    out2 = ((q63_t) inA2 * inB1) >> 32;

    /* sature the result */
    out1 = __SSAT(out1, 31);
    out2 = __SSAT(out2, 31);

    /* get result in 1.31 format */
    out1 = out1 << 1;
    out2 = out2 << 1;

    /* store the result to destination buffer */
    *pCmplxDst++ = out1;
    *pCmplxDst++ = out2;

    /* Decrement the numSamples loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  while(numSamples > 0u)
  {
    /* realOut = realA * realB.            */
    /* imagReal = imagA * realB.               */
    inA1 = *pSrcReal++;
    /* store the result in the destination buffer. */
    *pCmplxDst++ =
      (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
    *pCmplxDst++ =
      (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);

    /* Decrement the numSamples loop counter */
    numSamples--;
  }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}
void arm_float_to_q15(
  float32_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
{
  float32_t *pIn = pSrc;                         /* Src pointer */
  uint32_t blkCnt;                               /* loop counter */

#ifdef ARM_MATH_ROUNDING

  float32_t in;

#endif /*      #ifdef ARM_MATH_ROUNDING        */

#ifndef ARM_MATH_CM0

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.   
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {

#ifdef ARM_MATH_ROUNDING
    /* C = A * 32768 */
    /* convert from float to q15 and then store the results in the destination buffer */
    in = *pIn++;
    in = (in * 32768.0f);
    in += in > 0 ? 0.5 : -0.5;
    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));

    in = *pIn++;
    in = (in * 32768.0f);
    in += in > 0 ? 0.5 : -0.5;
    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));

    in = *pIn++;
    in = (in * 32768.0f);
    in += in > 0 ? 0.5 : -0.5;
    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));

    in = *pIn++;
    in = (in * 32768.0f);
    in += in > 0 ? 0.5 : -0.5;
    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));

#else

    /* C = A * 32768 */
    /* convert from float to q15 and then store the results in the destination buffer */
    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);

#endif /*      #ifdef ARM_MATH_ROUNDING        */

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.   
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {

#ifdef ARM_MATH_ROUNDING
    /* C = A * 32768 */
    /* convert from float to q15 and then store the results in the destination buffer */
    in = *pIn++;
    in = (in * 32768.0f);
    in += in > 0 ? 0.5 : -0.5;
    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));

#else

    /* C = A * 32768 */
    /* convert from float to q15 and then store the results in the destination buffer */
    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);

#endif /*      #ifdef ARM_MATH_ROUNDING        */

    /* Decrement the loop counter */
    blkCnt--;
  }


#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {

#ifdef ARM_MATH_ROUNDING
    /* C = A * 32768 */
    /* convert from float to q15 and then store the results in the destination buffer */
    in = *pIn++;
    in = (in * 32768.0f);
    in += in > 0 ? 0.5f : -0.5f;
    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));

#else

    /* C = A * 32768 */
    /* convert from float to q15 and then store the results in the destination buffer */
    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);

#endif /*      #ifdef ARM_MATH_ROUNDING        */

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0 */

}
buffer_c16_t FIRAndDecimateComplex::execute(
	const buffer_c16_t& src,
	const buffer_c16_t& dst
) {
	/* int16_t input (sample count "n" must be multiple of decimation_factor)
	 * -> int16_t output, decimated by decimation_factor.
	 * taps are normalized to 1 << 16 == 1.0.
	 */
	const auto output_sampling_rate = src.sampling_rate / decimation_factor_;
	const size_t output_samples = src.count / decimation_factor_;
	
	sample_t* dst_p = dst.p;
	const buffer_c16_t result { dst.p, output_samples, output_sampling_rate };

	const sample_t* src_p = src.p;
	size_t outer_count = output_samples;
	while(outer_count > 0) {
		/* Put new samples into delay buffer */
		auto z_new_p = &samples_[taps_count_ - decimation_factor_];
		for(size_t i=0; i<decimation_factor_; i++) {
			*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
		}

		size_t loop_count = taps_count_ / 8;
		auto t_p = &taps_reversed_[0];
		auto z_p = &samples_[0];

		int64_t t_real = 0;
		int64_t t_imag = 0;

		while(loop_count > 0) {
			const auto tap0 = *__SIMD32(t_p)++;
			const auto sample0 = *__SIMD32(z_p)++;
			const auto tap1 = *__SIMD32(t_p)++;
			const auto sample1 = *__SIMD32(z_p)++;
			t_real = __SMLSLD(sample0, tap0, t_real);
			t_imag = __SMLALDX(sample0, tap0, t_imag);
			t_real = __SMLSLD(sample1, tap1, t_real);
			t_imag = __SMLALDX(sample1, tap1, t_imag);

			const auto tap2 = *__SIMD32(t_p)++;
			const auto sample2 = *__SIMD32(z_p)++;
			const auto tap3 = *__SIMD32(t_p)++;
			const auto sample3 = *__SIMD32(z_p)++;
			t_real = __SMLSLD(sample2, tap2, t_real);
			t_imag = __SMLALDX(sample2, tap2, t_imag);
			t_real = __SMLSLD(sample3, tap3, t_real);
			t_imag = __SMLALDX(sample3, tap3, t_imag);

			const auto tap4 = *__SIMD32(t_p)++;
			const auto sample4 = *__SIMD32(z_p)++;
			const auto tap5 = *__SIMD32(t_p)++;
			const auto sample5 = *__SIMD32(z_p)++;
			t_real = __SMLSLD(sample4, tap4, t_real);
			t_imag = __SMLALDX(sample4, tap4, t_imag);
			t_real = __SMLSLD(sample5, tap5, t_real);
			t_imag = __SMLALDX(sample5, tap5, t_imag);

			const auto tap6 = *__SIMD32(t_p)++;
			const auto sample6 = *__SIMD32(z_p)++;
			const auto tap7 = *__SIMD32(t_p)++;
			const auto sample7 = *__SIMD32(z_p)++;
			t_real = __SMLSLD(sample6, tap6, t_real);
			t_imag = __SMLALDX(sample6, tap6, t_imag);
			t_real = __SMLSLD(sample7, tap7, t_real);
			t_imag = __SMLALDX(sample7, tap7, t_imag);

			loop_count--;
		}

		/* TODO: Re-evaluate whether saturation is performed, normalization,
		 * all that jazz.
		 */
		const int32_t r = t_real >> 16;
		const int32_t i = t_imag >> 16;
		const int32_t r_sat = __SSAT(r, 16);
		const int32_t i_sat = __SSAT(i, 16);
		*__SIMD32(dst_p)++ = __PKHBT(
			r_sat,
			i_sat,
			16
		);

		/* Shift sample buffer left/down by decimation factor. */
		const size_t unroll_factor = 4;
		size_t shift_count = (taps_count_ - decimation_factor_) / unroll_factor;

		sample_t* t = &samples_[0];
		const sample_t* s = &samples_[decimation_factor_];
		
		while(shift_count > 0) {
			*__SIMD32(t)++ = *__SIMD32(s)++;
			*__SIMD32(t)++ = *__SIMD32(s)++;
			*__SIMD32(t)++ = *__SIMD32(s)++;
			*__SIMD32(t)++ = *__SIMD32(s)++;
			shift_count--;
		}

		shift_count = (taps_count_ - decimation_factor_) % unroll_factor;
		while(shift_count > 0) {
			*(t++) = *(s++);
			shift_count--;
		}

		outer_count--;
	}

	return result;
}