C++ (Cpp) __PKHBT примеры использования

Пример #1

0

Показать файл

Файл: CV_CoreSimd.c Проект: ARM-software/CMSIS_5

/**
\brief Test case: TC_CoreSimd_Part9
\details
- Check Packing Halfword:
  __PKHBT
  __PKHTB
*/
void TC_CoreSimd_Pack16 (void) {
#if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__  == 1)) || \
     (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))     )
  volatile uint32_t op1_u32, op2_u32;
  volatile uint32_t res_u32;

  /* --- __PKHBT Test ---------------------------------------------- */
  op1_u32 = 0x00000111;
  op2_u32 = 0x22200000;
  res_u32 = __PKHBT(op1_u32, op2_u32, 0);
  ASSERT_TRUE(res_u32 == 0x22200111);

  op1_u32 = 0x00000111;
  op2_u32 = 0x22200000;
  res_u32 = __PKHBT(op1_u32, op2_u32, 4);
  ASSERT_TRUE(res_u32 == 0x22000111);

  /* --- __PKHTB Test ---------------------------------------------- */
  op1_u32 = 0x11100000;
  op2_u32 = 0x00000222;
  res_u32 = __PKHTB(op1_u32, op2_u32, 0);
  ASSERT_TRUE(res_u32 == 0x11100222);

  op1_u32 = 0x11100000;
  op2_u32 = 0x00000222;
  res_u32 = __PKHTB(op1_u32, op2_u32, 4);
  ASSERT_TRUE(res_u32 == 0x11100022);
#endif
}

Пример #2

0

Показать файл

Файл: arm_pid_init_q15.c Проект: EduardoG26/nucleo-f4x1-motion-player-project

void arm_pid_init_q15(
  arm_pid_instance_q15 * S,
  int32_t resetStateFlag)
{

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  /* Derived coefficient A0 */
  S->A0 = __QADD16(__QADD16(S->Kp, S->Ki), S->Kd);

  /* Derived coefficients and pack into A1 */

#ifndef  ARM_MATH_BIG_ENDIAN

  S->A1 = __PKHBT(-__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), S->Kd, 16);

#else

  S->A1 = __PKHBT(S->Kd, -__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

  /* Check whether state needs reset or not */
  if(resetStateFlag)
  {
    /* Clear the state buffer.  The size will be always 3 samples */
    memset(S->state, 0, 3u * sizeof(q15_t));
  }

#else

  /* Run the below code for Cortex-M0 */

  q31_t temp;                                    /*to store the sum */

  /* Derived coefficient A0 */
  temp = S->Kp + S->Ki + S->Kd;
  S->A0 = (q15_t) __SSAT(temp, 16);

  /* Derived coefficients and pack into A1 */
  temp = -(S->Kd + S->Kd + S->Kp);
  S->A1 = (q15_t) __SSAT(temp, 16);
  S->A2 = S->Kd;



  /* Check whether state needs reset or not */
  if(resetStateFlag)
  {
    /* Clear the state buffer.  The size will be always 3 samples */
    memset(S->state, 0, 3u * sizeof(q15_t));
  }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}

Пример #3

0

Показать файл

Файл: dsp_decimate.cpp Проект: CCrashBandicot/portapack-hackrf

buffer_c16_t Complex8DecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) {
	/* Decimates by two using a non-recursive third-order CIC filter.
	 */

	/* CIC filter (decimating by two):
	 * 	D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1
	 * 	D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1
	 *
	 * 	D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1
	 * 	D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1
	 */

	uint32_t i1_i0 = _i1_i0;
	uint32_t q1_q0 = _q1_q0;

	/* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */
	constexpr uint32_t scale_factor = 32;
	constexpr uint32_t k_3_1 = 0x00030001 * scale_factor;
	uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]);
	uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]);
	uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]);
	while(src_p < src_end) {
		const uint32_t q3_i3_q2_i2 = *(src_p++);						// 3
		const uint32_t q5_i5_q4_i4 = *(src_p++);

		const uint32_t d_i0_partial = __SMUAD(k_3_1, i1_i0);			// 1: = 3 * i1 + 1 * i0
		const uint32_t i3_i2 = __SXTB16(q3_i3_q2_i2,  0);				// 1: (q3_i3_q2_i2 ror  0)[23:16]:(q3_i3_q2_i2 ror  0)[7:0]
		const uint32_t d_i0 = __SMLADX(k_3_1, i3_i2, d_i0_partial);		// 1: + 3 * i2 + 1 * i3

		const uint32_t d_q0_partial = __SMUAD(k_3_1, q1_q0);			// 1: = 3 * q1 * 1 * q0
		const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2,  8);				// 1: (q3_i3_q2_i2 ror  8)[23:16]:(q3_i3_q2_i2 ror  8)[7:0]
		const uint32_t d_q0 = __SMLADX(k_3_1, q3_q2, d_q0_partial);		// 1: + 3 * q2 + 1 * q3 

		const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16);				// 1: (Rm<<16)[31:16]:Rn[15:0]

		const uint32_t d_i1_partial = __SMUAD(k_3_1, i3_i2);			// 1: = 3 * i3 + 1 * i2
		const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4,  0);				// 1: (q5_i5_q4_i4 ror  0)[23:16]:(q5_i5_q4_i4 ror  0)[7:0]
		const uint32_t d_i1 = __SMLADX(k_3_1, i5_i4, d_i1_partial);		// 1: + 1 * i5 + 3 * i4

		const uint32_t d_q1_partial = __SMUAD(k_3_1, q3_q2);			// 1: = 3 * q3 * 1 * q2
		const uint32_t q5_q4 = __SXTB16(q5_i5_q4_i4,  8);				// 1: (q5_i5_q4_i4 ror  8)[23:16]:(q5_i5_q4_i4 ror  8)[7:0]
		const uint32_t d_q1 = __SMLADX(k_3_1, q5_q4, d_q1_partial);		// 1: + 1 * q5 + 3 * q4 

		const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16);				// 1: (Rm<<16)[31:16]:Rn[15:0]

		*(dst_p++) = d_q0_i0;											// 3
		*(dst_p++) = d_q1_i1;

		i1_i0 = i5_i4;
		q1_q0 = q5_q4;
	}
	_i1_i0 = i1_i0;
	_q1_q0 = q1_q0;

	return { dst.p, src.count / 2, src.sampling_rate / 2 };
}

Пример #4

0

Показать файл

Файл: dsp_demodulate.cpp Проект: CCrashBandicot/portapack-hackrf

buffer_s16_t FM::execute(
	const buffer_c16_t& src,
	const buffer_s16_t& dst
) {
	auto z = z_;

	const auto src_p = src.p;
	const auto src_end = &src.p[src.count];
	auto dst_p = dst.p;
	while(src_p < src_end) {
		const auto s0 = *__SIMD32(src_p)++;
		const auto s1 = *__SIMD32(src_p)++;
		const auto t0 = multiply_conjugate_s16_s32(s0, z);
		const auto t1 = multiply_conjugate_s16_s32(s1, s0);
		z = s1;
		const int32_t theta0_int = angle_approx_0deg27(t0) * ks16;
		const int32_t theta0_sat = __SSAT(theta0_int, 16);
		const int32_t theta1_int = angle_approx_0deg27(t1) * ks16;
		const int32_t theta1_sat = __SSAT(theta1_int, 16);
		*__SIMD32(dst_p)++ = __PKHBT(
			theta0_sat,
			theta1_sat,
			16
		);
	}
	z_ = z;

	return { dst.p, src.count, src.sampling_rate };
}

Пример #5

0

Показать файл

Файл: arm_negate_q15.c Проект: DesignByArie/lpc17xx.cmsis.driver.library

void arm_negate_q15( 
  q15_t * pSrc, 
  q15_t * pDst, 
  uint32_t blockSize) 
{ 
  uint32_t blkCnt;                               /* loop counter */ 
  q15_t in1, in2;                                /* Temporary variables */ 
 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = ~A */ 
    /* Read two inputs */ 
    in1 = *pSrc++; 
    in2 = *pSrc++; 
    /* Negate and then store the results in the destination buffer by packing. */ 
    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); 
 
    in1 = *pSrc++; 
    in2 = *pSrc++; 
 
    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = ~A */ 
    /* Negate and then store the result in the destination buffer. */ 
    *pDst++ = __SSAT(-*pSrc++, 16); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
}

Пример #6

0

Показать файл

Файл: arm_fill_q15.c Проект: ChingHengWang/stm32_ws

void arm_fill_q15(
  q15_t value,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t packedValue;                             /* value packed to 32 bits */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* Packing two 16 bit values to 32 bit value in order to use SIMD */
  packedValue = __PKHBT(value, value, 16u);

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = value */
    /* Fill the value in the destination buffer */
    *__SIMD32(pDst)++ = packedValue;
    *__SIMD32(pDst)++ = packedValue;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  while(blkCnt > 0u)
  {
    /* C = value */
    /* Fill the value in the destination buffer */
    *pDst++ = value;

    /* Decrement the loop counter */
    blkCnt--;
  }
}

Пример #7

0

Показать файл

Файл: arm_fill_q15.c Проект: ARM-software/CMSIS_5

void arm_fill_q15(
  q15_t value,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* Loop counter */

#if defined (ARM_MATH_LOOPUNROLL)
  q31_t packedValue;                             /* value packed to 32 bits */

  /* Packing two 16 bit values to 32 bit value in order to use SIMD */
  packedValue = __PKHBT(value, value, 16U);

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

  while (blkCnt > 0U)
  {
    /* C = value */

    /* fill 2 times 2 samples at a time */
    write_q15x2_ia (&pDst, packedValue);
    write_q15x2_ia (&pDst, packedValue);

    /* Decrement loop counter */
    blkCnt--;
  }

  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

#else

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = value */

    /* Fill value in destination buffer */
    *pDst++ = value;

    /* Decrement loop counter */
    blkCnt--;
  }
}

Пример #8

0

Показать файл

Файл: arm_fill_q15.c Проект: JGSuw/DIP

void arm_fill_q15(     
  q15_t value,     
  q15_t * pDst,     
  uint32_t blockSize)     
{     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t packedValue;                             /* value packed to 32 bits */     
  
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* Packing two 16 bit values to 32 bit value in order to use SIMD */     
  packedValue = __PKHBT(value, value, 16u);     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = value */     
    /* Fill the value in the destination buffer */     
  
	_SIMD32_OFFSET(pDst) = packedValue;  
	_SIMD32_OFFSET(pDst + 2) = packedValue;  
	_SIMD32_OFFSET(pDst + 4) = packedValue;  
	_SIMD32_OFFSET(pDst + 6) = packedValue;  
	pDst += 8u;  
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.  
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = value */     
    /* Fill the value in the destination buffer */     
    *pDst++ = value;     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
}

Пример #9

0

Показать файл

Файл: arm_biquad_cascade_df1_fast_q15.c Проект: imhoffj/RaceCapture-Pro_firmware

void arm_biquad_cascade_df1_fast_q15(
    const arm_biquad_casd_df1_inst_q15 * S,
    q15_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
{
    q15_t *pIn = pSrc;                             /*  Source pointer                               */
    q15_t *pOut = pDst;                            /*  Destination pointer                          */
    q31_t in;                                      /*  Temporary variable to hold input value       */
    q31_t out;                                     /*  Temporary variable to hold output value      */
    q31_t b0;                                      /*  Temporary variable to hold bo value          */
    q31_t b1, a1;                                  /*  Filter coefficients                          */
    q31_t state_in, state_out;                     /*  Filter state variables                       */
    q31_t acc0;                                    /*  Accumulator                                  */
    int32_t shift = (int32_t) (15 - S->postShift); /*  Post shift                                   */
    q15_t *pState = S->pState;                     /*  State pointer                                */
    q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
    q31_t *pState_q31;                             /*  32-bit state pointer for SIMD implementation */
    uint32_t sample, stage = S->numStages;         /*  Stage loop counter                           */



    do {
        /* Initialize state pointer of type q31 */
        pState_q31 = (q31_t *) (pState);

        /* Read the b0 and 0 coefficients using SIMD  */
        b0 = *__SIMD32(pCoeffs)++;

        /* Read the b1 and b2 coefficients using SIMD */
        b1 = *__SIMD32(pCoeffs)++;

        /* Read the a1 and a2 coefficients using SIMD */
        a1 = *__SIMD32(pCoeffs)++;

        /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
        state_in = (q31_t) (*pState_q31++);

        /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
        state_out = (q31_t) (*pState_q31);

        /* Apply loop unrolling and compute 2 output values simultaneously. */
        /*      The variables acc0 ... acc3 hold output values that are being computed:
         *
         *    acc0 =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
         *    acc0 =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
         */
        sample = blockSize >> 1u;

        /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.
         ** a second loop below computes the remaining 1 sample. */
        while(sample > 0u) {

            /* Read the input */
            in = *__SIMD32(pIn)++;

            /* out =  b0 * x[n] + 0 * 0 */
            out = __SMUAD(b0, in);
            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
            state_out = __PKHBT(state_out >> 16, (out), 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* out =  b0 * x[n] + 0 * 0 */
            out = __SMUADX(b0, in);
            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);


            /* Store the output in the destination buffer. */

#ifndef  ARM_MATH_BIG_ENDIAN

            *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);

#else

            *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in >> 16, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, in, 16);
            state_out = __PKHBT(state_out >> 16, out, 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */


            /* Decrement the loop counter */
            sample--;

        }

        /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
         ** No loop unrolling is used. */

        if((blockSize & 0x1u) != 0u) {
            /* Read the input */
            in = *pIn++;

            /* out =  b0 * x[n] + 0 * 0 */

#ifndef  ARM_MATH_BIG_ENDIAN

            out = __SMUAD(b0, in);

#else

            out = __SMUADX(b0, in);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);

            /* Store the output in the destination buffer. */
            *pOut++ = (q15_t) out;

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, in, 16);
            state_out = __PKHBT(state_out >> 16, out, 16);

#endif /*   #ifndef  ARM_MATH_BIG_ENDIAN    */

        }

        /*  The first stage goes from the input buffer to the output buffer.  */
        /*  Subsequent (numStages - 1) occur in-place in the output buffer  */
        pIn = pDst;

        /* Reset the output pointer */
        pOut = pDst;

        /*  Store the updated state variables back into the state array */
        *__SIMD32(pState)++ = state_in;
        *__SIMD32(pState)++ = state_out;


        /* Decrement the loop counter */
        stage--;

    } while(stage > 0u);

Пример #10

0

Показать файл

Файл: arm_abs_q15.c Проект: 1heinz/TauLabs

void arm_abs_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

/* Run the below code for Cortex-M4 and Cortex-M3 */

  q15_t in1;                                     /* Input value1 */
  q15_t in2;                                     /* Input value2 */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read two inputs */
    in1 = *pSrc++;
    in2 = *pSrc++;


    /* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */

#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ =
      __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)),
              ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16);

#else


    *__SIMD32(pDst)++ =
      __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)),
              ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    in1 = *pSrc++;
    in2 = *pSrc++;


#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ =
      __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)),
              ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16);

#else


    *__SIMD32(pDst)++ =
      __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)),
              ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read the input */
    in1 = *pSrc++;

    /* Calculate absolute value of input and then store the result in the destination buffer. */
    *pDst++ = (in1 > 0) ? in1 : __QSUB16(0, in1);

    /* Decrement the loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  q15_t in;                                      /* Temporary input variable */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read the input */
    in = *pSrc++;

    /* Calculate absolute value of input and then store the result in the destination buffer. */
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0 */

}

Пример #11

0

Показать файл

Файл: arm_q7_to_q15.c Проект: CrazianT3k/ahabadge

void arm_q7_to_q15(
    q7_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
{
    q7_t *pIn = pSrc;                              /* Src pointer */
    uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY
    q31_t in;
    q31_t in1, in2;
    q31_t out1, out2;

    /* Run the below code for Cortex-M4 and Cortex-M3 */

    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
        /* C = (q15_t) A << 8 */
        /* convert from q7 to q15 and then store the results in the destination buffer */
        in = *__SIMD32(pIn)++;

        /* rotatate in by 8 and extend two q7_t values to q15_t values */
        in1 = __SXTB16(__ROR(in, 8));

        /* extend remainig two q7_t values to q15_t values */
        in2 = __SXTB16(in);

        in1 = in1 << 8u;
        in2 = in2 << 8u;

        in1 = in1 & 0xFF00FF00;
        in2 = in2 & 0xFF00FF00;

#ifndef ARM_MATH_BIG_ENDIAN

        out2 = __PKHTB(in1, in2, 16);
        out1 = __PKHBT(in2, in1, 16);

#else

        out1 = __PKHTB(in1, in2, 16);
        out2 = __PKHBT(in2, in1, 16);

#endif

        *__SIMD32(pDst)++ = out1;
        *__SIMD32(pDst)++ = out2;

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

#else

    /* Run the below code for Cortex-M0 */

    /* Loop over blockSize number of values */
    blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

    while(blkCnt > 0u)
    {
        /* C = (q15_t) A << 8 */
        /* convert from q7 to q15 and then store the results in the destination buffer */
        *pDst++ = (q15_t) * pIn++ << 8;

        /* Decrement the loop counter */
        blkCnt--;
    }

}

Пример #12

0

Показать файл

Файл: dsp_decimate.cpp Проект: CCrashBandicot/portapack-hackrf

buffer_c16_t FIRAndDecimateComplex::execute(
	const buffer_c16_t& src,
	const buffer_c16_t& dst
) {
	/* int16_t input (sample count "n" must be multiple of decimation_factor)
	 * -> int16_t output, decimated by decimation_factor.
	 * taps are normalized to 1 << 16 == 1.0.
	 */
	const auto output_sampling_rate = src.sampling_rate / decimation_factor_;
	const size_t output_samples = src.count / decimation_factor_;
	
	sample_t* dst_p = dst.p;
	const buffer_c16_t result { dst.p, output_samples, output_sampling_rate };

	const sample_t* src_p = src.p;
	size_t outer_count = output_samples;
	while(outer_count > 0) {
		/* Put new samples into delay buffer */
		auto z_new_p = &samples_[taps_count_ - decimation_factor_];
		for(size_t i=0; i<decimation_factor_; i++) {
			*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
		}

		size_t loop_count = taps_count_ / 8;
		auto t_p = &taps_reversed_[0];
		auto z_p = &samples_[0];

		int64_t t_real = 0;
		int64_t t_imag = 0;

		while(loop_count > 0) {
			const auto tap0 = *__SIMD32(t_p)++;
			const auto sample0 = *__SIMD32(z_p)++;
			const auto tap1 = *__SIMD32(t_p)++;
			const auto sample1 = *__SIMD32(z_p)++;
			t_real = __SMLSLD(sample0, tap0, t_real);
			t_imag = __SMLALDX(sample0, tap0, t_imag);
			t_real = __SMLSLD(sample1, tap1, t_real);
			t_imag = __SMLALDX(sample1, tap1, t_imag);

			const auto tap2 = *__SIMD32(t_p)++;
			const auto sample2 = *__SIMD32(z_p)++;
			const auto tap3 = *__SIMD32(t_p)++;
			const auto sample3 = *__SIMD32(z_p)++;
			t_real = __SMLSLD(sample2, tap2, t_real);
			t_imag = __SMLALDX(sample2, tap2, t_imag);
			t_real = __SMLSLD(sample3, tap3, t_real);
			t_imag = __SMLALDX(sample3, tap3, t_imag);

			const auto tap4 = *__SIMD32(t_p)++;
			const auto sample4 = *__SIMD32(z_p)++;
			const auto tap5 = *__SIMD32(t_p)++;
			const auto sample5 = *__SIMD32(z_p)++;
			t_real = __SMLSLD(sample4, tap4, t_real);
			t_imag = __SMLALDX(sample4, tap4, t_imag);
			t_real = __SMLSLD(sample5, tap5, t_real);
			t_imag = __SMLALDX(sample5, tap5, t_imag);

			const auto tap6 = *__SIMD32(t_p)++;
			const auto sample6 = *__SIMD32(z_p)++;
			const auto tap7 = *__SIMD32(t_p)++;
			const auto sample7 = *__SIMD32(z_p)++;
			t_real = __SMLSLD(sample6, tap6, t_real);
			t_imag = __SMLALDX(sample6, tap6, t_imag);
			t_real = __SMLSLD(sample7, tap7, t_real);
			t_imag = __SMLALDX(sample7, tap7, t_imag);

			loop_count--;
		}

		/* TODO: Re-evaluate whether saturation is performed, normalization,
		 * all that jazz.
		 */
		const int32_t r = t_real >> 16;
		const int32_t i = t_imag >> 16;
		const int32_t r_sat = __SSAT(r, 16);
		const int32_t i_sat = __SSAT(i, 16);
		*__SIMD32(dst_p)++ = __PKHBT(
			r_sat,
			i_sat,
			16
		);

		/* Shift sample buffer left/down by decimation factor. */
		const size_t unroll_factor = 4;
		size_t shift_count = (taps_count_ - decimation_factor_) / unroll_factor;

		sample_t* t = &samples_[0];
		const sample_t* s = &samples_[decimation_factor_];
		
		while(shift_count > 0) {
			*__SIMD32(t)++ = *__SIMD32(s)++;
			*__SIMD32(t)++ = *__SIMD32(s)++;
			*__SIMD32(t)++ = *__SIMD32(s)++;
			*__SIMD32(t)++ = *__SIMD32(s)++;
			shift_count--;
		}

		shift_count = (taps_count_ - decimation_factor_) % unroll_factor;
		while(shift_count > 0) {
			*(t++) = *(s++);
			shift_count--;
		}

		outer_count--;
	}

	return result;
}

Пример #13

0

Показать файл

Файл: arm_mult_q15.c Проект: JGSuw/DIP

void arm_mult_q15(     
  q15_t * pSrcA,     
  q15_t * pSrcB,     
  q15_t * pDst,     
  uint32_t blockSize)     
{     
  uint32_t blkCnt;                               /* loop counters */     
  q31_t inA1, inA2, inB1, inB2;					 /* temporary input variables */  
  q15_t out1, out2, out3, out4;					 /* temporary output variables */  
  q31_t mul1, mul2, mul3, mul4;					 /* temporary variables */  
     
  /* loop Unrolling */     
  blkCnt = blockSize >> 2u;     
     
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
   ** a second loop below computes the remaining 1 to 3 samples. */     
  while(blkCnt > 0u)     
  {  
    /* read two samples at a time from sourceA */  
	inA1 = *__SIMD32(pSrcA)++;  
	/* read two samples at a time from sourceB */  
	inB1 = *__SIMD32(pSrcB)++;  
    /* read two samples at a time from sourceA */  
    inA2 = *__SIMD32(pSrcA)++;  
	/* read two samples at a time from sourceB */  
	inB2 = *__SIMD32(pSrcB)++;  
  
	/* multiply mul = sourceA * sourceB */  
	mul1 = (q31_t)((q15_t)(inA1 >> 16)* (q15_t)(inB1>> 16));  
	mul2 = (q31_t)((q15_t)inA1 * (q15_t)inB1);  
	mul3 = (q31_t)((q15_t)(inA2 >> 16)* (q15_t)(inB2>> 16));  
	mul4 = (q31_t)((q15_t)inA2 * (q15_t)inB2);  
  
	/* shift result by 15 to get 16 bit result */  
	mul1 = mul1 >> 15;  
	mul2 = mul2 >> 15;  
	mul3 = mul3 >> 15;  
	mul4 = mul4 >> 15;  
  
	/* saturate result to 16 bit */  
#ifdef CCS  
  
	out1 = (q15_t) __SSATA(mul1, 0, 16);  
	out2 = (q15_t) __SSATA(mul2, 0, 16);  
	out3 = (q15_t) __SSATA(mul3, 0, 16);  
	out4 = (q15_t) __SSATA(mul4, 0, 16);  
  
#else  
  
	out1 = (q15_t) __SSAT(mul1, 16);  
	out2 = (q15_t) __SSAT(mul2, 16);  
	out3 = (q15_t) __SSAT(mul3, 16);  
	out4 = (q15_t) __SSAT(mul4, 16);  
  
#endif 	//	#ifdef CCS  
  
	/* store the result */  
#ifndef ARM_MATH_BIG_ENDIAN  
  
 	*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);  
 	*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);  
  
#else  
  
 	*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);  
 	*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);  
  
#endif		//	#ifndef ARM_MATH_BIG_ENDIAN  
  
    /* Decrement the blockSize loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x4u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = A * B */     
    /* Multiply the inputs and store the result in the destination buffer */     
#ifdef CCS	   
    *pDst++ = (q15_t) __SSATA(((q31_t) ((*pSrcA++) * (*pSrcB++)) >> 15), 0, 16);
#else   
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);     
#endif	//	#ifdef CCS     
    /* Decrement the blockSize loop counter */     
    blkCnt--;     
  }     
}

Пример #14

0

Показать файл

Файл: arm_q7_to_q15.c Проект: JGSuw/DIP

void arm_q7_to_q15(     
  q7_t * pSrc,     
  q15_t * pDst,     
  uint32_t blockSize)     
{     
  q7_t *pIn = pSrc;                              /* Src pointer */     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t in;  
  q31_t in1, in2;  
  q31_t out1, out2;  
  q31_t and = 0xFF00FF00;  
  
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = (q15_t) A << 8 */     
    /* convert from q7 to q15 and then store the results in the destination buffer */  
	/* read 4 samples at a time */     
	in = *__SIMD32(pIn)++;  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(in, 8);  
  
	/* extend remainig two q7_t values to q15_t values */  
	in2 = __SXTB16(in, 0);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(__ROR(in, 8));  
  
	/* extend remainig two q7_t values to q15_t values */  
	in2 = __SXTB16(in);  
  
  
#endif	/* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/  
	in1 = in1 << 8u;  
	in2 = in2 << 8u;  
  
	/* read next 4 sampels */  
	in = *__SIMD32(pIn)++;  
  
	/* anding with 0xff00ff00 */  
	in1 =  in1 & and;  
	out2 = in2 & and;  
  
	/* pack two 16 bit values */  
	out1 = __PKHTB(in1, out2, 16);  
	out2 = __PKHBT(out2, in1, 16);  
  
#ifndef ARM_MATH_BIG_ENDIAN	  
	  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst + 2) = out1;  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(in, 8);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(__ROR(in, 8));  
  
#endif  
  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst) = out2;  
  
#else  
  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst) = out1;  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(in, 8);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(__ROR(in, 8));  
  
#endif  
  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst + 2) = out2;  
  
#endif	 	//	#ifndef ARM_MATH_BIG_ENDIAN  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in2 = __SXTB16(in, 0);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in2 = __SXTB16(in);  
  
#endif  
  
	/* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/  
	in1 = in1 << 8u;  
	in2 = in2 << 8u;  
  
	/* anding with 0xff00ff00 */  
	out1 = in1 & and;  
	out2 = in2 & and;  
  
	/* pack two 16 bit values */  
	out1 = __PKHTB(in1, out2, 16);  
	out2 = __PKHBT(out2, in1, 16);  
  
	/* store two q15_t samples at a time to destination */  
#ifndef ARM_MATH_BIG_ENDIAN  
  
	_SIMD32_OFFSET(pDst + 6) = out1;  
	_SIMD32_OFFSET(pDst + 4) = out2;  
  
#else  
  
	_SIMD32_OFFSET(pDst + 4) = out1;  
	_SIMD32_OFFSET(pDst + 6) = out2;  
  
#endif	 	//	#ifndef ARM_MATH_BIG_ENDIAN  
  
	/* incremnet destination pointer */  
	pDst += 8u;  
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = (q15_t) A << 8 */     
    /* convert from q7 to q15 and then store the results in the destination buffer */     
    *pDst++ = (q15_t) * pIn++ << 8;     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
}

Пример #15

0

Показать файл

Файл: arm_offset_q15.c Проект: OpenNuvoton/Mini51BSP

void arm_offset_q15(
  q15_t * pSrc,
  q15_t offset,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t offset_packed;                           /* Offset packed to 32 bit */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PKHBT(offset, offset, 16);

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer, 2 samples at a time. */
    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);

    /* Decrement the loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16);

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}

Пример #16

0

Показать файл

Файл: arm_negate_q15.c Проект: ThucVD2704/femto-usb-blink-example

void arm_negate_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */


#ifndef ARM_MATH_CM0

/* Run the below code for Cortex-M4 and Cortex-M3 */

  q15_t in1, in2;                                /* Temporary variables */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.   
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Read two inputs */
    in1 = *pSrc++;
    in2 = *pSrc++;
    /* Negate and then store the results in the destination buffer by packing. */

#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16);

#else

    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    in1 = *pSrc++;
    in2 = *pSrc++;

#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16);

#else


    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.   
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the result in the destination buffer. */
    *pDst++ = __SSAT(-*pSrc++, 16);

    /* Decrement the loop counter */
    blkCnt--;
  }

}

Пример #17

0

Показать файл

Файл: dsp_decimate.cpp Проект: CCrashBandicot/portapack-hackrf

buffer_c16_t TranslateByFSOver4AndDecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) {
	/* Translates incoming complex<int8_t> samples by -fs/4,
	 * decimates by two using a non-recursive third-order CIC filter.
	 */

	/* Derivation of algorithm:
	 * Original CIC filter (decimating by two):
	 * 	D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1
	 * 	D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1
	 *
	 * 	D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1
	 * 	D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1
	 *
	 * Translate -fs/4, phased 180 degrees, accomplished by complex multiplication
	 * of complex length-4 sequence:
	 *
	 * Substitute:
	 *	i0 = -i0, q0 = -q0
	 *	i1 = -q1, q1 =  i1
	 *	i2 =  i2, q2 =  q2
	 *	i3 =  q3, q3 = -i3
	 *	i4 = -i4, q4 = -q4
	 *	i5 = -q5, q5 =  i5
	 *
	 * Resulting taps (with decimation by 2, four samples in, two samples out):
	 *	D_I0 =  q3 * 1 +  i2 * 3 + -q1 * 3 + -i0 * 1
	 *	D_Q0 = -i3 * 1 +  q2 * 3 +  i1 * 3 + -q0 * 1
 	 *
	 *	D_I1 = -q5 * 1 + -i4 * 3 +  q3 * 3 +  i2 * 1
	 *	D_Q1 =  i5 * 1 + -q4 * 3 + -i3 * 3 +  q2 * 1
	 */

	// 6 cycles per complex input sample, not including loop overhead.
	uint32_t q1_i0 = _q1_i0;
	uint32_t q0_i1 = _q0_i1;
	/* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */
	constexpr uint32_t scale_factor = 32;
	const uint32_t k_3_1 = 0x00030001 * scale_factor;
	uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]);
	uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]);
	uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]);
	while(src_p < src_end) {
		const uint32_t q3_i3_q2_i2 = *(src_p++);			// 3
		const uint32_t q5_i5_q4_i4 = *(src_p++);

		const uint32_t i2_i3 = __SXTB16(q3_i3_q2_i2, 16);			// 1: (q3_i3_q2_i2 ror 16)[23:16]:(q3_i3_q2_i2 ror 16)[7:0]
		const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2,  8);			// 1: (q3_i3_q2_i2 ror  8)[23:16]:(q3_i3_q2_i2 ror  8)[7:0]
		const uint32_t i2_q3 = __PKHTB(i2_i3, q3_q2, 16);			// 1: Rn[31:16]:(Rm>>16)[15:0]
		const uint32_t i3_q2 = __PKHBT(q3_q2, i2_i3, 16);			// 1:(Rm<<16)[31:16]:Rn[15:0]

		// D_I0 = 3 * (i2 - q1) + (q3 - i0)
		const uint32_t i2_m_q1_q3_m_i0 = __QSUB16(i2_q3, q1_i0);	// 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
		const uint32_t d_i0 = __SMUAD(k_3_1, i2_m_q1_q3_m_i0);		// 1: Rm[15:0]*Rs[15:0]+Rm[31:16]*Rs[31:16]

		// D_Q0 = 3 * (q2 + i1) - (i3 + q0)
		const uint32_t i3_p_q0_q2_p_i1 = __QADD16(i3_q2, q0_i1);	// 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
		const uint32_t d_q0 = __SMUSDX(i3_p_q0_q2_p_i1, k_3_1);		// 1: Rm[15:0]*Rs[31:16]–Rm[31:16]*RsX[15:0]
		const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]

		const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4,  0);			// 1: (q5_i5_q4_i4 ror  0)[23:16]:(q5_i5_q4_i4 ror  0)[7:0]
		const uint32_t q4_q5 = __SXTB16(q5_i5_q4_i4, 24);			// 1: (q5_i5_q4_i4 ror 24)[23:16]:(q5_i5_q4_i4 ror 24)[7:0]
		const uint32_t q4_i5 = __PKHTB(q4_q5, i5_i4, 16);			// 1: Rn[31:16]:(Rm>>16)[15:0]
		const uint32_t q5_i4 = __PKHBT(i5_i4, q4_q5, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]

		// D_I1 = (i2 - q5) + 3 * (q3 - i4)
		const uint32_t i2_m_q5_q3_m_i4 = __QSUB16(i2_q3, q5_i4);	// 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
		const uint32_t d_i1 = __SMUADX(i2_m_q5_q3_m_i4, k_3_1);		// 1: Rm[15:0]*Rs[31:16]+Rm[31:16]*Rs[15:0]

		// D_Q1 = (i5 + q2) - 3 * (q4 + i3)
		const uint32_t q4_p_i3_i5_p_q2 = __QADD16(q4_i5, i3_q2);	// 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
		const uint32_t d_q1 = __SMUSD(k_3_1, q4_p_i3_i5_p_q2);		// 1: Rm[15:0]*Rs[15:0]–Rm[31:16]*Rs[31:16]
		const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]
		*(dst_p++) = d_q0_i0;							// 3
		*(dst_p++) = d_q1_i1;

		q1_i0 = q5_i4;
		q0_i1 = q4_i5;
	}
	_q1_i0 = q1_i0;
	_q0_i1 = q0_i1;

	return { dst.p, src.count / 2, src.sampling_rate / 2 };
}