C++ (Cpp) __SIMD32 예제들

예제 #1

0

파일 보기

파일: dsp_demodulate.cpp 프로젝트: CCrashBandicot/portapack-hackrf

buffer_s16_t FM::execute(
	const buffer_c16_t& src,
	const buffer_s16_t& dst
) {
	auto z = z_;

	const auto src_p = src.p;
	const auto src_end = &src.p[src.count];
	auto dst_p = dst.p;
	while(src_p < src_end) {
		const auto s0 = *__SIMD32(src_p)++;
		const auto s1 = *__SIMD32(src_p)++;
		const auto t0 = multiply_conjugate_s16_s32(s0, z);
		const auto t1 = multiply_conjugate_s16_s32(s1, s0);
		z = s1;
		const int32_t theta0_int = angle_approx_0deg27(t0) * ks16;
		const int32_t theta0_sat = __SSAT(theta0_int, 16);
		const int32_t theta1_int = angle_approx_0deg27(t1) * ks16;
		const int32_t theta1_sat = __SSAT(theta1_int, 16);
		*__SIMD32(dst_p)++ = __PKHBT(
			theta0_sat,
			theta1_sat,
			16
		);
	}
	z_ = z;

	return { dst.p, src.count, src.sampling_rate };
}

예제 #2

0

파일 보기

파일: arm_mean_q15.c 프로젝트: 4ilo/Wireless-Jukebox

void arm_mean_q15(
  q15_t * pSrc,
  uint32_t blockSize,
  q15_t * pResult)
{
  q31_t sum = 0;                                 /* Temporary result storage */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in;

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
    in = *__SIMD32(pSrc)++;
    sum += ((in << 16) >> 16);
    sum += (in >> 16);
    in = *__SIMD32(pSrc)++;
    sum += ((in << 16) >> 16);
    sum += (in >> 16);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  while(blkCnt > 0u)
  {
    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
    sum += *pSrc++;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
  /* Store the result to the destination */
  *pResult = (q15_t) (sum / (q31_t)blockSize);
}

예제 #3

0

파일 보기

파일: arm_negate_q7.c 프로젝트: 0x1abin/mbed

void arm_negate_q7(
  q7_t * pSrc,
  q7_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */
  q7_t in;

#ifndef ARM_MATH_CM0_FAMILY

/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t input;                                   /* Input values1-4 */
  q31_t zero = 0x00000000;


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Read four inputs */
    input = *__SIMD32(pSrc)++;

    /* Store the Negated results in the destination buffer in a single cycle by packing the results */
    *__SIMD32(pDst)++ = __QSUB8(zero, input);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the results in the destination buffer. */ \
      in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? 0x7f : -in;

    /* Decrement the loop counter */
    blkCnt--;
  }
}

예제 #4

0

파일 보기

파일: arm_fill_q15.c 프로젝트: ChingHengWang/stm32_ws

void arm_fill_q15(
  q15_t value,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t packedValue;                             /* value packed to 32 bits */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* Packing two 16 bit values to 32 bit value in order to use SIMD */
  packedValue = __PKHBT(value, value, 16u);

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = value */
    /* Fill the value in the destination buffer */
    *__SIMD32(pDst)++ = packedValue;
    *__SIMD32(pDst)++ = packedValue;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  while(blkCnt > 0u)
  {
    /* C = value */
    /* Fill the value in the destination buffer */
    *pDst++ = value;

    /* Decrement the loop counter */
    blkCnt--;
  }
}

예제 #5

0

파일 보기

파일: arm_copy_q7.c 프로젝트: 451506709/automated_machine

void arm_copy_q7(
  q7_t * pSrc,
  q7_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A */
    /* Copy and then store the results in the destination buffer */
    /* 4 samples are copied and stored at a time using SIMD */
    *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */


  while(blkCnt > 0u)
  {
    /* C = A */
    /* Copy and then store the results in the destination buffer */
    *pDst++ = *pSrc++;

    /* Decrement the loop counter */
    blkCnt--;
  }
}

예제 #6

0

파일 보기

파일: arm_negate_q7.c 프로젝트: imhoffj/RaceCapture-Pro_firmware

void arm_negate_q7(
    q7_t * pSrc,
    q7_t * pDst,
    uint32_t blockSize)
{
    uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

    /* Run the below code for Cortex-M4 and Cortex-M3 */
    q7_t in1;                                      /* Input value1 */
    q7_t in2;                                      /* Input value2 */
    q7_t in3;                                      /* Input value3 */
    q7_t in4;                                      /* Input value4 */


    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u) {
        /* C = -A */
        /* Read four inputs */
        in1 = *pSrc++;
        in2 = *pSrc++;
        in3 = *pSrc++;
        in4 = *pSrc++;

        /* Store the Negated results in the destination buffer in a single cycle by packing the results */
        *__SIMD32(pDst)++ =
            __PACKq7(__SSAT(-in1, 8), __SSAT(-in2, 8), __SSAT(-in3, 8),
                     __SSAT(-in4, 8));

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

#else

    /* Run the below code for Cortex-M0 */

    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

    while(blkCnt > 0u) {
        /* C = -A */
        /* Negate and then store the results in the destination buffer. */
        *pDst++ = __SSAT(-*pSrc++, 8);

        /* Decrement the loop counter */
        blkCnt--;
    }
}

예제 #7

0

파일 보기

파일: channel_stats_collector.hpp 프로젝트: fengmushu/portapack-hackrf

	void feed(buffer_c16_t src, Callback callback) {
		auto src_p = src.p;
		while(src_p < &src.p[src.count]) {
			const uint32_t sample = *__SIMD32(src_p)++;
			const uint32_t mag_sq = __SMUAD(sample, sample);
			if( mag_sq > max_squared ) {
				max_squared = mag_sq;
			}
		}
		count += src.count;

		const size_t samples_per_update = src.sampling_rate * update_interval;

		if( count >= samples_per_update ) {
			const float max_squared_f = max_squared;
			const float max_db_f = complex16_mag_squared_to_dbv_norm(max_squared_f);
			const int32_t max_db = max_db_f;
			const ChannelStatistics statistics {
				.max_db = max_db,
				.count = count,
			};
			callback(statistics);

			max_squared = 0;
			count = 0;
		}
	}

예제 #8

0

파일 보기

파일: arm_power_q15.c 프로젝트: Frehner1/CMSIS_LPC17xx

void arm_power_q15( 
  q15_t * pSrc, 
  uint32_t blockSize, 
  q63_t * pResult) 
{ 
  q63_t sum = 0;                                 /* Temporary result storage */ 
  q31_t in32;                                    /* Temporary variable to store input value */ 
  q15_t in16;                                    /* Temporary variable to store input value */ 
  uint32_t blkCnt;                               /* loop counter */ 
 
 
  /* loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ 
    /* Compute Power and then store the result in a temporary variable, sum. */ 
    in32 = *__SIMD32(pSrc)++; 
    sum = __SMLALD(in32, in32, sum); 
    in32 = *__SIMD32(pSrc)++; 
    sum = __SMLALD(in32, in32, sum); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ 
    /* Compute Power and then store the result in a temporary variable, sum. */ 
    in16 = *pSrc++; 
    sum = __SMLALD(in16, in16, sum); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* Store the results in 34.30 format  */ 
  *pResult = sum; 
}

예제 #9

0

파일 보기

파일: arm_negate_q15.c 프로젝트: DesignByArie/lpc17xx.cmsis.driver.library

void arm_negate_q15( 
  q15_t * pSrc, 
  q15_t * pDst, 
  uint32_t blockSize) 
{ 
  uint32_t blkCnt;                               /* loop counter */ 
  q15_t in1, in2;                                /* Temporary variables */ 
 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = ~A */ 
    /* Read two inputs */ 
    in1 = *pSrc++; 
    in2 = *pSrc++; 
    /* Negate and then store the results in the destination buffer by packing. */ 
    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); 
 
    in1 = *pSrc++; 
    in2 = *pSrc++; 
 
    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = ~A */ 
    /* Negate and then store the result in the destination buffer. */ 
    *pDst++ = __SSAT(-*pSrc++, 16); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
}

예제 #10

0

파일 보기

파일: dsp_demodulate.cpp 프로젝트: CCrashBandicot/portapack-hackrf

buffer_f32_t AM::execute(
	const buffer_c16_t& src,
	const buffer_f32_t& dst
) {
	const auto src_p = src.p;
	const auto src_end = &src.p[src.count];
	auto dst_p = dst.p;
	while(src_p < src_end) {
		const uint32_t sample0 = *__SIMD32(src_p)++;
		const uint32_t sample1 = *__SIMD32(src_p)++;
		const uint32_t mag_sq0 = __SMUAD(sample0, sample0);
		const uint32_t mag_sq1 = __SMUAD(sample1, sample1);
		*(dst_p++) = __builtin_sqrtf(mag_sq0) * k;
		*(dst_p++) = __builtin_sqrtf(mag_sq1) * k;
	}

	return { dst.p, src.count, src.sampling_rate };
}

예제 #11

0

파일 보기

파일: arm_abs_q7.c 프로젝트: Frehner1/CMSIS_LPC17xx

void arm_abs_q7( 
  q7_t * pSrc, 
  q7_t * pDst, 
  uint32_t blockSize) 
{ 
  uint32_t blkCnt;                               /* loop counter */ 
  q7_t in1;                                      /* Input value1 */ 
  q7_t in2;                                      /* Input value2 */ 
  q7_t in3;                                      /* Input value3 */ 
  q7_t in4;                                      /* Input value4 */ 
 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = |A| */ 
    /* Read 4 inputs */ 
    in1 = *pSrc++; 
    in2 = *pSrc++; 
    in3 = *pSrc++; 
    in4 = *pSrc++; 
 
    /* Store the Absolute result in the destination buffer by packing the 4 values in single cycle */ 
    *__SIMD32(pDst)++ = 
      __PACKq7(((in1 > 0) ? in1 : __SSAT(-in1, 8)), 
               ((in2 > 0) ? in2 : __SSAT(-in2, 8)), 
               ((in3 > 0) ? in3 : __SSAT(-in3, 8)), 
               ((in4 > 0) ? in4 : __SSAT(-in4, 8))); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = |A| */ 
    /* Read the input */ 
    in1 = *pSrc++; 
 
    /* Store the Absolute result in the destination buffer */ 
    *pDst++ = (in1 > 0) ? in1 : __SSAT(-in1, 8); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
}

예제 #12

0

파일 보기

파일: arm_copy_q7.c 프로젝트: JGSuw/DIP

void arm_copy_q7(     
  q7_t * pSrc,     
  q7_t * pDst,     
  uint32_t blockSize)     
{     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t in1, in2;  
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = A */     
    /* Copy and then store the results in the destination buffer */     
    /* 4 samples are copied and stored at a time using SIMD */     
	in1 = *__SIMD32(pSrc)++;  
	*__SIMD32(pDst)++ = in1;  
	in2 = *__SIMD32(pSrc)++;  
	*__SIMD32(pDst)++ = in2;  
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = A */     
    /* Copy and then store the results in the destination buffer */     
    *pDst++ = *pSrc++;     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
}

예제 #13

0

파일 보기

파일: arm_sub_q15.c 프로젝트: imhoffj/RaceCapture-Pro_firmware

void arm_sub_q15(
    q15_t * pSrcA,
    q15_t * pSrcB,
    q15_t * pDst,
    uint32_t blockSize)
{
    uint32_t blkCnt;                               /* loop counter */


#ifndef ARM_MATH_CM0

    /* Run the below code for Cortex-M4 and Cortex-M3 */
    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u) {
        /* C = A - B */
        /* Subtract and then store the results in the destination buffer two samples at a time. */
        *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
        *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

    while(blkCnt > 0u) {
        /* C = A - B */
        /* Subtract and then store the result in the destination buffer. */
        *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);

        /* Decrement the loop counter */
        blkCnt--;
    }

#else

    /* Run the below code for Cortex-M0 */

    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;

    while(blkCnt > 0u) {
        /* C = A - B */
        /* Subtract and then store the result in the destination buffer. */
        *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16);

        /* Decrement the loop counter */
        blkCnt--;
    }

#endif /* #ifndef ARM_MATH_CM0 */


}

예제 #14

0

파일 보기

파일: dsp_demodulate.cpp 프로젝트: CCrashBandicot/portapack-hackrf

buffer_f32_t FM::execute(
	const buffer_c16_t& src,
	const buffer_f32_t& dst
) {
	auto z = z_;

	const auto src_p = src.p;
	const auto src_end = &src.p[src.count];
	auto dst_p = dst.p;
	while(src_p < src_end) {
		const auto s0 = *__SIMD32(src_p)++;
		const auto s1 = *__SIMD32(src_p)++;
		const auto t0 = multiply_conjugate_s16_s32(s0, z);
		const auto t1 = multiply_conjugate_s16_s32(s1, s0);
		z = s1;
		*(dst_p++) = angle_precise(t0) * kf;
		*(dst_p++) = angle_precise(t1) * kf;
	}
	z_ = z;

	return { dst.p, src.count, src.sampling_rate };
}

예제 #15

0

파일 보기

파일: dsp_decimate.cpp 프로젝트: CCrashBandicot/portapack-hackrf

buffer_c16_t DecimateBy2CIC3::execute(
	const buffer_c16_t& src,
	const buffer_c16_t& dst
) {
	/* Complex non-recursive 3rd-order CIC filter (taps 1,3,3,1).
	 * Gain of 8.
	 * Consumes 16 bytes (4 s16:s16 samples) per loop iteration,
	 * Produces  8 bytes (2 s16:s16 samples) per loop iteration.
	 */
	uint32_t t1 = _iq0;
	uint32_t t2 = _iq1;
	const uint32_t taps = 0x00000003;
	auto s = src.p;
	auto d = dst.p;
	const auto d_end = &dst.p[src.count / 2];
	while(d < d_end) {
		uint32_t i = __SXTH(t1, 0);			/* 1: I0 */
		uint32_t q = __SXTH(t1, 16);			/* 1: Q0 */
		i = __SMLABB(t2, taps, i);	/* 1: I1*3 + I0 */
		q = __SMLATB(t2, taps, q);	/* 1: Q1*3 + Q0 */

		const uint32_t t3 = *__SIMD32(s)++;		/* 3: Q2:I2 */
		const uint32_t t4 = *__SIMD32(s)++;		/*    Q3:I3 */

		i = __SMLABB(t3, taps, i);	/* 1: I2*3 + I1*3 + I0 */
		q = __SMLATB(t3, taps, q);	/* 1: Q2*3 + Q1*3 + Q0 */
		int32_t si0 = __SXTAH(i, t4,  0);		/* 1: I3 + Q2*3 + Q1*3 + Q0 */
		int32_t sq0 = __SXTAH(q, t4, 16);		/* 1: Q3 + Q2*3 + Q1*3 + Q0 */
		i = __BFI(si0 / 8, sq0 / 8, 16, 16);	/* 1: D2_Q0:D2_I0 */
		*__SIMD32(d)++ = i;			/* D2_Q0:D2_I0 */

		i = __SXTH(t3, 0);			/* 1: I2 */
		q = __SXTH(t3, 16);			/* 1: Q2 */
		i = __SMLABB(t4, taps, i);	/* 1: I3*3 + I2 */
		q = __SMLATB(t4, taps, q);	/* 1: Q3*3 + Q2 */

		t1 = *__SIMD32(s)++;		/* 3: Q4:I4 */
		t2 = *__SIMD32(s)++;		/*    Q5:I5 */

		i = __SMLABB(t1, taps, i);	/* 1: I4*3 + I3*3 + I2 */
		q = __SMLATB(t1, taps, q);	/* 1: Q4*3 + Q3*3 + Q2 */
		int32_t si1 = __SXTAH(i, t2, 0) ;		/* 1: I5 + Q4*3 + Q3*3 + Q2 */
		int32_t sq1 = __SXTAH(q, t2, 16);		/* 1: Q5 + Q4*3 + Q3*3 + Q2 */
		i = __BFI(si1 / 8, sq1 / 8, 16, 16);	/* 1: D2_Q1:D2_I1 */
		*__SIMD32(d)++ = i;			/* D2_Q1:D2_I1 */
	}
	_iq0 = t1;
	_iq1 = t2;

	return { dst.p, src.count / 2, src.sampling_rate / 2 };
}

예제 #16

0

파일 보기

파일: arm_copy_q15.c 프로젝트: JGSuw/DIP

void arm_copy_q15(     
  q15_t * pSrc,     
  q15_t * pDst,     
  uint32_t blockSize)     
{     
  uint32_t blkCnt;                               /* loop counter */     
     
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = A */     
    /* Read two inputs */   
    *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;     
    *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;     
    *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;     
    *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;     
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = A */     
    /* Copy and then store the value in the destination buffer */     
    *pDst++ = *pSrc++;     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
}

예제 #17

0

파일 보기

파일: arm_fill_q7.c 프로젝트: DesignByArie/lpc17xx.cmsis.driver.library

void arm_fill_q7( 
  q7_t value, 
  q7_t * pDst, 
  uint32_t blockSize) 
{ 
  uint32_t blkCnt;                               /* loop counter */ 
  q31_t packedValue;                             /* value packed to 32 bits */ 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* Packing four 8 bit values to 32 bit value in order to use SIMD */ 
  packedValue = __PACKq7(value, value, value, value); 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = value */ 
    /* Fill the value in the destination buffer */ 
    *__SIMD32(pDst)++ = packedValue; 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = value */ 
    /* Fill the value in the destination buffer */ 
    *pDst++ = value; 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
}

예제 #18

0

파일 보기

파일: channel_stats_collector.hpp 프로젝트: CCrashBandicot/portapack-hackrf

	void feed(const buffer_c16_t& src, Callback callback) {
		auto src_p = src.p;
		while(src_p < &src.p[src.count]) {
			const uint32_t sample = *__SIMD32(src_p)++;
			const uint32_t mag_sq = __SMUAD(sample, sample);
			if( mag_sq > max_squared ) {
				max_squared = mag_sq;
			}
		}
		count += src.count;

		const size_t samples_per_update = src.sampling_rate * update_interval;

		if( count >= samples_per_update ) {
			const float max_squared_f = max_squared;
			const int32_t max_db = mag2_to_dbv_norm(max_squared_f * (1.0f / (32768.0f * 32768.0f)));
			callback({ max_db, count });

			max_squared = 0;
			count = 0;
		}
	}

예제 #19

0

파일 보기

파일: arm_sub_q15.c 프로젝트: Frehner1/CMSIS_LPC17xx

void arm_sub_q15( 
  q15_t * pSrcA, 
  q15_t * pSrcB, 
  q15_t * pDst, 
  uint32_t blockSize) 
{ 
  uint32_t blkCnt;                               /* loop counter */ 
 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = A - B */ 
    /* Subtract and then store the results in the destination buffer two samples at a time. */ 
    *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); 
    *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = A - B */ 
    /* Subtract and then store the result in the destination buffer. */ 
    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
}

예제 #20

0

파일 보기

파일: arm_q15_to_q31.c 프로젝트: 12019/USB_Photoframe

void arm_q15_to_q31(
  q15_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
{
  q15_t *pIn = pSrc;                             /* Src pointer */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in1, in2;
  q31_t out1, out2, out3, out4;

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = (q31_t)A << 16 */
    /* convert from q15 to q31 and then store the results in the destination buffer */
    in1 = *__SIMD32(pIn)++;
    in2 = *__SIMD32(pIn)++;

#ifndef ARM_MATH_BIG_ENDIAN

    /* extract lower 16 bits to 32 bit result */
    out1 = in1 << 16u;
    /* extract upper 16 bits to 32 bit result */
    out2 = in1 & 0xFFFF0000;
    /* extract lower 16 bits to 32 bit result */
    out3 = in2 << 16u;
    /* extract upper 16 bits to 32 bit result */
    out4 = in2 & 0xFFFF0000;

#else

    /* extract upper 16 bits to 32 bit result */
    out1 = in1 & 0xFFFF0000;
    /* extract lower 16 bits to 32 bit result */
    out2 = in1 << 16u;
    /* extract upper 16 bits to 32 bit result */
    out3 = in2 & 0xFFFF0000;
    /* extract lower 16 bits to 32 bit result */
    out4 = in2 << 16u;

#endif //      #ifndef ARM_MATH_BIG_ENDIAN

    *pDst++ = out1;
    *pDst++ = out2;
    *pDst++ = out3;
    *pDst++ = out4;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  while(blkCnt > 0u)
  {
    /* C = (q31_t)A << 16 */
    /* convert from q15 to q31 and then store the results in the destination buffer */
    *pDst++ = (q31_t) * pIn++ << 16;

    /* Decrement the loop counter */
    blkCnt--;
  }

}

예제 #21

0

파일 보기

파일: arm_biquad_cascade_df1_fast_q15.c 프로젝트: imhoffj/RaceCapture-Pro_firmware

void arm_biquad_cascade_df1_fast_q15(
    const arm_biquad_casd_df1_inst_q15 * S,
    q15_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
{
    q15_t *pIn = pSrc;                             /*  Source pointer                               */
    q15_t *pOut = pDst;                            /*  Destination pointer                          */
    q31_t in;                                      /*  Temporary variable to hold input value       */
    q31_t out;                                     /*  Temporary variable to hold output value      */
    q31_t b0;                                      /*  Temporary variable to hold bo value          */
    q31_t b1, a1;                                  /*  Filter coefficients                          */
    q31_t state_in, state_out;                     /*  Filter state variables                       */
    q31_t acc0;                                    /*  Accumulator                                  */
    int32_t shift = (int32_t) (15 - S->postShift); /*  Post shift                                   */
    q15_t *pState = S->pState;                     /*  State pointer                                */
    q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
    q31_t *pState_q31;                             /*  32-bit state pointer for SIMD implementation */
    uint32_t sample, stage = S->numStages;         /*  Stage loop counter                           */



    do {
        /* Initialize state pointer of type q31 */
        pState_q31 = (q31_t *) (pState);

        /* Read the b0 and 0 coefficients using SIMD  */
        b0 = *__SIMD32(pCoeffs)++;

        /* Read the b1 and b2 coefficients using SIMD */
        b1 = *__SIMD32(pCoeffs)++;

        /* Read the a1 and a2 coefficients using SIMD */
        a1 = *__SIMD32(pCoeffs)++;

        /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
        state_in = (q31_t) (*pState_q31++);

        /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
        state_out = (q31_t) (*pState_q31);

        /* Apply loop unrolling and compute 2 output values simultaneously. */
        /*      The variables acc0 ... acc3 hold output values that are being computed:
         *
         *    acc0 =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
         *    acc0 =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
         */
        sample = blockSize >> 1u;

        /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.
         ** a second loop below computes the remaining 1 sample. */
        while(sample > 0u) {

            /* Read the input */
            in = *__SIMD32(pIn)++;

            /* out =  b0 * x[n] + 0 * 0 */
            out = __SMUAD(b0, in);
            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
            state_out = __PKHBT(state_out >> 16, (out), 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* out =  b0 * x[n] + 0 * 0 */
            out = __SMUADX(b0, in);
            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);


            /* Store the output in the destination buffer. */

#ifndef  ARM_MATH_BIG_ENDIAN

            *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);

#else

            *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in >> 16, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, in, 16);
            state_out = __PKHBT(state_out >> 16, out, 16);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */


            /* Decrement the loop counter */
            sample--;

        }

        /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
         ** No loop unrolling is used. */

        if((blockSize & 0x1u) != 0u) {
            /* Read the input */
            in = *pIn++;

            /* out =  b0 * x[n] + 0 * 0 */

#ifndef  ARM_MATH_BIG_ENDIAN

            out = __SMUAD(b0, in);

#else

            out = __SMUADX(b0, in);

#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */

            /* acc0 =  b1 * x[n-1] + acc0 +=  b2 * x[n-2] + out */
            acc0 = __SMLAD(b1, state_in, out);
            /* acc0 +=  a1 * y[n-1] + acc0 +=  a2 * y[n-2] */
            acc0 = __SMLAD(a1, state_out, acc0);

            /* The result is converted from 3.29 to 1.31 and then saturation is applied */
            out = __SSAT((acc0 >> shift), 16);

            /* Store the output in the destination buffer. */
            *pOut++ = (q15_t) out;

            /* Every time after the output is computed state should be updated. */
            /* The states should be updated as:  */
            /* Xn2 = Xn1    */
            /* Xn1 = Xn     */
            /* Yn2 = Yn1    */
            /* Yn1 = acc0   */
            /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
            /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */

#ifndef  ARM_MATH_BIG_ENDIAN

            state_in = __PKHBT(in, state_in, 16);
            state_out = __PKHBT(out, state_out, 16);

#else

            state_in = __PKHBT(state_in >> 16, in, 16);
            state_out = __PKHBT(state_out >> 16, out, 16);

#endif /*   #ifndef  ARM_MATH_BIG_ENDIAN    */

        }

        /*  The first stage goes from the input buffer to the output buffer.  */
        /*  Subsequent (numStages - 1) occur in-place in the output buffer  */
        pIn = pDst;

        /* Reset the output pointer */
        pOut = pDst;

        /*  Store the updated state variables back into the state array */
        *__SIMD32(pState)++ = state_in;
        *__SIMD32(pState)++ = state_out;


        /* Decrement the loop counter */
        stage--;

    } while(stage > 0u);

예제 #22

0

파일 보기

파일: arm_dot_prod_q15.c 프로젝트: szymon2103/Stm32

void arm_dot_prod_q15(
  q15_t * pSrcA,
  q15_t * pSrcB,
  uint32_t blockSize,
  q63_t * result)
{
  q63_t sum = 0;                                 /* Temporary result storage */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

/* Run the below code for Cortex-M4 and Cortex-M3 */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the result in a temporary buffer. */
    sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
    sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the results in a temporary buffer. */
    sum = __SMLALD(*pSrcA++, *pSrcB++, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }


#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the results in a temporary buffer. */
    sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++);

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  /* Store the result in the destination buffer in 34.30 format */
  *result = sum;

}

예제 #23

0

파일 보기

파일: arm_power_q7.c 프로젝트: 153rd/STM32F4-workarea

void arm_power_q7(
  q7_t * pSrc,
  uint32_t blockSize,
  q31_t * pResult)
{
  q31_t sum = 0;                                 /* Temporary result storage */
  q7_t in;                                       /* Temporary variable to store input */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t input1;                                  /* Temporary variable to store packed input */
  q31_t in1, in2;                                /* Temporary variables to store input */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* Reading two inputs of pSrc vector and packing */
    input1 = *__SIMD32(pSrc)++;

    in1 = __SXTB16(__ROR(input1, 8));
    in2 = __SXTB16(input1);

    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
    /* calculate power and accumulate to accumulator */
    sum = __SMLAD(in1, in1, sum);
    sum = __SMLAD(in2, in2, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

  while(blkCnt > 0u)
  {
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
    /* Compute Power and then store the result in a temporary variable, sum. */
    in = *pSrc++;
    sum += ((q15_t) in * in);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Store the result in 18.14 format  */
  *pResult = sum;
}

예제 #24

0

파일 보기

파일: arm_dot_prod_q7.c 프로젝트: 201409366/LPC1549_LPCXpresso

void arm_dot_prod_q7(
    q7_t * pSrcA,
    q7_t * pSrcB,
    uint32_t blockSize,
    q31_t * result)
{
    uint32_t blkCnt;                               /* loop counter */

    q31_t sum = 0;                                 /* Temporary variables to store output */

#ifndef ARM_MATH_CM0_FAMILY

    /* Run the below code for Cortex-M4 and Cortex-M3 */

    q31_t input1, input2;                          /* Temporary variables to store input */
    q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables to store input */



    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
        /* read 4 samples at a time from sourceA */
        input1 = *__SIMD32(pSrcA)++;
        /* read 4 samples at a time from sourceB */
        input2 = *__SIMD32(pSrcB)++;

        /* extract two q7_t samples to q15_t samples */
        inA1 = __SXTB16(__ROR(input1, 8));
        /* extract reminaing two samples */
        inA2 = __SXTB16(input1);
        /* extract two q7_t samples to q15_t samples */
        inB1 = __SXTB16(__ROR(input2, 8));
        /* extract reminaing two samples */
        inB2 = __SXTB16(input2);

        /* multiply and accumulate two samples at a time */
        sum = __SMLAD(inA1, inB1, sum);
        sum = __SMLAD(inA2, inB2, sum);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

    while(blkCnt > 0u)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        /* Dot product and then store the results in a temporary buffer. */
        sum = __SMLAD(*pSrcA++, *pSrcB++, sum);

        /* Decrement the loop counter */
        blkCnt--;
    }

#else

    /* Run the below code for Cortex-M0 */



    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;

    while(blkCnt > 0u)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        /* Dot product and then store the results in a temporary buffer. */
        sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);

        /* Decrement the loop counter */
        blkCnt--;
    }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */


    /* Store the result in the destination buffer in 18.14 format */
    *result = sum;
}

예제 #25

0

파일 보기

파일: arm_q7_to_q15.c 프로젝트: CrazianT3k/ahabadge

void arm_q7_to_q15(
    q7_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
{
    q7_t *pIn = pSrc;                              /* Src pointer */
    uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY
    q31_t in;
    q31_t in1, in2;
    q31_t out1, out2;

    /* Run the below code for Cortex-M4 and Cortex-M3 */

    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
        /* C = (q15_t) A << 8 */
        /* convert from q7 to q15 and then store the results in the destination buffer */
        in = *__SIMD32(pIn)++;

        /* rotatate in by 8 and extend two q7_t values to q15_t values */
        in1 = __SXTB16(__ROR(in, 8));

        /* extend remainig two q7_t values to q15_t values */
        in2 = __SXTB16(in);

        in1 = in1 << 8u;
        in2 = in2 << 8u;

        in1 = in1 & 0xFF00FF00;
        in2 = in2 & 0xFF00FF00;

#ifndef ARM_MATH_BIG_ENDIAN

        out2 = __PKHTB(in1, in2, 16);
        out1 = __PKHBT(in2, in1, 16);

#else

        out1 = __PKHTB(in1, in2, 16);
        out2 = __PKHBT(in2, in1, 16);

#endif

        *__SIMD32(pDst)++ = out1;
        *__SIMD32(pDst)++ = out2;

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

#else

    /* Run the below code for Cortex-M0 */

    /* Loop over blockSize number of values */
    blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

    while(blkCnt > 0u)
    {
        /* C = (q15_t) A << 8 */
        /* convert from q7 to q15 and then store the results in the destination buffer */
        *pDst++ = (q15_t) * pIn++ << 8;

        /* Decrement the loop counter */
        blkCnt--;
    }

}

예제 #26

0

파일 보기

파일: arm_offset_q15.c 프로젝트: OpenNuvoton/Mini51BSP

void arm_offset_q15(
  q15_t * pSrc,
  q15_t offset,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t offset_packed;                           /* Offset packed to 32 bit */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PKHBT(offset, offset, 16);

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer, 2 samples at a time. */
    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);

    /* Decrement the loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16);

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}

예제 #27

0

파일 보기

파일: arm_negate_q15.c 프로젝트: ThucVD2704/femto-usb-blink-example

void arm_negate_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */


#ifndef ARM_MATH_CM0

/* Run the below code for Cortex-M4 and Cortex-M3 */

  q15_t in1, in2;                                /* Temporary variables */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.   
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Read two inputs */
    in1 = *pSrc++;
    in2 = *pSrc++;
    /* Negate and then store the results in the destination buffer by packing. */

#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16);

#else

    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    in1 = *pSrc++;
    in2 = *pSrc++;

#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in1, 16), __SSAT(-in2, 16), 16);

#else


    *__SIMD32(pDst)++ = __PKHBT(__SSAT(-in2, 16), __SSAT(-in1, 16), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.   
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the result in the destination buffer. */
    *pDst++ = __SSAT(-*pSrc++, 16);

    /* Decrement the loop counter */
    blkCnt--;
  }

}

예제 #28

0

파일 보기

파일: arm_q7_to_q31.c 프로젝트: 201409366/LPC1549_LPCXpresso

void arm_q7_to_q31(
  q7_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
{
  q7_t *pIn = pSrc;                              /* Src pointer */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

  q31_t in;

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = (q31_t) A << 24 */
    /* convert from q7 to q31 and then store the results in the destination buffer */
    in = *__SIMD32(pIn)++;

#ifndef ARM_MATH_BIG_ENDIAN

    *pDst++ = (__ROR(in, 8)) & 0xFF000000;
    *pDst++ = (__ROR(in, 16)) & 0xFF000000;
    *pDst++ = (__ROR(in, 24)) & 0xFF000000;
    *pDst++ = (in & 0xFF000000);

#else

    *pDst++ = (in & 0xFF000000);
    *pDst++ = (__ROR(in, 24)) & 0xFF000000;
    *pDst++ = (__ROR(in, 16)) & 0xFF000000;
    *pDst++ = (__ROR(in, 8)) & 0xFF000000;

#endif //              #ifndef ARM_MATH_BIG_ENDIAN

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  while(blkCnt > 0u)
  {
    /* C = (q31_t) A << 24 */
    /* convert from q7 to q31 and then store the results in the destination buffer */
    *pDst++ = (q31_t) * pIn++ << 24;

    /* Decrement the loop counter */
    blkCnt--;
  }

}

예제 #29

0

파일 보기

파일: arm_abs_q15.c 프로젝트: 1heinz/TauLabs

void arm_abs_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

/* Run the below code for Cortex-M4 and Cortex-M3 */

  q15_t in1;                                     /* Input value1 */
  q15_t in2;                                     /* Input value2 */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read two inputs */
    in1 = *pSrc++;
    in2 = *pSrc++;


    /* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */

#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ =
      __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)),
              ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16);

#else


    *__SIMD32(pDst)++ =
      __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)),
              ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    in1 = *pSrc++;
    in2 = *pSrc++;


#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ =
      __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)),
              ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16);

#else


    *__SIMD32(pDst)++ =
      __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)),
              ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read the input */
    in1 = *pSrc++;

    /* Calculate absolute value of input and then store the result in the destination buffer. */
    *pDst++ = (in1 > 0) ? in1 : __QSUB16(0, in1);

    /* Decrement the loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  q15_t in;                                      /* Temporary input variable */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read the input */
    in = *pSrc++;

    /* Calculate absolute value of input and then store the result in the destination buffer. */
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0 */

}

예제 #30

0

파일 보기

파일: arm_var_q15.c 프로젝트: AndrewCapon/axoloti

void arm_var_q15(
  q15_t * pSrc,
  uint32_t blockSize,
  q15_t * pResult)
{

  q31_t sum = 0;                                 /* Accumulator */
  q31_t meanOfSquares, squareOfMean;             /* square of mean and mean of square */
  uint32_t blkCnt;                               /* loop counter */
  q63_t sumOfSquares = 0;                        /* Accumulator */
   
#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t in;                                      /* input value */
  q15_t in1;                                     /* input value */

	if(blockSize == 1)
	{
		*pResult = 0;
		return;
	}

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1])  */
    /* Compute Sum of squares of the input samples    
     * and then store the result in a temporary variable, sum. */
    in = *__SIMD32(pSrc)++;
    sum += ((in << 16) >> 16);
    sum += (in >> 16);
    sumOfSquares = __SMLALD(in, in, sumOfSquares);
    in = *__SIMD32(pSrc)++;
    sum += ((in << 16) >> 16);
    sum += (in >> 16);
    sumOfSquares = __SMLALD(in, in, sumOfSquares);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
    /* Compute Sum of squares of the input samples    
     * and then store the result in a temporary variable, sum. */
    in1 = *pSrc++;
    sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
    sum += in1;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Compute Mean of squares of the input samples    
   * and then store the result in a temporary variable, meanOfSquares. */
  meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1));

  /* Compute square of mean */
  squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1)));

  /* mean of the squares minus the square of the mean. */
  *pResult = (meanOfSquares - squareOfMean) >> 15;

#else

  /* Run the below code for Cortex-M0 */
  q15_t in;                                      /* input value */

	if(blockSize == 1)
	{
		*pResult = 0;
		return;
	}

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
    /* Compute Sum of squares of the input samples     
     * and then store the result in a temporary variable, sumOfSquares. */
    in = *pSrc++;
    sumOfSquares += (in * in);

    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
    /* Compute sum of all input values and then store the result in a temporary variable, sum. */
    sum += in;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Compute Mean of squares of the input samples     
   * and then store the result in a temporary variable, meanOfSquares. */
  meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1));

  /* Compute square of mean */
  squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1)));

  /* mean of the squares minus the square of the mean. */
  *pResult = (meanOfSquares - squareOfMean) >> 15;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}