void arm_sub_q15(
    q15_t * pSrcA,
    q15_t * pSrcB,
    q15_t * pDst,
    uint32_t blockSize)
{
    uint32_t blkCnt;                               /* loop counter */


#ifndef ARM_MATH_CM0

    /* Run the below code for Cortex-M4 and Cortex-M3 */
    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u) {
        /* C = A - B */
        /* Subtract and then store the results in the destination buffer two samples at a time. */
        *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
        *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

    while(blkCnt > 0u) {
        /* C = A - B */
        /* Subtract and then store the result in the destination buffer. */
        *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);

        /* Decrement the loop counter */
        blkCnt--;
    }

#else

    /* Run the below code for Cortex-M0 */

    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;

    while(blkCnt > 0u) {
        /* C = A - B */
        /* Subtract and then store the result in the destination buffer. */
        *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16);

        /* Decrement the loop counter */
        blkCnt--;
    }

#endif /* #ifndef ARM_MATH_CM0 */


}
Beispiel #2
0
void arm_sub_q15( 
  q15_t * pSrcA, 
  q15_t * pSrcB, 
  q15_t * pDst, 
  uint32_t blockSize) 
{ 
  uint32_t blkCnt;                               /* loop counter */ 
 
 
  /*loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = A - B */ 
    /* Subtract and then store the results in the destination buffer two samples at a time. */ 
    *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); 
    *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = A - B */ 
    /* Subtract and then store the result in the destination buffer. */ 
    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
} 
void arm_negate_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */
  q15_t in;

#ifndef ARM_MATH_CM0

/* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t in1, in2;                                /* Temporary variables */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Read two inputs at a time */
    in1 = _SIMD32_OFFSET(pSrc);
    in2 = _SIMD32_OFFSET(pSrc + 2);

    /* negate two samples at a time */
    in1 = __QSUB16(0, in1);

    /* negate two samples at a time */
    in2 = __QSUB16(0, in2);

    /* store the result to destination 2 samples at a time */
    _SIMD32_OFFSET(pDst) = in1;
    /* store the result to destination 2 samples at a time */
    _SIMD32_OFFSET(pDst + 2) = in2;


    /* update pointers to process next samples */
    pSrc += 4u;
    pDst += 4u;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the result in the destination buffer. */
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;

    /* Decrement the loop counter */
    blkCnt--;
  }
}
Beispiel #4
0
void arm_sub_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        q15_t * pDst,
        uint32_t blockSize)
{
        uint32_t blkCnt;                               /* Loop counter */

#if defined (ARM_MATH_LOOPUNROLL)

#if defined (ARM_MATH_DSP)
  q31_t inA1, inA2;
  q31_t inB1, inB2;
#endif

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

  while (blkCnt > 0U)
  {
    /* C = A - B */

#if defined (ARM_MATH_DSP)
    /* read 2 times 2 samples at a time from sourceA */
    inA1 = read_q15x2_ia ((q15_t **) &pSrcA);
    inA2 = read_q15x2_ia ((q15_t **) &pSrcA);
    /* read 2 times 2 samples at a time from sourceB */
    inB1 = read_q15x2_ia ((q15_t **) &pSrcB);
    inB2 = read_q15x2_ia ((q15_t **) &pSrcB);

    /* Subtract and store 2 times 2 samples at a time */
    write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
    write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
#else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
#endif

    /* Decrement loop counter */
    blkCnt--;
  }

  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

#else

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = A - B */

    /* Subtract and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
#else
    *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
#endif

    /* Decrement loop counter */
    blkCnt--;
  }

}
Beispiel #5
0
void arm_abs_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

/* Run the below code for Cortex-M4 and Cortex-M3 */

  q15_t in1;                                     /* Input value1 */
  q15_t in2;                                     /* Input value2 */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read two inputs */
    in1 = *pSrc++;
    in2 = *pSrc++;


    /* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */

#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ =
      __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)),
              ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16);

#else


    *__SIMD32(pDst)++ =
      __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)),
              ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    in1 = *pSrc++;
    in2 = *pSrc++;


#ifndef  ARM_MATH_BIG_ENDIAN

    *__SIMD32(pDst)++ =
      __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)),
              ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16);

#else


    *__SIMD32(pDst)++ =
      __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)),
              ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16);

#endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read the input */
    in1 = *pSrc++;

    /* Calculate absolute value of input and then store the result in the destination buffer. */
    *pDst++ = (in1 > 0) ? in1 : __QSUB16(0, in1);

    /* Decrement the loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  q15_t in;                                      /* Temporary input variable */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read the input */
    in = *pSrc++;

    /* Calculate absolute value of input and then store the result in the destination buffer. */
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0 */

}
buffer_c16_t TranslateByFSOver4AndDecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) {
	/* Translates incoming complex<int8_t> samples by -fs/4,
	 * decimates by two using a non-recursive third-order CIC filter.
	 */

	/* Derivation of algorithm:
	 * Original CIC filter (decimating by two):
	 * 	D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1
	 * 	D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1
	 *
	 * 	D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1
	 * 	D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1
	 *
	 * Translate -fs/4, phased 180 degrees, accomplished by complex multiplication
	 * of complex length-4 sequence:
	 *
	 * Substitute:
	 *	i0 = -i0, q0 = -q0
	 *	i1 = -q1, q1 =  i1
	 *	i2 =  i2, q2 =  q2
	 *	i3 =  q3, q3 = -i3
	 *	i4 = -i4, q4 = -q4
	 *	i5 = -q5, q5 =  i5
	 *
	 * Resulting taps (with decimation by 2, four samples in, two samples out):
	 *	D_I0 =  q3 * 1 +  i2 * 3 + -q1 * 3 + -i0 * 1
	 *	D_Q0 = -i3 * 1 +  q2 * 3 +  i1 * 3 + -q0 * 1
 	 *
	 *	D_I1 = -q5 * 1 + -i4 * 3 +  q3 * 3 +  i2 * 1
	 *	D_Q1 =  i5 * 1 + -q4 * 3 + -i3 * 3 +  q2 * 1
	 */

	// 6 cycles per complex input sample, not including loop overhead.
	uint32_t q1_i0 = _q1_i0;
	uint32_t q0_i1 = _q0_i1;
	/* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */
	constexpr uint32_t scale_factor = 32;
	const uint32_t k_3_1 = 0x00030001 * scale_factor;
	uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]);
	uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]);
	uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]);
	while(src_p < src_end) {
		const uint32_t q3_i3_q2_i2 = *(src_p++);			// 3
		const uint32_t q5_i5_q4_i4 = *(src_p++);

		const uint32_t i2_i3 = __SXTB16(q3_i3_q2_i2, 16);			// 1: (q3_i3_q2_i2 ror 16)[23:16]:(q3_i3_q2_i2 ror 16)[7:0]
		const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2,  8);			// 1: (q3_i3_q2_i2 ror  8)[23:16]:(q3_i3_q2_i2 ror  8)[7:0]
		const uint32_t i2_q3 = __PKHTB(i2_i3, q3_q2, 16);			// 1: Rn[31:16]:(Rm>>16)[15:0]
		const uint32_t i3_q2 = __PKHBT(q3_q2, i2_i3, 16);			// 1:(Rm<<16)[31:16]:Rn[15:0]

		// D_I0 = 3 * (i2 - q1) + (q3 - i0)
		const uint32_t i2_m_q1_q3_m_i0 = __QSUB16(i2_q3, q1_i0);	// 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
		const uint32_t d_i0 = __SMUAD(k_3_1, i2_m_q1_q3_m_i0);		// 1: Rm[15:0]*Rs[15:0]+Rm[31:16]*Rs[31:16]

		// D_Q0 = 3 * (q2 + i1) - (i3 + q0)
		const uint32_t i3_p_q0_q2_p_i1 = __QADD16(i3_q2, q0_i1);	// 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
		const uint32_t d_q0 = __SMUSDX(i3_p_q0_q2_p_i1, k_3_1);		// 1: Rm[15:0]*Rs[31:16]–Rm[31:16]*RsX[15:0]
		const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]

		const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4,  0);			// 1: (q5_i5_q4_i4 ror  0)[23:16]:(q5_i5_q4_i4 ror  0)[7:0]
		const uint32_t q4_q5 = __SXTB16(q5_i5_q4_i4, 24);			// 1: (q5_i5_q4_i4 ror 24)[23:16]:(q5_i5_q4_i4 ror 24)[7:0]
		const uint32_t q4_i5 = __PKHTB(q4_q5, i5_i4, 16);			// 1: Rn[31:16]:(Rm>>16)[15:0]
		const uint32_t q5_i4 = __PKHBT(i5_i4, q4_q5, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]

		// D_I1 = (i2 - q5) + 3 * (q3 - i4)
		const uint32_t i2_m_q5_q3_m_i4 = __QSUB16(i2_q3, q5_i4);	// 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
		const uint32_t d_i1 = __SMUADX(i2_m_q5_q3_m_i4, k_3_1);		// 1: Rm[15:0]*Rs[31:16]+Rm[31:16]*Rs[15:0]

		// D_Q1 = (i5 + q2) - 3 * (q4 + i3)
		const uint32_t q4_p_i3_i5_p_q2 = __QADD16(q4_i5, i3_q2);	// 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
		const uint32_t d_q1 = __SMUSD(k_3_1, q4_p_i3_i5_p_q2);		// 1: Rm[15:0]*Rs[15:0]–Rm[31:16]*Rs[31:16]
		const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16);			// 1: (Rm<<16)[31:16]:Rn[15:0]
		*(dst_p++) = d_q0_i0;							// 3
		*(dst_p++) = d_q1_i1;

		q1_i0 = q5_i4;
		q0_i1 = q4_i5;
	}
	_q1_i0 = q1_i0;
	_q0_i1 = q0_i1;

	return { dst.p, src.count / 2, src.sampling_rate / 2 };
}
arm_status arm_mat_sub_q15(
    const arm_matrix_instance_q15 * pSrcA,
    const arm_matrix_instance_q15 * pSrcB,
    arm_matrix_instance_q15 * pDst)
{
    q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A */
    q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B */
    q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
    uint32_t numSamples;                           /* total number of elements in the matrix */
    uint32_t blkCnt;                               /* loop counters  */
    arm_status status;                             /* status of matrix subtraction  */


#ifdef ARM_MATH_MATRIX_CHECK


    /* Check for matrix mismatch condition */
    if((pSrcA->numRows != pSrcB->numRows) ||
       (pSrcA->numCols != pSrcB->numCols) ||
       (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols)) {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    } else
#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */

    {
        /* Total number of samples in the input matrix */
        numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;

#ifndef ARM_MATH_CM0

        /* Run the below code for Cortex-M4 and Cortex-M3 */

        /* Apply loop unrolling */
        blkCnt = numSamples >> 2u;

        /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
         ** a second loop below computes the remaining 1 to 3 samples. */
        while(blkCnt > 0u) {
            /* C(m,n) = A(m,n) - B(m,n) */
            /* Subtract, Saturate and then store the results in the destination buffer. */
            *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
            *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);

            /* Decrement the loop counter */
            blkCnt--;
        }

        /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
         ** No loop unrolling is used. */
        blkCnt = numSamples % 0x4u;

        while(blkCnt > 0u) {
            /* C(m,n) = A(m,n) - B(m,n) */
            /* Subtract and then store the results in the destination buffer. */
            *pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);

            /* Decrement the loop counter */
            blkCnt--;
        }

#else

        /* Run the below code for Cortex-M0 */

        /* Initialize blkCnt with number of samples */
        blkCnt = numSamples;

        while(blkCnt > 0u) {
            /* C(m,n) = A(m,n) - B(m,n) */
            /* Subtract and then store the results in the destination buffer. */
            *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);

            /* Decrement the loop counter */
            blkCnt--;
        }

#endif /* #ifndef ARM_MATH_CM0 */

        /* Set status as ARM_MATH_SUCCESS */
        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return (status);
}
Beispiel #8
0
/**
\brief Test case: TC_CoreSimd_ParAddSub16
\details
- Check Parallel 16-bit addition and subtraction:
  __SADD16
  __SSUB16
  __SASX
  __SSAX
  __SHADD16
  __SHSUB16
  __SHASX
  __SHSAX
  __QADD16
  __QSUB16
  __QASX
  __QSAX
  __UADD16
  __USUB16
  __UASX
  __USAX
  __UHADD16
  __UHSUB16
  __UHASX
  __UHSAX
  __UQSUB16
  __UQADD16
  __UQASX
  __UQSAX
*/
void TC_CoreSimd_ParAddSub16 (void) {
#if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__  == 1)) || \
     (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))     )
  volatile uint32_t op1_u32, op2_u32;
  volatile uint32_t res_u32;

  volatile int32_t op1_s32, op2_s32;
  volatile int32_t res_s32;

  /* --- __SADD16 Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80038001;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SADD16(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80078003);

  /* --- __SSUB16 Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80078003;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SSUB16(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80038001);

  /* --- __SASX Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80078003;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SASX(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80097FFF);

  /* --- __SSAX Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80038007;
  op2_s32 = (int32_t)0x00020004;
  res_s32 = __SSAX(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x7FFF8009);

  /* --- __SHADD16 Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80038001;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SHADD16(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xC003C001);

  /* --- __SHSUB16 Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80078003;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SHSUB16(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xC001C000);

  /* --- __SHASX Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80078003;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SHASX(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xC004BFFF);

  /* --- __SHSAX Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80038007;
  op2_s32 = (int32_t)0x00020004;
  res_s32 = __SHSAX(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xBFFFC004);

  /* --- __QADD16 Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80038000;
  op2_s32 = (int32_t)0x00048002;
  res_s32 = __QADD16(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80078000);

  /* --- __QSUB16 Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80038003;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __QSUB16(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80008001);

  /* --- __QASX Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80078003;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __QASX(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80098000);

  /* --- __QSAX Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80038007;
  op2_s32 = (int32_t)0x00020004;
  res_s32 = __QSAX(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80008009);

  /* --- __UADD16 Test ---------------------------------------------- */
  op1_u32 = 0x00010002;
  op2_u32 = 0x00020004;
  res_u32 = __UADD16(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x00030006);

  /* --- __USUB16 Test ---------------------------------------------- */
  op1_u32 = 0x00030006;
  op2_u32 = 0x00020004;
  res_u32 = __USUB16(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x00010002);

  /* --- __UASX Test ---------------------------------------------- */
  op1_u32 = 0x80078003;
  op2_u32 = 0x00040002;
  res_u32 = __UASX(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x80097FFF);

  /* --- __USAX Test ---------------------------------------------- */
  op1_u32 = 0x80038007;
  op2_u32 = 0x00020004;
  res_u32 = __USAX(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x7FFF8009);

  /* --- __UHADD16 Test ---------------------------------------------- */
  op1_u32 = 0x00010002;
  op2_u32 = 0x00020004;
  res_u32 = __UHADD16(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x00010003);

  /* --- __UHSUB16 Test ---------------------------------------------- */
  op1_u32 = 0x00030006;
  op2_u32 = 0x00020004;
  res_u32 = __UHSUB16(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x00000001);

  /* --- __UHASX Test ---------------------------------------------- */
  op1_u32 = 0x80078003;
  op2_u32 = 0x00040002;
  res_u32 = __UHASX(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x40043FFF);

  /* --- __UHSAX Test ---------------------------------------------- */
  op1_u32 = 0x80038007;
  op2_u32 = 0x00020004;
  res_u32 = __UHSAX(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x3FFF4004);

  /* --- __UQADD16 Test ---------------------------------------------- */
  op1_u32 = 0xFFFE0002;
  op2_u32 = 0x00020004;
  res_u32 = __UQADD16(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0xFFFF0006);

  /* --- __UQSUB16 Test ---------------------------------------------- */
  op1_u32 = 0x00020006;
  op2_u32 = 0x00030004;
  res_u32 = __UQSUB16(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x00000002);

  /* --- __UQASX Test ---------------------------------------------- */
  op1_u32 = 0xFFF80003;
  op2_u32 = 0x00040009;
  res_u32 = __UQASX(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0xFFFF0000);

  /* --- __UQSAX Test ---------------------------------------------- */
  op1_u32 = 0x0003FFF8;
  op2_u32 = 0x00090004;
  res_u32 = __UQSAX(op1_u32, op2_u32);
  ASSERT_TRUE(res_u32 == 0x0000FFFF);
#endif
}
Beispiel #9
0
void arm_abs_q15(     
  q15_t * pSrc,     
  q15_t * pDst,     
  uint32_t blockSize)     
{     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t in1, in2, in3, in4;						 /* temporary input variables */  
  q31_t out1, out2, out3, out4;					 /* temporary output variabels */  
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = |A| */     
    /* Read inputs */    
    in1 = (q31_t)*pSrc;     
    in2 = (q31_t)*(pSrc + 1);     
    in3 = (q31_t)*(pSrc + 2);     
	  
	/* find absolute value */     
	out1 = (in1 > 0) ? in1 : __QSUB(0, in1);   
  
    /* read input */  
	in4 = (q31_t)*(pSrc + 3);   		   
  
	/* find absolute value */     
 	out2 = (in2 > 0) ? in2 : __QSUB(0, in2);  
  
	/* store result to destination */  
   *pDst = (q15_t)out1;  
  
	/* find absolute value */     
	out3 = (in3 > 0) ? in3 : __QSUB(0, in3);  
  
    /* read input */  
	in1 = (q31_t)*(pSrc + 4);  
  
	/* find absolute value */     
	out4 = (in4 > 0) ? in4 : __QSUB(0, in4);  
  
	/* store result to destination */  
	*(pDst + 1) = (q15_t)out2;  
  
    /* read input */  
	in2 = (q31_t)*(pSrc + 5);  
  
	/* find absolute value */     
	out1 = (in1 > 0) ? in1 : __QSUB(0, in1);   
  
	/* store result to destination */  
	*(pDst + 2) = (q15_t)out3;  
  
	/* find absolute value */     
	out2 = (in2 > 0) ? in2 : __QSUB(0, in2);   
  
    /* read input */  
	in3 = (q31_t)*(pSrc + 6);  
  
	/* store result to destination */  
	*(pDst + 3) = (q15_t)out4;  
  
    /* read input */  
	in4 = (q31_t)*(pSrc + 7);  
  
	/* find absolute value */     
	out3 = (in3 > 0) ? in3 : __QSUB(0, in3);  
  
	/* store result to destination */  
	*(pDst + 4) = (q15_t)out1;   
  
	/* find absolute value */     
	out4 = (in4 > 0) ? in4 : __QSUB(0, in4);  
  
	/* store result to destination */  
	*(pDst + 5) = (q15_t)out2;  
	*(pDst + 6) = (q15_t)out3;  
  
	/* increment source pointer by 8 */  
	pSrc += 8u;  
  
	/* store result to destination */  
	*(pDst + 7) = (q15_t)out4;  
  
	/* increment destination pointer by 8 */  
	pDst += 8u;    
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = |A| */     
    /* Read the input */     
    in1 = *pSrc++;     
     
    /* Calculate absolute value of input and then store the result in the destination buffer. */     
    *pDst++ = (in1 > 0) ? in1 : __QSUB16(0, in1);     
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
}