void arm_sub_q15( q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the results in the destination buffer two samples at a time. */ *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the result in the destination buffer. */ *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the result in the destination buffer. */ *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
void arm_sub_q15( q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the results in the destination buffer two samples at a time. */ *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); *__SIMD32(pDst)++ = __QSUB16(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A - B */ /* Subtract and then store the result in the destination buffer. */ *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++); /* Decrement the loop counter */ blkCnt--; } }
void arm_negate_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q15_t in; #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t in1, in2; /* Temporary variables */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = -A */ /* Read two inputs at a time */ in1 = _SIMD32_OFFSET(pSrc); in2 = _SIMD32_OFFSET(pSrc + 2); /* negate two samples at a time */ in1 = __QSUB16(0, in1); /* negate two samples at a time */ in2 = __QSUB16(0, in2); /* store the result to destination 2 samples at a time */ _SIMD32_OFFSET(pDst) = in1; /* store the result to destination 2 samples at a time */ _SIMD32_OFFSET(pDst + 2) = in2; /* update pointers to process next samples */ pSrc += 4u; pDst += 4u; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0 */ while(blkCnt > 0u) { /* C = -A */ /* Negate and then store the result in the destination buffer. */ in = *pSrc++; *pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in; /* Decrement the loop counter */ blkCnt--; } }
void arm_sub_q15( const q15_t * pSrcA, const q15_t * pSrcB, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* Loop counter */ #if defined (ARM_MATH_LOOPUNROLL) #if defined (ARM_MATH_DSP) q31_t inA1, inA2; q31_t inB1, inB2; #endif /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = A - B */ #if defined (ARM_MATH_DSP) /* read 2 times 2 samples at a time from sourceA */ inA1 = read_q15x2_ia ((q15_t **) &pSrcA); inA2 = read_q15x2_ia ((q15_t **) &pSrcA); /* read 2 times 2 samples at a time from sourceB */ inB1 = read_q15x2_ia ((q15_t **) &pSrcB); inB2 = read_q15x2_ia ((q15_t **) &pSrcB); /* Subtract and store 2 times 2 samples at a time */ write_q15x2_ia (&pDst, __QSUB16(inA1, inB1)); write_q15x2_ia (&pDst, __QSUB16(inA2, inB2)); #else *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); #endif /* Decrement loop counter */ blkCnt--; } /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = A - B */ /* Subtract and store result in destination buffer. */ #if defined (ARM_MATH_DSP) *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++); #else *pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16); #endif /* Decrement loop counter */ blkCnt--; } }
void arm_abs_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q15_t in1; /* Input value1 */ q15_t in2; /* Input value2 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = |A| */ /* Read two inputs */ in1 = *pSrc++; in2 = *pSrc++; /* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)), ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16); #else *__SIMD32(pDst)++ = __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)), ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ in1 = *pSrc++; in2 = *pSrc++; #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(((in1 > 0) ? in1 : __QSUB16(0, in1)), ((in2 > 0) ? in2 : __QSUB16(0, in2)), 16); #else *__SIMD32(pDst)++ = __PKHBT(((in2 > 0) ? in2 : __QSUB16(0, in2)), ((in1 > 0) ? in1 : __QSUB16(0, in1)), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = |A| */ /* Read the input */ in1 = *pSrc++; /* Calculate absolute value of input and then store the result in the destination buffer. */ *pDst++ = (in1 > 0) ? in1 : __QSUB16(0, in1); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ q15_t in; /* Temporary input variable */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = |A| */ /* Read the input */ in = *pSrc++; /* Calculate absolute value of input and then store the result in the destination buffer. */ *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
buffer_c16_t TranslateByFSOver4AndDecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) { /* Translates incoming complex<int8_t> samples by -fs/4, * decimates by two using a non-recursive third-order CIC filter. */ /* Derivation of algorithm: * Original CIC filter (decimating by two): * D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1 * D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1 * * D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1 * D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1 * * Translate -fs/4, phased 180 degrees, accomplished by complex multiplication * of complex length-4 sequence: * * Substitute: * i0 = -i0, q0 = -q0 * i1 = -q1, q1 = i1 * i2 = i2, q2 = q2 * i3 = q3, q3 = -i3 * i4 = -i4, q4 = -q4 * i5 = -q5, q5 = i5 * * Resulting taps (with decimation by 2, four samples in, two samples out): * D_I0 = q3 * 1 + i2 * 3 + -q1 * 3 + -i0 * 1 * D_Q0 = -i3 * 1 + q2 * 3 + i1 * 3 + -q0 * 1 * * D_I1 = -q5 * 1 + -i4 * 3 + q3 * 3 + i2 * 1 * D_Q1 = i5 * 1 + -q4 * 3 + -i3 * 3 + q2 * 1 */ // 6 cycles per complex input sample, not including loop overhead. uint32_t q1_i0 = _q1_i0; uint32_t q0_i1 = _q0_i1; /* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */ constexpr uint32_t scale_factor = 32; const uint32_t k_3_1 = 0x00030001 * scale_factor; uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]); uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]); uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]); while(src_p < src_end) { const uint32_t q3_i3_q2_i2 = *(src_p++); // 3 const uint32_t q5_i5_q4_i4 = *(src_p++); const uint32_t i2_i3 = __SXTB16(q3_i3_q2_i2, 16); // 1: (q3_i3_q2_i2 ror 16)[23:16]:(q3_i3_q2_i2 ror 16)[7:0] const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2, 8); // 1: (q3_i3_q2_i2 ror 8)[23:16]:(q3_i3_q2_i2 ror 8)[7:0] const uint32_t i2_q3 = __PKHTB(i2_i3, q3_q2, 16); // 1: Rn[31:16]:(Rm>>16)[15:0] const uint32_t i3_q2 = __PKHBT(q3_q2, i2_i3, 16); // 1:(Rm<<16)[31:16]:Rn[15:0] // D_I0 = 3 * (i2 - q1) + (q3 - i0) const uint32_t i2_m_q1_q3_m_i0 = __QSUB16(i2_q3, q1_i0); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0] const uint32_t d_i0 = __SMUAD(k_3_1, i2_m_q1_q3_m_i0); // 1: Rm[15:0]*Rs[15:0]+Rm[31:16]*Rs[31:16] // D_Q0 = 3 * (q2 + i1) - (i3 + q0) const uint32_t i3_p_q0_q2_p_i1 = __QADD16(i3_q2, q0_i1); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0] const uint32_t d_q0 = __SMUSDX(i3_p_q0_q2_p_i1, k_3_1); // 1: Rm[15:0]*Rs[31:16]–Rm[31:16]*RsX[15:0] const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4, 0); // 1: (q5_i5_q4_i4 ror 0)[23:16]:(q5_i5_q4_i4 ror 0)[7:0] const uint32_t q4_q5 = __SXTB16(q5_i5_q4_i4, 24); // 1: (q5_i5_q4_i4 ror 24)[23:16]:(q5_i5_q4_i4 ror 24)[7:0] const uint32_t q4_i5 = __PKHTB(q4_q5, i5_i4, 16); // 1: Rn[31:16]:(Rm>>16)[15:0] const uint32_t q5_i4 = __PKHBT(i5_i4, q4_q5, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] // D_I1 = (i2 - q5) + 3 * (q3 - i4) const uint32_t i2_m_q5_q3_m_i4 = __QSUB16(i2_q3, q5_i4); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0] const uint32_t d_i1 = __SMUADX(i2_m_q5_q3_m_i4, k_3_1); // 1: Rm[15:0]*Rs[31:16]+Rm[31:16]*Rs[15:0] // D_Q1 = (i5 + q2) - 3 * (q4 + i3) const uint32_t q4_p_i3_i5_p_q2 = __QADD16(q4_i5, i3_q2); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0] const uint32_t d_q1 = __SMUSD(k_3_1, q4_p_i3_i5_p_q2); // 1: Rm[15:0]*Rs[15:0]–Rm[31:16]*Rs[31:16] const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] *(dst_p++) = d_q0_i0; // 3 *(dst_p++) = d_q1_i1; q1_i0 = q5_i4; q0_i1 = q4_i5; } _q1_i0 = q1_i0; _q0_i1 = q0_i1; return { dst.p, src.count / 2, src.sampling_rate / 2 }; }
arm_status arm_mat_sub_q15( const arm_matrix_instance_q15 * pSrcA, const arm_matrix_instance_q15 * pSrcB, arm_matrix_instance_q15 * pDst) { q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */ q15_t *pOut = pDst->pData; /* output data matrix pointer */ uint32_t numSamples; /* total number of elements in the matrix */ uint32_t blkCnt; /* loop counters */ arm_status status; /* status of matrix subtraction */ #ifdef ARM_MATH_MATRIX_CHECK /* Check for matrix mismatch condition */ if((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols)) { /* Set status as ARM_MATH_SIZE_MISMATCH */ status = ARM_MATH_SIZE_MISMATCH; } else #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ { /* Total number of samples in the input matrix */ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols; #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ /* Apply loop unrolling */ blkCnt = numSamples >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C(m,n) = A(m,n) - B(m,n) */ /* Subtract, Saturate and then store the results in the destination buffer. */ *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++); *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = numSamples % 0x4u; while(blkCnt > 0u) { /* C(m,n) = A(m,n) - B(m,n) */ /* Subtract and then store the results in the destination buffer. */ *pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = numSamples; while(blkCnt > 0u) { /* C(m,n) = A(m,n) - B(m,n) */ /* Subtract and then store the results in the destination buffer. */ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ /* Set status as ARM_MATH_SUCCESS */ status = ARM_MATH_SUCCESS; } /* Return to application */ return (status); }
/** \brief Test case: TC_CoreSimd_ParAddSub16 \details - Check Parallel 16-bit addition and subtraction: __SADD16 __SSUB16 __SASX __SSAX __SHADD16 __SHSUB16 __SHASX __SHSAX __QADD16 __QSUB16 __QASX __QSAX __UADD16 __USUB16 __UASX __USAX __UHADD16 __UHSUB16 __UHASX __UHSAX __UQSUB16 __UQADD16 __UQASX __UQSAX */ void TC_CoreSimd_ParAddSub16 (void) { #if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) ) volatile uint32_t op1_u32, op2_u32; volatile uint32_t res_u32; volatile int32_t op1_s32, op2_s32; volatile int32_t res_s32; /* --- __SADD16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038001; op2_s32 = (int32_t)0x00040002; res_s32 = __SADD16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80078003); /* --- __SSUB16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __SSUB16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80038001); /* --- __SASX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __SASX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80097FFF); /* --- __SSAX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038007; op2_s32 = (int32_t)0x00020004; res_s32 = __SSAX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x7FFF8009); /* --- __SHADD16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038001; op2_s32 = (int32_t)0x00040002; res_s32 = __SHADD16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xC003C001); /* --- __SHSUB16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __SHSUB16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xC001C000); /* --- __SHASX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __SHASX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xC004BFFF); /* --- __SHSAX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038007; op2_s32 = (int32_t)0x00020004; res_s32 = __SHSAX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xBFFFC004); /* --- __QADD16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038000; op2_s32 = (int32_t)0x00048002; res_s32 = __QADD16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80078000); /* --- __QSUB16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038003; op2_s32 = (int32_t)0x00040002; res_s32 = __QSUB16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80008001); /* --- __QASX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80078003; op2_s32 = (int32_t)0x00040002; res_s32 = __QASX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80098000); /* --- __QSAX Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80038007; op2_s32 = (int32_t)0x00020004; res_s32 = __QSAX(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80008009); /* --- __UADD16 Test ---------------------------------------------- */ op1_u32 = 0x00010002; op2_u32 = 0x00020004; res_u32 = __UADD16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00030006); /* --- __USUB16 Test ---------------------------------------------- */ op1_u32 = 0x00030006; op2_u32 = 0x00020004; res_u32 = __USUB16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00010002); /* --- __UASX Test ---------------------------------------------- */ op1_u32 = 0x80078003; op2_u32 = 0x00040002; res_u32 = __UASX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x80097FFF); /* --- __USAX Test ---------------------------------------------- */ op1_u32 = 0x80038007; op2_u32 = 0x00020004; res_u32 = __USAX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x7FFF8009); /* --- __UHADD16 Test ---------------------------------------------- */ op1_u32 = 0x00010002; op2_u32 = 0x00020004; res_u32 = __UHADD16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00010003); /* --- __UHSUB16 Test ---------------------------------------------- */ op1_u32 = 0x00030006; op2_u32 = 0x00020004; res_u32 = __UHSUB16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00000001); /* --- __UHASX Test ---------------------------------------------- */ op1_u32 = 0x80078003; op2_u32 = 0x00040002; res_u32 = __UHASX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x40043FFF); /* --- __UHSAX Test ---------------------------------------------- */ op1_u32 = 0x80038007; op2_u32 = 0x00020004; res_u32 = __UHSAX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x3FFF4004); /* --- __UQADD16 Test ---------------------------------------------- */ op1_u32 = 0xFFFE0002; op2_u32 = 0x00020004; res_u32 = __UQADD16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0xFFFF0006); /* --- __UQSUB16 Test ---------------------------------------------- */ op1_u32 = 0x00020006; op2_u32 = 0x00030004; res_u32 = __UQSUB16(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00000002); /* --- __UQASX Test ---------------------------------------------- */ op1_u32 = 0xFFF80003; op2_u32 = 0x00040009; res_u32 = __UQASX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0xFFFF0000); /* --- __UQSAX Test ---------------------------------------------- */ op1_u32 = 0x0003FFF8; op2_u32 = 0x00090004; res_u32 = __UQSAX(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x0000FFFF); #endif }
void arm_abs_q15( q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ q31_t in1, in2, in3, in4; /* temporary input variables */ q31_t out1, out2, out3, out4; /* temporary output variabels */ /*loop Unrolling */ blkCnt = blockSize >> 3u; /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while(blkCnt > 0u) { /* C = |A| */ /* Read inputs */ in1 = (q31_t)*pSrc; in2 = (q31_t)*(pSrc + 1); in3 = (q31_t)*(pSrc + 2); /* find absolute value */ out1 = (in1 > 0) ? in1 : __QSUB(0, in1); /* read input */ in4 = (q31_t)*(pSrc + 3); /* find absolute value */ out2 = (in2 > 0) ? in2 : __QSUB(0, in2); /* store result to destination */ *pDst = (q15_t)out1; /* find absolute value */ out3 = (in3 > 0) ? in3 : __QSUB(0, in3); /* read input */ in1 = (q31_t)*(pSrc + 4); /* find absolute value */ out4 = (in4 > 0) ? in4 : __QSUB(0, in4); /* store result to destination */ *(pDst + 1) = (q15_t)out2; /* read input */ in2 = (q31_t)*(pSrc + 5); /* find absolute value */ out1 = (in1 > 0) ? in1 : __QSUB(0, in1); /* store result to destination */ *(pDst + 2) = (q15_t)out3; /* find absolute value */ out2 = (in2 > 0) ? in2 : __QSUB(0, in2); /* read input */ in3 = (q31_t)*(pSrc + 6); /* store result to destination */ *(pDst + 3) = (q15_t)out4; /* read input */ in4 = (q31_t)*(pSrc + 7); /* find absolute value */ out3 = (in3 > 0) ? in3 : __QSUB(0, in3); /* store result to destination */ *(pDst + 4) = (q15_t)out1; /* find absolute value */ out4 = (in4 > 0) ? in4 : __QSUB(0, in4); /* store result to destination */ *(pDst + 5) = (q15_t)out2; *(pDst + 6) = (q15_t)out3; /* increment source pointer by 8 */ pSrc += 8u; /* store result to destination */ *(pDst + 7) = (q15_t)out4; /* increment destination pointer by 8 */ pDst += 8u; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 8, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { /* C = |A| */ /* Read the input */ in1 = *pSrc++; /* Calculate absolute value of input and then store the result in the destination buffer. */ *pDst++ = (in1 > 0) ? in1 : __QSUB16(0, in1); /* Decrement the loop counter */ blkCnt--; } }