void arm_offset_q7( q7_t * pSrc, q7_t offset, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t offset_packed; /* Offset packed to 32 bit */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* Offset is packed to 32 bit in order to use SIMD32 for addition */ offset_packed = __PACKq7(offset, offset, offset, offset); /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the results in the destination bufferfor 4 samples at a time. */ *__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrc)++, offset_packed); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the result in the destination buffer. */ *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A + offset */ /* Add offset and then store the result in the destination buffer. */ *pDst++ = (q7_t) __SSAT((q15_t) * pSrc++ + offset, 8); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
void arm_add_q7( q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, uint32_t blockSize) { uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A + B */ /* Add and then store the results in the destination buffer. */ *__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A + B */ /* Add and then store the results in the destination buffer. */ *pDst++ = (q7_t) __SSAT(*pSrcA++ + *pSrcB++, 8); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A + B */ /* Add and then store the results in the destination buffer. */ *pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ + *pSrcB++, 8); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0 */ }
/** \brief Test case: TC_CoreSimd_ParAddSub8 \details - Check Parallel 8-bit addition and subtraction: __SADD8 S Signed __SSUB8 Q Signed Saturating __SHADD8 SH Signed Halving __SHSUB8 U Unsigned __QADD8 UQ Unsigned Saturating __QSUB8 UH Unsigned Halving __UADD8 __USUB8 __UHADD8 __UHSUB8 __UQADD8 __UQSUB8 */ void TC_CoreSimd_ParAddSub8 (void) { #if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) ) volatile uint32_t op1_u32, op2_u32; volatile uint32_t res_u32; volatile int32_t op1_s32, op2_s32; volatile int32_t res_s32; /* --- __SADD8 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x87858381; op2_s32 = (int32_t)0x08060402; res_s32 = __SADD8(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x8F8B8783); /* --- __SSUB8 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x8F8B8783; op2_s32 = (int32_t)0x08060402; res_s32 = __SSUB8(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x87858381); /* --- __SHADD8 Test ---------------------------------------------- */ op1_s32 = 0x07050302; op2_s32 = 0x08060402; res_s32 = __SHADD8(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == 0x07050302); /* --- __SHSUB8 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x8F8B8783; op2_s32 = 0x08060402; res_s32 = __SHSUB8(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xC3C2C1C0); /* --- __QADD8 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x8085837F; op2_s32 = (int32_t)0xFF060402; res_s32 = __QADD8(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x808B877F); /* --- __QSUB8 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x808B8783; op2_s32 = (int32_t)0x08060402; res_s32 = __QSUB8(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x80858381); /* --- __UADD8 Test ---------------------------------------------- */ op1_u32 = 0x07050301; op2_u32 = 0x08060402; res_u32 = __UADD8(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x0F0B0703); /* --- __USUB8 Test ---------------------------------------------- */ op1_u32 = 0x0F0B0703; op2_u32 = 0x08060402; res_u32 = __USUB8(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x07050301); /* --- __UHADD8 Test ---------------------------------------------- */ op1_u32 = 0x07050302; op2_u32 = 0x08060402; res_u32 = __UHADD8(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x07050302); /* --- __UHSUB8 Test ---------------------------------------------- */ op1_u32 = 0x0F0B0703; op2_u32 = 0x08060402; res_u32 = __UHSUB8(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x03020100); /* --- __UQADD8 Test ---------------------------------------------- */ op1_u32 = 0xFF050301; op2_u32 = 0x08060402; res_u32 = __UQADD8(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0xFF0B0703); /* --- __UQSUB8 Test ---------------------------------------------- */ op1_u32 = 0x080B0702; op2_u32 = 0x0F060408; res_u32 = __UQSUB8(op1_u32, op2_u32); ASSERT_TRUE(res_u32 == 0x00050300); #endif }