void arm_power_q15( q15_t * pSrc, uint32_t blockSize, q63_t * pResult) { q63_t sum = 0; /* Temporary result storage */ q31_t in32; /* Temporary variable to store input value */ q15_t in16; /* Temporary variable to store input value */ uint32_t blkCnt; /* loop counter */ /* loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ in32 = *__SIMD32(pSrc)++; sum = __SMLALD(in32, in32, sum); in32 = *__SIMD32(pSrc)++; sum = __SMLALD(in32, in32, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ in16 = *pSrc++; sum = __SMLALD(in16, in16, sum); /* Decrement the loop counter */ blkCnt--; } /* Store the results in 34.30 format */ *pResult = sum; }
void arm_dot_prod_q15( q15_t * pSrcA, q15_t * pSrcB, uint32_t blockSize, q63_t * result) { q63_t sum = 0; /* Temporary result storage */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Calculate dot product and then store the result in a temporary buffer. */ sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum); sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Calculate dot product and then store the results in a temporary buffer. */ sum = __SMLALD(*pSrcA++, *pSrcB++, sum); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Calculate dot product and then store the results in a temporary buffer. */ sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ /* Store the result in the destination buffer in 34.30 format */ *result = sum; }
void arm_var_q15( q15_t * pSrc, uint32_t blockSize, q15_t * pResult) { q31_t sum = 0; /* Accumulator */ q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */ uint32_t blkCnt; /* loop counter */ q63_t sumOfSquares = 0; /* Accumulator */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t in; /* input value */ q15_t in1; /* input value */ if(blockSize == 1) { *pResult = 0; return; } /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ /* Compute Sum of squares of the input samples * and then store the result in a temporary variable, sum. */ in = *__SIMD32(pSrc)++; sum += ((in << 16) >> 16); sum += (in >> 16); sumOfSquares = __SMLALD(in, in, sumOfSquares); in = *__SIMD32(pSrc)++; sum += ((in << 16) >> 16); sum += (in >> 16); sumOfSquares = __SMLALD(in, in, sumOfSquares); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ /* Compute Sum of squares of the input samples * and then store the result in a temporary variable, sum. */ in1 = *pSrc++; sumOfSquares = __SMLALD(in1, in1, sumOfSquares); sum += in1; /* Decrement the loop counter */ blkCnt--; } /* Compute Mean of squares of the input samples * and then store the result in a temporary variable, meanOfSquares. */ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1)); /* Compute square of mean */ squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1))); /* mean of the squares minus the square of the mean. */ *pResult = (meanOfSquares - squareOfMean) >> 15; #else /* Run the below code for Cortex-M0 */ q15_t in; /* input value */ if(blockSize == 1) { *pResult = 0; return; } /* Loop over blockSize number of values */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ /* Compute Sum of squares of the input samples * and then store the result in a temporary variable, sumOfSquares. */ in = *pSrc++; sumOfSquares += (in * in); /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ /* Compute sum of all input values and then store the result in a temporary variable, sum. */ sum += in; /* Decrement the loop counter */ blkCnt--; } /* Compute Mean of squares of the input samples * and then store the result in a temporary variable, meanOfSquares. */ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1)); /* Compute square of mean */ squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1))); /* mean of the squares minus the square of the mean. */ *pResult = (meanOfSquares - squareOfMean) >> 15; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
void arm_dot_prod_q15( q15_t * pSrcA, q15_t * pSrcB, uint32_t blockSize, q63_t * result) { q63_t sum = 0; /* Temporary result storage */ uint32_t blkCnt; /* loop counter */ q31_t inA1, inA2, inB1, inB2; /* Temporary variables to store input data */ q31_t inA3, inA4, inB3, inB4; /*loop Unrolling */ blkCnt = blockSize >> 3u; /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Calculate dot product and then store the result in a temporary buffer. */ /* read two samples at a time from soruceA buffer */ inA1 = _SIMD32_OFFSET(pSrcA); /* read two samples at a time from soruceB buffer */ inB1 = _SIMD32_OFFSET(pSrcB); /* read two samples at a time from soruceA buffer */ inA2 = _SIMD32_OFFSET(pSrcA+2); /* multiply and accumulate two samples at a time */ sum = __SMLALD(inA1, inB1, sum); /* read two samples at a time from soruceB buffer */ inB2 = _SIMD32_OFFSET(pSrcB+2); /* read two samples at a time from soruceA buffer */ inA3 = _SIMD32_OFFSET(pSrcA+4); /* read two samples at a time from soruceB buffer */ inB3 = _SIMD32_OFFSET(pSrcB+4); /* multiply and accumulate two samples at a time */ sum = __SMLALD(inA2, inB2, sum); /* read two samples at a time from soruceA buffer */ inA4 = _SIMD32_OFFSET(pSrcA+6); /* read two samples at a time from soruceB buffer */ inB4 = _SIMD32_OFFSET(pSrcB+6); /* increment source A buffer by 8 */ pSrcA += 8u; /* increment sourceB buffer by 8 */ pSrcB += 8u; /* multiply and accumulate two samples at a time */ sum = __SMLALD(inA3, inB3, sum); sum = __SMLALD(inA4, inB4, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 8, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Calculate dot product and then store the results in a temporary buffer. */ sum = __SMLALD(*pSrcA++, *pSrcB++, sum); /* Decrement the loop counter */ blkCnt--; } /* Store the result in the destination buffer in 34.30 format */ *result = sum; }
/** \brief Test case: TC_CoreSimd_ParMul16 \details - Check Parallel 16-bit multiplication: __SMLAD __SMLADX __SMLALD __SMLALDX __SMLSD __SMLSDX __SMLSLD __SMLSLDX __SMUAD __SMUADX __SMUSD __SMUSDX */ void TC_CoreSimd_ParMul16 (void) { #if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) ) volatile int32_t op1_s32, op2_s32, op3_s32; volatile int32_t res_s32; volatile int64_t op1_s64; volatile int64_t res_s64; /* --- __SMLAD Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op3_s32 = 0x20000000; res_s32 = __SMLAD(op1_s32, op2_s32, op3_s32); ASSERT_TRUE(res_s32 == 0x20000017); /* --- __SMLADX Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op3_s32 = 0x00000800; res_s32 = __SMLADX(op1_s32, op2_s32, op3_s32); ASSERT_TRUE(res_s32 == 0x00000816); /* --- __SMLALD Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op1_s64 = 0x00000000200000000LL; res_s64 = __SMLALD(op1_s32, op2_s32, op1_s64); ASSERT_TRUE(res_s64 == 0x0000000200000017LL); /* --- __SMLALDX Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op1_s64 = 0x00000000200000000LL; res_s64 = __SMLALDX(op1_s32, op2_s32, op1_s64); ASSERT_TRUE(res_s64 == 0x0000000200000016LL); /* --- __SMLSD Test ---------------------------------------------- */ op1_s32 = 0x00030006; op2_s32 = 0x00050004; op3_s32 = 0x00000800; res_s32 = __SMLSD(op1_s32, op2_s32, op3_s32); ASSERT_TRUE(res_s32 == 0x00000809); /* --- __SMLSDX Test ---------------------------------------------- */ op1_s32 = 0x00030002; op2_s32 = 0x00050004; op3_s32 = 0x00000800; res_s32 = __SMLSDX(op1_s32, op2_s32, op3_s32); ASSERT_TRUE(res_s32 == 0x000007FE); /* --- __SMLSLD Test ---------------------------------------------- */ op1_s32 = 0x00030006; op2_s32 = 0x00050004; op1_s64 = 0x00000000200000000LL; res_s64 = __SMLSLD(op1_s32, op2_s32, op1_s64); ASSERT_TRUE(res_s64 == 0x0000000200000009LL); /* --- __SMLSLDX Test ---------------------------------------------- */ op1_s32 = 0x00030006; op2_s32 = 0x00050004; op1_s64 = 0x00000000200000000LL; res_s64 = __SMLSLDX(op1_s32, op2_s32, op1_s64); ASSERT_TRUE(res_s64 == 0x0000000200000012LL); /* --- __SMUAD Test ---------------------------------------------- */ op1_s32 = 0x00030001; op2_s32 = 0x00040002; res_s32 = __SMUAD(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == 0x0000000E); op1_s32 = (int32_t)0xFFFDFFFF; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUAD(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF2); /* --- __SMUADX Test ---------------------------------------------- */ op1_s32 = 0x00030001; op2_s32 = 0x00040002; res_s32 = __SMUADX(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == 0x0000000A); op1_s32 = (int32_t)0xFFFDFFFF; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUADX(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6); /* --- __SMUSD Test ---------------------------------------------- */ op1_s32 = (int32_t)0x00030001; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUSD(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6); op1_s32 = (int32_t)0xFFFDFFFF; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUSD(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == 0x0000000A); /* --- __SMUSDX Test ---------------------------------------------- */ op1_s32 = 0x00030001; op2_s32 = 0x00040002; res_s32 = __SMUSDX(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFFE); op1_s32 = (int32_t)0xFFFDFFFF; op2_s32 = (int32_t)0x00040002; res_s32 = __SMUSDX(op1_s32,op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0x00000002); #endif }
void arm_rms_q15( q15_t * pSrc, uint32_t blockSize, q15_t * pResult) { q63_t sum = 0; /* accumulator */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t in; /* temporary variable to store the input value */ q15_t in1; /* temporary variable to store the input value */ uint32_t blkCnt; /* loop counter */ /* loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ /* Compute sum of the squares and then store the results in a temporary variable, sum */ in = *__SIMD32(pSrc)++; sum = __SMLALD(in, in, sum); in = *__SIMD32(pSrc)++; sum = __SMLALD(in, in, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ /* Compute sum of the squares and then store the results in a temporary variable, sum */ in1 = *pSrc++; sum = __SMLALD(in1, in1, sum); /* Decrement the loop counter */ blkCnt--; } /* Truncating and saturating the accumulator to 1.15 format */ /* Store the result in the destination */ arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult); #else /* Run the below code for Cortex-M0 */ q15_t in; /* temporary variable to store the input value */ uint32_t blkCnt; /* loop counter */ /* Loop over blockSize number of values */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */ /* Compute sum of the squares and then store the results in a temporary variable, sum */ in = *pSrc++; sum += ((q31_t) in * in); /* Decrement the loop counter */ blkCnt--; } /* Truncating and saturating the accumulator to 1.15 format */ /* Store the result in the destination */ arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult); #endif /* #ifndef ARM_MATH_CM0_FAMILY */ }
void arm_var_q15( const q15_t * pSrc, uint32_t blockSize, q15_t * pResult) { uint32_t blkCnt; /* Loop counter */ q31_t sum = 0; /* Accumulator */ q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */ q63_t sumOfSquares = 0; /* Sum of squares */ q15_t in; /* Temporary variable to store input value */ #if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) q31_t in32; /* Temporary variable to store input value */ #endif if (blockSize <= 1U) { *pResult = 0; return; } #if defined (ARM_MATH_LOOPUNROLL) /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ /* C = A[0] + A[1] + ... + A[blockSize-1] */ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ /* Compute sum and store result in a temporary variable, sum. */ #if defined (ARM_MATH_DSP) in32 = read_q15x2_ia ((q15_t **) &pSrc); sumOfSquares = __SMLALD(in32, in32, sumOfSquares); sum += ((in32 << 16U) >> 16U); sum += (in32 >> 16U); in32 = read_q15x2_ia ((q15_t **) &pSrc); sumOfSquares = __SMLALD(in32, in32, sumOfSquares); sum += ((in32 << 16U) >> 16U); sum += (in32 >> 16U); #else in = *pSrc++; sumOfSquares += (in * in); sum += in; in = *pSrc++; sumOfSquares += (in * in); sum += in; in = *pSrc++; sumOfSquares += (in * in); sum += in; in = *pSrc++; sumOfSquares += (in * in); sum += in; #endif /* #if defined (ARM_MATH_DSP) */ /* Decrement loop counter */ blkCnt--; } /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ /* C = A[0] + A[1] + ... + A[blockSize-1] */ in = *pSrc++; /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */ #if defined (ARM_MATH_DSP) sumOfSquares = __SMLALD(in, in, sumOfSquares); #else sumOfSquares += (in * in); #endif /* #if defined (ARM_MATH_DSP) */ /* Compute sum and store result in a temporary variable, sum. */ sum += in; /* Decrement loop counter */ blkCnt--; } /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U)); /* Compute square of mean */ squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U))); /* mean of squares minus the square of mean. */ *pResult = (meanOfSquares - squareOfMean) >> 15U; }
void arm_dot_prod_q15( const q15_t * pSrcA, const q15_t * pSrcB, uint32_t blockSize, q63_t * result) { uint32_t blkCnt; /* Loop counter */ q63_t sum = 0; /* Temporary return variable */ #if defined (ARM_MATH_LOOPUNROLL) /* Loop unrolling: Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ #if defined (ARM_MATH_DSP) /* Calculate dot product and store result in a temporary buffer. */ sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum); sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum); #else sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++); sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++); sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++); sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++); #endif /* Decrement loop counter */ blkCnt--; } /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; #else /* Initialize blkCnt with number of samples */ blkCnt = blockSize; #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ while (blkCnt > 0U) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Calculate dot product and store result in a temporary buffer. */ //#if defined (ARM_MATH_DSP) // sum = __SMLALD(*pSrcA++, *pSrcB++, sum); //#else sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++); //#endif /* Decrement loop counter */ blkCnt--; } /* Store result in destination buffer in 34.30 format */ *result = sum; }