buffer_c16_t Complex8DecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) { /* Decimates by two using a non-recursive third-order CIC filter. */ /* CIC filter (decimating by two): * D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1 * D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1 * * D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1 * D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1 */ uint32_t i1_i0 = _i1_i0; uint32_t q1_q0 = _q1_q0; /* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */ constexpr uint32_t scale_factor = 32; constexpr uint32_t k_3_1 = 0x00030001 * scale_factor; uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]); uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]); uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]); while(src_p < src_end) { const uint32_t q3_i3_q2_i2 = *(src_p++); // 3 const uint32_t q5_i5_q4_i4 = *(src_p++); const uint32_t d_i0_partial = __SMUAD(k_3_1, i1_i0); // 1: = 3 * i1 + 1 * i0 const uint32_t i3_i2 = __SXTB16(q3_i3_q2_i2, 0); // 1: (q3_i3_q2_i2 ror 0)[23:16]:(q3_i3_q2_i2 ror 0)[7:0] const uint32_t d_i0 = __SMLADX(k_3_1, i3_i2, d_i0_partial); // 1: + 3 * i2 + 1 * i3 const uint32_t d_q0_partial = __SMUAD(k_3_1, q1_q0); // 1: = 3 * q1 * 1 * q0 const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2, 8); // 1: (q3_i3_q2_i2 ror 8)[23:16]:(q3_i3_q2_i2 ror 8)[7:0] const uint32_t d_q0 = __SMLADX(k_3_1, q3_q2, d_q0_partial); // 1: + 3 * q2 + 1 * q3 const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] const uint32_t d_i1_partial = __SMUAD(k_3_1, i3_i2); // 1: = 3 * i3 + 1 * i2 const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4, 0); // 1: (q5_i5_q4_i4 ror 0)[23:16]:(q5_i5_q4_i4 ror 0)[7:0] const uint32_t d_i1 = __SMLADX(k_3_1, i5_i4, d_i1_partial); // 1: + 1 * i5 + 3 * i4 const uint32_t d_q1_partial = __SMUAD(k_3_1, q3_q2); // 1: = 3 * q3 * 1 * q2 const uint32_t q5_q4 = __SXTB16(q5_i5_q4_i4, 8); // 1: (q5_i5_q4_i4 ror 8)[23:16]:(q5_i5_q4_i4 ror 8)[7:0] const uint32_t d_q1 = __SMLADX(k_3_1, q5_q4, d_q1_partial); // 1: + 1 * q5 + 3 * q4 const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] *(dst_p++) = d_q0_i0; // 3 *(dst_p++) = d_q1_i1; i1_i0 = i5_i4; q1_q0 = q5_q4; } _i1_i0 = i1_i0; _q1_q0 = q1_q0; return { dst.p, src.count / 2, src.sampling_rate / 2 }; }
/** \brief Test case: TC_CoreSimd_PackUnpack \details - Check Packing and unpacking: __SXTB16 __SXTAB16 __UXTB16 __UXTAB16 */ void TC_CoreSimd_PackUnpack (void) { #if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) ) volatile int32_t op1_s32, op2_s32; volatile int32_t res_s32; /* --- __SXTB16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80830168; res_s32 = __SXTB16(op1_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFF830068); /* --- __SXTAB16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x000D0008; op2_s32 = (int32_t)0x80830168; res_s32 = __SXTAB16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == (int32_t)0xFF900070); /* --- __UXTB16 Test ---------------------------------------------- */ op1_s32 = (int32_t)0x80830168; res_s32 = __UXTB16(op1_s32); ASSERT_TRUE(res_s32 == 0x00830068); /* --- __UXTAB16 Test ---------------------------------------------- */ op1_s32 = 0x000D0008; op2_s32 = (int32_t)0x80830168; res_s32 = __UXTAB16(op1_s32, op2_s32); ASSERT_TRUE(res_s32 == 0x00900070); #endif }
void arm_dot_prod_q7( q7_t * pSrcA, q7_t * pSrcB, uint32_t blockSize, q31_t * result) { uint32_t blkCnt; /* loop counter */ q31_t sum = 0; /* Temporary variables to store output */ #ifndef ARM_MATH_CM0_FAMILY /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t input1, input2; /* Temporary variables to store input */ q31_t inA1, inA2, inB1, inB2; /* Temporary variables to store input */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* read 4 samples at a time from sourceA */ input1 = *__SIMD32(pSrcA)++; /* read 4 samples at a time from sourceB */ input2 = *__SIMD32(pSrcB)++; /* extract two q7_t samples to q15_t samples */ inA1 = __SXTB16(__ROR(input1, 8)); /* extract reminaing two samples */ inA2 = __SXTB16(input1); /* extract two q7_t samples to q15_t samples */ inB1 = __SXTB16(__ROR(input2, 8)); /* extract reminaing two samples */ inB2 = __SXTB16(input2); /* multiply and accumulate two samples at a time */ sum = __SMLAD(inA1, inB1, sum); sum = __SMLAD(inA2, inB2, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Dot product and then store the results in a temporary buffer. */ sum = __SMLAD(*pSrcA++, *pSrcB++, sum); /* Decrement the loop counter */ blkCnt--; } #else /* Run the below code for Cortex-M0 */ /* Initialize blkCnt with number of samples */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* Dot product and then store the results in a temporary buffer. */ sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++); /* Decrement the loop counter */ blkCnt--; } #endif /* #ifndef ARM_MATH_CM0_FAMILY */ /* Store the result in the destination buffer in 18.14 format */ *result = sum; }
void arm_q7_to_q15( q7_t * pSrc, q15_t * pDst, uint32_t blockSize) { q7_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0_FAMILY q31_t in; q31_t in1, in2; q31_t out1, out2; /* Run the below code for Cortex-M4 and Cortex-M3 */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ in = *__SIMD32(pIn)++; /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); /* extend remainig two q7_t values to q15_t values */ in2 = __SXTB16(in); in1 = in1 << 8u; in2 = in2 << 8u; in1 = in1 & 0xFF00FF00; in2 = in2 & 0xFF00FF00; #ifndef ARM_MATH_BIG_ENDIAN out2 = __PKHTB(in1, in2, 16); out1 = __PKHBT(in2, in1, 16); #else out1 = __PKHTB(in1, in2, 16); out2 = __PKHBT(in2, in1, 16); #endif *__SIMD32(pDst)++ = out1; *__SIMD32(pDst)++ = out2; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0_FAMILY */ while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) * pIn++ << 8; /* Decrement the loop counter */ blkCnt--; } }
void arm_power_q7( q7_t * pSrc, uint32_t blockSize, q31_t * pResult) { q31_t sum = 0; /* Temporary result storage */ q7_t in; /* Temporary variable to store input */ uint32_t blkCnt; /* loop counter */ #ifndef ARM_MATH_CM0 /* Run the below code for Cortex-M4 and Cortex-M3 */ q31_t input1; /* Temporary variable to store packed input */ q31_t in1, in2; /* Temporary variables to store input */ /*loop Unrolling */ blkCnt = blockSize >> 2u; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* Reading two inputs of pSrc vector and packing */ input1 = *__SIMD32(pSrc)++; in1 = __SXTB16(__ROR(input1, 8)); in2 = __SXTB16(input1); /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* calculate power and accumulate to accumulator */ sum = __SMLAD(in1, in1, sum); sum = __SMLAD(in2, in2, sum); /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; #else /* Run the below code for Cortex-M0 */ /* Loop over blockSize number of values */ blkCnt = blockSize; #endif /* #ifndef ARM_MATH_CM0 */ while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, sum. */ in = *pSrc++; sum += ((q15_t) in * in); /* Decrement the loop counter */ blkCnt--; } /* Store the result in 18.14 format */ *pResult = sum; }
buffer_c16_t TranslateByFSOver4AndDecimateBy2CIC3::execute(const buffer_c8_t& src, const buffer_c16_t& dst) { /* Translates incoming complex<int8_t> samples by -fs/4, * decimates by two using a non-recursive third-order CIC filter. */ /* Derivation of algorithm: * Original CIC filter (decimating by two): * D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1 * D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1 * * D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1 * D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1 * * Translate -fs/4, phased 180 degrees, accomplished by complex multiplication * of complex length-4 sequence: * * Substitute: * i0 = -i0, q0 = -q0 * i1 = -q1, q1 = i1 * i2 = i2, q2 = q2 * i3 = q3, q3 = -i3 * i4 = -i4, q4 = -q4 * i5 = -q5, q5 = i5 * * Resulting taps (with decimation by 2, four samples in, two samples out): * D_I0 = q3 * 1 + i2 * 3 + -q1 * 3 + -i0 * 1 * D_Q0 = -i3 * 1 + q2 * 3 + i1 * 3 + -q0 * 1 * * D_I1 = -q5 * 1 + -i4 * 3 + q3 * 3 + i2 * 1 * D_Q1 = i5 * 1 + -q4 * 3 + -i3 * 3 + q2 * 1 */ // 6 cycles per complex input sample, not including loop overhead. uint32_t q1_i0 = _q1_i0; uint32_t q0_i1 = _q0_i1; /* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */ constexpr uint32_t scale_factor = 32; const uint32_t k_3_1 = 0x00030001 * scale_factor; uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]); uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]); uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]); while(src_p < src_end) { const uint32_t q3_i3_q2_i2 = *(src_p++); // 3 const uint32_t q5_i5_q4_i4 = *(src_p++); const uint32_t i2_i3 = __SXTB16(q3_i3_q2_i2, 16); // 1: (q3_i3_q2_i2 ror 16)[23:16]:(q3_i3_q2_i2 ror 16)[7:0] const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2, 8); // 1: (q3_i3_q2_i2 ror 8)[23:16]:(q3_i3_q2_i2 ror 8)[7:0] const uint32_t i2_q3 = __PKHTB(i2_i3, q3_q2, 16); // 1: Rn[31:16]:(Rm>>16)[15:0] const uint32_t i3_q2 = __PKHBT(q3_q2, i2_i3, 16); // 1:(Rm<<16)[31:16]:Rn[15:0] // D_I0 = 3 * (i2 - q1) + (q3 - i0) const uint32_t i2_m_q1_q3_m_i0 = __QSUB16(i2_q3, q1_i0); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0] const uint32_t d_i0 = __SMUAD(k_3_1, i2_m_q1_q3_m_i0); // 1: Rm[15:0]*Rs[15:0]+Rm[31:16]*Rs[31:16] // D_Q0 = 3 * (q2 + i1) - (i3 + q0) const uint32_t i3_p_q0_q2_p_i1 = __QADD16(i3_q2, q0_i1); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0] const uint32_t d_q0 = __SMUSDX(i3_p_q0_q2_p_i1, k_3_1); // 1: Rm[15:0]*Rs[31:16]–Rm[31:16]*RsX[15:0] const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4, 0); // 1: (q5_i5_q4_i4 ror 0)[23:16]:(q5_i5_q4_i4 ror 0)[7:0] const uint32_t q4_q5 = __SXTB16(q5_i5_q4_i4, 24); // 1: (q5_i5_q4_i4 ror 24)[23:16]:(q5_i5_q4_i4 ror 24)[7:0] const uint32_t q4_i5 = __PKHTB(q4_q5, i5_i4, 16); // 1: Rn[31:16]:(Rm>>16)[15:0] const uint32_t q5_i4 = __PKHBT(i5_i4, q4_q5, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] // D_I1 = (i2 - q5) + 3 * (q3 - i4) const uint32_t i2_m_q5_q3_m_i4 = __QSUB16(i2_q3, q5_i4); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0] const uint32_t d_i1 = __SMUADX(i2_m_q5_q3_m_i4, k_3_1); // 1: Rm[15:0]*Rs[31:16]+Rm[31:16]*Rs[15:0] // D_Q1 = (i5 + q2) - 3 * (q4 + i3) const uint32_t q4_p_i3_i5_p_q2 = __QADD16(q4_i5, i3_q2); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0] const uint32_t d_q1 = __SMUSD(k_3_1, q4_p_i3_i5_p_q2); // 1: Rm[15:0]*Rs[15:0]–Rm[31:16]*Rs[31:16] const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16); // 1: (Rm<<16)[31:16]:Rn[15:0] *(dst_p++) = d_q0_i0; // 3 *(dst_p++) = d_q1_i1; q1_i0 = q5_i4; q0_i1 = q4_i5; } _q1_i0 = q1_i0; _q0_i1 = q0_i1; return { dst.p, src.count / 2, src.sampling_rate / 2 }; }
void arm_q7_to_q15( q7_t * pSrc, q15_t * pDst, uint32_t blockSize) { q7_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ q31_t in; q31_t in1, in2; q31_t out1, out2; q31_t and = 0xFF00FF00; /*loop Unrolling */ blkCnt = blockSize >> 3u; /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ /* read 4 samples at a time */ in = *__SIMD32(pIn)++; #ifdef CCS /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(in, 8); /* extend remainig two q7_t values to q15_t values */ in2 = __SXTB16(in, 0); #else /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); /* extend remainig two q7_t values to q15_t values */ in2 = __SXTB16(in); #endif /* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/ in1 = in1 << 8u; in2 = in2 << 8u; /* read next 4 sampels */ in = *__SIMD32(pIn)++; /* anding with 0xff00ff00 */ in1 = in1 & and; out2 = in2 & and; /* pack two 16 bit values */ out1 = __PKHTB(in1, out2, 16); out2 = __PKHBT(out2, in1, 16); #ifndef ARM_MATH_BIG_ENDIAN /* store two q15_t samples at a time to destination */ _SIMD32_OFFSET(pDst + 2) = out1; #ifdef CCS /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(in, 8); #else /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); #endif /* store two q15_t samples at a time to destination */ _SIMD32_OFFSET(pDst) = out2; #else /* store two q15_t samples at a time to destination */ _SIMD32_OFFSET(pDst) = out1; #ifdef CCS /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(in, 8); #else /* rotatate in by 8 and extend two q7_t values to q15_t values */ in1 = __SXTB16(__ROR(in, 8)); #endif /* store two q15_t samples at a time to destination */ _SIMD32_OFFSET(pDst + 2) = out2; #endif // #ifndef ARM_MATH_BIG_ENDIAN #ifdef CCS /* rotatate in by 8 and extend two q7_t values to q15_t values */ in2 = __SXTB16(in, 0); #else /* rotatate in by 8 and extend two q7_t values to q15_t values */ in2 = __SXTB16(in); #endif /* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/ in1 = in1 << 8u; in2 = in2 << 8u; /* anding with 0xff00ff00 */ out1 = in1 & and; out2 = in2 & and; /* pack two 16 bit values */ out1 = __PKHTB(in1, out2, 16); out2 = __PKHBT(out2, in1, 16); /* store two q15_t samples at a time to destination */ #ifndef ARM_MATH_BIG_ENDIAN _SIMD32_OFFSET(pDst + 6) = out1; _SIMD32_OFFSET(pDst + 4) = out2; #else _SIMD32_OFFSET(pDst + 4) = out1; _SIMD32_OFFSET(pDst + 6) = out2; #endif // #ifndef ARM_MATH_BIG_ENDIAN /* incremnet destination pointer */ pDst += 8u; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 8, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { /* C = (q15_t) A << 8 */ /* convert from q7 to q15 and then store the results in the destination buffer */ *pDst++ = (q15_t) * pIn++ << 8; /* Decrement the loop counter */ blkCnt--; } }
void arm_power_q7( q7_t * pSrc, uint32_t blockSize, q31_t * pResult) { q31_t acc = 0; /* Temporary result storage */ q31_t input1; /* Temporary variable to store packed input */ q7_t in; /* Temporary variable to store input */ uint32_t blkCnt; /* loop counter */ q31_t inA1, inA2; /* Temporary variables to hold intermiediate data */ q31_t acc1 = 0; /*loop Unrolling */ blkCnt = blockSize >> 3u; /* First part of the processing with loop unrolling. Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while(blkCnt > 0u) { /* read four samples at a time from soruce buffer */ input1 = _SIMD32_OFFSET(pSrc); /* extend two q7_t values to q15_t values */ #ifdef CCS inA1 = __SXTB16(input1, 8); inA2 = __SXTB16(input1, 0); #else inA1 = __SXTB16(__ROR(input1, 8)); inA2 = __SXTB16(input1); #endif // #ifdef CCS /* calculate power and accumulate to accumulator */ acc = __SMLAD(inA1, inA1, acc); /* read four samples at a time from soruce buffer */ input1 = _SIMD32_OFFSET(pSrc + 4); #ifdef CCS /* extend two q7_t values to q15_t values */ inA1 = __SXTB16(input1, 8); /* calculate power and accumulate to accumulator */ acc1 = __SMLAD(inA2, inA2, acc1); /* extend two q7_t values to q15_t values */ inA2 = __SXTB16(input1, 0); #else /* extend two q7_t values to q15_t values */ inA1 = __SXTB16(__ROR(input1, 8)); /* calculate power and accumulate to accumulator */ acc1 = __SMLAD(inA2, inA2, acc1); /* extend two q7_t values to q15_t values */ inA2 = __SXTB16(input1); #endif // #ifdef CCS /* calculate power and accumulate to accumulator */ acc = __SMLAD(inA1, inA1, acc); acc1 = __SMLAD(inA2, inA2, acc1); /* update source buffer to process next samples */ pSrc += 8u; /* Decrement the loop counter */ blkCnt--; } /* add accumulators */ acc = acc + acc1; /* If the blockSize is not a multiple of 8, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x8u; while(blkCnt > 0u) { /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ /* Compute Power and then store the result in a temporary variable, acc. */ in = *pSrc++; acc += ((q15_t) in * in); /* Decrement the loop counter */ blkCnt--; } /* Store the result in 18.14 format */ *pResult = acc; }