C++ (Cpp) __SMLALD 예제들

예제 #1

0

파일 보기

파일: arm_power_q15.c 프로젝트: Frehner1/CMSIS_LPC17xx

void arm_power_q15( 
  q15_t * pSrc, 
  uint32_t blockSize, 
  q63_t * pResult) 
{ 
  q63_t sum = 0;                                 /* Temporary result storage */ 
  q31_t in32;                                    /* Temporary variable to store input value */ 
  q15_t in16;                                    /* Temporary variable to store input value */ 
  uint32_t blkCnt;                               /* loop counter */ 
 
 
  /* loop Unrolling */ 
  blkCnt = blockSize >> 2u; 
 
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
   ** a second loop below computes the remaining 1 to 3 samples. */ 
  while(blkCnt > 0u) 
  { 
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ 
    /* Compute Power and then store the result in a temporary variable, sum. */ 
    in32 = *__SIMD32(pSrc)++; 
    sum = __SMLALD(in32, in32, sum); 
    in32 = *__SIMD32(pSrc)++; 
    sum = __SMLALD(in32, in32, sum); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
   ** No loop unrolling is used. */ 
  blkCnt = blockSize % 0x4u; 
 
  while(blkCnt > 0u) 
  { 
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */ 
    /* Compute Power and then store the result in a temporary variable, sum. */ 
    in16 = *pSrc++; 
    sum = __SMLALD(in16, in16, sum); 
 
    /* Decrement the loop counter */ 
    blkCnt--; 
  } 
 
  /* Store the results in 34.30 format  */ 
  *pResult = sum; 
}

예제 #2

0

파일 보기

파일: arm_dot_prod_q15.c 프로젝트: szymon2103/Stm32

void arm_dot_prod_q15(
  q15_t * pSrcA,
  q15_t * pSrcB,
  uint32_t blockSize,
  q63_t * result)
{
  q63_t sum = 0;                                 /* Temporary result storage */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

/* Run the below code for Cortex-M4 and Cortex-M3 */


  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the result in a temporary buffer. */
    sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
    sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the results in a temporary buffer. */
    sum = __SMLALD(*pSrcA++, *pSrcB++, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }


#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the results in a temporary buffer. */
    sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++);

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  /* Store the result in the destination buffer in 34.30 format */
  *result = sum;

}

예제 #3

0

파일 보기

파일: arm_var_q15.c 프로젝트: AndrewCapon/axoloti

void arm_var_q15(
  q15_t * pSrc,
  uint32_t blockSize,
  q15_t * pResult)
{

  q31_t sum = 0;                                 /* Accumulator */
  q31_t meanOfSquares, squareOfMean;             /* square of mean and mean of square */
  uint32_t blkCnt;                               /* loop counter */
  q63_t sumOfSquares = 0;                        /* Accumulator */
   
#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t in;                                      /* input value */
  q15_t in1;                                     /* input value */

	if(blockSize == 1)
	{
		*pResult = 0;
		return;
	}

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1])  */
    /* Compute Sum of squares of the input samples    
     * and then store the result in a temporary variable, sum. */
    in = *__SIMD32(pSrc)++;
    sum += ((in << 16) >> 16);
    sum += (in >> 16);
    sumOfSquares = __SMLALD(in, in, sumOfSquares);
    in = *__SIMD32(pSrc)++;
    sum += ((in << 16) >> 16);
    sum += (in >> 16);
    sumOfSquares = __SMLALD(in, in, sumOfSquares);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
    /* Compute Sum of squares of the input samples    
     * and then store the result in a temporary variable, sum. */
    in1 = *pSrc++;
    sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
    sum += in1;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Compute Mean of squares of the input samples    
   * and then store the result in a temporary variable, meanOfSquares. */
  meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1));

  /* Compute square of mean */
  squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1)));

  /* mean of the squares minus the square of the mean. */
  *pResult = (meanOfSquares - squareOfMean) >> 15;

#else

  /* Run the below code for Cortex-M0 */
  q15_t in;                                      /* input value */

	if(blockSize == 1)
	{
		*pResult = 0;
		return;
	}

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
    /* Compute Sum of squares of the input samples     
     * and then store the result in a temporary variable, sumOfSquares. */
    in = *pSrc++;
    sumOfSquares += (in * in);

    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
    /* Compute sum of all input values and then store the result in a temporary variable, sum. */
    sum += in;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Compute Mean of squares of the input samples     
   * and then store the result in a temporary variable, meanOfSquares. */
  meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1));

  /* Compute square of mean */
  squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1)));

  /* mean of the squares minus the square of the mean. */
  *pResult = (meanOfSquares - squareOfMean) >> 15;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}

예제 #4

0

파일 보기

파일: arm_dot_prod_q15.c 프로젝트: JGSuw/DIP

void arm_dot_prod_q15(     
  q15_t * pSrcA,     
  q15_t * pSrcB,     
  uint32_t blockSize,     
  q63_t * result)     
{     
  q63_t sum = 0;                                 /* Temporary result storage */     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t inA1, inA2, inB1, inB2; 				 /* Temporary variables to store input data */  
  q31_t inA3, inA4, inB3, inB4;  
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */     
    /* Calculate dot product and then store the result in a temporary buffer. */     
	/* read two samples at a time from soruceA buffer */  
	inA1 = _SIMD32_OFFSET(pSrcA);  
	/* read two samples at a time from soruceB buffer */  
	inB1 = _SIMD32_OFFSET(pSrcB);  
	/* read two samples at a time from soruceA buffer */  
	inA2 = _SIMD32_OFFSET(pSrcA+2);  
  
	/* multiply and accumulate two samples at a time */  
    sum = __SMLALD(inA1, inB1, sum);
	    
	/* read two samples at a time from soruceB buffer */  
	inB2 = _SIMD32_OFFSET(pSrcB+2);  
	/* read two samples at a time from soruceA buffer */  
	inA3 = _SIMD32_OFFSET(pSrcA+4);  
	/* read two samples at a time from soruceB buffer */  
	inB3 = _SIMD32_OFFSET(pSrcB+4);  
  
	/* multiply and accumulate two samples at a time */  
    sum = __SMLALD(inA2, inB2, sum);
  
	/* read two samples at a time from soruceA buffer */  
	inA4 = _SIMD32_OFFSET(pSrcA+6);  
	/* read two samples at a time from soruceB buffer */  
	inB4 = _SIMD32_OFFSET(pSrcB+6);  
  
	/* increment source A buffer by 8 */  
	pSrcA += 8u;  
	/* increment sourceB buffer by 8 */  
	pSrcB += 8u;  
  
	/* multiply and accumulate two samples at a time */  
    sum = __SMLALD(inA3, inB3, sum);
    sum = __SMLALD(inA4, inB4, sum);
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */     
    /* Calculate dot product and then store the results in a temporary buffer. */     
    sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* Store the result in the destination buffer in 34.30 format */     
  *result = sum;     
}

예제 #5

0

파일 보기

파일: CV_CoreSimd.c 프로젝트: ARM-software/CMSIS_5

/**
\brief Test case: TC_CoreSimd_ParMul16
\details
- Check Parallel 16-bit multiplication:
  __SMLAD
  __SMLADX
  __SMLALD
  __SMLALDX
  __SMLSD
  __SMLSDX
  __SMLSLD
  __SMLSLDX
  __SMUAD
  __SMUADX
  __SMUSD
  __SMUSDX
*/
void TC_CoreSimd_ParMul16 (void) {
#if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__  == 1)) || \
     (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))     )
  volatile int32_t op1_s32, op2_s32, op3_s32;
  volatile int32_t res_s32;

  volatile int64_t op1_s64;
  volatile int64_t res_s64;

  /* --- __SMLAD Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op3_s32 = 0x20000000;
  res_s32 = __SMLAD(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x20000017);

  /* --- __SMLADX Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op3_s32 = 0x00000800;
  res_s32 = __SMLADX(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x00000816);

  /* --- __SMLALD Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLALD(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000017LL);

  /* --- __SMLALDX Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLALDX(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000016LL);

  /* --- __SMLSD Test ---------------------------------------------- */
  op1_s32 = 0x00030006;
  op2_s32 = 0x00050004;
  op3_s32 = 0x00000800;
  res_s32 = __SMLSD(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x00000809);

  /* --- __SMLSDX Test ---------------------------------------------- */
  op1_s32 = 0x00030002;
  op2_s32 = 0x00050004;
  op3_s32 = 0x00000800;
  res_s32 = __SMLSDX(op1_s32, op2_s32, op3_s32);
  ASSERT_TRUE(res_s32 == 0x000007FE);

  /* --- __SMLSLD Test ---------------------------------------------- */
  op1_s32 = 0x00030006;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLSLD(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000009LL);

  /* --- __SMLSLDX Test ---------------------------------------------- */
  op1_s32 = 0x00030006;
  op2_s32 = 0x00050004;
  op1_s64 = 0x00000000200000000LL;
  res_s64 = __SMLSLDX(op1_s32, op2_s32, op1_s64);
  ASSERT_TRUE(res_s64 == 0x0000000200000012LL);

  /* --- __SMUAD Test ---------------------------------------------- */
  op1_s32 = 0x00030001;
  op2_s32 = 0x00040002;
  res_s32 = __SMUAD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == 0x0000000E);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUAD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF2);

  /* --- __SMUADX Test ---------------------------------------------- */
  op1_s32 = 0x00030001;
  op2_s32 = 0x00040002;
  res_s32 = __SMUADX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == 0x0000000A);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUADX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6);

  /* --- __SMUSD Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x00030001;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUSD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFF6);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUSD(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == 0x0000000A);

  /* --- __SMUSDX Test ---------------------------------------------- */
  op1_s32 = 0x00030001;
  op2_s32 = 0x00040002;
  res_s32 = __SMUSDX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0xFFFFFFFE);

  op1_s32 = (int32_t)0xFFFDFFFF;
  op2_s32 = (int32_t)0x00040002;
  res_s32 = __SMUSDX(op1_s32,op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x00000002);
#endif
}

예제 #6

0

파일 보기

파일: arm_rms_q15.c 프로젝트: 0x4C4A/stm32f072-discovery-basic-template

void arm_rms_q15(
  q15_t * pSrc,
  uint32_t blockSize,
  q15_t * pResult)
{
  q63_t sum = 0;                                 /* accumulator */

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t in;                                      /* temporary variable to store the input value */
  q15_t in1;                                     /* temporary variable to store the input value */
  uint32_t blkCnt;                               /* loop counter */

  /* loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
    /* Compute sum of the squares and then store the results in a temporary variable, sum */
    in = *__SIMD32(pSrc)++;
    sum = __SMLALD(in, in, sum);
    in = *__SIMD32(pSrc)++;
    sum = __SMLALD(in, in, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
    /* Compute sum of the squares and then store the results in a temporary variable, sum */
    in1 = *pSrc++;
    sum = __SMLALD(in1, in1, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Truncating and saturating the accumulator to 1.15 format */
  /* Store the result in the destination */
  arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);

#else

  /* Run the below code for Cortex-M0 */

  q15_t in;                                      /* temporary variable to store the input value */
  uint32_t blkCnt;                               /* loop counter */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
    /* Compute sum of the squares and then store the results in a temporary variable, sum */
    in = *pSrc++;
    sum += ((q31_t) in * in);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Truncating and saturating the accumulator to 1.15 format */
  /* Store the result in the destination */
  arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}

예제 #7

0

파일 보기

파일: arm_var_q15.c 프로젝트: ARM-software/CMSIS_5

void arm_var_q15(
  const q15_t * pSrc,
        uint32_t blockSize,
        q15_t * pResult)
{
        uint32_t blkCnt;                               /* Loop counter */
        q31_t sum = 0;                                 /* Accumulator */
        q31_t meanOfSquares, squareOfMean;             /* Square of mean and mean of square */
        q63_t sumOfSquares = 0;                        /* Sum of squares */
        q15_t in;                                      /* Temporary variable to store input value */

#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
        q31_t in32;                                    /* Temporary variable to store input value */
#endif

  if (blockSize <= 1U)
  {
    *pResult = 0;
    return;
  }

#if defined (ARM_MATH_LOOPUNROLL)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

  while (blkCnt > 0U)
  {
    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
    /* C = A[0] + A[1] + ... + A[blockSize-1] */

    /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
    /* Compute sum and store result in a temporary variable, sum. */
#if defined (ARM_MATH_DSP)
    in32 = read_q15x2_ia ((q15_t **) &pSrc);
    sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
    sum += ((in32 << 16U) >> 16U);
    sum +=  (in32 >> 16U);

    in32 = read_q15x2_ia ((q15_t **) &pSrc);
    sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
    sum += ((in32 << 16U) >> 16U);
    sum +=  (in32 >> 16U);
#else
    in = *pSrc++;
    sumOfSquares += (in * in);
    sum += in;

    in = *pSrc++;
    sumOfSquares += (in * in);
    sum += in;

    in = *pSrc++;
    sumOfSquares += (in * in);
    sum += in;

    in = *pSrc++;
    sumOfSquares += (in * in);
    sum += in;
#endif /* #if defined (ARM_MATH_DSP) */

    /* Decrement loop counter */
    blkCnt--;
  }

  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

#else

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
    /* C = A[0] + A[1] + ... + A[blockSize-1] */

    in = *pSrc++;
    /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
#if defined (ARM_MATH_DSP)
    sumOfSquares = __SMLALD(in, in, sumOfSquares);
#else
    sumOfSquares += (in * in);
#endif /* #if defined (ARM_MATH_DSP) */
    /* Compute sum and store result in a temporary variable, sum. */
    sum += in;

    /* Decrement loop counter */
    blkCnt--;
  }

  /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
  meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));

  /* Compute square of mean */
  squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));

  /* mean of squares minus the square of mean. */
  *pResult = (meanOfSquares - squareOfMean) >> 15U;
}

예제 #8

0

파일 보기

파일: arm_dot_prod_q15.c 프로젝트: ARM-software/CMSIS_5

void arm_dot_prod_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t blockSize,
        q63_t * result)
{
        uint32_t blkCnt;                               /* Loop counter */
        q63_t sum = 0;                                 /* Temporary return variable */

#if defined (ARM_MATH_LOOPUNROLL)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */

#if defined (ARM_MATH_DSP)
    /* Calculate dot product and store result in a temporary buffer. */
    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
    sum = __SMLALD(read_q15x2_ia ((q15_t **) &pSrcA), read_q15x2_ia ((q15_t **) &pSrcB), sum);
#else
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
#endif

    /* Decrement loop counter */
    blkCnt--;
  }

  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

#else

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */

    /* Calculate dot product and store result in a temporary buffer. */
//#if defined (ARM_MATH_DSP)
//    sum  = __SMLALD(*pSrcA++, *pSrcB++, sum);
//#else
    sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
//#endif

    /* Decrement loop counter */
    blkCnt--;
  }

  /* Store result in destination buffer in 34.30 format */
  *result = sum;
}