Beispiel #1
0
/**
\brief Test case: TC_CoreSimd_SatAddSub
\details
- Check Saturating addition and subtraction:
  __QADD
  __QSUB
*/
void TC_CoreSimd_SatAddSub (void) {
#if ((defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__  == 1)) || \
     (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))     )
  volatile int32_t op1_s32, op2_s32;
  volatile int32_t res_s32;

  /* --- __QADD Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80000003;
  op2_s32 = (int32_t)0x00000004;
  res_s32 = __QADD(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80000007);

  op1_s32 = (int32_t)0x80000000;
  op2_s32 = (int32_t)0x80000002;
  res_s32 = __QADD(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80000000);

  /* --- __QSUB Test ---------------------------------------------- */
  op1_s32 = (int32_t)0x80000003;
  op2_s32 = (int32_t)0x00000004;
  res_s32 = __QSUB(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80000000);

  op1_s32 = (int32_t)0x80000003;
  op2_s32 = (int32_t)0x00000002;
  res_s32 = __QSUB(op1_s32, op2_s32);
  ASSERT_TRUE(res_s32 == (int32_t)0x80000001);
#endif
}
void arm_sub_q31(
  q31_t * pSrcA,
  q31_t * pSrcB,
  q31_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */


#ifndef ARM_MATH_CM0_FAMILY

/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2, inA3, inA4;
  q31_t inB1, inB2, inB3, inB4;

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the results in the destination buffer. */
    inA1 = *pSrcA++;
    inA2 = *pSrcA++;
    inB1 = *pSrcB++;
    inB2 = *pSrcB++;

    inA3 = *pSrcA++;
    inA4 = *pSrcA++;
    inB3 = *pSrcB++;
    inB4 = *pSrcB++;

    *pDst++ = __QSUB(inA1, inB1);
    *pDst++ = __QSUB(inA2, inB2);
    *pDst++ = __QSUB(inA3, inB3);
    *pDst++ = __QSUB(inA4, inB4);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the result in the destination buffer. */
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);

    /* Decrement the loop counter */
    blkCnt--;
  }

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the result in the destination buffer. */
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ - *pSrcB++);

    /* Decrement the loop counter */
    blkCnt--;
  }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

}
Beispiel #3
0
arm_status arm_mat_sub_q31(     
  const arm_matrix_instance_q31 * pSrcA,     
  const arm_matrix_instance_q31 * pSrcB,     
  arm_matrix_instance_q31 * pDst)     
{     
  q31_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */     
  q31_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */     
  q31_t *pOut = pDst->pData;                     /* output data matrix pointer */   
  q31_t inA1, inA2, inB1, inB2;					 /* temporary variables */   
  q31_t out1, out2;								 /* temporary variables */ 	  
  uint32_t numSamples;                           /* total number of elements in the matrix  */   
  uint32_t blkCnt;                               /* loop counters */     
  arm_status status;                             /* status of matrix subtraction */     
     
     
#ifdef ARM_MATH_MATRIX_CHECK     
  /* Check for matrix mismatch condition  */     
  if((pSrcA->numRows != pSrcB->numRows) ||     
     (pSrcA->numCols != pSrcB->numCols) ||     
     (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))     
  {     
    /* Set status as ARM_MATH_SIZE_MISMATCH */     
    status = ARM_MATH_SIZE_MISMATCH;     
  }     
  else     
#endif     
  {     
    /* Total number of samples in the input matrix */     
    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;     
     
    /* Loop Unrolling */     
    blkCnt = numSamples >> 3u;     
     
    /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
     ** a second loop below computes the remaining 1 to 7 samples. */     
    while(blkCnt > 0u)     
    {     
      /* C(m,n) = A(m,n) - B(m,n) */     
      /* Subtract, saturate and then store the results in the destination buffer. */     
	  /* Read values from source A */   
	  inA1 = pIn1[0];  
  
	  /* Read values from source B */   
	  inB1 = pIn2[0];  
  
	  /* Read values from source A */   
	  inA2 = pIn1[1];  
  
	  /* Subtract and saturate */  
	  out1 = __QSUB(inA1, inB1);  
  
	  /* Read values from source B */   
	  inB2 = pIn2[1];  
  
	  /* Read values from source A */   
	  inA1 = pIn1[2];  
  
	  /* Subtract and saturate */  
	  out2 = __QSUB(inA2, inB2);  
  
	  /* Read values from source B */   
	  inB1 = pIn2[2];    
  
	  /* Store result in destination */   
	  pOut[0] = out1;  
	  pOut[1] = out2;  
  
	  /* Read values from source A */   
	  inA2 = pIn1[3];  
  
	  /* Read values from source B */   
	  inB2 = pIn2[3];  
    
	  /* Subtract and saturate */  
	  out1 = __QSUB(inA1, inB1);  
  
	  /* Read values from source A */   
	  inA1 = pIn1[4];  
  
	  /* Subtract and saturate */  
	  out2 = __QSUB(inA2, inB2);  
	    
	  /* Read values from source B */   
	  inB1 = pIn2[4];  
  
	  /* Store result in destination */   
	  pOut[2] = out1;  
  
	  /* Read values from source A */   
	  inA2 = pIn1[5];  
  
	  /* Subtract and saturate */  
	  out1 = __QSUB(inA1, inB1);  
  
	  /* Read values from source B */   
	  inB2 = pIn2[5];  
  
	  /* Store result in destination */   
	  pOut[3] = out2;  
  
	  /* Read values from source A */   
	  inA1 = pIn1[6];  
  
	  /* Subtract and saturate */  
	  out2 = __QSUB(inA2, inB2);  
  
	  /* Read values from source A */   
	  inA2 = pIn1[7];  
	    
	  /* Read values from source B */   
	  inB1 = pIn2[6];  
	  inB2 = pIn2[7];  
  
	  /* Store result in destination */   
	  pOut[4] = out1;  
  
	  /* Subtract and saturate */  
	  out1 = __QSUB(inA1, inB1);  
  
	  /* Store result in destination */   
	  pOut[5] = out2;  
  
	  /* Subtract and saturate */  
	  out2 = __QSUB(inA2, inB2);  
  
	  /* Store result in destination */   
	  pOut[6] = out1;  
  
	  /* Increment Soruce A pointer */  
	  pIn1 += 8u;  
  
	  pOut[7] = out2;  
  
	  /* Increment Soruce B pointer */  
	  pIn2 += 8u;  
  
	  /* Increment Destination pointer */  
	  pOut += 8u;   
   
      /* Decrement the loop counter */   
      blkCnt--;   
    }     
     
    /* If the numSamples is not a multiple of 8, compute any remaining output samples here.      
     ** No loop unrolling is used. */     
    blkCnt = numSamples % 0x8u;     
     
    while(blkCnt > 0u)     
    {     
      /* C(m,n) = A(m,n) - B(m,n) */     
      /* Subtract, saturate and then store the results in the destination buffer. */     
	  inA1 = *pIn1++;   
	  inB1 = *pIn2++;   
   
      inA1 = __QSUB(inA1, inB1);   
   
	  *pOut++ = inA1;   
     
      /* Decrement the loop counter */     
      blkCnt--;     
    }     
     
    /* Set status as ARM_MATH_SUCCESS */     
    status = ARM_MATH_SUCCESS;     
  }     
     
  /* Return to application */     
  return (status);     
}     
void arm_abs_q31(
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
{
  uint32_t blkCnt;                               /* loop counter */
  q31_t in;                                      /* Input value */

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in1, in2, in3, in4;

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
    in1 = *pSrc++;
    in2 = *pSrc++;
    in3 = *pSrc++;
    in4 = *pSrc++;

    *pDst++ = (in1 > 0) ? in1 : (q31_t)__QSUB(0, in1);
    *pDst++ = (in2 > 0) ? in2 : (q31_t)__QSUB(0, in2);
    *pDst++ = (in3 > 0) ? in3 : (q31_t)__QSUB(0, in3);
    *pDst++ = (in4 > 0) ? in4 : (q31_t)__QSUB(0, in4);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /*   #ifndef ARM_MATH_CM0_FAMILY   */

  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Calculate absolute value of the input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
    in = *pSrc++;
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);

    /* Decrement the loop counter */
    blkCnt--;
  }

}
void arm_cmplx_conj_q31(
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t numSamples)
{
  uint32_t blkCnt;                               /* loop counter */
  q31_t in;                                      /* Input value */

#ifndef ARM_MATH_CM0_FAMILY

  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inR1, inR2, inR3, inR4;                  /* Temporary real variables */
  q31_t inI1, inI2, inI3, inI4;                  /* Temporary imaginary variables */

  /*loop Unrolling */
  blkCnt = numSamples >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    /* Saturated to 0x7fffffff if the input is -1(0x80000000) */
    /* read real input sample */
    inR1 = pSrc[0];
    /* store real input sample */
    pDst[0] = inR1;

    /* read imaginary input sample */
    inI1 = pSrc[1];

    /* read real input sample */
    inR2 = pSrc[2];
    /* store real input sample */
    pDst[2] = inR2;

    /* read imaginary input sample */
    inI2 = pSrc[3];

    /* negate imaginary input sample */
    inI1 = __QSUB(0, inI1);

    /* read real input sample */
    inR3 = pSrc[4];
    /* store real input sample */
    pDst[4] = inR3;

    /* read imaginary input sample */
    inI3 = pSrc[5];

    /* negate imaginary input sample */
    inI2 = __QSUB(0, inI2);

    /* read real input sample */
    inR4 = pSrc[6];
    /* store real input sample */
    pDst[6] = inR4;

    /* negate imaginary input sample */
    inI3 = __QSUB(0, inI3);

    /* store imaginary input sample */
    inI4 = pSrc[7];

    /* store imaginary input samples */
    pDst[1] = inI1;

    /* negate imaginary input sample */
    inI4 = __QSUB(0, inI4);

    /* store imaginary input samples */
    pDst[3] = inI2;

    /* increment source pointer by 8 to proecess next samples */
    pSrc += 8u;

    /* store imaginary input samples */
    pDst[5] = inI3;
    pDst[7] = inI4;

    /* increment destination pointer by 8 to process next samples */
    pDst += 8u;

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;

#else

  /* Run the below code for Cortex-M0 */
  blkCnt = numSamples;


#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  while(blkCnt > 0u)
  {
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    /* Saturated to 0x7fffffff if the input is -1(0x80000000) */
    *pDst++ = *pSrc++;
    in = *pSrc++;
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;

    /* Decrement the loop counter */
    blkCnt--;
  }
}
Beispiel #6
0
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License

#define OP_NAME isub

#include <ngl_opcode_begin.c>

#ifdef OPCODE_BODY
#ifdef NGL_ARM
{
  ngl_val right = ngl_stack_pop(&stack);
  ngl_val left = ngl_stack_pop(&stack);
  ngl_stack_push(&stack, ngl_val_uint(__QSUB(left.uinteger, right.uinteger)));
}
#else
{
  ngl_val right = ngl_stack_pop(&stack);
  ngl_val left = ngl_stack_pop(&stack);
  int64_t sum = (int64_t)(left.integer) - (int64_t)(right.integer);
  if (sum > INT_MAX) {
    sum = INT_MAX;
  }
  if (sum < INT_MIN) {
    sum = INT_MIN;
  }
  ngl_stack_push(&stack, ngl_val_int(sum));
}
#endif
Beispiel #7
0
void arm_negate_q31(     
  q31_t * pSrc,     
  q31_t * pDst,     
  uint32_t blockSize)     
{     
  q31_t in1, in2, in3, in4;                      /* Temporary variables */     
  uint32_t blkCnt;                               /* loop counter */     
     
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = -A */     
    /* Negate and then store the results in the destination buffer. */     
	/* read samples from source */  
	in1 = *pSrc;  
	in2 = *(pSrc + 1);  
  
	/* negate input */  
	in1 = __QSUB(0, in1);  
  
	/* read samples from source */  
	in3 = *(pSrc + 2);  
  
	/* negate input */  
	in2 = __QSUB(0, in2);  
  
	/* read samples from source */  
	in4 = *(pSrc + 3);  
  
	/* negate input */  
	in3 = __QSUB(0, in3);  
  
	/* store result to destination */  
	*pDst = in1;  
  
	/* negate input */  
	in4 = __QSUB(0, in4);  
  
	/* store result to destination */  
	*(pDst + 1) = in2;  
	*(pDst + 2) = in3;  
	*(pDst + 3) = in4;  
  
	/* read samples from source */  
	in1 = *(pSrc + 4);  
	in2 = *(pSrc + 5);  
  
	/* negate input */  
	in1 = __QSUB(0, in1);  
  
	/* read samples from source */  
	in3 = *(pSrc + 6);  
  
	/* negate input */  
	in2 = __QSUB(0, in2);  
  
	/* read samples from source */  
	in4 = *(pSrc + 7);  
  
	/* negate input */  
	in3 = __QSUB(0, in3);  
  
	/* store result to destination */  
	*(pDst + 4) = in1;  
  
	/* negate input */  
	in4 = __QSUB(0, in4);  
  
	/* store result to destination */  
	*(pDst + 5) = in2;  
  
	/* increment source by 8 to process next samples */  
	pSrc += 8u;  
  
	/* store result to destination */  
	*(pDst + 6) = in3;  
	*(pDst + 7) = in4;  
  
	/* increment destination by 8 */ 
	pDst += 8u;  
	     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = -A */     
    /* Negate and then store the result in the destination buffer. */     
    in1 = *pSrc++;     
    *pDst++ = __QSUB(0, in1);     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
}     
Beispiel #8
0
void arm_negate_q31(
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
{
  q31_t in;                                      /* Temporary variable */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

/* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in1, in2, in3, in4;

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the results in the destination buffer. */
    in1 = *pSrc++;
    in2 = *pSrc++;
    in3 = *pSrc++;
    in4 = *pSrc++;

    *pDst++ = __QSUB(0, in1);
    *pDst++ = __QSUB(0, in2);
    *pDst++ = __QSUB(0, in3);
    *pDst++ = __QSUB(0, in4);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */


  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the result in the destination buffer. */
    in = *pSrc++;
    *pDst++ = (in == 0x80000000) ? 0x7fffffff : -in;

    /* Decrement the loop counter */
    blkCnt--;
  }
}
Beispiel #9
0
void arm_abs_q15(     
  q15_t * pSrc,     
  q15_t * pDst,     
  uint32_t blockSize)     
{     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t in1, in2, in3, in4;						 /* temporary input variables */  
  q31_t out1, out2, out3, out4;					 /* temporary output variabels */  
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = |A| */     
    /* Read inputs */    
    in1 = (q31_t)*pSrc;     
    in2 = (q31_t)*(pSrc + 1);     
    in3 = (q31_t)*(pSrc + 2);     
	  
	/* find absolute value */     
	out1 = (in1 > 0) ? in1 : __QSUB(0, in1);   
  
    /* read input */  
	in4 = (q31_t)*(pSrc + 3);   		   
  
	/* find absolute value */     
 	out2 = (in2 > 0) ? in2 : __QSUB(0, in2);  
  
	/* store result to destination */  
   *pDst = (q15_t)out1;  
  
	/* find absolute value */     
	out3 = (in3 > 0) ? in3 : __QSUB(0, in3);  
  
    /* read input */  
	in1 = (q31_t)*(pSrc + 4);  
  
	/* find absolute value */     
	out4 = (in4 > 0) ? in4 : __QSUB(0, in4);  
  
	/* store result to destination */  
	*(pDst + 1) = (q15_t)out2;  
  
    /* read input */  
	in2 = (q31_t)*(pSrc + 5);  
  
	/* find absolute value */     
	out1 = (in1 > 0) ? in1 : __QSUB(0, in1);   
  
	/* store result to destination */  
	*(pDst + 2) = (q15_t)out3;  
  
	/* find absolute value */     
	out2 = (in2 > 0) ? in2 : __QSUB(0, in2);   
  
    /* read input */  
	in3 = (q31_t)*(pSrc + 6);  
  
	/* store result to destination */  
	*(pDst + 3) = (q15_t)out4;  
  
    /* read input */  
	in4 = (q31_t)*(pSrc + 7);  
  
	/* find absolute value */     
	out3 = (in3 > 0) ? in3 : __QSUB(0, in3);  
  
	/* store result to destination */  
	*(pDst + 4) = (q15_t)out1;   
  
	/* find absolute value */     
	out4 = (in4 > 0) ? in4 : __QSUB(0, in4);  
  
	/* store result to destination */  
	*(pDst + 5) = (q15_t)out2;  
	*(pDst + 6) = (q15_t)out3;  
  
	/* increment source pointer by 8 */  
	pSrc += 8u;  
  
	/* store result to destination */  
	*(pDst + 7) = (q15_t)out4;  
  
	/* increment destination pointer by 8 */  
	pDst += 8u;    
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = |A| */     
    /* Read the input */     
    in1 = *pSrc++;     
     
    /* Calculate absolute value of input and then store the result in the destination buffer. */     
    *pDst++ = (in1 > 0) ? in1 : __QSUB16(0, in1);     
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
}     
Beispiel #10
0
arm_status arm_mat_sub_q31(
  const arm_matrix_instance_q31 * pSrcA,
  const arm_matrix_instance_q31 * pSrcB,
  arm_matrix_instance_q31 * pDst)
{
  q31_t *pIn1 = pSrcA->pData;                     /** input data matrix pointer A */
  q31_t *pIn2 = pSrcB->pData;                     /** input data matrix pointer B */
  q31_t *pOut = pDst->pData;                      /** output data matrix pointer */
  q31_t inA1, inB1;                               /** temporary variables */

#ifndef ARM_MATH_CM0_FAMILY

  q31_t inA2, inB2;                               /** temporary variables */
  q31_t out1, out2;                               /** temporary variables */

#endif //      #ifndef ARM_MATH_CM0_FAMILY

  uint32_t numSamples;                            /** total number of elements in the matrix  */
  uint32_t blkCnt;                                /** loop counters */
  arm_status status;                              /** status of matrix subtraction */


#ifdef ARM_MATH_MATRIX_CHECK
   /** Check for matrix mismatch condition  */
  if((pSrcA->numRows != pSrcB->numRows) ||
     (pSrcA->numCols != pSrcB->numCols) ||
     (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
  {
     /** Set status as ARM_MATH_SIZE_MISMATCH */
    status = ARM_MATH_SIZE_MISMATCH;
  }
  else
#endif
  {
     /** Total number of samples in the input matrix */
    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;

#ifndef ARM_MATH_CM0_FAMILY

     /** Run the below code for Cortex-M4 and Cortex-M3 */

     /** Loop Unrolling */
    blkCnt = numSamples >> 2u;

     /** First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
       /** C(m,n) = A(m,n) - B(m,n) */
       /** Subtract, saturate and then store the results in the destination buffer. */
       /** Read values from source A */
      inA1 = pIn1[0];

       /** Read values from source B */
      inB1 = pIn2[0];

       /** Read values from source A */
      inA2 = pIn1[1];

       /** Subtract and saturate */
      out1 = __QSUB(inA1, inB1);

       /** Read values from source B */
      inB2 = pIn2[1];

       /** Read values from source A */
      inA1 = pIn1[2];

       /** Subtract and saturate */
      out2 = __QSUB(inA2, inB2);

       /** Read values from source B */
      inB1 = pIn2[2];

       /** Store result in destination */
      pOut[0] = out1;
      pOut[1] = out2;

       /** Read values from source A */
      inA2 = pIn1[3];

       /** Read values from source B */
      inB2 = pIn2[3];

       /** Subtract and saturate */
      out1 = __QSUB(inA1, inB1);

       /** Subtract and saturate */
      out2 = __QSUB(inA2, inB2);

       /** Store result in destination */
      pOut[2] = out1;
      pOut[3] = out2;

       /** update pointers to process next samples */
      pIn1 += 4u;
      pIn2 += 4u;
      pOut += 4u;

       /** Decrement the loop counter */
      blkCnt--;
    }

     /** If the numSamples is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = numSamples % 0x4u;

#else

     /** Run the below code for Cortex-M0 */

     /** Initialize blkCnt with number of samples */
    blkCnt = numSamples;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

    while(blkCnt > 0u)
    {
       /** C(m,n) = A(m,n) - B(m,n) */
       /** Subtract, saturate and then store the results in the destination buffer. */
      inA1 = *pIn1++;
      inB1 = *pIn2++;

      inA1 = __QSUB(inA1, inB1);

      *pOut++ = inA1;

       /** Decrement the loop counter */
      blkCnt--;
    }

     /** Set status as ARM_MATH_SUCCESS */
    status = ARM_MATH_SUCCESS;
  }

   /** Return to application */
  return (status);
}
Beispiel #11
0
void arm_abs_q7(
  const q7_t * pSrc,
        q7_t * pDst,
        uint32_t blockSize)
{
        uint32_t blkCnt;                               /* Loop counter */
        q7_t in;                                       /* Temporary input variable */

#if defined (ARM_MATH_LOOPUNROLL)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;

  while (blkCnt > 0U)
  {
    /* C = |A| */

    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
    in = *pSrc++;
#if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
#else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif

    in = *pSrc++;
#if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
#else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif

    in = *pSrc++;
#if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
#else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif

    in = *pSrc++;
#if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t)__QSUB(0, in);
#else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif

    /* Decrement loop counter */
    blkCnt--;
  }

  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;

#else

  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;

#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

  while (blkCnt > 0U)
  {
    /* C = |A| */

    /* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
    in = *pSrc++;
#if defined (ARM_MATH_DSP)
    *pDst++ = (in > 0) ? in : (q7_t) __QSUB(0, in);
#else
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif

    /* Decrement loop counter */
    blkCnt--;
  }

}