void arm_q7_to_q31(
  q7_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
{
  q7_t *pIn = pSrc;                              /* Src pointer */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY

  q31_t in;

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = (q31_t) A << 24 */
    /* convert from q7 to q31 and then store the results in the destination buffer */
    in = *__SIMD32(pIn)++;

#ifndef ARM_MATH_BIG_ENDIAN

    *pDst++ = (__ROR(in, 8)) & 0xFF000000;
    *pDst++ = (__ROR(in, 16)) & 0xFF000000;
    *pDst++ = (__ROR(in, 24)) & 0xFF000000;
    *pDst++ = (in & 0xFF000000);

#else

    *pDst++ = (in & 0xFF000000);
    *pDst++ = (__ROR(in, 24)) & 0xFF000000;
    *pDst++ = (__ROR(in, 16)) & 0xFF000000;
    *pDst++ = (__ROR(in, 8)) & 0xFF000000;

#endif //              #ifndef ARM_MATH_BIG_ENDIAN

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

  while(blkCnt > 0u)
  {
    /* C = (q31_t) A << 24 */
    /* convert from q7 to q31 and then store the results in the destination buffer */
    *pDst++ = (q31_t) * pIn++ << 24;

    /* Decrement the loop counter */
    blkCnt--;
  }

}
示例#2
0
void arm_q7_to_q15(
    q7_t * pSrc,
    q15_t * pDst,
    uint32_t blockSize)
{
    q7_t *pIn = pSrc;                              /* Src pointer */
    uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0_FAMILY
    q31_t in;
    q31_t in1, in2;
    q31_t out1, out2;

    /* Run the below code for Cortex-M4 and Cortex-M3 */

    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
        /* C = (q15_t) A << 8 */
        /* convert from q7 to q15 and then store the results in the destination buffer */
        in = *__SIMD32(pIn)++;

        /* rotatate in by 8 and extend two q7_t values to q15_t values */
        in1 = __SXTB16(__ROR(in, 8));

        /* extend remainig two q7_t values to q15_t values */
        in2 = __SXTB16(in);

        in1 = in1 << 8u;
        in2 = in2 << 8u;

        in1 = in1 & 0xFF00FF00;
        in2 = in2 & 0xFF00FF00;

#ifndef ARM_MATH_BIG_ENDIAN

        out2 = __PKHTB(in1, in2, 16);
        out1 = __PKHBT(in2, in1, 16);

#else

        out1 = __PKHTB(in1, in2, 16);
        out2 = __PKHBT(in2, in1, 16);

#endif

        *__SIMD32(pDst)++ = out1;
        *__SIMD32(pDst)++ = out2;

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

#else

    /* Run the below code for Cortex-M0 */

    /* Loop over blockSize number of values */
    blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0_FAMILY */

    while(blkCnt > 0u)
    {
        /* C = (q15_t) A << 8 */
        /* convert from q7 to q15 and then store the results in the destination buffer */
        *pDst++ = (q15_t) * pIn++ << 8;

        /* Decrement the loop counter */
        blkCnt--;
    }

}
void arm_dot_prod_q7(
    q7_t * pSrcA,
    q7_t * pSrcB,
    uint32_t blockSize,
    q31_t * result)
{
    uint32_t blkCnt;                               /* loop counter */

    q31_t sum = 0;                                 /* Temporary variables to store output */

#ifndef ARM_MATH_CM0_FAMILY

    /* Run the below code for Cortex-M4 and Cortex-M3 */

    q31_t input1, input2;                          /* Temporary variables to store input */
    q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables to store input */



    /*loop Unrolling */
    blkCnt = blockSize >> 2u;

    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
        /* read 4 samples at a time from sourceA */
        input1 = *__SIMD32(pSrcA)++;
        /* read 4 samples at a time from sourceB */
        input2 = *__SIMD32(pSrcB)++;

        /* extract two q7_t samples to q15_t samples */
        inA1 = __SXTB16(__ROR(input1, 8));
        /* extract reminaing two samples */
        inA2 = __SXTB16(input1);
        /* extract two q7_t samples to q15_t samples */
        inB1 = __SXTB16(__ROR(input2, 8));
        /* extract reminaing two samples */
        inB2 = __SXTB16(input2);

        /* multiply and accumulate two samples at a time */
        sum = __SMLAD(inA1, inB1, sum);
        sum = __SMLAD(inA2, inB2, sum);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;

    while(blkCnt > 0u)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        /* Dot product and then store the results in a temporary buffer. */
        sum = __SMLAD(*pSrcA++, *pSrcB++, sum);

        /* Decrement the loop counter */
        blkCnt--;
    }

#else

    /* Run the below code for Cortex-M0 */



    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;

    while(blkCnt > 0u)
    {
        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
        /* Dot product and then store the results in a temporary buffer. */
        sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);

        /* Decrement the loop counter */
        blkCnt--;
    }

#endif /* #ifndef ARM_MATH_CM0_FAMILY */


    /* Store the result in the destination buffer in 18.14 format */
    *result = sum;
}
示例#4
0
void arm_power_q7(
  q7_t * pSrc,
  uint32_t blockSize,
  q31_t * pResult)
{
  q31_t sum = 0;                                 /* Temporary result storage */
  q7_t in;                                       /* Temporary variable to store input */
  uint32_t blkCnt;                               /* loop counter */

#ifndef ARM_MATH_CM0

  /* Run the below code for Cortex-M4 and Cortex-M3 */

  q31_t input1;                                  /* Temporary variable to store packed input */
  q31_t in1, in2;                                /* Temporary variables to store input */

  /*loop Unrolling */
  blkCnt = blockSize >> 2u;

  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* Reading two inputs of pSrc vector and packing */
    input1 = *__SIMD32(pSrc)++;

    in1 = __SXTB16(__ROR(input1, 8));
    in2 = __SXTB16(input1);

    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
    /* calculate power and accumulate to accumulator */
    sum = __SMLAD(in1, in1, sum);
    sum = __SMLAD(in2, in2, sum);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;

#else

  /* Run the below code for Cortex-M0 */

  /* Loop over blockSize number of values */
  blkCnt = blockSize;

#endif /* #ifndef ARM_MATH_CM0 */

  while(blkCnt > 0u)
  {
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
    /* Compute Power and then store the result in a temporary variable, sum. */
    in = *pSrc++;
    sum += ((q15_t) in * in);

    /* Decrement the loop counter */
    blkCnt--;
  }

  /* Store the result in 18.14 format  */
  *pResult = sum;
}
/**
  * @brief  AES CCM Authentication TAG generation.
  * @param  hcryp: pointer to a CRYP_HandleTypeDef structure that contains
  *         the configuration information for CRYP module
  * @param  AuthTag: Pointer to the authentication buffer
  * @param  Timeout: Timeout duration
  * @retval HAL status
  */
HAL_StatusTypeDef HAL_CRYPEx_AESCCM_GenerateAuthTAG(CRYP_HandleTypeDef *hcryp, uint32_t *AuthTag, uint32_t Timeout)
{
  uint32_t tagaddr = (uint32_t)AuthTag;
  uint32_t ctr0 [4]={0};
  uint32_t ctr0addr = (uint32_t)ctr0;
  uint32_t tickstart = 0U;  
  
  if(hcryp->State == HAL_CRYP_STATE_READY)
  {  
    /* Process locked */
    __HAL_LOCK(hcryp);
    
    /* Change the CRYP peripheral state */
    hcryp->State = HAL_CRYP_STATE_BUSY;
    
    /* Check if initialization phase has already been performed */
    if(hcryp->Phase == CRYPEx_PHASE_PROCESS)
    {
      /* Change the CRYP phase */
      hcryp->Phase = CRYPEx_PHASE_FINAL;
    }
    else /* Initialization phase has not been performed*/
    { 
      /* Disable the peripheral */
      __HAL_CRYP_DISABLE(hcryp);
            
      /* Sequence error code field */   
     hcryp->ErrorCode |= HAL_CRYP_ERROR_AUTH_TAG_SEQUENCE;      
      
     /* Change the CRYP peripheral state */
      hcryp->State = HAL_CRYP_STATE_READY;    
      
      /* Process unlocked */
      __HAL_UNLOCK(hcryp);
      return HAL_ERROR;
    } 
    
    /* Disable CRYP to start the final phase */
    __HAL_CRYP_DISABLE(hcryp);   
    
    /* Select final phase & ALGODIR bit must be set to ‘0’. */
    MODIFY_REG(hcryp->Instance->CR, CRYP_CR_GCM_CCMPH|CRYP_CR_ALGODIR, CRYP_PHASE_FINAL|CRYP_OPERATINGMODE_ENCRYPT);     
    
    /* Enable the CRYP peripheral */
    __HAL_CRYP_ENABLE(hcryp);
    
    /* Write the counter block in the IN FIFO, CTR0 information from B0
    data has to be swapped according to the DATATYPE*/
    ctr0[0]=(hcryp->Init.B0[0]) & CRYP_CCM_CTR0_0;
    ctr0[1]=hcryp->Init.B0[1];
    ctr0[2]=hcryp->Init.B0[2];
    ctr0[3]=hcryp->Init.B0[3] &  CRYP_CCM_CTR0_3; 
    
    if(hcryp->Init.DataType == CRYP_DATATYPE_8B)
    {   
      hcryp->Instance->DIN = __REV(*(uint32_t*)(ctr0addr));
      ctr0addr+=4;
      hcryp->Instance->DIN = __REV(*(uint32_t*)(ctr0addr));
      ctr0addr+=4;
      hcryp->Instance->DIN = __REV(*(uint32_t*)(ctr0addr));
      ctr0addr+=4;
      hcryp->Instance->DIN = __REV(*(uint32_t*)(ctr0addr));
    }
    else if(hcryp->Init.DataType == CRYP_DATATYPE_16B)
    {
      hcryp->Instance->DIN = __ROR(*(uint32_t*)(ctr0addr), 16U);
      ctr0addr+=4;
      hcryp->Instance->DIN = __ROR(*(uint32_t*)(ctr0addr), 16U);
      ctr0addr+=4;
      hcryp->Instance->DIN = __ROR(*(uint32_t*)(ctr0addr), 16U);
      ctr0addr+=4;
      hcryp->Instance->DIN = __ROR(*(uint32_t*)(ctr0addr), 16U);
    }
    else if(hcryp->Init.DataType == CRYP_DATATYPE_1B)
    { 
      hcryp->Instance->DIN = __RBIT(*(uint32_t*)(ctr0addr));
      ctr0addr+=4;
      hcryp->Instance->DIN = __RBIT(*(uint32_t*)(ctr0addr));
      ctr0addr+=4;
      hcryp->Instance->DIN = __RBIT(*(uint32_t*)(ctr0addr));
      ctr0addr+=4;
      hcryp->Instance->DIN = __RBIT(*(uint32_t*)(ctr0addr)); 
    }
    else
    {
      hcryp->Instance->DIN = *(uint32_t*)(ctr0addr);
      ctr0addr+=4;
      hcryp->Instance->DIN = *(uint32_t*)(ctr0addr);
      ctr0addr+=4;
      hcryp->Instance->DIN = *(uint32_t*)(ctr0addr);
      ctr0addr+=4;
      hcryp->Instance->DIN = *(uint32_t*)(ctr0addr);;
    }   
    /* Wait for OFNE flag to be raised */
    tickstart = HAL_GetTick();
    while(HAL_IS_BIT_CLR(hcryp->Instance->SR, CRYP_FLAG_OFNE))
    {
      /* Check for the Timeout */
      if(Timeout != HAL_MAX_DELAY)
      {
        if((Timeout == 0U)||((HAL_GetTick() - tickstart ) > Timeout))
        {       
          /* Disable the CRYP peripheral Clock */
          __HAL_CRYP_DISABLE(hcryp);
         
          /* Change state */
          hcryp->ErrorCode |= HAL_CRYP_ERROR_TIMEOUT;
          hcryp->State = HAL_CRYP_STATE_READY;  
         
          /* Process unlocked */          
          __HAL_UNLOCK(hcryp); 
          return HAL_ERROR;
        }
      }
    }
    
    /* Read the Auth TAG in the IN FIFO */
    *(uint32_t*)(tagaddr) = hcryp->Instance->DOUT;
    tagaddr+=4U;
    *(uint32_t*)(tagaddr) = hcryp->Instance->DOUT;
    tagaddr+=4U;
    *(uint32_t*)(tagaddr) = hcryp->Instance->DOUT;
    tagaddr+=4U;
    *(uint32_t*)(tagaddr) = hcryp->Instance->DOUT;      
    
    /* Change the CRYP peripheral state */
    hcryp->State = HAL_CRYP_STATE_READY;
   
    /* Process unlocked */
    __HAL_UNLOCK(hcryp);
  
    /* Disable CRYP  */
    __HAL_CRYP_DISABLE(hcryp);  
  }
  else
  {
    /* Busy error code field */
    hcryp->ErrorCode = HAL_CRYP_ERROR_BUSY; 
    return HAL_ERROR;
  }   
  /* Return function status */
  return HAL_OK;   
}
/**
  * @brief  generate the GCM authentication TAG.
  * @param  hcryp: pointer to a CRYP_HandleTypeDef structure that contains
  *         the configuration information for CRYP module
  * @param  AuthTag: Pointer to the authentication buffer
  * @param  Timeout: Timeout duration
  * @retval HAL status
  */
HAL_StatusTypeDef HAL_CRYPEx_AESGCM_GenerateAuthTAG(CRYP_HandleTypeDef *hcryp, uint32_t *AuthTag, uint32_t Timeout)
{
  uint32_t tickstart = 0U;    
  uint64_t headerlength = hcryp->Init.HeaderSize * 32U; /* Header length in bits */
  uint64_t inputlength = (hcryp->Size) * 32U; /* input length in bits */
  uint32_t tagaddr = (uint32_t)AuthTag;  
  
  if(hcryp->State == HAL_CRYP_STATE_READY)
  {  
    /* Process locked */
    __HAL_LOCK(hcryp);
    
    /* Change the CRYP peripheral state */
    hcryp->State = HAL_CRYP_STATE_BUSY;
    
    /* Check if initialization phase has already been performed */
    if(hcryp->Phase == CRYPEx_PHASE_PROCESS)
    {
      /* Change the CRYP phase */
      hcryp->Phase = CRYPEx_PHASE_FINAL;
    }
    else /* Initialization phase has not been performed*/
    { 
      /* Disable the Peripheral */
      __HAL_CRYP_DISABLE(hcryp);
      
      /* Sequence error code field */ 
      hcryp->ErrorCode |= HAL_CRYP_ERROR_AUTH_TAG_SEQUENCE; 
      
      /* Change the CRYP peripheral state */
      hcryp->State = HAL_CRYP_STATE_READY;    
      
      /* Process unlocked */
      __HAL_UNLOCK(hcryp);
      return HAL_ERROR;
    }
    
    /* Disable CRYP to start the final phase */
    __HAL_CRYP_DISABLE(hcryp);
    
    /* Select final phase */  
    MODIFY_REG(hcryp->Instance->CR, CRYP_CR_GCM_CCMPH, CRYP_PHASE_FINAL);  
    
    /*ALGODIR bit must be set to ‘0’.*/ 
    hcryp->Instance->CR &=  ~CRYP_CR_ALGODIR;
    
    /* Enable the CRYP peripheral */
    __HAL_CRYP_ENABLE(hcryp);
    
    /* Write the number of bits in header (64 bits) followed by the number of bits
    in the payload */
    if(hcryp->Init.DataType == CRYP_DATATYPE_1B)
    {
      hcryp->Instance->DIN = 0U;
      hcryp->Instance->DIN = __RBIT((uint32_t)(headerlength));
      hcryp->Instance->DIN = 0U;
      hcryp->Instance->DIN = __RBIT((uint32_t)(inputlength));
    }
    else if(hcryp->Init.DataType == CRYP_DATATYPE_8B)
    {
      hcryp->Instance->DIN = 0U;
      hcryp->Instance->DIN = __REV((uint32_t)(headerlength));
      hcryp->Instance->DIN = 0U;
      hcryp->Instance->DIN = __REV((uint32_t)(inputlength));    
    }
    else if(hcryp->Init.DataType == CRYP_DATATYPE_16B)
    {
      hcryp->Instance->DIN = 0U;
      hcryp->Instance->DIN = __ROR((uint32_t)headerlength, 16U);
      hcryp->Instance->DIN = 0U;
      hcryp->Instance->DIN = __ROR((uint32_t)inputlength, 16U);
    }
    else if(hcryp->Init.DataType == CRYP_DATATYPE_32B)
    {
      hcryp->Instance->DIN = 0U;
      hcryp->Instance->DIN = (uint32_t)(headerlength);
      hcryp->Instance->DIN = 0U;
      hcryp->Instance->DIN = (uint32_t)(inputlength);
    }  
    
    /* Wait for OFNE flag to be raised */
    tickstart = HAL_GetTick();
    while(HAL_IS_BIT_CLR(hcryp->Instance->SR, CRYP_FLAG_OFNE))
    {
      /* Check for the Timeout */
      if(Timeout != HAL_MAX_DELAY)
      {
        if((Timeout == 0U)||((HAL_GetTick() - tickstart ) > Timeout))
        {       
          /* Disable the CRYP Peripheral Clock */
          __HAL_CRYP_DISABLE(hcryp);
          
          /* Change state */
          hcryp->ErrorCode |= HAL_CRYP_ERROR_TIMEOUT;
          hcryp->State = HAL_CRYP_STATE_READY;  
          
          /* Process unlocked */          
          __HAL_UNLOCK(hcryp); 
          return HAL_ERROR;
        }
      }
    }          
    
    /* Read the authentication TAG in the output FIFO */
    *(uint32_t*)(tagaddr) = hcryp->Instance->DOUT;
    tagaddr+=4U;
    *(uint32_t*)(tagaddr) = hcryp->Instance->DOUT;
    tagaddr+=4U;
    *(uint32_t*)(tagaddr) = hcryp->Instance->DOUT;
    tagaddr+=4U;
    *(uint32_t*)(tagaddr) = hcryp->Instance->DOUT;      
    
    /* Disable the peripheral */
    __HAL_CRYP_DISABLE(hcryp);
    
    /* Change the CRYP peripheral state */
    hcryp->State = HAL_CRYP_STATE_READY;    
    
    /* Process unlocked */
    __HAL_UNLOCK(hcryp);
  }
  else
  {
    /* Busy error code field */
    hcryp->ErrorCode |= HAL_CRYP_ERROR_BUSY; 
    return HAL_ERROR;
  }   
  /* Return function status */
  return HAL_OK;
}
示例#7
0
文件: arm_q7_to_q15.c 项目: JGSuw/DIP
void arm_q7_to_q15(     
  q7_t * pSrc,     
  q15_t * pDst,     
  uint32_t blockSize)     
{     
  q7_t *pIn = pSrc;                              /* Src pointer */     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t in;  
  q31_t in1, in2;  
  q31_t out1, out2;  
  q31_t and = 0xFF00FF00;  
  
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {     
    /* C = (q15_t) A << 8 */     
    /* convert from q7 to q15 and then store the results in the destination buffer */  
	/* read 4 samples at a time */     
	in = *__SIMD32(pIn)++;  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(in, 8);  
  
	/* extend remainig two q7_t values to q15_t values */  
	in2 = __SXTB16(in, 0);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(__ROR(in, 8));  
  
	/* extend remainig two q7_t values to q15_t values */  
	in2 = __SXTB16(in);  
  
  
#endif	/* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/  
	in1 = in1 << 8u;  
	in2 = in2 << 8u;  
  
	/* read next 4 sampels */  
	in = *__SIMD32(pIn)++;  
  
	/* anding with 0xff00ff00 */  
	in1 =  in1 & and;  
	out2 = in2 & and;  
  
	/* pack two 16 bit values */  
	out1 = __PKHTB(in1, out2, 16);  
	out2 = __PKHBT(out2, in1, 16);  
  
#ifndef ARM_MATH_BIG_ENDIAN	  
	  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst + 2) = out1;  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(in, 8);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(__ROR(in, 8));  
  
#endif  
  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst) = out2;  
  
#else  
  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst) = out1;  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(in, 8);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in1 = __SXTB16(__ROR(in, 8));  
  
#endif  
  
	/* store two q15_t samples at a time to destination */  
	_SIMD32_OFFSET(pDst + 2) = out2;  
  
#endif	 	//	#ifndef ARM_MATH_BIG_ENDIAN  
  
#ifdef CCS  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in2 = __SXTB16(in, 0);  
  
#else  
  
	/* rotatate in by 8 and extend two q7_t values to q15_t values */  
	in2 = __SXTB16(in);  
  
#endif  
  
	/* shift in1 by 8 to convert q7_t value to q15_t value (ex: 0x00ff00ff ==> 0xff00ff00*/  
	in1 = in1 << 8u;  
	in2 = in2 << 8u;  
  
	/* anding with 0xff00ff00 */  
	out1 = in1 & and;  
	out2 = in2 & and;  
  
	/* pack two 16 bit values */  
	out1 = __PKHTB(in1, out2, 16);  
	out2 = __PKHBT(out2, in1, 16);  
  
	/* store two q15_t samples at a time to destination */  
#ifndef ARM_MATH_BIG_ENDIAN  
  
	_SIMD32_OFFSET(pDst + 6) = out1;  
	_SIMD32_OFFSET(pDst + 4) = out2;  
  
#else  
  
	_SIMD32_OFFSET(pDst + 4) = out1;  
	_SIMD32_OFFSET(pDst + 6) = out2;  
  
#endif	 	//	#ifndef ARM_MATH_BIG_ENDIAN  
  
	/* incremnet destination pointer */  
	pDst += 8u;  
  
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = (q15_t) A << 8 */     
    /* convert from q7 to q15 and then store the results in the destination buffer */     
    *pDst++ = (q15_t) * pIn++ << 8;     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
}     
示例#8
0
文件: arm_power_q7.c 项目: JGSuw/DIP
void arm_power_q7(     
  q7_t * pSrc,     
  uint32_t blockSize,     
  q31_t * pResult)     
{     
  q31_t acc = 0;                                 /* Temporary result storage */     
  q31_t input1;                                  /* Temporary variable to store packed input */     
  q7_t in;                                       /* Temporary variable to store input */     
  uint32_t blkCnt;                               /* loop counter */     
  q31_t inA1, inA2;	   							 /* Temporary variables to hold intermiediate data */  
  q31_t acc1 = 0;  
     
     
  /*loop Unrolling */     
  blkCnt = blockSize >> 3u;     
     
  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
   ** a second loop below computes the remaining 1 to 7 samples. */     
  while(blkCnt > 0u)     
  {    
  	/* read four samples at a time from soruce buffer */   
  	input1 = _SIMD32_OFFSET(pSrc);  
  
	/* extend two q7_t values to q15_t values */  
#ifdef CCS  
  
	inA1 = __SXTB16(input1, 8);  
	inA2 = __SXTB16(input1, 0);  
  
#else  
  
	inA1 = __SXTB16(__ROR(input1, 8));  
	inA2 = __SXTB16(input1);  
  
#endif	//	#ifdef CCS  
  
    /* calculate power and accumulate to accumulator */  
	acc = __SMLAD(inA1, inA1, acc);     
  
  	/* read four samples at a time from soruce buffer */   
  	input1 = _SIMD32_OFFSET(pSrc + 4);  
  
#ifdef CCS  
  
	/* extend two q7_t values to q15_t values */  
	inA1 = __SXTB16(input1, 8);  
  
    /* calculate power and accumulate to accumulator */  
    acc1 = __SMLAD(inA2, inA2, acc1);     
  
	/* extend two q7_t values to q15_t values */  
	inA2 = __SXTB16(input1, 0);  
  
#else  
  
	/* extend two q7_t values to q15_t values */  
	inA1 = __SXTB16(__ROR(input1, 8));  
  
    /* calculate power and accumulate to accumulator */  
    acc1 = __SMLAD(inA2, inA2, acc1);     
  
	/* extend two q7_t values to q15_t values */  
	inA2 = __SXTB16(input1);  
  
#endif	//	#ifdef CCS  
  
    /* calculate power and accumulate to accumulator */  
    acc = __SMLAD(inA1, inA1, acc);     
    acc1 = __SMLAD(inA2, inA2, acc1);  
      
	/* update source buffer to process next samples */  
	pSrc += 8u;     
    
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
    
  /* add accumulators */  
  acc = acc + acc1;   
    
  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
   ** No loop unrolling is used. */     
  blkCnt = blockSize % 0x8u;     
     
  while(blkCnt > 0u)     
  {     
    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */     
    /* Compute Power and then store the result in a temporary variable, acc. */     
    in = *pSrc++;     
    acc += ((q15_t) in * in);     
     
    /* Decrement the loop counter */     
    blkCnt--;     
  }     
     
  /* Store the result in 18.14 format  */     
  *pResult = acc;     
}