Exemplo n.º 1
0
void test_vst1s32 (void)
{
  int32_t *arg0_int32_t;
  int32x2_t arg1_int32x2_t;

  vst1_s32 (arg0_int32_t, arg1_int32x2_t);
}
Exemplo n.º 2
0
test_vdup_lane_s32 ()
{
  int32x2_t a;
  int32x2_t b;
  int i;
  int32_t c[2] = { 0, 1 };
  int32_t d[2];

  a = vld1_s32 (c);
  b = wrap_vdup_lane_s32_0 (a);
  vst1_s32 (d, b);
  for (i = 0; i < 2; i++)
    if (c[0] != d[i])
      return 1;

  b = wrap_vdup_lane_s32_1 (a);
  vst1_s32 (d, b);
  for (i = 0; i < 2; i++)
    if (c[1] != d[i])
      return 1;
  return 0;
}
Exemplo n.º 3
0
test_vreinterpret_s32_f64 ()
{
  float64x1_t a;
  int32x2_t b;
  float64_t c[1] = { PI_F64 };
  int32_t d[2] = { 0x54442D18, 0x400921FB };
  int32_t e[2];
  int i;

  a = vld1_f64 (c);
  b = wrap_vreinterpret_s32_f64 (a);
  vst1_s32 (e, b);
  for (i = 0; i < 2; i++)
    if (d[i] != e[i])
      return 1;
  return 0;
};
Exemplo n.º 4
0
void test6(float *p, int32x2_t v) {
  return vst1_s32(p, v); // expected-warning {{incompatible pointer types}}
}
Exemplo n.º 5
0
inline void vst1(s32 * ptr, const int32x2_t   & v) { return vst1_s32(ptr, v); }
Exemplo n.º 6
0
// Contains a function for the core loop in the normalized lattice MA
// filter routine for iSAC codec, optimized for ARM Neon platform.
// It does:
//  for 0 <= n < HALF_SUBFRAMELEN - 1:
//    *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
//    *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
// Output is not bit-exact with the reference C code, due to the replacement
// of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
// instructions. The difference should not be bigger than 1.
void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
                                    int16_t input1,  // Filter coefficient
                                    int32_t input2,  // Inverse coefficient
                                    int32_t* ptr0,   // Sample buffer
                                    int32_t* ptr1,   // Sample buffer
                                    int32_t* ptr2)   // Sample buffer
{
  int n = 0;
  int loop = (HALF_SUBFRAMELEN - 1) >> 3;
  int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7;

  int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16);
  int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16);
  int32x4_t input2_v = vdupq_n_s32(input2);
  int32x4_t tmp0a, tmp1a, tmp2a, tmp3a;
  int32x4_t tmp0b, tmp1b, tmp2b, tmp3b;
  int32x4_t ptr0va, ptr1va, ptr2va;
  int32x4_t ptr0vb, ptr1vb, ptr2vb;

  // Unroll to process 8 samples at once.
  for (n = 0; n < loop; n++) {
    ptr0va = vld1q_s32(ptr0);
    ptr0vb = vld1q_s32(ptr0 + 4);
    ptr0 += 8;

    ptr2va = vld1q_s32(ptr2);
    ptr2vb = vld1q_s32(ptr2 + 4);

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
    tmp0b = vqrdmulhq_s32(ptr0vb, input0_v);

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
    tmp1b = vqrdmulhq_s32(ptr0vb, input1_v);

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
    tmp2b = vaddq_s32(tmp0b, ptr2vb);
    tmp2a = vshlq_n_s32(tmp2a, 15);
    tmp2b = vshlq_n_s32(tmp2b, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
    ptr2vb = vqrdmulhq_s32(tmp2b, input2_v);

    vst1q_s32(ptr2, ptr2va);
    vst1q_s32(ptr2 + 4, ptr2vb);
    ptr2 += 8;

    // Calculate tmp3 = ptr2v * input0.
    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
    tmp3b = vqrdmulhq_s32(ptr2vb, input0_v);

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1va = vaddq_s32(tmp1a, tmp3a);
    ptr1vb = vaddq_s32(tmp1b, tmp3b);

    vst1q_s32(ptr1, ptr1va);
    vst1q_s32(ptr1 + 4, ptr1vb);
    ptr1 += 8;
  }

  // Process four more samples.
  if (loop_tail & 0x4) {
    ptr0va = vld1q_s32(ptr0);
    ptr2va = vld1q_s32(ptr2);
    ptr0 += 4;

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
    tmp2a = vshlq_n_s32(tmp2a, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);

    vst1q_s32(ptr2, ptr2va);
    ptr2 += 4;

    // Calculate tmp3 = *(ptr2) * input0.
    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1va = vaddq_s32(tmp1a, tmp3a);

    vst1q_s32(ptr1, ptr1va);
    ptr1 += 4;
  }

  // Process two more samples.
  if (loop_tail & 0x2) {
    int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
    int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
    ptr0v_tail = vld1_s32(ptr0);
    ptr2v_tail = vld1_s32(ptr2);
    ptr0 += 2;

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v));

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v));

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
    tmp2_tail = vshl_n_s32(tmp2_tail, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v));

    vst1_s32(ptr2, ptr2v_tail);
    ptr2 += 2;

    // Calculate tmp3 = *(ptr2) * input0.
    tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v));

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail);

    vst1_s32(ptr1, ptr1v_tail);
    ptr1 += 2;
  }

  // Process one more sample.
  if (loop_tail & 0x1) {
    int16_t t16a = (int16_t)(input2 >> 16);
    int16_t t16b = (int16_t)input2;
    if (t16b < 0) t16a++;
    int32_t tmp32a;
    int32_t tmp32b;

    // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)).
    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0);
    tmp32b = *ptr2 + tmp32a;
    *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) +
                       (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b)));

    // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2).
    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0);
    tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2);
    *ptr1 = tmp32a + tmp32b;
  }
Exemplo n.º 7
0
void BQ_2I_D32F32C30_TRC_WRA_01 (           Biquad_Instance_t       *pInstance,
                                            LVM_INT32                    *pDataIn,
                                            LVM_INT32                    *pDataOut,
                                            LVM_INT16                    NrSamples)


    {
#if !(defined  __ARM_HAVE_NEON)
        LVM_INT32 ynL,ynR,templ,tempd;
        LVM_INT16 ii;
        PFilter_State pBiquadState = (PFilter_State) pInstance;

         for (ii = NrSamples; ii != 0; ii--)
         {


            /**************************************************************************
                            PROCESSING OF THE LEFT CHANNEL
            ***************************************************************************/
            /* ynL= ( A2 (Q30) * x(n-2)L (Q0) ) >>30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[2],ynL,30)

            /* ynL+= ( A1 (Q30) * x(n-1)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[0],templ,30)
            ynL+=templ;

            /* ynL+= ( A0 (Q30) * x(n)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[2],*pDataIn,templ,30)
            ynL+=templ;

             /* ynL+= (-B2 (Q30) * y(n-2)L (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[6],templ,30)
            ynL+=templ;

            /* ynL+= (-B1 (Q30) * y(n-1)L (Q0) ) >> 30 in Q0 */
            MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[4],templ,30)
            ynL+=templ;

            /**************************************************************************
                            PROCESSING OF THE RIGHT CHANNEL
            ***************************************************************************/
            /* ynR= ( A2 (Q30) * x(n-2)R (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[0],pBiquadState->pDelays[3],ynR,30)

            /* ynR+= ( A1 (Q30) * x(n-1)R (Q0) ) >> 30  in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[1],pBiquadState->pDelays[1],templ,30)
            ynR+=templ;

            /* ynR+= ( A0 (Q30) * x(n)R (Q0) ) >> 30 in Q0*/
            tempd=*(pDataIn+1);
            MUL32x32INTO32(pBiquadState->coefs[2],tempd,templ,30)
            ynR+=templ;

            /* ynR+= (-B2 (Q30) * y(n-2)R (Q0) ) >> 30 in Q0*/
            MUL32x32INTO32(pBiquadState->coefs[3],pBiquadState->pDelays[7],templ,30)
            ynR+=templ;

            /* ynR+= (-B1 (Q30) * y(n-1)R (Q0) ) >> 30 in Q0 */
            MUL32x32INTO32(pBiquadState->coefs[4],pBiquadState->pDelays[5],templ,30)
            ynR+=templ;

            /**************************************************************************
                            UPDATING THE DELAYS
            ***************************************************************************/
            pBiquadState->pDelays[7]=pBiquadState->pDelays[5]; /* y(n-2)R=y(n-1)R*/
            pBiquadState->pDelays[6]=pBiquadState->pDelays[4]; /* y(n-2)L=y(n-1)L*/
            pBiquadState->pDelays[3]=pBiquadState->pDelays[1]; /* x(n-2)R=x(n-1)R*/
            pBiquadState->pDelays[2]=pBiquadState->pDelays[0]; /* x(n-2)L=x(n-1)L*/
            pBiquadState->pDelays[5]=(LVM_INT32)ynR; /* Update y(n-1)R in Q0*/
            pBiquadState->pDelays[4]=(LVM_INT32)ynL; /* Update y(n-1)L in Q0*/
            pBiquadState->pDelays[0]=(*pDataIn); /* Update x(n-1)L in Q0*/
            pDataIn++;
            pBiquadState->pDelays[1]=(*pDataIn); /* Update x(n-1)R in Q0*/
            pDataIn++;

            /**************************************************************************
                            WRITING THE OUTPUT
            ***************************************************************************/
            *pDataOut=(LVM_INT32)ynL; /* Write Left output in Q0*/
            pDataOut++;
            *pDataOut=(LVM_INT32)ynR; /* Write Right ouput in Q0*/
            pDataOut++;


        }
#else
        LVM_INT16 ii=0;
	      
		PFilter_State pBiquadState = (PFilter_State) pInstance;

		int32x2_t A2 = vdup_n_s32(pBiquadState->coefs[0]);
		int32x2_t A1 = vdup_n_s32(pBiquadState->coefs[1]);
		int32x2_t A0 = vdup_n_s32(pBiquadState->coefs[2]);
		int32x2_t B2 = vdup_n_s32(pBiquadState->coefs[3]);
		int32x2_t B1 = vdup_n_s32(pBiquadState->coefs[4]);
		
		int32x2_t X_2 = vld1_s32(&pBiquadState->pDelays[2]);
		int32x2_t X_1 = vld1_s32(&pBiquadState->pDelays[0]);
		int32x2_t Y_2 = vld1_s32(&pBiquadState->pDelays[6]);
		int32x2_t Y_1 = vld1_s32(&pBiquadState->pDelays[4]);

		for(ii=0; ii<NrSamples; ii++){
		  int32x2_t s = vld1_s32(pDataIn);
		  int64x2_t r = vmull_s32(A2, X_2);
		  r = vmlal_s32(r, A1, X_1);
		  r = vmlal_s32(r, A0, s);
		  r = vmlal_s32(r, B2, Y_2);
		  r = vmlal_s32(r, B1, Y_1);
		  int32_t ll =(int32_t)( vgetq_lane_s64(r, 0) >> 30);
		  int32_t rr =(int32_t)( vgetq_lane_s64(r, 1) >> 30);
		  pDataIn += 2;
		  *pDataOut ++ = ll;
		  *pDataOut ++ = rr;
		  int32_t tmp1, tmp2;
		  tmp1 = vget_lane_s32(X_1, 0);
		  tmp2 = vget_lane_s32(X_1, 1);
		  vset_lane_s32(tmp1, X_2, 0);
		  vset_lane_s32(tmp2, X_2, 1);
		  tmp1 = vget_lane_s32(Y_1, 0);
		  tmp2 = vget_lane_s32(Y_1, 1);
		  vset_lane_s32(tmp1, Y_2, 0);
		  vset_lane_s32(tmp2, Y_2, 1);

		  vset_lane_s32(ll, Y_1, 0);
		  vset_lane_s32(rr, Y_1, 1);
		  
		  tmp1 = vget_lane_s32(s, 0);
		  tmp2 = vget_lane_s32(s, 1);
		  vset_lane_s32(tmp1, X_1, 0);
		  vset_lane_s32(tmp2, X_1, 1);
		}
        vst1_s32(&pBiquadState->pDelays[2], X_2);
        vst1_s32(&pBiquadState->pDelays[0], X_1);
        vst1_s32(&pBiquadState->pDelays[6], Y_2);
        vst1_s32(&pBiquadState->pDelays[4], Y_1);
#endif         

    }