Exemplo n.º 1
0
float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
  // CHECK-LABEL: test_vuzpq_f32
  return vuzpq_f32(a, b);
  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
}
// __INLINE
void arm_cmplx_mult_cmplx_f32_dot(
  float32_t * pSrcA,
  float32_t * pSrcB,
  float32_t * pDst,
  uint32_t numSamples)
{
  float32_t a, b, c, d;                          /* Temporary variables to store real and imaginary values */
  float32x4_t A1, A2;                            /* Temporary variables to store real and imaginary values of source buffer A */
  float32x4_t B1, B2;                            /* Temporary variables to store real and imaginary values of source buffer B */
  float32x4_t C1, C2, C3, C4;                    /* Temporary variables to store multiplication output */
  float32x4x2_t out1, out2, out3, out4;          /* Temporary variables to stroe output result */
  float32x4x2_t acc1, acc2, acc3, acc4;            /* Accumulators */
  float 	 sum_real, sum_img;            		/*  */
  uint32_t blkCnt;                               /* loop counters */

  /* Clear accumulators   VDUP.32 q0,r0
	Vector Duplicate duplicates a scalar into every element of the destination vector.
    */
  acc1.val[0] = vdupq_n_f32(0.0f);
  acc1.val[1] = vdupq_n_f32(0.0f);
  acc2.val[0] = vdupq_n_f32(0.0f);
  acc2.val[1] = vdupq_n_f32(0.0f);
  acc3.val[0] = vdupq_n_f32(0.0f);
  acc3.val[1] = vdupq_n_f32(0.0f);
  acc4.val[0] = vdupq_n_f32(0.0f);
  acc4.val[1] = vdupq_n_f32(0.0f);

  /* Loop over blockSize number of values */
  blkCnt = numSamples >> 4u;

  while(blkCnt > 0u)
  {
  	/*  A1, A2, B1, B2 each has two complex data. */
	
	/******************************************************/
	/* Step 1: Load data A1, A2, B1, B2 for group a:*/
    /* read 2 complex values at a time from source A buffer 
	float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);
		 VLD1.32 {d0, d1}, [r0]
	*/
    A1 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;
    /* read 2 complex values at a time from source A buffer */
    A2 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;

    /* read 2 complex values at a time from source B buffer */
    B1 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;
    /* read 2 complex values at a time from source B buffer */
    B2 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;

	/******************************************************/
	/* Step 2: Unzip data Out1, Out2 for group a:*/
    /* unzip real and imag values
	A1: reala0, imga0, reala1, imga1
	A2: realb0, imgb0, realb1, imgb1
	out1.val0: reala0, reala1, realb0, realb1;
	out1.val1: imga0, imga1, imgb0, imgb1

	vuzpq_f32:
    	float32x4x2_t vuzpq_f32 (float32x4_t, float32x4_t) 
	Form of expected instruction(s): vuzp.32 q0, q1
	Vector Unzip de-interleaves the elements of two vectors. 
	*/
    out1 = vuzpq_f32(A1, A2);
    out2 = vuzpq_f32(B1, B2);

	/******************************************************/
	/* Step 1: Load data A1, A2, B1, B2 for group b:*/
    /* read 2 complex values at a time from source A buffer */
    A1 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;
    /* read 2 complex values at a time from source A buffer */
    A2 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;

    /* read 2 complex values at a time from source B buffer */
    B1 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;
    /* read 2 complex values at a time from source B buffer */
    B2 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;

	/******************************************************/
	/* Step 3: Compute data C1,C2,C3,C4 for group a:*/
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    /* vmulq_f32: VMUL.F32 q0,q0,q0
		val[0]: real
		val[1]: img
		C1 = a.real*b.real;  		C2 = a.img*b.img
		C3 = a.img*b.real;  		C4 = a.real*b.img
	*/    
	/* multiply 4 samples at a time from A1 real input with B1 real input 	*/
    C1 = vmulq_f32(out1.val[0], out2.val[0]);  
    /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */
    C2 = vmulq_f32(out1.val[1], out2.val[1]);
    /* multiply 4 samples at a time from A1 imaginary input with B1 real input */
    C3 = vmulq_f32(out1.val[1], out2.val[0]);
    /* multiply 4 samples at a time from A1 real input with B1 imaginary input */
    C4 = vmulq_f32(out1.val[0], out2.val[1]);
	/*  real: c1-c2; img: c3+c4 */

	/******************************************************/
	/* Step 2: Unzip data Out2, Out3 for group b:*/
    out2 = vuzpq_f32(A1, A2);
    out3 = vuzpq_f32(B1, B2);

	/******************************************************/
	/* Step 1: Load data A1, A2 for group c:*/
    /* read 2 complex values at a time from source A buffer */
    A1 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;
    /* read 2 complex values at a time from source A buffer */
    A2 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;

	/******************************************************/
	/* Step 4: Output or accumlate data for group a:*/
	/*  (a+bi)*(c+di) = (ac-bd)+(ad+bc)i*/
	/*  real: c1-c2; img: c3+c4 */
    /* subtract 4 samples at time from real result to imaginary result, got four real part */
	/*  
		C1 = a.real*b.real; 		C2 = a.img*b.img
		C3 = a.img*b.real;		C4 = a.real*b.img

		vaddq_f32: 
		VADD.F32 q0,q0,q0
	*/
    out1.val[0] = vsubq_f32(C1, C2);
	acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]);  /* add by Hank */
    /* add real*imaginary result with imaginary*real result 4 at a time */
    out1.val[1] = vaddq_f32(C3, C4);
	acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */
	/* out1 is four complex product. */

	/******************************************************/
	/* Step 1: Load data B1, B2 for group c:*/
    /* read 2 complex values at a time from source B buffer */
    B1 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;
    /* read 2 complex values at a time from source B buffer */
    B2 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;

	/******************************************************/
	/* Step 3: Compute data C1,C2   for group b:*/
    /* multiply 4 samples at a time from A1 real input with B1 real input */
    C1 = vmulq_f32(out2.val[0], out3.val[0]);
    /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */
    C2 = vmulq_f32(out2.val[1], out3.val[1]);

	/******************************************************/
	/* Step 5: Store data for group a:*/
    /* Store 4 complex samples to destination buffer
             VST2.32 {d0, d2}, [r0]   */
    //vst2q_f32(pDst, out1);
    /* increment destination buffer by 8 */
    //pDst += 8u;

	/******************************************************/
	/* Step 3: Compute data  C3,C4 for group b:*/
    /* multiply 4 samples at a time from A1 imaginary input with B1 real input */
    C3 = vmulq_f32(out2.val[1], out3.val[0]);
    
                           /* multiply 4 samples at a time from A1 real input with B1 imaginary input */
	C4 = vmulq_f32(out2.val[0], out3.val[1]);

	/******************************************************/
	/* Step 2: Unzip data Out1, Out2 for group C:*/
    out3 = vuzpq_f32(A1, A2);
    out4 = vuzpq_f32(B1, B2);

	/******************************************************/
	/* Step 1: Load data A1, A2, B1, B2 for group d:*/
    /* read 4 complex values from source A buffer */
    A1 = vld1q_f32(pSrcA);
    pSrcA += 4u;
    A2 = vld1q_f32(pSrcA);
    pSrcA += 4u;

    /* read 4 complex values from source B buffer */
    B1 = vld1q_f32(pSrcB);
    pSrcB += 4u;
    B2 = vld1q_f32(pSrcB);
    pSrcB += 4u;

	/******************************************************/
	/* Step 4: Output or accumlate data for group b:*/
    /* subtract 4 samples at time from real result to imaginary result */
    out2.val[0] = vsubq_f32(C1, C2);
    /* add real*imaginary result with imaginary*real result 4 at a time */
    out2.val[1] = vaddq_f32(C3, C4);
	acc2.val[0] = vaddq_f32(out2.val[0], acc2.val[0]);  /* add by Hank */
	acc2.val[1] = vaddq_f32(out2.val[1], acc2.val[1]); /* add by Hank */

	/******************************************************/
	/* Step 3: Compute data C1,C2,C3,C4 for group c:*/
    /* multiply 4 samples at a time from A3 real input with B3 real input */
    C1 = vmulq_f32(out3.val[0], out4.val[0]);
    /* multiply 4 samples at a time from A3 imaginary input with B3 imaginary input */
    C2 = vmulq_f32(out3.val[1], out4.val[1]);
    /* multiply 4 samples at a time from A3 imaginary input with B3 real input */
    C3 = vmulq_f32(out3.val[1], out4.val[0]);
    /* multiply 4 samples at a time from A3 real input with B3 imaginary input */
    C4 = vmulq_f32(out3.val[0], out4.val[1]);

	/******************************************************/
	/* Step 2: Unzip data Out1, Out2 for group D:*/
    out1 = vuzpq_f32(A1, A2);
    out4 = vuzpq_f32(B1, B2);

	/******************************************************/
	/* Step 5: Store data for group b:*/
    /* Store 4 complex samples to destination buffer */
    //vst2q_f32(pDst, out2);
    /* increment destination buffer by 8 */
    //pDst += 8u;

	/******************************************************/
	/* Step 4: Output or accumlate data for group c:*/
    /* subtract 4 samples at time from real result to imaginary result */
    out3.val[0] = vsubq_f32(C1, C2);
    /* add real*imaginary result with imaginary*real result 4 at a time */
    out3.val[1] = vaddq_f32(C3, C4);
	acc3.val[0] = vaddq_f32(out3.val[0], acc3.val[0]);  /* add by Hank */
	acc3.val[1] = vaddq_f32(out3.val[1], acc3.val[1]); /* add by Hank */

	/******************************************************/
	/* Step 3: Compute data C1,C2,C3,C4 for group d:*/
    /* multiply 4 samples at a time from A1 real input with B1 real input */
    C1 = vmulq_f32(out1.val[0], out4.val[0]);
    /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */
    C2 = vmulq_f32(out1.val[1], out4.val[1]);

    /* multiply 4 samples at a time from A1 imaginary input with B1 real input */
    C3 = vmulq_f32(out1.val[1], out4.val[0]);
    /* multiply 4 samples at a time from A1 real input with B1 imaginary input */
    C4 = vmulq_f32(out1.val[0], out4.val[1]);

	/******************************************************/
	/* Step 5: Store data for group c:*/
    /* Store 4 complex samples to destination buffer */
    //vst2q_f32(pDst, out3);

	/******************************************************/
	/* Step 4: Output or accumlate data for group d:*/
    /* subtract 4 samples at time from real result to imaginary result */
    out4.val[0] = vsubq_f32(C1, C2);

    /* increment destination buffer by 8 */
    //pDst += 8u;

	/******************************************************/
	/* Step 4: Output or accumlate data for group d:*/
    /* add real*imaginary result with imaginary*real result 4 at a time */
    out4.val[1] = vaddq_f32(C3, C4);
	acc4.val[0] = vaddq_f32(out4.val[0], acc4.val[0]);  /* add by Hank */
	acc4.val[1] = vaddq_f32(out4.val[1], acc4.val[1]); /* add by Hank */

    /* zip real and imag values */
    //out4 = vzipq_f32(out4.val[0], out4.val[1]);

	/******************************************************/
	/* Step 5: Store data for group d:*/
    /* Store 4 complex samples to destination buffer */
    //vst1q_f32(pDst, out4.val[0]);
    //pDst += 4u;
    //vst1q_f32(pDst, out4.val[1]);
    //pDst += 4u;

    /* Decrement the numSamples loop counter */
    blkCnt--;
  }

  blkCnt = numSamples & 15u;
  blkCnt = blkCnt >> 2u;

  /* If the blockSize is not a multiple of 16, compute remaining output samples.     
   ** Compute multiple of 4 samples at a time in second loop.  
   ** and remaining 1 to 3 samples in third loop. */
  while(blkCnt > 0u)
  {
	/* Step 1: Load data A1, A2, B1, B2 */
	    /* read 4 complex values at a time from source A buffer */
	    A1 = vld1q_f32(pSrcA);
	    /* increment source A buffer by 8 */
	    pSrcA += 4u;
	    A2 = vld1q_f32(pSrcA);
	    pSrcA += 4u;
	    /* read 4 complex values at a time from source B buffer */
	    B1 = vld1q_f32(pSrcB);
	    /* increment source B buffer by 8 */
	    pSrcB += 4u;
	    B2 = vld1q_f32(pSrcB);
	    pSrcB += 4u;

	/* Step 2: Unzip data Out1, Out2 */
		/* Unzip data */
	    out1 = vuzpq_f32(A1, A2);
	    out2 = vuzpq_f32(B1, B2);

	/* Step 3: Compute data C1,C2,C3,C4 */
	    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
	    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
	    /* multiply 4 samples at a time from A1 real input with B1 real input */
	    C1 = vmulq_f32(out1.val[0], out2.val[0]);
	    /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */
	    C2 = vmulq_f32(out1.val[1], out2.val[1]);
	    /* multiply 4 samples at a time from A1 imaginary input with B1 real input */
	    C3 = vmulq_f32(out1.val[1], out2.val[0]);
	    /* multiply 4 samples at a time from A1 real input with B1 imaginary input */
	    C4 = vmulq_f32(out1.val[0], out2.val[1]);

	/* Step 4: Output or accumlate data for group d:*/
	    /* subtract 4 samples at time from real result to imaginary result */
	    out1.val[0] = vsubq_f32(C1, C2);
	    /* add real*imaginary result with imaginary*real result 4 at a time */
	    out1.val[1] = vaddq_f32(C3, C4);
		acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]);  /* add by Hank */
		acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */

	    //out1 = vzipq_f32(out1.val[0], out1.val[1]);

	/* Step 5: Store data */
	    /* Store 4 complex samples to destination buffer */
	    //vst1q_f32(pDst, out1.val[0]);
	    //pDst += 4u;
	    //vst1q_f32(pDst, out1.val[1]);
	    //pDst += 4u;

    /* Decrement the numSamples loop counter */
    blkCnt--;
  }

  blkCnt = numSamples & 3u;

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.     
   ** No intrinsics is used. */
  sum_real =0;
  sum_img =0;
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;

    /* store the result in the destination buffer. */
    sum_real += ((a * c) - (b * d));
    sum_img += ((a * d) + (b * c));

    /* Decrement the numSamples loop counter */
    blkCnt--;
  }

	/* add 4 accumulators */
	acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]);
	acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]);
	acc2.val[0] = vaddq_f32(acc3.val[0], acc4.val[0]);
	acc2.val[1] = vaddq_f32(acc3.val[1], acc4.val[1]);
	acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]);
	acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]);

	sum_real += vgetq_lane_f32(acc1.val[0], 0) + vgetq_lane_f32(acc1.val[0], 1)
		+ vgetq_lane_f32(acc1.val[0], 2) + vgetq_lane_f32(acc1.val[0], 3);
	sum_img += vgetq_lane_f32(acc1.val[1], 0) + vgetq_lane_f32(acc1.val[1], 1)
		+ vgetq_lane_f32(acc1.val[1], 2) + vgetq_lane_f32(acc1.val[1], 3);

	*pDst++=sum_real;
	*pDst++=sum_img;
Exemplo n.º 3
0
static void FilterAdaptationNEON(
    int num_partitions,
    int x_fft_buf_block_pos,
    float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
    float e_fft[2][PART_LEN1],
    float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
  float fft[PART_LEN2];
  int i;
  for (i = 0; i < num_partitions; i++) {
    int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
    int pos = i * PART_LEN1;
    int j;
    // Check for wrap
    if (i + x_fft_buf_block_pos >= num_partitions) {
      xPos -= num_partitions * PART_LEN1;
    }

    // Process the whole array...
    for (j = 0; j < PART_LEN; j += 4) {
      // Load x_fft_buf and e_fft.
      const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);
      const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);
      const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]);
      const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]);
      // Calculate the product of conjugate(x_fft_buf) by e_fft.
      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
      const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re);
      const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im);
      const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im);
      const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re);
      // Interleave real and imaginary parts.
      const float32x4x2_t g_n_h = vzipq_f32(e, f);
      // Store
      vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
      vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
    }
    // ... and fixup the first imaginary entry.
    fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN],
                   -x_fft_buf[1][xPos + PART_LEN],
                   e_fft[0][PART_LEN],
                   e_fft[1][PART_LEN]);

    aec_rdft_inverse_128(fft);
    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

    // fft scaling
    {
      const float scale = 2.0f / PART_LEN2;
      const float32x4_t scale_ps = vmovq_n_f32(scale);
      for (j = 0; j < PART_LEN; j += 4) {
        const float32x4_t fft_ps = vld1q_f32(&fft[j]);
        const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);
        vst1q_f32(&fft[j], fft_scale);
      }
    }
    aec_rdft_forward_128(fft);

    {
      const float wt1 = h_fft_buf[1][pos];
      h_fft_buf[0][pos + PART_LEN] += fft[1];
      for (j = 0; j < PART_LEN; j += 4) {
        float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]);
        float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]);
        const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
        const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
        const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
        wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
        wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);

        vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re);
        vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im);
      }
      h_fft_buf[1][pos] = wt1;
    }
  }
}