float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { // CHECK-LABEL: test_vuzpq_f32 return vuzpq_f32(a, b); // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }
// __INLINE void arm_cmplx_mult_cmplx_f32_dot( float32_t * pSrcA, float32_t * pSrcB, float32_t * pDst, uint32_t numSamples) { float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */ float32x4_t A1, A2; /* Temporary variables to store real and imaginary values of source buffer A */ float32x4_t B1, B2; /* Temporary variables to store real and imaginary values of source buffer B */ float32x4_t C1, C2, C3, C4; /* Temporary variables to store multiplication output */ float32x4x2_t out1, out2, out3, out4; /* Temporary variables to stroe output result */ float32x4x2_t acc1, acc2, acc3, acc4; /* Accumulators */ float sum_real, sum_img; /* */ uint32_t blkCnt; /* loop counters */ /* Clear accumulators VDUP.32 q0,r0 Vector Duplicate duplicates a scalar into every element of the destination vector. */ acc1.val[0] = vdupq_n_f32(0.0f); acc1.val[1] = vdupq_n_f32(0.0f); acc2.val[0] = vdupq_n_f32(0.0f); acc2.val[1] = vdupq_n_f32(0.0f); acc3.val[0] = vdupq_n_f32(0.0f); acc3.val[1] = vdupq_n_f32(0.0f); acc4.val[0] = vdupq_n_f32(0.0f); acc4.val[1] = vdupq_n_f32(0.0f); /* Loop over blockSize number of values */ blkCnt = numSamples >> 4u; while(blkCnt > 0u) { /* A1, A2, B1, B2 each has two complex data. */ /******************************************************/ /* Step 1: Load data A1, A2, B1, B2 for group a:*/ /* read 2 complex values at a time from source A buffer float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); VLD1.32 {d0, d1}, [r0] */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source A buffer */ A2 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /* read 2 complex values at a time from source B buffer */ B2 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /******************************************************/ /* Step 2: Unzip data Out1, Out2 for group a:*/ /* unzip real and imag values A1: reala0, imga0, reala1, imga1 A2: realb0, imgb0, realb1, imgb1 out1.val0: reala0, reala1, realb0, realb1; out1.val1: imga0, imga1, imgb0, imgb1 vuzpq_f32: float32x4x2_t vuzpq_f32 (float32x4_t, float32x4_t) Form of expected instruction(s): vuzp.32 q0, q1 Vector Unzip de-interleaves the elements of two vectors. */ out1 = vuzpq_f32(A1, A2); out2 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 1: Load data A1, A2, B1, B2 for group b:*/ /* read 2 complex values at a time from source A buffer */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source A buffer */ A2 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /* read 2 complex values at a time from source B buffer */ B2 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /******************************************************/ /* Step 3: Compute data C1,C2,C3,C4 for group a:*/ /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ /* vmulq_f32: VMUL.F32 q0,q0,q0 val[0]: real val[1]: img C1 = a.real*b.real; C2 = a.img*b.img C3 = a.img*b.real; C4 = a.real*b.img */ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out1.val[0], out2.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out1.val[1], out2.val[1]); /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out1.val[1], out2.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out1.val[0], out2.val[1]); /* real: c1-c2; img: c3+c4 */ /******************************************************/ /* Step 2: Unzip data Out2, Out3 for group b:*/ out2 = vuzpq_f32(A1, A2); out3 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 1: Load data A1, A2 for group c:*/ /* read 2 complex values at a time from source A buffer */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source A buffer */ A2 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /******************************************************/ /* Step 4: Output or accumlate data for group a:*/ /* (a+bi)*(c+di) = (ac-bd)+(ad+bc)i*/ /* real: c1-c2; img: c3+c4 */ /* subtract 4 samples at time from real result to imaginary result, got four real part */ /* C1 = a.real*b.real; C2 = a.img*b.img C3 = a.img*b.real; C4 = a.real*b.img vaddq_f32: VADD.F32 q0,q0,q0 */ out1.val[0] = vsubq_f32(C1, C2); acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]); /* add by Hank */ /* add real*imaginary result with imaginary*real result 4 at a time */ out1.val[1] = vaddq_f32(C3, C4); acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */ /* out1 is four complex product. */ /******************************************************/ /* Step 1: Load data B1, B2 for group c:*/ /* read 2 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /* read 2 complex values at a time from source B buffer */ B2 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /******************************************************/ /* Step 3: Compute data C1,C2 for group b:*/ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out2.val[0], out3.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out2.val[1], out3.val[1]); /******************************************************/ /* Step 5: Store data for group a:*/ /* Store 4 complex samples to destination buffer VST2.32 {d0, d2}, [r0] */ //vst2q_f32(pDst, out1); /* increment destination buffer by 8 */ //pDst += 8u; /******************************************************/ /* Step 3: Compute data C3,C4 for group b:*/ /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out2.val[1], out3.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out2.val[0], out3.val[1]); /******************************************************/ /* Step 2: Unzip data Out1, Out2 for group C:*/ out3 = vuzpq_f32(A1, A2); out4 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 1: Load data A1, A2, B1, B2 for group d:*/ /* read 4 complex values from source A buffer */ A1 = vld1q_f32(pSrcA); pSrcA += 4u; A2 = vld1q_f32(pSrcA); pSrcA += 4u; /* read 4 complex values from source B buffer */ B1 = vld1q_f32(pSrcB); pSrcB += 4u; B2 = vld1q_f32(pSrcB); pSrcB += 4u; /******************************************************/ /* Step 4: Output or accumlate data for group b:*/ /* subtract 4 samples at time from real result to imaginary result */ out2.val[0] = vsubq_f32(C1, C2); /* add real*imaginary result with imaginary*real result 4 at a time */ out2.val[1] = vaddq_f32(C3, C4); acc2.val[0] = vaddq_f32(out2.val[0], acc2.val[0]); /* add by Hank */ acc2.val[1] = vaddq_f32(out2.val[1], acc2.val[1]); /* add by Hank */ /******************************************************/ /* Step 3: Compute data C1,C2,C3,C4 for group c:*/ /* multiply 4 samples at a time from A3 real input with B3 real input */ C1 = vmulq_f32(out3.val[0], out4.val[0]); /* multiply 4 samples at a time from A3 imaginary input with B3 imaginary input */ C2 = vmulq_f32(out3.val[1], out4.val[1]); /* multiply 4 samples at a time from A3 imaginary input with B3 real input */ C3 = vmulq_f32(out3.val[1], out4.val[0]); /* multiply 4 samples at a time from A3 real input with B3 imaginary input */ C4 = vmulq_f32(out3.val[0], out4.val[1]); /******************************************************/ /* Step 2: Unzip data Out1, Out2 for group D:*/ out1 = vuzpq_f32(A1, A2); out4 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 5: Store data for group b:*/ /* Store 4 complex samples to destination buffer */ //vst2q_f32(pDst, out2); /* increment destination buffer by 8 */ //pDst += 8u; /******************************************************/ /* Step 4: Output or accumlate data for group c:*/ /* subtract 4 samples at time from real result to imaginary result */ out3.val[0] = vsubq_f32(C1, C2); /* add real*imaginary result with imaginary*real result 4 at a time */ out3.val[1] = vaddq_f32(C3, C4); acc3.val[0] = vaddq_f32(out3.val[0], acc3.val[0]); /* add by Hank */ acc3.val[1] = vaddq_f32(out3.val[1], acc3.val[1]); /* add by Hank */ /******************************************************/ /* Step 3: Compute data C1,C2,C3,C4 for group d:*/ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out1.val[0], out4.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out1.val[1], out4.val[1]); /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out1.val[1], out4.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out1.val[0], out4.val[1]); /******************************************************/ /* Step 5: Store data for group c:*/ /* Store 4 complex samples to destination buffer */ //vst2q_f32(pDst, out3); /******************************************************/ /* Step 4: Output or accumlate data for group d:*/ /* subtract 4 samples at time from real result to imaginary result */ out4.val[0] = vsubq_f32(C1, C2); /* increment destination buffer by 8 */ //pDst += 8u; /******************************************************/ /* Step 4: Output or accumlate data for group d:*/ /* add real*imaginary result with imaginary*real result 4 at a time */ out4.val[1] = vaddq_f32(C3, C4); acc4.val[0] = vaddq_f32(out4.val[0], acc4.val[0]); /* add by Hank */ acc4.val[1] = vaddq_f32(out4.val[1], acc4.val[1]); /* add by Hank */ /* zip real and imag values */ //out4 = vzipq_f32(out4.val[0], out4.val[1]); /******************************************************/ /* Step 5: Store data for group d:*/ /* Store 4 complex samples to destination buffer */ //vst1q_f32(pDst, out4.val[0]); //pDst += 4u; //vst1q_f32(pDst, out4.val[1]); //pDst += 4u; /* Decrement the numSamples loop counter */ blkCnt--; } blkCnt = numSamples & 15u; blkCnt = blkCnt >> 2u; /* If the blockSize is not a multiple of 16, compute remaining output samples. ** Compute multiple of 4 samples at a time in second loop. ** and remaining 1 to 3 samples in third loop. */ while(blkCnt > 0u) { /* Step 1: Load data A1, A2, B1, B2 */ /* read 4 complex values at a time from source A buffer */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 8 */ pSrcA += 4u; A2 = vld1q_f32(pSrcA); pSrcA += 4u; /* read 4 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 8 */ pSrcB += 4u; B2 = vld1q_f32(pSrcB); pSrcB += 4u; /* Step 2: Unzip data Out1, Out2 */ /* Unzip data */ out1 = vuzpq_f32(A1, A2); out2 = vuzpq_f32(B1, B2); /* Step 3: Compute data C1,C2,C3,C4 */ /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out1.val[0], out2.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out1.val[1], out2.val[1]); /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out1.val[1], out2.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out1.val[0], out2.val[1]); /* Step 4: Output or accumlate data for group d:*/ /* subtract 4 samples at time from real result to imaginary result */ out1.val[0] = vsubq_f32(C1, C2); /* add real*imaginary result with imaginary*real result 4 at a time */ out1.val[1] = vaddq_f32(C3, C4); acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]); /* add by Hank */ acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */ //out1 = vzipq_f32(out1.val[0], out1.val[1]); /* Step 5: Store data */ /* Store 4 complex samples to destination buffer */ //vst1q_f32(pDst, out1.val[0]); //pDst += 4u; //vst1q_f32(pDst, out1.val[1]); //pDst += 4u; /* Decrement the numSamples loop counter */ blkCnt--; } blkCnt = numSamples & 3u; /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No intrinsics is used. */ sum_real =0; sum_img =0; while(blkCnt > 0u) { /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ a = *pSrcA++; b = *pSrcA++; c = *pSrcB++; d = *pSrcB++; /* store the result in the destination buffer. */ sum_real += ((a * c) - (b * d)); sum_img += ((a * d) + (b * c)); /* Decrement the numSamples loop counter */ blkCnt--; } /* add 4 accumulators */ acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]); acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]); acc2.val[0] = vaddq_f32(acc3.val[0], acc4.val[0]); acc2.val[1] = vaddq_f32(acc3.val[1], acc4.val[1]); acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]); acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]); sum_real += vgetq_lane_f32(acc1.val[0], 0) + vgetq_lane_f32(acc1.val[0], 1) + vgetq_lane_f32(acc1.val[0], 2) + vgetq_lane_f32(acc1.val[0], 3); sum_img += vgetq_lane_f32(acc1.val[1], 0) + vgetq_lane_f32(acc1.val[1], 1) + vgetq_lane_f32(acc1.val[1], 2) + vgetq_lane_f32(acc1.val[1], 3); *pDst++=sum_real; *pDst++=sum_img;
static void FilterAdaptationNEON( int num_partitions, int x_fft_buf_block_pos, float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float e_fft[2][PART_LEN1], float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { float fft[PART_LEN2]; int i; for (i = 0; i < num_partitions; i++) { int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; int j; // Check for wrap if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // Process the whole array... for (j = 0; j < PART_LEN; j += 4) { // Load x_fft_buf and e_fft. const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]); const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]); // Calculate the product of conjugate(x_fft_buf) by e_fft. // re(conjugate(a) * b) = aRe * bRe + aIm * bIm // im(conjugate(a) * b)= aRe * bIm - aIm * bRe const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re); const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); // Interleave real and imaginary parts. const float32x4x2_t g_n_h = vzipq_f32(e, f); // Store vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); } // ... and fixup the first imaginary entry. fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], e_fft[0][PART_LEN], e_fft[1][PART_LEN]); aec_rdft_inverse_128(fft); memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); // fft scaling { const float scale = 2.0f / PART_LEN2; const float32x4_t scale_ps = vmovq_n_f32(scale); for (j = 0; j < PART_LEN; j += 4) { const float32x4_t fft_ps = vld1q_f32(&fft[j]); const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps); vst1q_f32(&fft[j], fft_scale); } } aec_rdft_forward_128(fft); { const float wt1 = h_fft_buf[1][pos]; h_fft_buf[0][pos + PART_LEN] += fft[1]; for (j = 0; j < PART_LEN; j += 4) { float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]); float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]); const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]); const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]); const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4); wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]); wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]); vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re); vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im); } h_fft_buf[1][pos] = wt1; } } }