void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                      const float aInputR[WEBAUDIO_BLOCK_SIZE],
                                      float aGainL, float aGainR,
                                      bool aIsOnTheLeft,
                                      float aOutputL[WEBAUDIO_BLOCK_SIZE],
                                      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
  ASSERT_ALIGNED(aInputL);
  ASSERT_ALIGNED(aInputR);
  ASSERT_ALIGNED(aOutputL);
  ASSERT_ALIGNED(aOutputR);

  float32x4_t vinL0, vinL1;
  float32x4_t vinR0, vinR1;
  float32x4_t voutL0, voutL1;
  float32x4_t voutR0, voutR1;
  float32x4_t vscaleL = vmovq_n_f32(aGainL);
  float32x4_t vscaleR = vmovq_n_f32(aGainR);

  if (aIsOnTheLeft) {
    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));

      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));

      voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
      voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);

      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);

      voutR0 = vmulq_f32(vinR0, vscaleR);
      voutR1 = vmulq_f32(vinR1, vscaleR);

      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
    }
  } else {
    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));

      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));

      voutL0 = vmulq_f32(vinL0, vscaleL);
      voutL1 = vmulq_f32(vinL1, vscaleL);

      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);

      voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
      voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);

      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
    }
  }
}
static void ScaleErrorSignalNEON(int extended_filter_enabled,
                                 float normal_mu,
                                 float normal_error_threshold,
                                 float x_pow[PART_LEN1],
                                 float ef[2][PART_LEN1]) {
  const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;
  const float error_threshold = extended_filter_enabled ?
      kExtendedErrorThreshold : normal_error_threshold;
  const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);
  const float32x4_t kMu = vmovq_n_f32(mu);
  const float32x4_t kThresh = vmovq_n_f32(error_threshold);
  int i;
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]);
    const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);
    const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);
    const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f);
    float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);
    float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);
    const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);
    const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);
    const float32x4_t absEf = vsqrtq_f32(ef_sum2);
    const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);
    const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);
    const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);
    uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));
    uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));
    uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger),
                                     vreinterpretq_u32_f32(ef_re));
    uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger),
                                     vreinterpretq_u32_f32(ef_im));
    ef_re_if = vandq_u32(bigger, ef_re_if);
    ef_im_if = vandq_u32(bigger, ef_im_if);
    ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);
    ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);
    ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);
    ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);
    vst1q_f32(&ef[0][i], ef_re);
    vst1q_f32(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    float abs_ef;
    ef[0][i] /= (x_pow[i] + 1e-10f);
    ef[1][i] /= (x_pow[i] + 1e-10f);
    abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

    if (abs_ef > error_threshold) {
      abs_ef = error_threshold / (abs_ef + 1e-10f);
      ef[0][i] *= abs_ef;
      ef[1][i] *= abs_ef;
    }

    // Stepsize factor
    ef[0][i] *= mu;
    ef[1][i] *= mu;
  }
}
void
AudioBlockCopyChannelWithScale_NEON(const float* aInput,
                                    float aScale,
                                    float* aOutput)
{
  ASSERT_ALIGNED(aInput);
  ASSERT_ALIGNED(aOutput);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale = vmovq_n_f32(aScale);

  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=16) {
    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
    vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4));
    vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8));
    vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12));

    vout0 = vmulq_f32(vin0, vscale);
    vout1 = vmulq_f32(vin1, vscale);
    vout2 = vmulq_f32(vin2, vscale);
    vout3 = vmulq_f32(vin3, vscale);

    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
    vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1);
    vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2);
    vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3);
  }
}
Beispiel #4
0
void test_vmovQ_nf32 (void)
{
  float32x4_t out_float32x4_t;
  float32_t arg0_float32_t;

  out_float32x4_t = vmovq_n_f32 (arg0_float32_t);
}
void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {
  ASSERT_ALIGNED(aBlock);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale = vmovq_n_f32(aScale);

  uint32_t dif = aSize % 16;
  uint32_t vectorSize = aSize - dif;
  uint32_t i = 0;
  for (; i < vectorSize; i += 16) {
    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));

    vout0 = vmulq_f32(vin0, vscale);
    vout1 = vmulq_f32(vin1, vscale);
    vout2 = vmulq_f32(vin2, vscale);
    vout3 = vmulq_f32(vin3, vscale);

    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
  }

  for (unsigned j = 0; j < dif; ++i, ++j) {
    aBlock[i] *= aScale;
  }
}
Beispiel #6
0
/* f32x4 mm mul */
void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C)
{
	int i, k, j;

	float32x4_t neon_b, neon_c;
	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_f32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_f32(A + j_T);

				neon_b = vld1q_f32(B + k_Row + j);
				neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));
				neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));
				neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));
				neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));

				neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);

				vst1q_lane_f32(C + k_Row + i, neon_c, 0);
				vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
void AudioBufferAddWithScale_NEON(const float* aInput,
                                  float aScale,
                                  float* aOutput,
                                  uint32_t aSize)
{
  ASSERT_ALIGNED(aInput);
  ASSERT_ALIGNED(aOutput);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale = vmovq_n_f32(aScale);

  uint32_t dif = aSize % 16;
  aSize -= dif;
  unsigned i = 0;
  for (; i < aSize; i+=16) {
    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
    vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4));
    vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8));
    vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12));

    vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));
    vout1 = vld1q_f32(ADDRESS_OF(aOutput, i+4));
    vout2 = vld1q_f32(ADDRESS_OF(aOutput, i+8));
    vout3 = vld1q_f32(ADDRESS_OF(aOutput, i+12));

    vout0 = vmlaq_f32(vout0, vin0, vscale);
    vout1 = vmlaq_f32(vout1, vin1, vscale);
    vout2 = vmlaq_f32(vout2, vin2, vscale);
    vout3 = vmlaq_f32(vout3, vin3, vscale);

    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
    vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1);
    vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2);
    vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3);
  }

  for (unsigned j = 0; j < dif; ++i, ++j) {
    aOutput[i] += aInput[i]*aScale;
  }
}
Beispiel #8
0
/* f32x4 mv mul */
void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C)
{
	int i = 0;
	int k = 0;

	float32x4_t neon_b, neon_c;
	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_f32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_f32(A + j);
			neon_a1 = vld1q_f32(A + j + Row);
			neon_a2 = vld1q_f32(A + j + 2 * Row);
			neon_a3 = vld1q_f32(A + j + 3 * Row);

			neon_b = vld1q_f32(B + k);
			neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));
			neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));
			neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));
			neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));

			neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);

		}

		vst1q_f32(C + i, neon_c);
	}
}
Beispiel #9
0
float32x4_t test_vmovq_n_f32(float32_t v1) {
  // CHECK: test_vmovq_n_f32
  return vmovq_n_f32(v1);
  // CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
}
static void OverdriveAndSuppressNEON(AecCore* aec,
                                     float hNl[PART_LEN1],
                                     const float hNlFb,
                                     float efw[2][PART_LEN1]) {
  int i;
  const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb);
  const float32x4_t vec_one = vdupq_n_f32(1.0f);
  const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
  const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);

  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    // Weight subbands
    float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
    const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
    const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
    const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve,
                                                        vec_hNlFb);
    const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
    const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve,
                                                          vec_hNl);
    const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger),
                                         vreinterpretq_u32_f32(vec_hNl));
    const float32x4_t vec_one_weightCurve_add =
        vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
    const uint32x4_t vec_if1 =
        vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));

    vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));

    {
      const float32x4_t vec_overDriveCurve =
          vld1q_f32(&WebRtcAec_overDriveCurve[i]);
      const float32x4_t vec_overDriveSm_overDriveCurve =
          vmulq_f32(vec_overDriveSm, vec_overDriveCurve);
      vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve);
      vst1q_f32(&hNl[i], vec_hNl);
    }

    // Suppress error signal
    {
      float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]);
      float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]);
      vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl);
      vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl);

      // Ooura fft returns incorrect sign on imaginary component. It matters
      // here because we are making an additive change with comfort noise.
      vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one);
      vst1q_f32(&efw[0][i], vec_efw_re);
      vst1q_f32(&efw[1][i], vec_efw_im);
    }
  }

  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    // Weight subbands
    if (hNl[i] > hNlFb) {
      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
               (1 - WebRtcAec_weightCurve[i]) * hNl[i];
    }

    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);

    // Suppress error signal
    efw[0][i] *= hNl[i];
    efw[1][i] *= hNl[i];

    // Ooura fft returns incorrect sign on imaginary component. It matters
    // here because we are making an additive change with comfort noise.
    efw[1][i] *= -1;
  }
}
static void FilterAdaptationNEON(
    int num_partitions,
    int x_fft_buf_block_pos,
    float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
    float e_fft[2][PART_LEN1],
    float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
  float fft[PART_LEN2];
  int i;
  for (i = 0; i < num_partitions; i++) {
    int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
    int pos = i * PART_LEN1;
    int j;
    // Check for wrap
    if (i + x_fft_buf_block_pos >= num_partitions) {
      xPos -= num_partitions * PART_LEN1;
    }

    // Process the whole array...
    for (j = 0; j < PART_LEN; j += 4) {
      // Load x_fft_buf and e_fft.
      const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);
      const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);
      const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]);
      const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]);
      // Calculate the product of conjugate(x_fft_buf) by e_fft.
      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
      const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re);
      const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im);
      const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im);
      const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re);
      // Interleave real and imaginary parts.
      const float32x4x2_t g_n_h = vzipq_f32(e, f);
      // Store
      vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
      vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
    }
    // ... and fixup the first imaginary entry.
    fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN],
                   -x_fft_buf[1][xPos + PART_LEN],
                   e_fft[0][PART_LEN],
                   e_fft[1][PART_LEN]);

    aec_rdft_inverse_128(fft);
    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

    // fft scaling
    {
      const float scale = 2.0f / PART_LEN2;
      const float32x4_t scale_ps = vmovq_n_f32(scale);
      for (j = 0; j < PART_LEN; j += 4) {
        const float32x4_t fft_ps = vld1q_f32(&fft[j]);
        const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);
        vst1q_f32(&fft[j], fft_scale);
      }
    }
    aec_rdft_forward_128(fft);

    {
      const float wt1 = h_fft_buf[1][pos];
      h_fft_buf[0][pos + PART_LEN] += fft[1];
      for (j = 0; j < PART_LEN; j += 4) {
        float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]);
        float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]);
        const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
        const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
        const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
        wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
        wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);

        vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re);
        vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im);
      }
      h_fft_buf[1][pos] = wt1;
    }
  }
}
void AudioBlockPanStereoToStereo_NEON(
    const float aInputL[WEBAUDIO_BLOCK_SIZE],
    const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL[WEBAUDIO_BLOCK_SIZE],
    float aGainR[WEBAUDIO_BLOCK_SIZE],
    const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
    float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
  ASSERT_ALIGNED(aInputL);
  ASSERT_ALIGNED(aInputR);
  ASSERT_ALIGNED(aGainL);
  ASSERT_ALIGNED(aGainR);
  ASSERT_ALIGNED(aIsOnTheLeft);
  ASSERT_ALIGNED(aOutputL);
  ASSERT_ALIGNED(aOutputR);

  float32x4_t vinL0, vinL1;
  float32x4_t vinR0, vinR1;
  float32x4_t voutL0, voutL1;
  float32x4_t voutR0, voutR1;
  float32x4_t vscaleL0, vscaleL1;
  float32x4_t vscaleR0, vscaleR1;
  float32x4_t onleft0, onleft1, notonleft0, notonleft1;

  float32x4_t zero = vmovq_n_f32(0);
  uint8x8_t isOnTheLeft;

  // Although MSVC throws uninitialized value warning for voutL0 and voutL1,
  // since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid
  // compiler warning, set zero.
  voutL0 = zero;
  voutL1 = zero;

  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
    vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
    vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));

    vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
    vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));

    vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i));
    vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4));

    vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i));
    vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4));

    // Load output with boolean "on the left" values. This assumes that
    // bools are stored as a single byte.
    isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]);
    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0);
    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1);
    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2);
    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3);
    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0);
    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1);
    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2);
    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3);

    // Convert the boolean values into masks by setting all bits to 1
    // if true.
    voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero);
    voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero);

    // The right output masks are the same as the left masks
    voutR0 = voutL0;
    voutR1 = voutL1;

    // Calculate left channel assuming isOnTheLeft
    onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0);
    onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL0);

    // Calculate left channel assuming not isOnTheLeft
    notonleft0 = vmulq_f32(vinL0, vscaleL0);
    notonleft1 = vmulq_f32(vinL1, vscaleL1);

    // Write results using previously stored masks
    voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0);
    voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1);

    // Calculate right channel assuming isOnTheLeft
    onleft0 = vmulq_f32(vinR0, vscaleR0);
    onleft1 = vmulq_f32(vinR1, vscaleR1);

    // Calculate right channel assuming not isOnTheLeft
    notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0);
    notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1);

    // Write results using previously stored masks
    voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0);
    voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1);

    vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
    vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
    vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
    vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
  }
}