static void SubbandCoherenceSSE2(AecCore* aec,
                                 float efw[2][PART_LEN1],
                                 float xfw[2][PART_LEN1],
                                 float* fft,
                                 float* cohde,
                                 float* cohxd) {
  float dfw[2][PART_LEN1];
  int i;

  if (aec->delayEstCtr == 0)
    aec->delayIdx = PartitionDelay(aec);

  // Use delayed far.
  memcpy(xfw,
         aec->xfwBuf + aec->delayIdx * PART_LEN1,
         sizeof(xfw[0][0]) * 2 * PART_LEN1);

  // Windowed near fft
  WindowData(fft, aec->dBuf);
  aec_rdft_forward_128(fft);
  StoreAsComplex(fft, dfw);

  // Windowed error fft
  WindowData(fft, aec->eBuf);
  aec_rdft_forward_128(fft);
  StoreAsComplex(fft, efw);

  SmoothedPSD(aec, efw, dfw, xfw);

  {
    const __m128 vec_1eminus10 =  _mm_set1_ps(1e-10f);

    // Subband coherence
    for (i = 0; i + 3 < PART_LEN1; i += 4) {
      const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]);
      const __m128 vec_se = _mm_loadu_ps(&aec->se[i]);
      const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]);
      const __m128 vec_sdse = _mm_add_ps(vec_1eminus10,
                                         _mm_mul_ps(vec_sd, vec_se));
      const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10,
                                         _mm_mul_ps(vec_sd, vec_sx));
      const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]);
      const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
      const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
      const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
      const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
                                              _MM_SHUFFLE(2, 0, 2, 0));
      const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
                                              _MM_SHUFFLE(3, 1, 3, 1));
      const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
                                              _MM_SHUFFLE(2, 0, 2, 0));
      const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
                                              _MM_SHUFFLE(3, 1, 3, 1));
      __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);
      __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);
      vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));
      vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);
      vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));
      vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);
      _mm_storeu_ps(&cohde[i], vec_cohde);
      _mm_storeu_ps(&cohxd[i], vec_cohxd);
    }

    // scalar code for the remaining items.
    for (; i < PART_LEN1; i++) {
      cohde[i] =
          (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
          (aec->sd[i] * aec->se[i] + 1e-10f);
      cohxd[i] =
          (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
          (aec->sx[i] * aec->sd[i] + 1e-10f);
    }
  }
}
Beispiel #2
0
static void SubbandCoherenceNEON(AecCore* aec,
                                 float efw[2][PART_LEN1],
                                 float xfw[2][PART_LEN1],
                                 float* fft,
                                 float* cohde,
                                 float* cohxd) {
  float dfw[2][PART_LEN1];
  int i;

  if (aec->delayEstCtr == 0)
    aec->delayIdx = PartitionDelay(aec);

  // Use delayed far.
  memcpy(xfw,
         aec->xfwBuf + aec->delayIdx * PART_LEN1,
         sizeof(xfw[0][0]) * 2 * PART_LEN1);

  // Windowed near fft
  WindowData(fft, aec->dBuf);
  aec_rdft_forward_128(fft);
  StoreAsComplex(fft, dfw);

  // Windowed error fft
  WindowData(fft, aec->eBuf);
  aec_rdft_forward_128(fft);
  StoreAsComplex(fft, efw);

  SmoothedPSD(aec, efw, dfw, xfw);

  {
    const float32x4_t vec_1eminus10 =  vdupq_n_f32(1e-10f);

    // Subband coherence
    for (i = 0; i + 3 < PART_LEN1; i += 4) {
      const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
      const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
      const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
      const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
      const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
      float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);
      float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);
      vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);
      vec_cohde = vdivq_f32(vec_cohde, vec_sdse);
      vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);
      vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);

      vst1q_f32(&cohde[i], vec_cohde);
      vst1q_f32(&cohxd[i], vec_cohxd);
    }
  }
  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    cohde[i] =
        (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
        (aec->sd[i] * aec->se[i] + 1e-10f);
    cohxd[i] =
        (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
        (aec->sx[i] * aec->sd[i] + 1e-10f);
  }
}