static void SubbandCoherenceSSE2(AecCore* aec, float efw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd) { float dfw[2][PART_LEN1]; int i; if (aec->delayEstCtr == 0) aec->delayIdx = PartitionDelay(aec); // Use delayed far. memcpy(xfw, aec->xfwBuf + aec->delayIdx * PART_LEN1, sizeof(xfw[0][0]) * 2 * PART_LEN1); // Windowed near fft WindowData(fft, aec->dBuf); aec_rdft_forward_128(fft); StoreAsComplex(fft, dfw); // Windowed error fft WindowData(fft, aec->eBuf); aec_rdft_forward_128(fft); StoreAsComplex(fft, efw); SmoothedPSD(aec, efw, dfw, xfw); { const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); // Subband coherence for (i = 0; i + 3 < PART_LEN1; i += 4) { const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); const __m128 vec_sdse = _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se)); const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx)); const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1)); const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1)); __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); _mm_storeu_ps(&cohde[i], vec_cohde); _mm_storeu_ps(&cohxd[i], vec_cohxd); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { cohde[i] = (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / (aec->sd[i] * aec->se[i] + 1e-10f); cohxd[i] = (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / (aec->sx[i] * aec->sd[i] + 1e-10f); } } }
static void SubbandCoherenceNEON(AecCore* aec, float efw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd) { float dfw[2][PART_LEN1]; int i; if (aec->delayEstCtr == 0) aec->delayIdx = PartitionDelay(aec); // Use delayed far. memcpy(xfw, aec->xfwBuf + aec->delayIdx * PART_LEN1, sizeof(xfw[0][0]) * 2 * PART_LEN1); // Windowed near fft WindowData(fft, aec->dBuf); aec_rdft_forward_128(fft); StoreAsComplex(fft, dfw); // Windowed error fft WindowData(fft, aec->eBuf); aec_rdft_forward_128(fft); StoreAsComplex(fft, efw); SmoothedPSD(aec, efw, dfw, xfw); { const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); // Subband coherence for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); const float32x4_t vec_se = vld1q_f32(&aec->se[i]); const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]); float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]); vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]); vec_cohde = vdivq_f32(vec_cohde, vec_sdse); vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]); vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx); vst1q_f32(&cohde[i], vec_cohde); vst1q_f32(&cohxd[i], vec_cohxd); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { cohde[i] = (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / (aec->sd[i] * aec->se[i] + 1e-10f); cohxd[i] = (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / (aec->sx[i] * aec->sd[i] + 1e-10f); } }