/** * @brief vector scale & accu: A[] = alpha * B[] + beta * A[]. * * @param dst[out] the accumulating matrix A. * src[in] the input matrix B. * alpha[in] scale of B. * beta[in] scale of A. * elemCnt[in] number of elements to calc. * * @return void. */ void neon_axpby(float *dst, const float *src, const float alpha, const float beta, const int elemCnt) { int i; for (i = 0; i <= elemCnt - 16; i += 16) { float32x4_t q0 = vld1q_f32(src + i); float32x4_t q1 = vld1q_f32(src + i + 4); float32x4_t q2 = vld1q_f32(src + i + 8); float32x4_t q3 = vld1q_f32(src + i + 12); float32x4_t q4 = vld1q_f32(dst + i); float32x4_t q5 = vld1q_f32(dst + i + 4); float32x4_t q6 = vld1q_f32(dst + i + 8); float32x4_t q7 = vld1q_f32(dst + i + 12); q0 = vmulq_n_f32(q0, alpha); q1 = vmulq_n_f32(q1, alpha); q2 = vmulq_n_f32(q2, alpha); q3 = vmulq_n_f32(q3, alpha); q0 = vmlaq_n_f32(q0, q4, beta); q1 = vmlaq_n_f32(q1, q5, beta); q2 = vmlaq_n_f32(q2, q6, beta); q3 = vmlaq_n_f32(q3, q7, beta); vst1q_f32(dst + i, q0); vst1q_f32(dst + i + 4, q1); vst1q_f32(dst + i + 8, q2); vst1q_f32(dst + i + 12, q3); } for (; i < elemCnt; i++) { float a = src[i] * alpha + dst[i] * beta; dst[i] = a; } }
// Updates the following smoothed Power Spectral Densities (PSD): // - sd : near-end // - se : residual echo // - sx : far-end // - sde : cross-PSD of near-end and residual echo // - sxd : cross-PSD of near-end and far-end // // In addition to updating the PSDs, also the filter diverge state is determined // upon actions are taken. static void SmoothedPSD(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = aec->extended_filter_enabled ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; int i; float sdSum = 0, seSum = 0; const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); float32x4_t vec_sdSum = vdupq_n_f32(0.0f); float32x4_t vec_seSum = vdupq_n_f32(0.0f); for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1); vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1); vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1); vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15); vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]); vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); vst1q_f32(&aec->sd[i], vec_sd); vst1q_f32(&aec->se[i], vec_se); vst1q_f32(&aec->sx[i], vec_sx); { float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]); vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1); vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); vst2q_f32(&aec->sde[i][0], vec_sde); } { float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]); vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1); vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); vst2q_f32(&aec->sxd[i][0], vec_sxd); } vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); vec_seSum = vaddq_f32(vec_seSum, vec_se); } { float32x2_t vec_sdSum_total; float32x2_t vec_seSum_total; // A B C D vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); // A+B C+D vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); // A+B+C+D A+B+C+D sdSum = vget_lane_f32(vec_sdSum_total, 0); seSum = vget_lane_f32(vec_seSum_total, 0); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. aec->sx[i] = ptrGCoh[0] * aec->sx[i] + ptrGCoh[1] * WEBRTC_SPL_MAX( xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], WebRtcAec_kMinFarendPSD); aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); sdSum += aec->sd[i]; seSum += aec->se[i]; } // Divergent filter safeguard update. aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). *extreme_filter_divergence = (seSum > (19.95f * sdSum)); }
/** * @brief matrix dot mul vector: C[] = alpha * A[] * B[] + beta * C[]. * * @param dst[out] the result matrix C. * src1[in] the input matrix A. * src2[in] the input vector B. * alpha[in] scale of A * B. * beta[in] scale of C. * dimM[in] row number. * dimN[in] column number. * leadingN[in]the aligned column number * * @return void. */ void neon_dotMulVec(float *dst, const float *src1, const float *src2, const float alpha, const float beta, const int dimM, const int dimN, const int leadingN) { float *mat0 = (float *)src1; float *mat1 = (float *)src1 + leadingN; float *mat2 = (float *)src1 + leadingN*2; float *mat3 = (float *)src1 + leadingN*3; float *mat4 = (float *)src2; float *mat8 = dst; float *mat9 = dst + leadingN; float *matA = dst + leadingN*2; float *matB = dst + leadingN*3; int i = 0; for (i = 0; i <= dimM - 4; i += 4) { int j = 0; for (j = 0; j <= dimN - 4; j += 4) { float32x4_t q0 = vld1q_f32(mat0 + j); float32x4_t q1 = vld1q_f32(mat1 + j); float32x4_t q2 = vld1q_f32(mat2 + j); float32x4_t q3 = vld1q_f32(mat3 + j); float32x4_t q4 = vld1q_f32(mat4 + j); float32x4_t q8 = vld1q_f32(mat8 + j); float32x4_t q9 = vld1q_f32(mat9 + j); float32x4_t qA = vld1q_f32(matA + j); float32x4_t qB = vld1q_f32(matB + j); q0 = vmulq_n_f32(q0, alpha); q1 = vmulq_n_f32(q1, alpha); q2 = vmulq_n_f32(q2, alpha); q3 = vmulq_n_f32(q3, alpha); q0 = vmulq_f32(q0, q4); q1 = vmulq_f32(q1, q4); q2 = vmulq_f32(q2, q4); q3 = vmulq_f32(q3, q4); q0 = vmlaq_n_f32(q0, q8, beta); q1 = vmlaq_n_f32(q1, q9, beta); q2 = vmlaq_n_f32(q2, qA, beta); q3 = vmlaq_n_f32(q3, qB, beta); vst1q_f32(mat8 + j, q0); vst1q_f32(mat9 + j, q1); vst1q_f32(matA + j, q2); vst1q_f32(matB + j, q3); } for (; j < dimN; j++) { float a0 = mat8[j] * beta; float a1 = mat9[j] * beta; float a2 = matA[j] * beta; float a3 = matB[j] * beta; a0 += mat0[j] * mat4[j] * alpha; a1 += mat1[j] * mat4[j] * alpha; a2 += mat2[j] * mat4[j] * alpha; a3 += mat3[j] * mat4[j] * alpha; mat8[j] = a0; mat9[j] = a1; matA[j] = a2; matB[j] = a3; } mat0 += leadingN * 4; mat1 += leadingN * 4; mat2 += leadingN * 4; mat3 += leadingN * 4; mat8 += leadingN * 4; mat9 += leadingN * 4; matA += leadingN * 4; matB += leadingN * 4; } for (; i < dimM; i++) { int j = 0; for (j = 0; j <= dimN - 4; j += 4) { float32x4_t q0 = vld1q_f32(mat0 + j); float32x4_t q4 = vld1q_f32(mat4 + j); float32x4_t q8 = vld1q_f32(mat8 + j); q0 = vmulq_n_f32(q0, alpha); q0 = vmulq_f32(q0, q4); q0 = vmlaq_n_f32(q0, q8, beta); vst1q_f32(mat8 + j, q0); } for (; j < dimN; j++) { float a0 = mat0[j] * mat4[j] * alpha + mat8[j] * beta; mat8[j] = a0; } mat0 += leadingN; mat8 += leadingN; } }