Beispiel #1
0
/**
 * @brief   vector scale & accu: A[] = alpha * B[] + beta * A[].
 *
 * @param   dst[out]    the accumulating matrix A.
 *          src[in]     the input matrix B.
 *          alpha[in]   scale of B.
 *          beta[in]    scale of A.
 *          elemCnt[in] number of elements to calc.
 *
 * @return  void.
 */
void neon_axpby(float *dst,
                const float *src,
                const float alpha,
                const float beta,
                const int elemCnt)
{
    int i;
    for (i = 0; i <= elemCnt - 16; i += 16)
    {
        float32x4_t q0 = vld1q_f32(src + i);
        float32x4_t q1 = vld1q_f32(src + i + 4);
        float32x4_t q2 = vld1q_f32(src + i + 8);
        float32x4_t q3 = vld1q_f32(src + i + 12);
        float32x4_t q4 = vld1q_f32(dst + i);
        float32x4_t q5 = vld1q_f32(dst + i + 4);
        float32x4_t q6 = vld1q_f32(dst + i + 8);
        float32x4_t q7 = vld1q_f32(dst + i + 12);
        q0 = vmulq_n_f32(q0, alpha);
        q1 = vmulq_n_f32(q1, alpha);
        q2 = vmulq_n_f32(q2, alpha);
        q3 = vmulq_n_f32(q3, alpha);
        q0 = vmlaq_n_f32(q0, q4, beta);
        q1 = vmlaq_n_f32(q1, q5, beta);
        q2 = vmlaq_n_f32(q2, q6, beta);
        q3 = vmlaq_n_f32(q3, q7, beta);
        vst1q_f32(dst + i,      q0);
        vst1q_f32(dst + i + 4,  q1);
        vst1q_f32(dst + i + 8,  q2);
        vst1q_f32(dst + i + 12, q3);
    }
    for (; i < elemCnt; i++)
    {
        float a = src[i] * alpha + dst[i] * beta;
        dst[i] = a;
    }
}
// Updates the following smoothed  Power Spectral Densities (PSD):
//  - sd  : near-end
//  - se  : residual echo
//  - sx  : far-end
//  - sde : cross-PSD of near-end and residual echo
//  - sxd : cross-PSD of near-end and far-end
//
// In addition to updating the PSDs, also the filter diverge state is determined
// upon actions are taken.
static void SmoothedPSD(AecCore* aec,
                        float efw[2][PART_LEN1],
                        float dfw[2][PART_LEN1],
                        float xfw[2][PART_LEN1],
                        int* extreme_filter_divergence) {
  // Power estimate smoothing coefficients.
  const float* ptrGCoh = aec->extended_filter_enabled
      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
  int i;
  float sdSum = 0, seSum = 0;
  const float32x4_t vec_15 =  vdupq_n_f32(WebRtcAec_kMinFarendPSD);
  float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
  float32x4_t vec_seSum = vdupq_n_f32(0.0f);

  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
    const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
    const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
    const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
    const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
    const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
    float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]);
    float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]);
    float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]);
    float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0);
    float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0);
    float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0);

    vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1);
    vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1);
    vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1);
    vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15);
    vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]);
    vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]);
    vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]);

    vst1q_f32(&aec->sd[i], vec_sd);
    vst1q_f32(&aec->se[i], vec_se);
    vst1q_f32(&aec->sx[i], vec_sx);

    {
      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
      float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0);
      float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1);
      vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]);
      vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]);
      vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1);
      vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0);
      vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]);
      vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]);
      vst2q_f32(&aec->sde[i][0], vec_sde);
    }

    {
      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
      float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0);
      float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1);
      vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]);
      vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]);
      vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1);
      vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0);
      vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]);
      vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]);
      vst2q_f32(&aec->sxd[i][0], vec_sxd);
    }

    vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
    vec_seSum = vaddq_f32(vec_seSum, vec_se);
  }
  {
    float32x2_t vec_sdSum_total;
    float32x2_t vec_seSum_total;
    // A B C D
    vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum),
                                vget_high_f32(vec_sdSum));
    vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum),
                                vget_high_f32(vec_seSum));
    // A+B C+D
    vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
    vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
    // A+B+C+D A+B+C+D
    sdSum = vget_lane_f32(vec_sdSum_total, 0);
    seSum = vget_lane_f32(vec_seSum_total, 0);
  }

  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
    aec->se[i] = ptrGCoh[0] * aec->se[i] +
                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
    // We threshold here to protect against the ill-effects of a zero farend.
    // The threshold is not arbitrarily chosen, but balances protection and
    // adverse interaction with the algorithm's tuning.
    // TODO(bjornv): investigate further why this is so sensitive.
    aec->sx[i] =
        ptrGCoh[0] * aec->sx[i] +
        ptrGCoh[1] * WEBRTC_SPL_MAX(
            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
            WebRtcAec_kMinFarendPSD);

    aec->sde[i][0] =
        ptrGCoh[0] * aec->sde[i][0] +
        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
    aec->sde[i][1] =
        ptrGCoh[0] * aec->sde[i][1] +
        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);

    aec->sxd[i][0] =
        ptrGCoh[0] * aec->sxd[i][0] +
        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
    aec->sxd[i][1] =
        ptrGCoh[0] * aec->sxd[i][1] +
        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);

    sdSum += aec->sd[i];
    seSum += aec->se[i];
  }

  // Divergent filter safeguard update.
  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;

  // Signal extreme filter divergence if the error is significantly larger
  // than the nearend (13 dB).
  *extreme_filter_divergence = (seSum > (19.95f * sdSum));
}
Beispiel #3
0
/**
 * @brief   matrix dot mul vector: C[] = alpha * A[] * B[] + beta * C[].
 *
 * @param   dst[out]    the result matrix C.
 *          src1[in]    the input matrix A.
 *          src2[in]    the input vector B.
 *          alpha[in]   scale of A * B.
 *          beta[in]    scale of C.
 *          dimM[in]    row number.
 *          dimN[in]    column number.
 *          leadingN[in]the aligned column number
 *
 * @return  void.
 */
void neon_dotMulVec(float *dst,
                    const float *src1,
                    const float *src2,
                    const float alpha,
                    const float beta,
                    const int dimM,
                    const int dimN,
                    const int leadingN)
{
    float *mat0 = (float *)src1;
    float *mat1 = (float *)src1 + leadingN;
    float *mat2 = (float *)src1 + leadingN*2;
    float *mat3 = (float *)src1 + leadingN*3;
    float *mat4 = (float *)src2;
    float *mat8 = dst;
    float *mat9 = dst + leadingN;
    float *matA = dst + leadingN*2;
    float *matB = dst + leadingN*3;
    int i = 0;
    for (i = 0; i <= dimM - 4; i += 4)
    {
        int j = 0;
        for (j = 0; j <= dimN - 4; j += 4)
        {
            float32x4_t q0 = vld1q_f32(mat0 + j);
            float32x4_t q1 = vld1q_f32(mat1 + j);
            float32x4_t q2 = vld1q_f32(mat2 + j);
            float32x4_t q3 = vld1q_f32(mat3 + j);
            float32x4_t q4 = vld1q_f32(mat4 + j);
            float32x4_t q8 = vld1q_f32(mat8 + j);
            float32x4_t q9 = vld1q_f32(mat9 + j);
            float32x4_t qA = vld1q_f32(matA + j);
            float32x4_t qB = vld1q_f32(matB + j);
            q0 = vmulq_n_f32(q0, alpha);
            q1 = vmulq_n_f32(q1, alpha);
            q2 = vmulq_n_f32(q2, alpha);
            q3 = vmulq_n_f32(q3, alpha);
            q0 = vmulq_f32(q0, q4);
            q1 = vmulq_f32(q1, q4);
            q2 = vmulq_f32(q2, q4);
            q3 = vmulq_f32(q3, q4);
            q0 = vmlaq_n_f32(q0, q8, beta);
            q1 = vmlaq_n_f32(q1, q9, beta);
            q2 = vmlaq_n_f32(q2, qA, beta);
            q3 = vmlaq_n_f32(q3, qB, beta);
            vst1q_f32(mat8 + j, q0);
            vst1q_f32(mat9 + j, q1);
            vst1q_f32(matA + j, q2);
            vst1q_f32(matB + j, q3);
        }
        for (; j < dimN; j++)
        {
            float a0 = mat8[j] * beta;
            float a1 = mat9[j] * beta;
            float a2 = matA[j] * beta;
            float a3 = matB[j] * beta;
            a0 += mat0[j] * mat4[j] * alpha;
            a1 += mat1[j] * mat4[j] * alpha;
            a2 += mat2[j] * mat4[j] * alpha;
            a3 += mat3[j] * mat4[j] * alpha;
            mat8[j] = a0;
            mat9[j] = a1;
            matA[j] = a2;
            matB[j] = a3;
        }
        mat0 += leadingN * 4;
        mat1 += leadingN * 4;
        mat2 += leadingN * 4;
        mat3 += leadingN * 4;
        mat8 += leadingN * 4;
        mat9 += leadingN * 4;
        matA += leadingN * 4;
        matB += leadingN * 4;
    }
    for (; i < dimM; i++)
    {
        int j = 0;
        for (j = 0; j <= dimN - 4; j += 4)
        {
            float32x4_t q0 = vld1q_f32(mat0 + j);
            float32x4_t q4 = vld1q_f32(mat4 + j);
            float32x4_t q8 = vld1q_f32(mat8 + j);
            q0 = vmulq_n_f32(q0, alpha);
            q0 = vmulq_f32(q0, q4);
            q0 = vmlaq_n_f32(q0, q8, beta);
            vst1q_f32(mat8 + j, q0);
        }
        for (; j < dimN; j++)
        {
            float a0 = mat0[j] * mat4[j] * alpha + mat8[j] * beta;
            mat8[j] = a0;
        }
        mat0 += leadingN;
        mat8 += leadingN;
    }
}