static void ScaleErrorSignalNEON(int extended_filter_enabled, float normal_mu, float normal_error_threshold, float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled ? kExtendedErrorThreshold : normal_error_threshold; const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); const float32x4_t kMu = vmovq_n_f32(mu); const float32x4_t kThresh = vmovq_n_f32(error_threshold); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); const float32x4_t absEf = vsqrtq_f32(ef_sum2); const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); ef_re_if = vandq_u32(bigger, ef_re_if); ef_im_if = vandq_u32(bigger, ef_im_if); ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); vst1q_f32(&ef[0][i], ef_re); vst1q_f32(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { float abs_ef; ef[0][i] /= (x_pow[i] + 1e-10f); ef[1][i] /= (x_pow[i] + 1e-10f); abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (abs_ef > error_threshold) { abs_ef = error_threshold / (abs_ef + 1e-10f); ef[0][i] *= abs_ef; ef[1][i] *= abs_ef; } // Stepsize factor ef[0][i] *= mu; ef[1][i] *= mu; } }
static float32x4_t vsqrtq_f32(float32x4_t s) { int i; float32x4_t x = vrsqrteq_f32(s); // Code to handle sqrt(0). // If the input to sqrtf() is zero, a zero will be returned. // If the input to vrsqrteq_f32() is zero, positive infinity is returned. const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); // check for divide by zero const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); // zero out the positive infinity results x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(x))); // from arm documentation // The Newton-Raphson iteration: // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. // // Note: The precision did not improve after 2 iterations. for (i = 0; i < 2; i++) { x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); } // sqrt(s) = s * 1/sqrt(s) return vmulq_f32(s, x);; }
void test_vmvnQu32 (void) { uint32x4_t out_uint32x4_t; uint32x4_t arg0_uint32x4_t; out_uint32x4_t = vmvnq_u32 (arg0_uint32x4_t); }
inline uint32x4_t vmvnq(const uint32x4_t & v) { return vmvnq_u32(v); }
static void OverdriveAndSuppressNEON(AecCore* aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]) { int i; const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb); const float32x4_t vec_one = vdupq_n_f32(1.0f); const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { // Weight subbands float32x4_t vec_hNl = vld1q_f32(&hNl[i]); const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve, vec_hNlFb); const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve, vec_hNl); const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(vec_hNl)); const float32x4_t vec_one_weightCurve_add = vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); const uint32x4_t vec_if1 = vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); { const float32x4_t vec_overDriveCurve = vld1q_f32(&WebRtcAec_overDriveCurve[i]); const float32x4_t vec_overDriveSm_overDriveCurve = vmulq_f32(vec_overDriveSm, vec_overDriveCurve); vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve); vst1q_f32(&hNl[i], vec_hNl); } // Suppress error signal { float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]); float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]); vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl); vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl); // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one); vst1q_f32(&efw[0][i], vec_efw_re); vst1q_f32(&efw[1][i], vec_efw_im); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { // Weight subbands if (hNl[i] > hNlFb) { hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + (1 - WebRtcAec_weightCurve[i]) * hNl[i]; } hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); // Suppress error signal efw[0][i] *= hNl[i]; efw[1][i] *= hNl[i]; // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. efw[1][i] *= -1; } }