static void ScaleErrorSignalNEON(int extended_filter_enabled, float normal_mu, float normal_error_threshold, float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled ? kExtendedErrorThreshold : normal_error_threshold; const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); const float32x4_t kMu = vmovq_n_f32(mu); const float32x4_t kThresh = vmovq_n_f32(error_threshold); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); const float32x4_t absEf = vsqrtq_f32(ef_sum2); const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); ef_re_if = vandq_u32(bigger, ef_re_if); ef_im_if = vandq_u32(bigger, ef_im_if); ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); vst1q_f32(&ef[0][i], ef_re); vst1q_f32(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { float abs_ef; ef[0][i] /= (x_pow[i] + 1e-10f); ef[1][i] /= (x_pow[i] + 1e-10f); abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (abs_ef > error_threshold) { abs_ef = error_threshold / (abs_ef + 1e-10f); ef[0][i] *= abs_ef; ef[1][i] *= abs_ef; } // Stepsize factor ef[0][i] *= mu; ef[1][i] *= mu; } }
void test_vorrQu32 (void) { uint32x4_t out_uint32x4_t; uint32x4_t arg0_uint32x4_t; uint32x4_t arg1_uint32x4_t; out_uint32x4_t = vorrq_u32 (arg0_uint32x4_t, arg1_uint32x4_t); }
static inline float32x4_t floor_neon(float32x4_t a) { #if __ARM_ARCH >= 8 return vrndqm_f32(a); #else const float32x4_t round32 = vdupq_n_f32(12582912.0f); const float32x4_t vhalf = vdupq_n_f32(0.5f); float32x4_t rounded = vsubq_f32(vaddq_f32(a, round32), round32); uint32x4_t mask = vceqq_f32(a, rounded); float32x4_t floored = vsubq_f32(vaddq_f32(vsubq_f32(a, vhalf), round32), round32); return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(a), mask), vbicq_u32(vreinterpretq_u32_f32(floored), mask))); #endif }
static inline void desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4], struct rte_mbuf **rx_pkts) { uint32x4_t vlan0, vlan1, rss, l3_l4e; const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0}; uint64x2_t rearm0, rearm1, rearm2, rearm3; /* mask everything except RSS, flow director and VLAN flags * bit2 is for VLAN tag, bit11 for flow director indication * bit13:12 for RSS indication. */ const uint32x4_t rss_vlan_msk = { 0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804}; const uint32x4_t cksum_mask = { PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD}; /* map rss and vlan type to rss hash and vlan flag */ const uint8x16_t vlan_flags = { 0, 0, 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; const uint8x16_t rss_flags = { 0, PKT_RX_FDIR, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, 0, 0}; const uint8x16_t l3_l4e_flags = { (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1, PKT_RX_IP_CKSUM_BAD >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, 0, 0, 0, 0, 0, 0, 0, 0}; vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]), vreinterpretq_u32_u64(descs[2])).val[1]; vlan1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]), vreinterpretq_u32_u64(descs[3])).val[1]; vlan0 = vzipq_u32(vlan0, vlan1).val[0]; vlan1 = vandq_u32(vlan0, rss_vlan_msk); vlan0 = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags, vreinterpretq_u8_u32(vlan1))); rss = vshrq_n_u32(vlan1, 11); rss = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags, vreinterpretq_u8_u32(rss))); l3_l4e = vshrq_n_u32(vlan1, 22); l3_l4e = vreinterpretq_u32_u8(vqtbl1q_u8(l3_l4e_flags, vreinterpretq_u8_u32(l3_l4e))); /* then we shift left 1 bit */ l3_l4e = vshlq_n_u32(l3_l4e, 1); /* we need to mask out the reduntant bits */ l3_l4e = vandq_u32(l3_l4e, cksum_mask); vlan0 = vorrq_u32(vlan0, rss); vlan0 = vorrq_u32(vlan0, l3_l4e); rearm0 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 0), mbuf_init, 1); rearm1 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 1), mbuf_init, 1); rearm2 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 2), mbuf_init, 1); rearm3 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 3), mbuf_init, 1); vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0); vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1); vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2); vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3); }
uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) { // CHECK-LABEL: test_vorrq_u32 return vorrq_u32(a, b); // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }
static void OverdriveAndSuppressNEON(AecCore* aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]) { int i; const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb); const float32x4_t vec_one = vdupq_n_f32(1.0f); const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { // Weight subbands float32x4_t vec_hNl = vld1q_f32(&hNl[i]); const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve, vec_hNlFb); const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve, vec_hNl); const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(vec_hNl)); const float32x4_t vec_one_weightCurve_add = vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); const uint32x4_t vec_if1 = vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); { const float32x4_t vec_overDriveCurve = vld1q_f32(&WebRtcAec_overDriveCurve[i]); const float32x4_t vec_overDriveSm_overDriveCurve = vmulq_f32(vec_overDriveSm, vec_overDriveCurve); vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve); vst1q_f32(&hNl[i], vec_hNl); } // Suppress error signal { float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]); float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]); vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl); vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl); // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one); vst1q_f32(&efw[0][i], vec_efw_re); vst1q_f32(&efw[1][i], vec_efw_im); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { // Weight subbands if (hNl[i] > hNlFb) { hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + (1 - WebRtcAec_weightCurve[i]) * hNl[i]; } hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); // Suppress error signal efw[0][i] *= hNl[i]; efw[1][i] *= hNl[i]; // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. efw[1][i] *= -1; } }
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. float32x4_t log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); const float32x4_t n = vsubq_f32(vreinterpretq_f32_u32(n_0), vreinterpretq_f32_u32(vec_implicit_leading_one)); // Compute y. const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); const float32x4_t y = vreinterpretq_f32_u32(vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); const float32x4_t C3 = vdupq_n_f32(-1.2315303f); const float32x4_t C2 = vdupq_n_f32(2.5988452f); const float32x4_t C1 = vdupq_n_f32(-3.3241990f); const float32x4_t C0 = vdupq_n_f32(3.1157899f); float32x4_t pol5_y = C5; pol5_y = vmlaq_f32(C4, y, pol5_y); pol5_y = vmlaq_f32(C3, y, pol5_y); pol5_y = vmlaq_f32(C2, y, pol5_y); pol5_y = vmlaq_f32(C1, y, pol5_y); pol5_y = vmlaq_f32(C0, y, pol5_y); const float32x4_t y_minus_one = vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); // Combine parts. log2_a = vaddq_f32(n, log2_y); } // b * log2(a) b_log2_a = vmulq_f32(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. const float32x4_t max_input = vdupq_n_f32(129.f); const float32x4_t min_input = vdupq_n_f32(-126.99999f); const float32x4_t x_min = vminq_f32(b_log2_a, max_input); const float32x4_t x_max = vmaxq_f32(x_min, min_input); // Compute n. const float32x4_t half = vdupq_n_f32(0.5f); const float32x4_t x_minus_half = vsubq_f32(x_max, half); const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); // Compute 2^n. const int32x4_t float_exponent_bias = vdupq_n_s32(127); const int32x4_t two_n_exponent = vaddq_s32(x_minus_half_floor, float_exponent_bias); const float32x4_t two_n = vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); // Compute y. const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); const float32x4_t C0 = vdupq_n_f32(1.0017247f); float32x4_t exp2_y = C2; exp2_y = vmlaq_f32(C1, y, exp2_y); exp2_y = vmlaq_f32(C0, y, exp2_y); // Combine parts. a_exp_b = vmulq_f32(exp2_y, two_n); } return a_exp_b; }