f64 dotProduct(const Size2D &_size, const f32 * src0Base, ptrdiff_t src0Stride, const f32 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width * sizeof(f32))) { size.width *= size.height; size.height = 1; } #define DOT_FLOAT_BLOCKSIZE (1 << 13) f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; while(i + 4 <= size.width) { size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4; float32x4_t v_sum = vdupq_n_f32(0.0f); for( ; i <= lim; i += 4 ) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i)); } float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); } if(i + 2 <= size.width) { float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); i += 2; } for (; i < size.width; ++i) result += src0[i] * src1[i]; } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }
void test_square_root_v2sf () { const float32_t pool[] = {4.0f, 9.0f}; float32x2_t val; float32x2_t res; val = vld1_f32 (pool); res = vsqrt_f32 (val); if (vget_lane_f32 (res, 0) != 2.0f) abort (); if (vget_lane_f32 (res, 1) != 3.0f) abort (); }
void test_vget_lanef32 (void) { float32_t out_float32_t; float32x2_t arg0_float32x2_t; out_float32_t = vget_lane_f32 (arg0_float32x2_t, 1); }
static int PartitionDelayNEON(const AecCore* aec) { // Measures the energy in each filter partition and returns the partition with // highest energy. // TODO(bjornv): Spread computational cost by computing one partition per // block? float wfEnMax = 0; int i; int delay = 0; for (i = 0; i < aec->num_partitions; i++) { int j; int pos = i * PART_LEN1; float wfEn = 0; float32x4_t vec_wfEn = vdupq_n_f32(0.0f); // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]); const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]); vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0); vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1); } { float32x2_t vec_total; // A B C D vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn)); // A+B C+D vec_total = vpadd_f32(vec_total, vec_total); // A+B+C+D A+B+C+D wfEn = vget_lane_f32(vec_total, 0); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; } if (wfEn > wfEnMax) { wfEnMax = wfEn; delay = i; } } return delay; }
// CHECK-LABEL: test_vget_lane_f32: float32_t test_vget_lane_f32(float32x2_t v) { return vget_lane_f32(v, 0); // CHECK-NEXT: ret }
// CHECK-LABEL: test_vget_lane_f32 float32_t test_vget_lane_f32_1(float32x2_t v) { return vget_lane_f32(v, 1); // CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1] }
float32_t test_vget_lane_f32(float32x2_t a) { // CHECK-LABEL: test_vget_lane_f32: // CHECK-NEXT: mov s0, v0[1] // CHECK-NEXT: ret return vget_lane_f32(a, 1); }
void computeNetwork0_i16_neon(const float *inputf, const float *weightsf, uint8_t *d) { const int16_t *input = (const int16_t *)inputf; const int16_t *weights = (const int16_t *)weightsf; int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int i = 0; i < 96/2; i += 8) { int16x4x2_t d0 = vld2_s16(input + i); int16x4x2_t w0 = vld2_s16(weights + i * 4); int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8); int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16); int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t m0 = vcvtq_f32_s32(sum); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 384/4)); m0 = vaddq_f32(m0, vld1q_f32(weightsf + 400/4)); float32x4_t m1, m2, m3, m4, m5, m6, m7; m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weightsf + 416/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + (416+16)/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + (416+32)/4)); m4 = vmulq_f32(m4, vld1q_f32(weightsf + (416+48)/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weightsf + (416+64)/4)); m7 = m1; m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f)); m1 = vaddq_f32(m1, ones_f); m7 = vmulq_f32(reciprocal(m1), m7); m3 = m0; m0 = vdupq_lane_f32(vget_low_f32(m0), 0); m1 = vdupq_lane_f32(vget_low_f32(m3), 1); m2 = vdupq_lane_f32(vget_high_f32(m3), 0); m3 = vdupq_lane_f32(vget_high_f32(m3), 1); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 496/4)); m1 = vmulq_f32(m1, vld1q_f32(weightsf + (496+16)/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + (496+32)/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + (496+48)/4)); m4 = vdupq_lane_f32(vget_low_f32(m7), 0); m5 = vdupq_lane_f32(vget_low_f32(m7), 1); m6 = vdupq_lane_f32(vget_high_f32(m7), 0); m7 = vdupq_lane_f32(vget_high_f32(m7), 1); m4 = vmulq_f32(m4, vld1q_f32(weightsf + (496+64)/4)); m5 = vmulq_f32(m5, vld1q_f32(weightsf + (496+80)/4)); m6 = vmulq_f32(m6, vld1q_f32(weightsf + (496+96)/4)); m7 = vmulq_f32(m7, vld1q_f32(weightsf + (496+112)/4)); m0 = vaddq_f32(m0, m1); m2 = vaddq_f32(m2, m3); m4 = vaddq_f32(m4, m5); m6 = vaddq_f32(m6, m7); m0 = vaddq_f32(m0, m2); m4 = vaddq_f32(m4, m6); m0 = vaddq_f32(m0, m4); m0 = vaddq_f32(m0, vld1q_f32(weightsf + (496+128)/4)); float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0)); d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0)); }
void computeNetwork0_neon(const float *input, const float *weights, uint8_t *d) { float32x4_t m0 = { 0.0f, 0.0f, 0.0f, 0.0f }; float32x4_t m1 = m0; float32x4_t m2 = m0; float32x4_t m3 = m0; float32x4_t m4, m5, m6, m7; for (int i = 0; i < 192/4; i += 4) { m4 = vld1q_f32(input + i); m5 = m4; m6 = m4; m7 = m4; m4 = vmulq_f32(m4, vld1q_f32(weights + i * 4)); m5 = vmulq_f32(m5, vld1q_f32(weights + i * 4 + 4)); m6 = vmulq_f32(m6, vld1q_f32(weights + i * 4 + 8)); m7 = vmulq_f32(m7, vld1q_f32(weights + i * 4 + 12)); m0 = vaddq_f32(m0, m4); m1 = vaddq_f32(m1, m5); m2 = vaddq_f32(m2, m6); m3 = vaddq_f32(m3, m7); } float32x2_t sum0 = vpadd_f32(vget_low_f32(m0), vget_high_f32(m0)); float32x2_t sum1 = vpadd_f32(vget_low_f32(m1), vget_high_f32(m1)); float32x2_t sum2 = vpadd_f32(vget_low_f32(m2), vget_high_f32(m2)); float32x2_t sum3 = vpadd_f32(vget_low_f32(m3), vget_high_f32(m3)); sum0 = vpadd_f32(sum0, sum1); sum1 = vpadd_f32(sum2, sum3); m0 = vcombine_f32(sum0, sum1); m0 = vaddq_f32(m0, vld1q_f32(weights + 768/4)); m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weights + 784/4)); m2 = vmulq_f32(m2, vld1q_f32(weights + (784+16)/4)); m3 = vmulq_f32(m3, vld1q_f32(weights + (784+32)/4)); m4 = vmulq_f32(m4, vld1q_f32(weights + (784+48)/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weights + (784+64)/4)); m7 = m1; m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f)); m1 = vaddq_f32(m1, ones_f); m7 = vmulq_f32(reciprocal(m1), m7); m3 = m0; m0 = vdupq_lane_f32(vget_low_f32(m0), 0); m1 = vdupq_lane_f32(vget_low_f32(m3), 1); m2 = vdupq_lane_f32(vget_high_f32(m3), 0); m3 = vdupq_lane_f32(vget_high_f32(m3), 1); m0 = vmulq_f32(m0, vld1q_f32(weights + 864/4)); m1 = vmulq_f32(m1, vld1q_f32(weights + (864+16)/4)); m2 = vmulq_f32(m2, vld1q_f32(weights + (864+32)/4)); m3 = vmulq_f32(m3, vld1q_f32(weights + (864+48)/4)); m4 = vdupq_lane_f32(vget_low_f32(m7), 0); m5 = vdupq_lane_f32(vget_low_f32(m7), 1); m6 = vdupq_lane_f32(vget_high_f32(m7), 0); m7 = vdupq_lane_f32(vget_high_f32(m7), 1); m4 = vmulq_f32(m4, vld1q_f32(weights + (864+64)/4)); m5 = vmulq_f32(m5, vld1q_f32(weights + (864+80)/4)); m6 = vmulq_f32(m6, vld1q_f32(weights + (864+96)/4)); m7 = vmulq_f32(m7, vld1q_f32(weights + (864+112)/4)); m0 = vaddq_f32(m0, m1); m2 = vaddq_f32(m2, m3); m4 = vaddq_f32(m4, m5); m6 = vaddq_f32(m6, m7); m0 = vaddq_f32(m0, m2); m4 = vaddq_f32(m4, m6); m0 = vaddq_f32(m0, m4); m0 = vaddq_f32(m0, vld1q_f32(weights + (864+128)/4)); float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0)); d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0)); }
// Updates the following smoothed Power Spectral Densities (PSD): // - sd : near-end // - se : residual echo // - sx : far-end // - sde : cross-PSD of near-end and residual echo // - sxd : cross-PSD of near-end and far-end // // In addition to updating the PSDs, also the filter diverge state is determined // upon actions are taken. static void SmoothedPSD(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = aec->extended_filter_enabled ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; int i; float sdSum = 0, seSum = 0; const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); float32x4_t vec_sdSum = vdupq_n_f32(0.0f); float32x4_t vec_seSum = vdupq_n_f32(0.0f); for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1); vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1); vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1); vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15); vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]); vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); vst1q_f32(&aec->sd[i], vec_sd); vst1q_f32(&aec->se[i], vec_se); vst1q_f32(&aec->sx[i], vec_sx); { float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]); vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1); vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); vst2q_f32(&aec->sde[i][0], vec_sde); } { float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]); vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1); vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); vst2q_f32(&aec->sxd[i][0], vec_sxd); } vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); vec_seSum = vaddq_f32(vec_seSum, vec_se); } { float32x2_t vec_sdSum_total; float32x2_t vec_seSum_total; // A B C D vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); // A+B C+D vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); // A+B+C+D A+B+C+D sdSum = vget_lane_f32(vec_sdSum_total, 0); seSum = vget_lane_f32(vec_seSum_total, 0); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. aec->sx[i] = ptrGCoh[0] * aec->sx[i] + ptrGCoh[1] * WEBRTC_SPL_MAX( xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], WebRtcAec_kMinFarendPSD); aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); sdSum += aec->sd[i]; seSum += aec->se[i]; } // Divergent filter safeguard update. aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). *extreme_filter_divergence = (seSum > (19.95f * sdSum)); }
// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 { // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> // CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> // CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 // CHECK: ret float [[VGET_LANE]] float32_t test_vget_lane_f32(float32x2_t a) { return vget_lane_f32(a, 1); }