static float32x4_t vsqrtq_f32(float32x4_t s) { int i; float32x4_t x = vrsqrteq_f32(s); // Code to handle sqrt(0). // If the input to sqrtf() is zero, a zero will be returned. // If the input to vrsqrteq_f32() is zero, positive infinity is returned. const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); // check for divide by zero const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); // zero out the positive infinity results x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(x))); // from arm documentation // The Newton-Raphson iteration: // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. // // Note: The precision did not improve after 2 iterations. for (i = 0; i < 2; i++) { x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); } // sqrt(s) = s * 1/sqrt(s) return vmulq_f32(s, x);; }
static void ScaleErrorSignalNEON(int extended_filter_enabled, float normal_mu, float normal_error_threshold, float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled ? kExtendedErrorThreshold : normal_error_threshold; const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); const float32x4_t kMu = vmovq_n_f32(mu); const float32x4_t kThresh = vmovq_n_f32(error_threshold); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); const float32x4_t absEf = vsqrtq_f32(ef_sum2); const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); ef_re_if = vandq_u32(bigger, ef_re_if); ef_im_if = vandq_u32(bigger, ef_im_if); ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); vst1q_f32(&ef[0][i], ef_re); vst1q_f32(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { float abs_ef; ef[0][i] /= (x_pow[i] + 1e-10f); ef[1][i] /= (x_pow[i] + 1e-10f); abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (abs_ef > error_threshold) { abs_ef = error_threshold / (abs_ef + 1e-10f); ef[0][i] *= abs_ef; ef[1][i] *= abs_ef; } // Stepsize factor ef[0][i] *= mu; ef[1][i] *= mu; } }
static inline float32x4_t floor_neon(float32x4_t a) { #if __ARM_ARCH >= 8 return vrndqm_f32(a); #else const float32x4_t round32 = vdupq_n_f32(12582912.0f); const float32x4_t vhalf = vdupq_n_f32(0.5f); float32x4_t rounded = vsubq_f32(vaddq_f32(a, round32), round32); uint32x4_t mask = vceqq_f32(a, rounded); float32x4_t floored = vsubq_f32(vaddq_f32(vsubq_f32(a, vhalf), round32), round32); return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(a), mask), vbicq_u32(vreinterpretq_u32_f32(floored), mask))); #endif }
static void thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const float* src = _src.ptr<float>(); float* dst = _dst.ptr<float>(); size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) CV_IPP_CHECK() { IppiSize sz = { roi.width, roi.height }; switch( type ) { case THRESH_TRUNC: if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO: if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO_INV: if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; } } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_cmpgt_ps( v0, thresh4 ); v1 = _mm_cmpgt_ps( v1, thresh4 ); v0 = _mm_and_ps( v0, maxval4 ); v1 = _mm_and_ps( v1, maxval4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval)); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_cmple_ps( v0, thresh4 ); v1 = _mm_cmple_ps( v1, thresh4 ); v0 = _mm_and_ps( v0, maxval4 ); v1 = _mm_and_ps( v1, maxval4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval)); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_min_ps( v0, thresh4 ); v1 = _mm_min_ps( v1, thresh4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); for( ; j <= roi.width - 4; j += 4 ) vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4)); v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4)); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), vreinterpretq_u32_f32(v_src)); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) { float v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4)); v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4)); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), vreinterpretq_u32_f32(v_src)); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) { float v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
void computeNetwork0new_neon(const float *dataf, const float *weightsf, uint8_t *d) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int i = 0; i < 128/2; i += 8) { int16x4x2_t d0 = vld2_s16(data + i); int16x4x2_t w0 = vld2_s16(weights + i * 4); int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8); int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16); int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t m0 = vcvtq_f32_s32(sum); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 512/4)); m0 = vaddq_f32(m0, vld1q_f32(weightsf + 528/4)); float32x4_t m1, m2, m3, m4; m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weightsf + 544/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + 560/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + 576/4)); m4 = vmulq_f32(m4, vld1q_f32(weightsf + 592/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weightsf + 608/4)); uint32x4_t gte = vcgeq_f32(m1, zeroes_f); uint16x4_t gte_u16 = vmovn_u32(gte); uint8x8_t gte_u8 = vmovn_u16(vcombine_u16(gte_u16, vget_low_u16(vreinterpretq_u16_u32(sign_bits_f)))); gte_u8 = vshr_n_u8(gte_u8, 7); vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(gte_u8), 0); }
void computeNetwork0_i16_neon(const float *inputf, const float *weightsf, uint8_t *d) { const int16_t *input = (const int16_t *)inputf; const int16_t *weights = (const int16_t *)weightsf; int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int i = 0; i < 96/2; i += 8) { int16x4x2_t d0 = vld2_s16(input + i); int16x4x2_t w0 = vld2_s16(weights + i * 4); int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8); int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16); int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t m0 = vcvtq_f32_s32(sum); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 384/4)); m0 = vaddq_f32(m0, vld1q_f32(weightsf + 400/4)); float32x4_t m1, m2, m3, m4, m5, m6, m7; m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weightsf + 416/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + (416+16)/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + (416+32)/4)); m4 = vmulq_f32(m4, vld1q_f32(weightsf + (416+48)/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weightsf + (416+64)/4)); m7 = m1; m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f)); m1 = vaddq_f32(m1, ones_f); m7 = vmulq_f32(reciprocal(m1), m7); m3 = m0; m0 = vdupq_lane_f32(vget_low_f32(m0), 0); m1 = vdupq_lane_f32(vget_low_f32(m3), 1); m2 = vdupq_lane_f32(vget_high_f32(m3), 0); m3 = vdupq_lane_f32(vget_high_f32(m3), 1); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 496/4)); m1 = vmulq_f32(m1, vld1q_f32(weightsf + (496+16)/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + (496+32)/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + (496+48)/4)); m4 = vdupq_lane_f32(vget_low_f32(m7), 0); m5 = vdupq_lane_f32(vget_low_f32(m7), 1); m6 = vdupq_lane_f32(vget_high_f32(m7), 0); m7 = vdupq_lane_f32(vget_high_f32(m7), 1); m4 = vmulq_f32(m4, vld1q_f32(weightsf + (496+64)/4)); m5 = vmulq_f32(m5, vld1q_f32(weightsf + (496+80)/4)); m6 = vmulq_f32(m6, vld1q_f32(weightsf + (496+96)/4)); m7 = vmulq_f32(m7, vld1q_f32(weightsf + (496+112)/4)); m0 = vaddq_f32(m0, m1); m2 = vaddq_f32(m2, m3); m4 = vaddq_f32(m4, m5); m6 = vaddq_f32(m6, m7); m0 = vaddq_f32(m0, m2); m4 = vaddq_f32(m4, m6); m0 = vaddq_f32(m0, m4); m0 = vaddq_f32(m0, vld1q_f32(weightsf + (496+128)/4)); float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0)); d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0)); }
void computeNetwork0_neon(const float *input, const float *weights, uint8_t *d) { float32x4_t m0 = { 0.0f, 0.0f, 0.0f, 0.0f }; float32x4_t m1 = m0; float32x4_t m2 = m0; float32x4_t m3 = m0; float32x4_t m4, m5, m6, m7; for (int i = 0; i < 192/4; i += 4) { m4 = vld1q_f32(input + i); m5 = m4; m6 = m4; m7 = m4; m4 = vmulq_f32(m4, vld1q_f32(weights + i * 4)); m5 = vmulq_f32(m5, vld1q_f32(weights + i * 4 + 4)); m6 = vmulq_f32(m6, vld1q_f32(weights + i * 4 + 8)); m7 = vmulq_f32(m7, vld1q_f32(weights + i * 4 + 12)); m0 = vaddq_f32(m0, m4); m1 = vaddq_f32(m1, m5); m2 = vaddq_f32(m2, m6); m3 = vaddq_f32(m3, m7); } float32x2_t sum0 = vpadd_f32(vget_low_f32(m0), vget_high_f32(m0)); float32x2_t sum1 = vpadd_f32(vget_low_f32(m1), vget_high_f32(m1)); float32x2_t sum2 = vpadd_f32(vget_low_f32(m2), vget_high_f32(m2)); float32x2_t sum3 = vpadd_f32(vget_low_f32(m3), vget_high_f32(m3)); sum0 = vpadd_f32(sum0, sum1); sum1 = vpadd_f32(sum2, sum3); m0 = vcombine_f32(sum0, sum1); m0 = vaddq_f32(m0, vld1q_f32(weights + 768/4)); m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weights + 784/4)); m2 = vmulq_f32(m2, vld1q_f32(weights + (784+16)/4)); m3 = vmulq_f32(m3, vld1q_f32(weights + (784+32)/4)); m4 = vmulq_f32(m4, vld1q_f32(weights + (784+48)/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weights + (784+64)/4)); m7 = m1; m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f)); m1 = vaddq_f32(m1, ones_f); m7 = vmulq_f32(reciprocal(m1), m7); m3 = m0; m0 = vdupq_lane_f32(vget_low_f32(m0), 0); m1 = vdupq_lane_f32(vget_low_f32(m3), 1); m2 = vdupq_lane_f32(vget_high_f32(m3), 0); m3 = vdupq_lane_f32(vget_high_f32(m3), 1); m0 = vmulq_f32(m0, vld1q_f32(weights + 864/4)); m1 = vmulq_f32(m1, vld1q_f32(weights + (864+16)/4)); m2 = vmulq_f32(m2, vld1q_f32(weights + (864+32)/4)); m3 = vmulq_f32(m3, vld1q_f32(weights + (864+48)/4)); m4 = vdupq_lane_f32(vget_low_f32(m7), 0); m5 = vdupq_lane_f32(vget_low_f32(m7), 1); m6 = vdupq_lane_f32(vget_high_f32(m7), 0); m7 = vdupq_lane_f32(vget_high_f32(m7), 1); m4 = vmulq_f32(m4, vld1q_f32(weights + (864+64)/4)); m5 = vmulq_f32(m5, vld1q_f32(weights + (864+80)/4)); m6 = vmulq_f32(m6, vld1q_f32(weights + (864+96)/4)); m7 = vmulq_f32(m7, vld1q_f32(weights + (864+112)/4)); m0 = vaddq_f32(m0, m1); m2 = vaddq_f32(m2, m3); m4 = vaddq_f32(m4, m5); m6 = vaddq_f32(m6, m7); m0 = vaddq_f32(m0, m2); m4 = vaddq_f32(m4, m6); m0 = vaddq_f32(m0, m4); m0 = vaddq_f32(m0, vld1q_f32(weights + (864+128)/4)); float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0)); d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0)); }
static void OverdriveAndSuppressNEON(AecCore* aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]) { int i; const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb); const float32x4_t vec_one = vdupq_n_f32(1.0f); const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { // Weight subbands float32x4_t vec_hNl = vld1q_f32(&hNl[i]); const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve, vec_hNlFb); const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve, vec_hNl); const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(vec_hNl)); const float32x4_t vec_one_weightCurve_add = vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); const uint32x4_t vec_if1 = vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); { const float32x4_t vec_overDriveCurve = vld1q_f32(&WebRtcAec_overDriveCurve[i]); const float32x4_t vec_overDriveSm_overDriveCurve = vmulq_f32(vec_overDriveSm, vec_overDriveCurve); vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve); vst1q_f32(&hNl[i], vec_hNl); } // Suppress error signal { float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]); float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]); vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl); vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl); // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one); vst1q_f32(&efw[0][i], vec_efw_re); vst1q_f32(&efw[1][i], vec_efw_im); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { // Weight subbands if (hNl[i] > hNlFb) { hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + (1 - WebRtcAec_weightCurve[i]) * hNl[i]; } hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); // Suppress error signal efw[0][i] *= hNl[i]; efw[1][i] *= hNl[i]; // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. efw[1][i] *= -1; } }
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. float32x4_t log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); const float32x4_t n = vsubq_f32(vreinterpretq_f32_u32(n_0), vreinterpretq_f32_u32(vec_implicit_leading_one)); // Compute y. const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); const float32x4_t y = vreinterpretq_f32_u32(vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); const float32x4_t C3 = vdupq_n_f32(-1.2315303f); const float32x4_t C2 = vdupq_n_f32(2.5988452f); const float32x4_t C1 = vdupq_n_f32(-3.3241990f); const float32x4_t C0 = vdupq_n_f32(3.1157899f); float32x4_t pol5_y = C5; pol5_y = vmlaq_f32(C4, y, pol5_y); pol5_y = vmlaq_f32(C3, y, pol5_y); pol5_y = vmlaq_f32(C2, y, pol5_y); pol5_y = vmlaq_f32(C1, y, pol5_y); pol5_y = vmlaq_f32(C0, y, pol5_y); const float32x4_t y_minus_one = vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); // Combine parts. log2_a = vaddq_f32(n, log2_y); } // b * log2(a) b_log2_a = vmulq_f32(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. const float32x4_t max_input = vdupq_n_f32(129.f); const float32x4_t min_input = vdupq_n_f32(-126.99999f); const float32x4_t x_min = vminq_f32(b_log2_a, max_input); const float32x4_t x_max = vmaxq_f32(x_min, min_input); // Compute n. const float32x4_t half = vdupq_n_f32(0.5f); const float32x4_t x_minus_half = vsubq_f32(x_max, half); const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); // Compute 2^n. const int32x4_t float_exponent_bias = vdupq_n_s32(127); const int32x4_t two_n_exponent = vaddq_s32(x_minus_half_floor, float_exponent_bias); const float32x4_t two_n = vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); // Compute y. const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); const float32x4_t C0 = vdupq_n_f32(1.0017247f); float32x4_t exp2_y = C2; exp2_y = vmlaq_f32(C1, y, exp2_y); exp2_y = vmlaq_f32(C0, y, exp2_y); // Combine parts. a_exp_b = vmulq_f32(exp2_y, two_n); } return a_exp_b; }