void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE], const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR, bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) { ASSERT_ALIGNED(aInputL); ASSERT_ALIGNED(aInputR); ASSERT_ALIGNED(aOutputL); ASSERT_ALIGNED(aOutputR); float32x4_t vinL0, vinL1; float32x4_t vinR0, vinR1; float32x4_t voutL0, voutL1; float32x4_t voutR0, voutR1; float32x4_t vscaleL = vmovq_n_f32(aGainL); float32x4_t vscaleR = vmovq_n_f32(aGainR); if (aIsOnTheLeft) { for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4)); voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL); voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1); voutR0 = vmulq_f32(vinR0, vscaleR); voutR1 = vmulq_f32(vinR1, vscaleR); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1); } } else { for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4)); voutL0 = vmulq_f32(vinL0, vscaleL); voutL1 = vmulq_f32(vinL1, vscaleL); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1); voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR); voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1); } } }
static void test_fma() { for(int i=0; i<1020 * 4; i++) { data_f[i] = i; } float32x4_t c0_02 = vdupq_n_f32(0.02f); float32x4_t c0_04 = vdupq_n_f32(0.04f); float32x4_t c0_05 = vdupq_n_f32(0.05f); float32x4_t c0_10 = vdupq_n_f32(0.1f); float32x4_t c0_20 = vdupq_n_f32(0.2f); float32x4_t c1_00 = vdupq_n_f32(1.0f); startTime(); // Do ~1 billion ops for (int ct=0; ct < (1000 * (1000 / 80)); ct++) { for (int i=0; i < 1000; i++) { float32x4_t t; t = vmulq_f32(vld1q_f32((float32_t *)&data_f[i]), c0_02); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+4]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+8]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+12]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+16]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+20]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+24]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+28]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+32]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+36]), c0_02); t = vaddq_f32(t, c1_00); vst1q_f32((float32_t *)&data_f[i], t); } } endTime("neon fma", 1e9); }
static void cft1st_128_neon(float* a) { const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); int j, k2; for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { float32x4_t a00v = vld1q_f32(&a[j + 0]); float32x4_t a04v = vld1q_f32(&a[j + 4]); float32x4_t a08v = vld1q_f32(&a[j + 8]); float32x4_t a12v = vld1q_f32(&a[j + 12]); float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v)); float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v)); float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v)); const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]); const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]); const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]); const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]); const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]); const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]); float32x4_t x0v = vaddq_f32(a01v, a23v); const float32x4_t x1v = vsubq_f32(a01v, a23v); const float32x4_t x2v = vaddq_f32(a45v, a67v); const float32x4_t x3v = vsubq_f32(a45v, a67v); const float32x4_t x3w = vrev64q_f32(x3v); float32x4_t x0w; a01v = vaddq_f32(x0v, x2v); x0v = vsubq_f32(x0v, x2v); x0w = vrev64q_f32(x0v); a45v = vmulq_f32(wk2rv, x0v); a45v = vmlaq_f32(a45v, wk2iv, x0w); x0v = vmlaq_f32(x1v, x3w, vec_swap_sign); x0w = vrev64q_f32(x0v); a23v = vmulq_f32(wk1rv, x0v); a23v = vmlaq_f32(a23v, wk1iv, x0w); x0v = vmlsq_f32(x1v, x3w, vec_swap_sign); x0w = vrev64q_f32(x0v); a67v = vmulq_f32(wk3rv, x0v); a67v = vmlaq_f32(a67v, wk3iv, x0w); a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v)); a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v)); a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v)); vst1q_f32(&a[j + 0], a00v); vst1q_f32(&a[j + 4], a04v); vst1q_f32(&a[j + 8], a08v); vst1q_f32(&a[j + 12], a12v); } }
void * scaled_sumf_thread_NEON(void * argument) { jsize i = 0; struct scaled_sumfneon_thread_data * data = (struct scaled_sumfneon_thread_data *) argument; float32_t * r = (float32_t *)data->r; const float32_t * x = (const float32_t *)data->x; const float32_t * y = (const float32_t *)data->y; const float32_t a = (const float32_t)data->a; const jsize size = data->size; float32x4_t rx4, xx4, yx4, ax4; ax4 = vdupq_n_f32(a); for(i; i < size ; i += 4) { xx4 = vld1q_f32(&(x[i])); yx4 = vld1q_f32(&(y[i])); rx4 = vmlaq_f32(xx4, ax4, yx4); vst1q_f32(&(r[i]), rx4); } }
template <bool align> SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t offset, float32x4_t & sum) { float32x4_t _a = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + offset)); float32x4_t _b = vcvt_f32_f16((float16x4_t)LoadHalf<align>(b + offset)); float32x4_t _d = vsubq_f32(_a, _b); sum = vmlaq_f32(sum, _d, _d); }
template <bool align> SIMD_INLINE void SquaredDifferenceSum32f(const float * a, const float * b, size_t offset, float32x4_t & sum) { float32x4_t _a = Load<align>(a + offset); float32x4_t _b = Load<align>(b + offset); float32x4_t _d = vsubq_f32(_a, _b); sum = vmlaq_f32(sum, _d, _d); }
static void ScaleErrorSignalNEON(int extended_filter_enabled, float normal_mu, float normal_error_threshold, float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled ? kExtendedErrorThreshold : normal_error_threshold; const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); const float32x4_t kMu = vmovq_n_f32(mu); const float32x4_t kThresh = vmovq_n_f32(error_threshold); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); const float32x4_t absEf = vsqrtq_f32(ef_sum2); const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); ef_re_if = vandq_u32(bigger, ef_re_if); ef_im_if = vandq_u32(bigger, ef_im_if); ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); vst1q_f32(&ef[0][i], ef_re); vst1q_f32(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { float abs_ef; ef[0][i] /= (x_pow[i] + 1e-10f); ef[1][i] /= (x_pow[i] + 1e-10f); abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (abs_ef > error_threshold) { abs_ef = error_threshold / (abs_ef + 1e-10f); ef[0][i] *= abs_ef; ef[1][i] *= abs_ef; } // Stepsize factor ef[0][i] *= mu; ef[1][i] *= mu; } }
f64 dotProduct(const Size2D &_size, const f32 * src0Base, ptrdiff_t src0Stride, const f32 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width * sizeof(f32))) { size.width *= size.height; size.height = 1; } #define DOT_FLOAT_BLOCKSIZE (1 << 13) f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; while(i + 4 <= size.width) { size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4; float32x4_t v_sum = vdupq_n_f32(0.0f); for( ; i <= lim; i += 4 ) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i)); } float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); } if(i + 2 <= size.width) { float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); i += 2; } for (; i < size.width; ++i) result += src0[i] * src1[i]; } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }
static void SubbandCoherenceNEON(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd, int* extreme_filter_divergence) { int i; SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); { const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); // Subband coherence for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); const float32x4_t vec_se = vld1q_f32(&aec->se[i]); const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]); float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]); vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]); vec_cohde = vdivq_f32(vec_cohde, vec_sdse); vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]); vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx); vst1q_f32(&cohde[i], vec_cohde); vst1q_f32(&cohxd[i], vec_cohxd); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { cohde[i] = (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / (aec->sd[i] * aec->se[i] + 1e-10f); cohxd[i] = (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / (aec->sx[i] * aec->sd[i] + 1e-10f); } }
void test_vmlaQf32 (void) { float32x4_t out_float32x4_t; float32x4_t arg0_float32x4_t; float32x4_t arg1_float32x4_t; float32x4_t arg2_float32x4_t; out_float32x4_t = vmlaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t); }
static int PartitionDelayNEON(const AecCore* aec) { // Measures the energy in each filter partition and returns the partition with // highest energy. // TODO(bjornv): Spread computational cost by computing one partition per // block? float wfEnMax = 0; int i; int delay = 0; for (i = 0; i < aec->num_partitions; i++) { int j; int pos = i * PART_LEN1; float wfEn = 0; float32x4_t vec_wfEn = vdupq_n_f32(0.0f); // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]); const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]); vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0); vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1); } { float32x2_t vec_total; // A B C D vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn)); // A+B C+D vec_total = vpadd_f32(vec_total, vec_total); // A+B+C+D A+B+C+D wfEn = vget_lane_f32(vec_total, 0); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; } if (wfEn > wfEnMax) { wfEnMax = wfEn; delay = i; } } return delay; }
int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int size = w * h; top_blob.create(w, h, channels); if (top_blob.empty()) return -100; if (bias_term) { const float* scale_ptr = scale_data; const float* bias_ptr = bias_data; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); float s = scale_ptr[q]; float bias = bias_ptr[q]; #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON float32x4_t _s = vdupq_n_f32(s); float32x4_t _bias = vdupq_n_f32(bias); for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); _p = vmlaq_f32(_bias, _p, _s); vst1q_f32(outptr, _p); ptr += 4; outptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *outptr = *ptr * s + bias; ptr++; outptr++; } } } else {
template <bool align> SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t offset, float32x4_t & sum, float32x4_t & correction) { float32x4_t _a = Load<align>(a + offset); float32x4_t _b = Load<align>(b + offset); float32x4_t _d = vsubq_f32(_a, _b); float32x4_t term = vmlaq_f32(correction, _d, _d); float32x4_t temp = vaddq_f32(sum, term); correction = vsubq_f32(vmulq_f32(temp, sum), term); sum = temp; }
void AudioBufferAddWithScale_NEON(const float* aInput, float aScale, float* aOutput, uint32_t aSize) { ASSERT_ALIGNED(aInput); ASSERT_ALIGNED(aOutput); float32x4_t vin0, vin1, vin2, vin3; float32x4_t vout0, vout1, vout2, vout3; float32x4_t vscale = vmovq_n_f32(aScale); uint32_t dif = aSize % 16; aSize -= dif; unsigned i = 0; for (; i < aSize; i+=16) { vin0 = vld1q_f32(ADDRESS_OF(aInput, i)); vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4)); vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8)); vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12)); vout0 = vld1q_f32(ADDRESS_OF(aOutput, i)); vout1 = vld1q_f32(ADDRESS_OF(aOutput, i+4)); vout2 = vld1q_f32(ADDRESS_OF(aOutput, i+8)); vout3 = vld1q_f32(ADDRESS_OF(aOutput, i+12)); vout0 = vmlaq_f32(vout0, vin0, vscale); vout1 = vmlaq_f32(vout1, vin1, vscale); vout2 = vmlaq_f32(vout2, vin2, vscale); vout3 = vmlaq_f32(vout3, vin3, vscale); vst1q_f32(ADDRESS_OF(aOutput, i), vout0); vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1); vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2); vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3); } for (unsigned j = 0; j < dif; ++i, ++j) { aOutput[i] += aInput[i]*aScale; } }
static void FilterFarNEON( int num_partitions, int x_fft_buf_block_pos, float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float y_fft[2][PART_LEN1]) { int i; for (i = 0; i < num_partitions; i++) { int j; int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); const float32x4_t g = vaddq_f32(y_fft_re, e); const float32x4_t h = vaddq_f32(y_fft_im, f); vst1q_f32(&y_fft[0][j], g); vst1q_f32(&y_fft[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); } } }
static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) { int i; const int num_partitions = aec->num_partitions; for (i = 0; i < num_partitions; i++) { int j; int xPos = (i + aec->xfBufBlockPos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap if (i + aec->xfBufBlockPos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]); const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]); const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]); const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]); const float32x4_t yf_re = vld1q_f32(&yf[0][j]); const float32x4_t yf_im = vld1q_f32(&yf[1][j]); const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re); const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im); const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im); const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re); const float32x4_t g = vaddq_f32(yf_re, e); const float32x4_t h = vaddq_f32(yf_im, f); vst1q_f32(&yf[0][j], g); vst1q_f32(&yf[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { yf[0][j] += MulRe(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j], aec->wfBuf[0][pos + j], aec->wfBuf[1][pos + j]); yf[1][j] += MulIm(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j], aec->wfBuf[0][pos + j], aec->wfBuf[1][pos + j]); } } }
/** * @brief vector_dot_vector. * * @param dst[out] the output element(1*1) * @param src1[in] the input vector(1*n) * src2[in] the input vector(1*n) * dimN[in] size of vector * * @return void */ void neon_VecdotVec(float *dst, const float *src1, const float *src2, const int dimN) { float *mat0 = (float *)src1; float *mat1 = (float *)src2; float32x4_t q0 = vld1q_f32(mat0); float32x4_t q1 = vld1q_f32(mat1); q0 = vmulq_f32(q0, q1); int j = 4; for (; j <= dimN - 4; j += 4) { float32x4_t q2 = vld1q_f32(mat0 + j); float32x4_t q3 = vld1q_f32(mat1 + j); q0 = vmlaq_f32(q0, q2, q3); } float32x2_t d0 = vpadd_f32(vget_low_f32(q0), vget_high_f32(q0)); d0 = vpadd_f32(d0, d0); *dst = *((float *)&d0); for (; j < dimN; j++) { *dst += src1[j] * src2[j]; } }
static void ne10_fft16_backward_float32_neon (ne10_fft_cpx_float32_t * Fout, ne10_fft_cpx_float32_t * Fin, ne10_fft_cpx_float32_t * twiddles) { ne10_fft_cpx_float32_t *tw1, *tw2, *tw3; // the first stage float32_t *p_src0, *p_src4, *p_src8, *p_src12; float32x4x2_t q2_in_0123, q2_in_4567, q2_in_89ab, q2_in_cdef; float32x4_t q_t0_r, q_t0_i, q_t1_r, q_t1_i, q_t2_r, q_t2_i, q_t3_r, q_t3_i; float32x4_t q_out_r048c, q_out_i048c, q_out_r159d, q_out_i159d; float32x4_t q_out_r26ae, q_out_i26ae, q_out_r37bf, q_out_i37bf; p_src0 = (float32_t*) (& (Fin[0])); p_src4 = (float32_t*) (& (Fin[4])); p_src8 = (float32_t*) (& (Fin[8])); p_src12 = (float32_t*) (& (Fin[12])); q2_in_0123 = vld2q_f32 (p_src0); q2_in_4567 = vld2q_f32 (p_src4); q2_in_89ab = vld2q_f32 (p_src8); q2_in_cdef = vld2q_f32 (p_src12); q_t2_r = vsubq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]); q_t2_i = vsubq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]); q_t3_r = vaddq_f32 (q2_in_0123.val[0], q2_in_89ab.val[0]); q_t3_i = vaddq_f32 (q2_in_0123.val[1], q2_in_89ab.val[1]); q_t0_r = vaddq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]); q_t0_i = vaddq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]); q_t1_r = vsubq_f32 (q2_in_4567.val[0], q2_in_cdef.val[0]); q_t1_i = vsubq_f32 (q2_in_4567.val[1], q2_in_cdef.val[1]); q_out_r26ae = vsubq_f32 (q_t3_r, q_t0_r); q_out_i26ae = vsubq_f32 (q_t3_i, q_t0_i); q_out_r048c = vaddq_f32 (q_t3_r, q_t0_r); q_out_i048c = vaddq_f32 (q_t3_i, q_t0_i); q_out_r159d = vsubq_f32 (q_t2_r, q_t1_i); q_out_i159d = vaddq_f32 (q_t2_i, q_t1_r); q_out_r37bf = vaddq_f32 (q_t2_r, q_t1_i); q_out_i37bf = vsubq_f32 (q_t2_i, q_t1_r); // second stages float32_t *p_dst0, *p_dst1, *p_dst2, *p_dst3; float32_t *p_tw1, *p_tw2, *p_tw3; float32x4_t q_s0_r, q_s0_i, q_s1_r, q_s1_i, q_s2_r, q_s2_i; float32x4_t q_s3_r, q_s3_i, q_s4_r, q_s4_i, q_s5_r, q_s5_i; float32x4x2_t q2_tmp_0, q2_tmp_1, q2_tmp_2, q2_tmp_3; float32x4_t q_in_r0123, q_in_r4567, q_in_r89ab, q_in_rcdef; float32x4_t q_in_i0123, q_in_i4567, q_in_i89ab, q_in_icdef; float32x4x2_t q2_tw1, q2_tw2, q2_tw3; float32x4x2_t q2_out_0123, q2_out_4567, q2_out_89ab, q2_out_cdef; float32x4_t q_one_by_nfft; tw1 = twiddles; tw2 = twiddles + 4; tw3 = twiddles + 8; p_dst0 = (float32_t*) (&Fout[0]); p_dst1 = (float32_t*) (&Fout[4]); p_dst2 = (float32_t*) (&Fout[8]); p_dst3 = (float32_t*) (&Fout[12]); p_tw1 = (float32_t*) tw1; p_tw2 = (float32_t*) tw2; p_tw3 = (float32_t*) tw3; q2_tmp_0 = vzipq_f32 (q_out_r048c, q_out_r159d); q2_tmp_1 = vzipq_f32 (q_out_i048c, q_out_i159d); q2_tmp_2 = vzipq_f32 (q_out_r26ae, q_out_r37bf); q2_tmp_3 = vzipq_f32 (q_out_i26ae, q_out_i37bf); q_in_r0123 = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[0]), vget_low_f32 (q2_tmp_2.val[0])); q_in_i0123 = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[0]), vget_low_f32 (q2_tmp_3.val[0])); q_in_r4567 = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[0]), vget_high_f32 (q2_tmp_2.val[0])); q_in_i4567 = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[0]), vget_high_f32 (q2_tmp_3.val[0])); q_in_r89ab = vcombine_f32 (vget_low_f32 (q2_tmp_0.val[1]), vget_low_f32 (q2_tmp_2.val[1])); q_in_i89ab = vcombine_f32 (vget_low_f32 (q2_tmp_1.val[1]), vget_low_f32 (q2_tmp_3.val[1])); q_in_rcdef = vcombine_f32 (vget_high_f32 (q2_tmp_0.val[1]), vget_high_f32 (q2_tmp_2.val[1])); q_in_icdef = vcombine_f32 (vget_high_f32 (q2_tmp_1.val[1]), vget_high_f32 (q2_tmp_3.val[1])); q2_tw1 = vld2q_f32 (p_tw1); q2_tw2 = vld2q_f32 (p_tw2); q2_tw3 = vld2q_f32 (p_tw3); q_s0_r = vmulq_f32 (q_in_r4567, q2_tw1.val[0]); q_s0_i = vmulq_f32 (q_in_i4567, q2_tw1.val[0]); q_s1_r = vmulq_f32 (q_in_r89ab, q2_tw2.val[0]); q_s1_i = vmulq_f32 (q_in_i89ab, q2_tw2.val[0]); q_s2_r = vmulq_f32 (q_in_rcdef, q2_tw3.val[0]); q_s2_i = vmulq_f32 (q_in_icdef, q2_tw3.val[0]); q_s0_r = vmlaq_f32 (q_s0_r, q_in_i4567, q2_tw1.val[1]); q_s0_i = vmlsq_f32 (q_s0_i, q_in_r4567, q2_tw1.val[1]); q_s1_r = vmlaq_f32 (q_s1_r, q_in_i89ab, q2_tw2.val[1]); q_s1_i = vmlsq_f32 (q_s1_i, q_in_r89ab, q2_tw2.val[1]); q_s2_r = vmlaq_f32 (q_s2_r, q_in_icdef, q2_tw3.val[1]); q_s2_i = vmlsq_f32 (q_s2_i, q_in_rcdef, q2_tw3.val[1]); q_s5_r = vsubq_f32 (q_in_r0123, q_s1_r); q_s5_i = vsubq_f32 (q_in_i0123, q_s1_i); q2_out_0123.val[0] = vaddq_f32 (q_in_r0123, q_s1_r); q2_out_0123.val[1] = vaddq_f32 (q_in_i0123, q_s1_i); q_s3_r = vaddq_f32 (q_s0_r, q_s2_r); q_s3_i = vaddq_f32 (q_s0_i, q_s2_i); q_s4_r = vsubq_f32 (q_s0_r, q_s2_r); q_s4_i = vsubq_f32 (q_s0_i, q_s2_i); q_one_by_nfft = vdupq_n_f32 (0.0625f); q2_out_89ab.val[0] = vsubq_f32 (q2_out_0123.val[0], q_s3_r); q2_out_89ab.val[1] = vsubq_f32 (q2_out_0123.val[1], q_s3_i); q2_out_0123.val[0] = vaddq_f32 (q2_out_0123.val[0], q_s3_r); q2_out_0123.val[1] = vaddq_f32 (q2_out_0123.val[1], q_s3_i); q2_out_4567.val[0] = vsubq_f32 (q_s5_r, q_s4_i); q2_out_4567.val[1] = vaddq_f32 (q_s5_i, q_s4_r); q2_out_cdef.val[0] = vaddq_f32 (q_s5_r, q_s4_i); q2_out_cdef.val[1] = vsubq_f32 (q_s5_i, q_s4_r); q2_out_89ab.val[0] = vmulq_f32 (q2_out_89ab.val[0], q_one_by_nfft); q2_out_89ab.val[1] = vmulq_f32 (q2_out_89ab.val[1], q_one_by_nfft); q2_out_0123.val[0] = vmulq_f32 (q2_out_0123.val[0], q_one_by_nfft); q2_out_0123.val[1] = vmulq_f32 (q2_out_0123.val[1], q_one_by_nfft); q2_out_4567.val[0] = vmulq_f32 (q2_out_4567.val[0], q_one_by_nfft); q2_out_4567.val[1] = vmulq_f32 (q2_out_4567.val[1], q_one_by_nfft); q2_out_cdef.val[0] = vmulq_f32 (q2_out_cdef.val[0], q_one_by_nfft); q2_out_cdef.val[1] = vmulq_f32 (q2_out_cdef.val[1], q_one_by_nfft); vst2q_f32 (p_dst0, q2_out_0123); vst2q_f32 (p_dst1, q2_out_4567); vst2q_f32 (p_dst2, q2_out_89ab); vst2q_f32 (p_dst3, q2_out_cdef); }
void AudioBlockPanStereoToStereo_NEON( const float aInputL[WEBAUDIO_BLOCK_SIZE], const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL[WEBAUDIO_BLOCK_SIZE], float aGainR[WEBAUDIO_BLOCK_SIZE], const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE], float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) { ASSERT_ALIGNED(aInputL); ASSERT_ALIGNED(aInputR); ASSERT_ALIGNED(aGainL); ASSERT_ALIGNED(aGainR); ASSERT_ALIGNED(aIsOnTheLeft); ASSERT_ALIGNED(aOutputL); ASSERT_ALIGNED(aOutputR); float32x4_t vinL0, vinL1; float32x4_t vinR0, vinR1; float32x4_t voutL0, voutL1; float32x4_t voutR0, voutR1; float32x4_t vscaleL0, vscaleL1; float32x4_t vscaleR0, vscaleR1; float32x4_t onleft0, onleft1, notonleft0, notonleft1; float32x4_t zero = vmovq_n_f32(0); uint8x8_t isOnTheLeft; // Although MSVC throws uninitialized value warning for voutL0 and voutL1, // since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid // compiler warning, set zero. voutL0 = zero; voutL1 = zero; for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4)); vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i)); vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4)); vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i)); vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4)); // Load output with boolean "on the left" values. This assumes that // bools are stored as a single byte. isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]); voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0); voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1); voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2); voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3); voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0); voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1); voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2); voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3); // Convert the boolean values into masks by setting all bits to 1 // if true. voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero); voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero); // The right output masks are the same as the left masks voutR0 = voutL0; voutR1 = voutL1; // Calculate left channel assuming isOnTheLeft onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0); onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL0); // Calculate left channel assuming not isOnTheLeft notonleft0 = vmulq_f32(vinL0, vscaleL0); notonleft1 = vmulq_f32(vinL1, vscaleL1); // Write results using previously stored masks voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0); voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1); // Calculate right channel assuming isOnTheLeft onleft0 = vmulq_f32(vinR0, vscaleR0); onleft1 = vmulq_f32(vinR1, vscaleR1); // Calculate right channel assuming not isOnTheLeft notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0); notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1); // Write results using previously stored masks voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0); voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1); } }
static void cftmdl_128_neon(float* a) { int j; const int l = 8; const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); for (j = 0; j < l; j += 2) { const float32x2_t a_00 = vld1_f32(&a[j + 0]); const float32x2_t a_08 = vld1_f32(&a[j + 8]); const float32x2_t a_32 = vld1_f32(&a[j + 32]); const float32x2_t a_40 = vld1_f32(&a[j + 40]); const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); const float32x2_t a_16 = vld1_f32(&a[j + 16]); const float32x2_t a_24 = vld1_f32(&a[j + 24]); const float32x2_t a_48 = vld1_f32(&a[j + 48]); const float32x2_t a_56 = vld1_f32(&a[j + 56]); const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); const float32x4_t x1_x3_add = vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x4_t x1_x3_sub = vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0); const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0); const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s); const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1); const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1); const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s); const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as); const float32x4_t yy4 = vmulq_f32(wk1rv, yy0); const float32x4_t xx1_rev = vrev64q_f32(xx1); const float32x4_t yy4_rev = vrev64q_f32(yy4); vst1_f32(&a[j + 0], vget_low_f32(xx0)); vst1_f32(&a[j + 32], vget_high_f32(xx0)); vst1_f32(&a[j + 16], vget_low_f32(xx1)); vst1_f32(&a[j + 48], vget_high_f32(xx1_rev)); a[j + 48] = -a[j + 48]; vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add)); vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub)); vst1_f32(&a[j + 40], vget_low_f32(yy4)); vst1_f32(&a[j + 56], vget_high_f32(yy4_rev)); } { const int k = 64; const int k1 = 2; const int k2 = 2 * k1; const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]); const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]); const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]); const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]); const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]); wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]); for (j = k; j < l + k; j += 2) { const float32x2_t a_00 = vld1_f32(&a[j + 0]); const float32x2_t a_08 = vld1_f32(&a[j + 8]); const float32x2_t a_32 = vld1_f32(&a[j + 32]); const float32x2_t a_40 = vld1_f32(&a[j + 40]); const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); const float32x2_t a_16 = vld1_f32(&a[j + 16]); const float32x2_t a_24 = vld1_f32(&a[j + 24]); const float32x2_t a_48 = vld1_f32(&a[j + 48]); const float32x2_t a_56 = vld1_f32(&a[j + 56]); const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); const float32x4_t x1_x3_add = vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x4_t x1_x3_sub = vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); float32x4_t xx4 = vmulq_f32(wk2rv, xx1); float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add); float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub); xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1)); xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add)); xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub)); vst1_f32(&a[j + 0], vget_low_f32(xx)); vst1_f32(&a[j + 32], vget_high_f32(xx)); vst1_f32(&a[j + 16], vget_low_f32(xx4)); vst1_f32(&a[j + 48], vget_high_f32(xx4)); vst1_f32(&a[j + 8], vget_low_f32(xx12)); vst1_f32(&a[j + 40], vget_high_f32(xx12)); vst1_f32(&a[j + 24], vget_low_f32(xx22)); vst1_f32(&a[j + 56], vget_high_f32(xx22)); } } }
/******************************************************************************* * PROCEDURE: gaussian_smooth * PURPOSE: Blur an image with a gaussian filter. * NAME: Mike Heath * DATE: 2/15/96 *******************************************************************************/ short int* gaussian_smooth(unsigned char *image, int rows, int cols, float sigma) { int r, c, rr, cc, /* Counter variables. */ windowsize, /* Dimension of the gaussian kernel. */ center; /* Half of the windowsize. */ float *tempim,*tempim1, /* Buffer for separable filter gaussian smoothing. */ *kernel, /* A one dimensional gaussian kernel. */ dot, /* Dot product summing variable. */ sum; /* Sum of the kernel weights variable. */ /**************************************************************************** * Create a 1-dimensional gaussian smoothing kernel. ****************************************************************************/ if(VERBOSE) printf(" Computing the gaussian smoothing kernel.\n"); make_gaussian_kernel(sigma, &kernel, &windowsize); center = windowsize / 2; /**************************************************************************** * Allocate a temporary buffer image and the smoothed image. ****************************************************************************/ if((tempim = (float *) malloc(rows*cols* sizeof(float))) == NULL) { fprintf(stderr, "Error allocating the buffer image.\n"); exit(1); } short int* smoothedim; if(((smoothedim) = (short int *) malloc(rows*cols*sizeof(short int))) == NULL) { fprintf(stderr, "Error allocating the smoothed image.\n"); exit(1); } startTimer(&totalTime); //Neon impelementation of gaussian smooth starts here /**************************************************************************** * Blur in the x - direction. ****************************************************************************/ int loop; int floop; //Modification of input image for neon implementation //For Filter 1 float * new_image; //For Filter 2 float *new_image_col; //kernel is changed to 17 from 15 for neon (two 0s at the beginning and the end) float new_kernel[17]; //Generating now kernel filter for (floop = 0 ; floop < 17 ; floop++) { if(floop == 0 || floop == 16 ) new_kernel[floop] = 0 ; else new_kernel [floop] = kernel[floop -1]; } //For filter 1, new cols number for neon unsigned int new_cols; new_cols=cols+16; unsigned int i, k; unsigned int a; unsigned int m; unsigned int n, j; //Malloc of new image used by neon new_image = (float*)malloc(new_cols*rows*sizeof(float)); for( i =0; i<rows; i++){ memset(&new_image[i*new_cols],0,8*sizeof(float)); for( k=0; k<cols;k++){ new_image[i*new_cols+8+k] = (float)image[i*cols+k]; } memset(&new_image[i*new_cols+8+cols],0,8*sizeof(float)); } // Neon handles four piexel at a time float32x4_t neon_input; float32x4_t neon_filter; float32x4_t temp_sum; float32x2_t tempUpper; float32x2_t tempLower; float32_t zero = 0; float32_t temp_output; float Basekernel = 0.0f; float kernelSum; //When using the new filter, we always assume the image has more than 9 pixels in a row //Base sum for the filter for( a=8; a<=16; a++){ Basekernel += new_kernel[a]; } //Filter 1, filtering row by row for(m=0; m<rows; m++){ for( n=0; n<cols; n++){ temp_sum = vdupq_n_f32(0); if(n==0){ kernelSum = Basekernel; } else if(n <=8){ kernelSum += new_kernel[8-n]; } else if(n>=cols-8){ kernelSum -=new_kernel[cols-n+8]; } //For each pixel, filtering is performed four times for( j=0; j<4; j++) { int kk=0; if(j>=2) { kk=1; } neon_input = vld1q_f32(&new_image[m*new_cols+n+j*4+kk]); neon_filter = vld1q_f32(&new_kernel[j*4+kk]); temp_sum = vmlaq_f32(temp_sum,neon_input,neon_filter); } unsigned int t; for( t=0; t<=3; t++){ temp_output += vgetq_lane_f32(temp_sum,t ); } temp_output += new_image[m*new_cols+n+8] * new_kernel[8]; temp_output /= kernelSum; tempim[m*cols+n] = temp_output; temp_output=0; } } for(r=0; r<rows; r++) { for(c=0; c<cols; c++) { dot = 0.0; sum = 0.0; for(cc=(-center); cc<=center; cc++) { if(((c+cc) >= 0) && ((c+cc) < cols)) { dot += (float)image[r*cols+(c+cc)] * kernel[center+cc]; sum += kernel[center+cc]; } } tempim1[r*cols+c] = dot/sum; } } /**************************************************************************** * Blur in the y - direction. ****************************************************************************/ unsigned int new_rows; new_rows=rows+16; new_image_col = (float*)malloc(new_rows*cols*sizeof(float)); if(VERBOSE) printf(" Bluring the image in the Y-direction.\n"); for( i =0; i<cols; i++){//actually nember of new rows are the number of columns here memset(&new_image_col[i*new_rows],0,8*sizeof(float)); for( k=0; k<rows;k++){ new_image_col[i*new_rows+8+k] = tempim[k*cols+i]; //new_image_col[i*new_rows+8+k] = imagetest1[k*cols+i]; } memset(&new_image_col[i*new_rows+8+rows],0,8*sizeof(float)); } Basekernel = 0.0; for( a=8; a<=16; a++){ Basekernel += new_kernel[a]; } for(m=0; m<cols; m++){// it was rows at br for( n=0; n<rows; n++){ temp_sum = vdupq_n_f32(0); if(n==0){ kernelSum = Basekernel; } else if(n <=8){ kernelSum += new_kernel[8-n]; } else if(n>=rows-8){ kernelSum -=new_kernel[rows-n+8]; } for( j=0; j<4; j++) { int kk=0; if(j>=2) { kk=1; } neon_input = vld1q_f32(&new_image_col[m*new_rows+n+j*4+kk]); neon_filter = vld1q_f32(&new_kernel[j*4+kk]); temp_sum = vmlaq_f32(temp_sum,neon_input,neon_filter); } unsigned int t; for( t=0; t<=3; t++){ temp_output += vgetq_lane_f32(temp_sum,t ); } temp_output += new_image_col[m*new_rows+n+8] * new_kernel[8]; temp_output = (temp_output * BOOSTBLURFACTOR) / kernelSum + 0.5; smoothedim[n*cols+m] = (short int )temp_output; temp_output=0; } } stopTimer(&totalTime); printTimer(&totalTime); free(tempim); free(kernel); return smoothedim; }
inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) { return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); }
void meanStdDev(const Size2D &size, const u16 * srcBase, ptrdiff_t srcStride, f32 * pMean, f32 * pStdDev) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3; f64 fsum = 0.0f, fsqsum = 0.0f; f32 arsum[8]; uint32x4_t v_zero = vdupq_n_u32(0u), v_sum; float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum; for (size_t i = 0; i < size.height; ++i) { const u16 * src = internal::getRowPtr(srcBase, srcStride, i); size_t j = 0u; while (j < roiw4) { size_t blockSize = std::min(roiw4 - j, blockSize0) + j; v_sum = v_zero; v_sqsum = v_zero_f; for ( ; j + 16 < blockSize ; j += 16) { internal::prefetch(src + j); uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); // 0 uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0)); uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo); float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); // 1 v_srclo = vmovl_u16(vget_low_u16(v_src1)); v_srchi = vmovl_u16(vget_high_u16(v_src1)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); v_srclo_f = vcvtq_f32_u32(v_srclo); v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); } for ( ; j < blockSize; j += 4) { uint32x4_t v_src = vmovl_u16(vld1_u16(src + j)); float32x4_t v_src_f = vcvtq_f32_u32(v_src); v_sum = vaddq_u32(v_sum, v_src); v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f); } vst1q_f32(arsum, vcvtq_f32_u32(v_sum)); vst1q_f32(arsum + 4, v_sqsum); fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3]; fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7]; } // collect a few last elements in the current row for ( ; j < size.width; ++j) { f32 srcval = src[j]; fsum += srcval; fsqsum += srcval * srcval; } } // calc mean and stddev f64 itotal = 1.0 / size.total(); f64 mean = fsum * itotal; f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0)); if (pMean) *pMean = mean; if (pStdDev) *pStdDev = stddev; #else (void)size; (void)srcBase; (void)srcStride; (void)pMean; (void)pStdDev; #endif }
inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) { return v_float32x4(vmlaq_f32(c.val, a.val, b.val)); }
static void SubbandCoherenceNEON(AecCore* aec, float efw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd) { float dfw[2][PART_LEN1]; int i; if (aec->delayEstCtr == 0) aec->delayIdx = PartitionDelay(aec); // Use delayed far. memcpy(xfw, aec->xfwBuf + aec->delayIdx * PART_LEN1, sizeof(xfw[0][0]) * 2 * PART_LEN1); // Windowed near fft WindowData(fft, aec->dBuf); aec_rdft_forward_128(fft); StoreAsComplex(fft, dfw); // Windowed error fft WindowData(fft, aec->eBuf); aec_rdft_forward_128(fft); StoreAsComplex(fft, efw); SmoothedPSD(aec, efw, dfw, xfw); { const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); // Subband coherence for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); const float32x4_t vec_se = vld1q_f32(&aec->se[i]); const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]); float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]); vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]); vec_cohde = vdivq_f32(vec_cohde, vec_sdse); vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]); vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx); vst1q_f32(&cohde[i], vec_cohde); vst1q_f32(&cohxd[i], vec_cohxd); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { cohde[i] = (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / (aec->sd[i] * aec->se[i] + 1e-10f); cohxd[i] = (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / (aec->sx[i] * aec->sd[i] + 1e-10f); } }
void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy) /* constant times a vector plus a vector. jack dongarra, linpack, 3/11/78. */ { int i,ix,iy,m,mp1; mp1 = 0; m = 0; if(n <= 0) return; if (da == ZERO) return; if(incx != 1 || incy != 1) { /* code for unequal increments or equal increments not equal to 1 */ ix = 0; iy = 0; if(incx < 0) ix = (-n+1)*incx; if(incy < 0)iy = (-n+1)*incy; for (i = 0;i < n; i++) { dy[iy] = dy[iy] + da*dx[ix]; ix = ix + incx; iy = iy + incy; } return; } /* code for both increments equal to 1 */ #ifdef ROLL for (i = 0;i < n; i++) { dy[i] = dy[i] + da*dx[i]; } #endif #ifdef UNROLL m = n % 4; if ( m != 0) { for (i = 0; i < m; i++) dy[i] = dy[i] + da*dx[i]; if (n < 4) return; } for (i = m; i < n; i = i + 4) { dy[i] = dy[i] + da*dx[i]; dy[i+1] = dy[i+1] + da*dx[i+1]; dy[i+2] = dy[i+2] + da*dx[i+2]; dy[i+3] = dy[i+3] + da*dx[i+3]; } #endif #ifdef NEON float cf[4]; float32x4_t x41, y41, c41, r41; float32_t *ptrx1 = (float32_t *)dx; float32_t *ptry1 = (float32_t *)dy; float32_t *ptrc1 = (float32_t *)cf; for (i=0; i<4; i++) { cf[i] = da; } m = n % 4; if ( m != 0) { for (i = 0; i < m; i++) dy[i] = dy[i] + da*dx[i]; if (n < 4) return; } ptrx1 = ptrx1 + m; ptry1 = ptry1 + m; c41 = vld1q_f32(ptrc1); for (i = m; i < n; i=i+4) { x41 = vld1q_f32(ptrx1); y41 = vld1q_f32(ptry1); r41 = vmlaq_f32(y41, x41, c41); vst1q_f32(ptry1, r41); ptrx1 = ptrx1 + 4; ptry1 = ptry1 + 4; } #endif return; }
// Updates the following smoothed Power Spectral Densities (PSD): // - sd : near-end // - se : residual echo // - sx : far-end // - sde : cross-PSD of near-end and residual echo // - sxd : cross-PSD of near-end and far-end // // In addition to updating the PSDs, also the filter diverge state is determined // upon actions are taken. static void SmoothedPSD(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = aec->extended_filter_enabled ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; int i; float sdSum = 0, seSum = 0; const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); float32x4_t vec_sdSum = vdupq_n_f32(0.0f); float32x4_t vec_seSum = vdupq_n_f32(0.0f); for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1); vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1); vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1); vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15); vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]); vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); vst1q_f32(&aec->sd[i], vec_sd); vst1q_f32(&aec->se[i], vec_se); vst1q_f32(&aec->sx[i], vec_sx); { float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]); vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1); vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); vst2q_f32(&aec->sde[i][0], vec_sde); } { float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]); vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1); vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); vst2q_f32(&aec->sxd[i][0], vec_sxd); } vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); vec_seSum = vaddq_f32(vec_seSum, vec_se); } { float32x2_t vec_sdSum_total; float32x2_t vec_seSum_total; // A B C D vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); // A+B C+D vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); // A+B+C+D A+B+C+D sdSum = vget_lane_f32(vec_sdSum_total, 0); seSum = vget_lane_f32(vec_seSum_total, 0); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. aec->sx[i] = ptrGCoh[0] * aec->sx[i] + ptrGCoh[1] * WEBRTC_SPL_MAX( xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], WebRtcAec_kMinFarendPSD); aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); sdSum += aec->sd[i]; seSum += aec->se[i]; } // Divergent filter safeguard update. aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). *extreme_filter_divergence = (seSum > (19.95f * sdSum)); }
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. float32x4_t log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); const float32x4_t n = vsubq_f32(vreinterpretq_f32_u32(n_0), vreinterpretq_f32_u32(vec_implicit_leading_one)); // Compute y. const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); const float32x4_t y = vreinterpretq_f32_u32(vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); const float32x4_t C3 = vdupq_n_f32(-1.2315303f); const float32x4_t C2 = vdupq_n_f32(2.5988452f); const float32x4_t C1 = vdupq_n_f32(-3.3241990f); const float32x4_t C0 = vdupq_n_f32(3.1157899f); float32x4_t pol5_y = C5; pol5_y = vmlaq_f32(C4, y, pol5_y); pol5_y = vmlaq_f32(C3, y, pol5_y); pol5_y = vmlaq_f32(C2, y, pol5_y); pol5_y = vmlaq_f32(C1, y, pol5_y); pol5_y = vmlaq_f32(C0, y, pol5_y); const float32x4_t y_minus_one = vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); // Combine parts. log2_a = vaddq_f32(n, log2_y); } // b * log2(a) b_log2_a = vmulq_f32(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. const float32x4_t max_input = vdupq_n_f32(129.f); const float32x4_t min_input = vdupq_n_f32(-126.99999f); const float32x4_t x_min = vminq_f32(b_log2_a, max_input); const float32x4_t x_max = vmaxq_f32(x_min, min_input); // Compute n. const float32x4_t half = vdupq_n_f32(0.5f); const float32x4_t x_minus_half = vsubq_f32(x_max, half); const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); // Compute 2^n. const int32x4_t float_exponent_bias = vdupq_n_s32(127); const int32x4_t two_n_exponent = vaddq_s32(x_minus_half_floor, float_exponent_bias); const float32x4_t two_n = vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); // Compute y. const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); const float32x4_t C0 = vdupq_n_f32(1.0017247f); float32x4_t exp2_y = C2; exp2_y = vmlaq_f32(C1, y, exp2_y); exp2_y = vmlaq_f32(C0, y, exp2_y); // Combine parts. a_exp_b = vmulq_f32(exp2_y, two_n); } return a_exp_b; }
static void FilterAdaptationNEON( int num_partitions, int x_fft_buf_block_pos, float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float e_fft[2][PART_LEN1], float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { float fft[PART_LEN2]; int i; for (i = 0; i < num_partitions; i++) { int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; int j; // Check for wrap if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // Process the whole array... for (j = 0; j < PART_LEN; j += 4) { // Load x_fft_buf and e_fft. const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]); const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]); // Calculate the product of conjugate(x_fft_buf) by e_fft. // re(conjugate(a) * b) = aRe * bRe + aIm * bIm // im(conjugate(a) * b)= aRe * bIm - aIm * bRe const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re); const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); // Interleave real and imaginary parts. const float32x4x2_t g_n_h = vzipq_f32(e, f); // Store vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); } // ... and fixup the first imaginary entry. fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], e_fft[0][PART_LEN], e_fft[1][PART_LEN]); aec_rdft_inverse_128(fft); memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); // fft scaling { const float scale = 2.0f / PART_LEN2; const float32x4_t scale_ps = vmovq_n_f32(scale); for (j = 0; j < PART_LEN; j += 4) { const float32x4_t fft_ps = vld1q_f32(&fft[j]); const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps); vst1q_f32(&fft[j], fft_scale); } } aec_rdft_forward_128(fft); { const float wt1 = h_fft_buf[1][pos]; h_fft_buf[0][pos + PART_LEN] += fft[1]; for (j = 0; j < PART_LEN; j += 4) { float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]); float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]); const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]); const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]); const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4); wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]); wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]); vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re); vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im); } h_fft_buf[1][pos] = wt1; } } }