void AudioBufferAddWithScale_NEON(const float* aInput, float aScale, float* aOutput, uint32_t aSize) { ASSERT_ALIGNED(aInput); ASSERT_ALIGNED(aOutput); float32x4_t vin0, vin1, vin2, vin3; float32x4_t vout0, vout1, vout2, vout3; float32x4_t vscale = vmovq_n_f32(aScale); uint32_t dif = aSize % 16; aSize -= dif; unsigned i = 0; for (; i < aSize; i += 16) { vin0 = vld1q_f32(ADDRESS_OF(aInput, i)); vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4)); vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8)); vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12)); vout0 = vld1q_f32(ADDRESS_OF(aOutput, i)); vout1 = vld1q_f32(ADDRESS_OF(aOutput, i + 4)); vout2 = vld1q_f32(ADDRESS_OF(aOutput, i + 8)); vout3 = vld1q_f32(ADDRESS_OF(aOutput, i + 12)); vout0 = vmlaq_f32(vout0, vin0, vscale); vout1 = vmlaq_f32(vout1, vin1, vscale); vout2 = vmlaq_f32(vout2, vin2, vscale); vout3 = vmlaq_f32(vout3, vin3, vscale); vst1q_f32(ADDRESS_OF(aOutput, i), vout0); vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1); vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2); vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3); } for (unsigned j = 0; j < dif; ++i, ++j) { aOutput[i] += aInput[i] * aScale; } }
void AudioBufferInPlaceScale_NEON(float* aBlock, uint32_t aChannelCount, float aScale, uint32_t aSize) { ASSERT_ALIGNED(aBlock); float32x4_t vin0, vin1, vin2, vin3; float32x4_t vout0, vout1, vout2, vout3; float32x4_t vscale = vmovq_n_f32(aScale); uint32_t totalSize = aSize * aChannelCount; uint32_t dif = totalSize % 16; totalSize -= dif; uint32_t i = 0; for (; i < totalSize; i+=16) { vin0 = vld1q_f32(ADDRESS_OF(aBlock, i)); vin1 = vld1q_f32(ADDRESS_OF(aBlock, i+4)); vin2 = vld1q_f32(ADDRESS_OF(aBlock, i+8)); vin3 = vld1q_f32(ADDRESS_OF(aBlock, i+12)); vout0 = vmulq_f32(vin0, vscale); vout1 = vmulq_f32(vin1, vscale); vout2 = vmulq_f32(vin2, vscale); vout3 = vmulq_f32(vin3, vscale); vst1q_f32(ADDRESS_OF(aBlock, i), vout0); vst1q_f32(ADDRESS_OF(aBlock, i+4), vout1); vst1q_f32(ADDRESS_OF(aBlock, i+8), vout2); vst1q_f32(ADDRESS_OF(aBlock, i+12), vout3); } for (unsigned j = 0; j < dif; ++i, ++j) { aBlock[i] *= aScale; } }
void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale, uint32_t aSize) { ASSERT_ALIGNED(aBlock); float32x4_t vin0, vin1, vin2, vin3; float32x4_t vout0, vout1, vout2, vout3; float32x4_t vscale0, vscale1, vscale2, vscale3; uint32_t dif = aSize % 16; uint32_t vectorSize = aSize - dif; uint32_t i = 0; for (; i < vectorSize; i += 16) { vin0 = vld1q_f32(ADDRESS_OF(aBlock, i)); vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4)); vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8)); vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12)); vscale0 = vld1q_f32(ADDRESS_OF(aScale, i)); vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4)); vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8)); vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12)); vout0 = vmulq_f32(vin0, vscale0); vout1 = vmulq_f32(vin1, vscale1); vout2 = vmulq_f32(vin2, vscale2); vout3 = vmulq_f32(vin3, vscale3); vst1q_f32(ADDRESS_OF(aBlock, i), vout0); vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1); vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2); vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3); } for (unsigned j = 0; j < dif; ++i, ++j) { aBlock[i] *= aScale[i]; } }
void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t) for (int i = 0; i < n; i += 4) { int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int j = 0; j < len; j += 8) { int16x4x2_t d0 = vld2_s16(data + j); int16x4x2_t w0 = vld2_s16(weights); int16x4x2_t w1 = vld2_s16(weights + 8); int16x4x2_t w2 = vld2_s16(weights + 16); int16x4x2_t w3 = vld2_s16(weights + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); weights += 32; } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t val = vcvtq_f32_s32(sum); val = vmulq_f32(val, vld1q_f32(weightsf + i*2)); val = vmulq_n_f32(val, istd[0]); val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4)); vst1q_f32(vals + i, val); } }
/** * @brief vector scale & accu: A[] = alpha * B[] + beta * A[]. * * @param dst[out] the accumulating matrix A. * src[in] the input matrix B. * alpha[in] scale of B. * beta[in] scale of A. * elemCnt[in] number of elements to calc. * * @return void. */ void neon_axpby(float *dst, const float *src, const float alpha, const float beta, const int elemCnt) { int i; for (i = 0; i <= elemCnt - 16; i += 16) { float32x4_t q0 = vld1q_f32(src + i); float32x4_t q1 = vld1q_f32(src + i + 4); float32x4_t q2 = vld1q_f32(src + i + 8); float32x4_t q3 = vld1q_f32(src + i + 12); float32x4_t q4 = vld1q_f32(dst + i); float32x4_t q5 = vld1q_f32(dst + i + 4); float32x4_t q6 = vld1q_f32(dst + i + 8); float32x4_t q7 = vld1q_f32(dst + i + 12); q0 = vmulq_n_f32(q0, alpha); q1 = vmulq_n_f32(q1, alpha); q2 = vmulq_n_f32(q2, alpha); q3 = vmulq_n_f32(q3, alpha); q0 = vmlaq_n_f32(q0, q4, beta); q1 = vmlaq_n_f32(q1, q5, beta); q2 = vmlaq_n_f32(q2, q6, beta); q3 = vmlaq_n_f32(q3, q7, beta); vst1q_f32(dst + i, q0); vst1q_f32(dst + i + 4, q1); vst1q_f32(dst + i + 8, q2); vst1q_f32(dst + i + 12, q3); } for (; i < elemCnt; i++) { float a = src[i] * alpha + dst[i] * beta; dst[i] = a; } }
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { unsigned int m = n >> 2; unsigned int k = n & 3, j; unsigned int l = n & (~3); for (j = 0; j < m; j++) { v4sf src = vld1q_f32(a + 4 * j); v4sf tem = simd_ln4f(src); vst1q_f32(y + 4 * j, tem); } for (j = 0; j < k; j++) { y[j + l] = logf(a[j + l]); } }
int Bias_arm::forward_inplace(Mat& bottom_top_blob) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; const float* bias_ptr = bias_data; #pragma omp parallel for for (int q=0; q<channels; q++) { float* ptr = bottom_top_blob.channel(q); float bias = bias_ptr[q]; #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON float32x4_t _bias = vdupq_n_f32(bias); for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); float32x4_t _outp = vaddq_f32(_p, _bias); vst1q_f32(ptr, _outp); ptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *ptr = *ptr + bias; ptr++; } } return 0; }
static void neon_vector_mul(const std::vector<float>& vec_a, const std::vector<float>& vec_b, std::vector<float>& vec_result) { assert(vec_a.size() == vec_b.size()); assert(vec_a.size() == vec_result.size()); int i = 0; //neon process for (; i < (int)vec_result.size() - 3 ; i+=4) { const auto data_a = vld1q_f32(&vec_a[i]); const auto data_b = vld1q_f32(&vec_b[i]); float* dst_ptr = &vec_result[i]; const auto data_res = vmulq_f32(data_a, data_b); vst1q_f32(dst_ptr, data_res); } //normal process for (; i < (int)vec_result.size(); i++) { vec_result[i] = vec_a[i] * vec_b[i]; } }
//Kernel function: saxpy void saxpy_vector(KernelArgs* args) { //Setup const float32x4_t MASK_FALSE = vdupq_n_f32(0.f); const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE)); //Uniforms //Fuses //Literals //Stack variables float32x4_t scale, x, y, result, var060, var061; //Loop over input uint64_t index; for(index = 0; index < args->N; index += 4) { //Inputs scale = vld1q_f32(&args->scale[index]); x = vld1q_f32(&args->x[index]); y = vld1q_f32(&args->y[index]); //Begin kernel logic { //>>> result = scale * x + y var061 = vmulq_f32(scale, x); var060 = vaddq_f32(var061, y); result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result); } //End kernel logic //Outputs vst1q_f32(&args->result[index], result); } }
void dotProd_neon(const float *data, const float *weights, float *vals, const int n, const int len, const float *istd) { for (int i = 0; i < n; i += 4) { float32x4_t accum0 = { 0.0f, 0.0f, 0.0f, 0.0f }; float32x4_t accum1 = accum0; float32x4_t accum2 = accum0; float32x4_t accum3 = accum0; for (int j = 0; j < len; j += 4) { float32x4_t d0 = vld1q_f32(data + j); float32x4_t d1 = d0; float32x4_t d2 = d0; float32x4_t d3 = d0; float32x4_t w0 = vld1q_f32(weights); float32x4_t w1 = vld1q_f32(weights + 4); float32x4_t w2 = vld1q_f32(weights + 8); float32x4_t w3 = vld1q_f32(weights + 12); accum0 = vaddq_f32(accum0, vmulq_f32(d0, w0)); accum1 = vaddq_f32(accum1, vmulq_f32(d1, w1)); accum2 = vaddq_f32(accum2, vmulq_f32(d2, w2)); accum3 = vaddq_f32(accum3, vmulq_f32(d3, w3)); weights += 16; } float32x2_t sum0 = vpadd_f32(vget_low_f32(accum0), vget_high_f32(accum0)); float32x2_t sum1 = vpadd_f32(vget_low_f32(accum1), vget_high_f32(accum1)); float32x2_t sum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2)); float32x2_t sum3 = vpadd_f32(vget_low_f32(accum3), vget_high_f32(accum3)); sum0 = vpadd_f32(sum0, sum1); sum1 = vpadd_f32(sum2, sum3); float32x4_t sum = vcombine_f32(sum0, sum1); sum = vmulq_n_f32(sum, istd[0]); sum = vaddq_f32(sum, vld1q_f32(weights + n*len + i)); vst1q_f32(vals + i, sum); } }
/* f32x4 add */ void mw_neon_mm_add_f32x4(float * A, int Row, int Col, float * B, float * C) { float32x4_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 4; i <= size ; i+=4) { k = i - 4; neon_a = vld1q_f32(A + k); neon_b = vld1q_f32(B + k); neon_c = vaddq_f32(neon_a, neon_b); vst1q_f32(C + k, neon_c); } k = i - 4; for (i = 0; i < size % 4; i++) { C[k + i] = A[k + i] + B[k + i]; } }
/* f32x4 mv mul */ void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C) { int i = 0; int k = 0; float32x4_t neon_b, neon_c; float32x4_t neon_a0, neon_a1, neon_a2, neon_a3; float32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_f32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_f32(A + j); neon_a1 = vld1q_f32(A + j + Row); neon_a2 = vld1q_f32(A + j + 2 * Row); neon_a3 = vld1q_f32(A + j + 3 * Row); neon_b = vld1q_f32(B + k); neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0)); neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1)); neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2)); neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3)); neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c); } vst1q_f32(C + i, neon_c); } }
static void FilterAdaptationNEON( int num_partitions, int x_fft_buf_block_pos, float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float e_fft[2][PART_LEN1], float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { float fft[PART_LEN2]; int i; for (i = 0; i < num_partitions; i++) { int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; int j; // Check for wrap if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // Process the whole array... for (j = 0; j < PART_LEN; j += 4) { // Load x_fft_buf and e_fft. const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]); const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]); // Calculate the product of conjugate(x_fft_buf) by e_fft. // re(conjugate(a) * b) = aRe * bRe + aIm * bIm // im(conjugate(a) * b)= aRe * bIm - aIm * bRe const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re); const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); // Interleave real and imaginary parts. const float32x4x2_t g_n_h = vzipq_f32(e, f); // Store vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]); vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); } // ... and fixup the first imaginary entry. fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], e_fft[0][PART_LEN], e_fft[1][PART_LEN]); aec_rdft_inverse_128(fft); memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); // fft scaling { const float scale = 2.0f / PART_LEN2; const float32x4_t scale_ps = vmovq_n_f32(scale); for (j = 0; j < PART_LEN; j += 4) { const float32x4_t fft_ps = vld1q_f32(&fft[j]); const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps); vst1q_f32(&fft[j], fft_scale); } } aec_rdft_forward_128(fft); { const float wt1 = h_fft_buf[1][pos]; h_fft_buf[0][pos + PART_LEN] += fft[1]; for (j = 0; j < PART_LEN; j += 4) { float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]); float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]); const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]); const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]); const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4); wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]); wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]); vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re); vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im); } h_fft_buf[1][pos] = wt1; } } }
void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE], const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR, bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) { ASSERT_ALIGNED(aInputL); ASSERT_ALIGNED(aInputR); ASSERT_ALIGNED(aOutputL); ASSERT_ALIGNED(aOutputR); float32x4_t vinL0, vinL1; float32x4_t vinR0, vinR1; float32x4_t voutL0, voutL1; float32x4_t voutR0, voutR1; float32x4_t vscaleL = vmovq_n_f32(aGainL); float32x4_t vscaleR = vmovq_n_f32(aGainR); if (aIsOnTheLeft) { for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i+4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i+4)); voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL); voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i+4), voutL1); voutR0 = vmulq_f32(vinR0, vscaleR); voutR1 = vmulq_f32(vinR1, vscaleR); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i+4), voutR1); } } else { for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i+4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i+4)); voutL0 = vmulq_f32(vinL0, vscaleL); voutL1 = vmulq_f32(vinL1, vscaleL); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i+4), voutL1); voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR); voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i+4), voutR1); } } }
static void OverdriveAndSuppressNEON(AecCore* aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]) { int i; const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb); const float32x4_t vec_one = vdupq_n_f32(1.0f); const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f); const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm); // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { // Weight subbands float32x4_t vec_hNl = vld1q_f32(&hNl[i]); const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]); const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb); const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve, vec_hNlFb); const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve); const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve, vec_hNl); const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(vec_hNl)); const float32x4_t vec_one_weightCurve_add = vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl); const uint32x4_t vec_if1 = vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add)); vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1)); { const float32x4_t vec_overDriveCurve = vld1q_f32(&WebRtcAec_overDriveCurve[i]); const float32x4_t vec_overDriveSm_overDriveCurve = vmulq_f32(vec_overDriveSm, vec_overDriveCurve); vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve); vst1q_f32(&hNl[i], vec_hNl); } // Suppress error signal { float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]); float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]); vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl); vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl); // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one); vst1q_f32(&efw[0][i], vec_efw_re); vst1q_f32(&efw[1][i], vec_efw_im); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { // Weight subbands if (hNl[i] > hNlFb) { hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + (1 - WebRtcAec_weightCurve[i]) * hNl[i]; } hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); // Suppress error signal efw[0][i] *= hNl[i]; efw[1][i] *= hNl[i]; // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. efw[1][i] *= -1; } }
static void rftbsub_128_neon(float* a) { const float* c = rdft_w + 32; int j1, j2; const float32x4_t mm_half = vdupq_n_f32(0.5f); a[1] = -a[1]; // Vectorized code (four at once). // Note: commented number are indexes for the first iteration of the loop. for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { // Load 'wk'. const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, // Load and shuffle 'a'. // 2, 4, 6, 8, 3, 5, 7, 9 float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); // 120, 122, 124, 126, 121, 123, 125, 127, const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); // 126, 124, 122, 120 const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); // 127, 125, 123, 121 const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); // Calculate 'x'. const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0); // 2-126, 4-124, 6-122, 8-120, const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1); // 3-127, 5-125, 7-123, 9-121, // Calculate product into 'y'. // yr = wkr * xr - wki * xi; // yi = wkr * xi + wki * xr; const float32x4_t a_ = vmulq_f32(wkr_, xr_); const float32x4_t b_ = vmulq_f32(wki_, xi_); const float32x4_t c_ = vmulq_f32(wkr_, xi_); const float32x4_t d_ = vmulq_f32(wki_, xr_); const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120, const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121, // Update 'a'. // a[j2 + 0] -= yr; // a[j2 + 1] -= yi; // a[k2 + 0] += yr; // a[k2 + 1] -= yi; // 126, 124, 122, 120, const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_); // 127, 125, 123, 121, const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1); // Shuffle in right order and store. // 2, 3, 4, 5, 6, 7, 8, 9, const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n); const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n); // 124, 125, 126, 127, 120, 121, 122, 123 const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr); // 2, 4, 6, 8, a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_); // 3, 5, 7, 9, a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]); // 2, 3, 4, 5, 6, 7, 8, 9, vst2q_f32(&a[0 + j2], a_j2_p); vst1q_f32(&a[122 - j2], a_k2_n.val[1]); vst1q_f32(&a[126 - j2], a_k2_n.val[0]); } // Scalar code for the remaining items. for (; j2 < 64; j1 += 1, j2 += 2) { const int k2 = 128 - j2; const int k1 = 32 - j1; const float wkr = 0.5f - c[k1]; const float wki = c[j1]; const float xr = a[j2 + 0] - a[k2 + 0]; const float xi = a[j2 + 1] + a[k2 + 1]; const float yr = wkr * xr + wki * xi; const float yi = wkr * xi - wki * xr; a[j2 + 0] = a[j2 + 0] - yr; a[j2 + 1] = yi - a[j2 + 1]; a[k2 + 0] = yr + a[k2 + 0]; a[k2 + 1] = yi - a[k2 + 1]; } a[65] = -a[65]; }
static void thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const float* src = _src.ptr<float>(); float* dst = _dst.ptr<float>(); size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) CV_IPP_CHECK() { IppiSize sz = { roi.width, roi.height }; switch( type ) { case THRESH_TRUNC: if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO: if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO_INV: if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; } } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_cmpgt_ps( v0, thresh4 ); v1 = _mm_cmpgt_ps( v1, thresh4 ); v0 = _mm_and_ps( v0, maxval4 ); v1 = _mm_and_ps( v1, maxval4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval)); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_cmple_ps( v0, thresh4 ); v1 = _mm_cmple_ps( v1, thresh4 ); v0 = _mm_and_ps( v0, maxval4 ); v1 = _mm_and_ps( v1, maxval4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval)); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_min_ps( v0, thresh4 ); v1 = _mm_min_ps( v1, thresh4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); for( ; j <= roi.width - 4; j += 4 ) vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4)); v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4)); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), vreinterpretq_u32_f32(v_src)); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) { float v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4)); v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4)); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), vreinterpretq_u32_f32(v_src)); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) { float v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }
// use ARM Neon extensions (unrolled loop) // NOTE: unrolling doesn't show any appreciable performance difference void dotprod_cccf_execute_neon4(dotprod_cccf _q, float complex * _x, float complex * _y) { // type cast input as floating point array float * x = (float*) _x; // double effective length unsigned int n = 2*_q->n; // first cut: ... float32x4_t v0, v1, v2, v3; // input vectors float32x4_t hi0, hi1, hi2, hi3; // coefficients vectors (real) float32x4_t hq0, hq1, hq2, hq3; // coefficients vectors (imag) float32x4_t ci0, ci1, ci2, ci3; // output multiplications (v * hi) float32x4_t cq0, cq1, cq2, cq3; // output multiplications (v * hq) // load zeros into sum registers float zeros[4] = {0,0,0,0}; float32x4_t sumi = vld1q_f32(zeros); float32x4_t sumq = vld1q_f32(zeros); // r = 4*floor(n/16) unsigned int r = (n >> 4) << 2; // unsigned int i; for (i=0; i<r; i+=4) { // load inputs into register (unaligned) v0 = vld1q_f32(&x[4*i+0]); v1 = vld1q_f32(&x[4*i+4]); v2 = vld1q_f32(&x[4*i+8]); v3 = vld1q_f32(&x[4*i+12]); // load real coefficients into registers (aligned) hi0 = vld1q_f32(&_q->hi[4*i+0]); hi1 = vld1q_f32(&_q->hi[4*i+4]); hi2 = vld1q_f32(&_q->hi[4*i+8]); hi3 = vld1q_f32(&_q->hi[4*i+12]); // load real coefficients into registers (aligned) hq0 = vld1q_f32(&_q->hq[4*i+0]); hq1 = vld1q_f32(&_q->hq[4*i+4]); hq2 = vld1q_f32(&_q->hq[4*i+8]); hq3 = vld1q_f32(&_q->hq[4*i+12]); // compute parallel multiplications (real) ci0 = vmulq_f32(v0, hi0); ci1 = vmulq_f32(v1, hi1); ci2 = vmulq_f32(v2, hi2); ci3 = vmulq_f32(v3, hi3); // compute parallel multiplications (imag) cq0 = vmulq_f32(v0, hq0); cq1 = vmulq_f32(v1, hq1); cq2 = vmulq_f32(v2, hq2); cq3 = vmulq_f32(v3, hq3); // accumulate sumi = vaddq_f32(sumi, ci0); sumq = vaddq_f32(sumq, cq0); sumi = vaddq_f32(sumi, ci1); sumq = vaddq_f32(sumq, cq1); sumi = vaddq_f32(sumi, ci2); sumq = vaddq_f32(sumq, cq2); sumi = vaddq_f32(sumi, ci3); sumq = vaddq_f32(sumq, cq3); } // unload float wi[4]; float wq[4]; vst1q_f32(wi, sumi); vst1q_f32(wq, sumq); // fold down (add/sub) float complex total = ((wi[0] - wq[1]) + (wi[2] - wq[3])) + ((wi[1] + wq[0]) + (wi[3] + wq[2])) * _Complex_I; // cleanup (note: n _must_ be even) // TODO : clean this method up for (i=2*r; i<_q->n; i++) { total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I ); } // set return value *_y = total; }
// Updates the following smoothed Power Spectral Densities (PSD): // - sd : near-end // - se : residual echo // - sx : far-end // - sde : cross-PSD of near-end and residual echo // - sxd : cross-PSD of near-end and far-end // // In addition to updating the PSDs, also the filter diverge state is determined // upon actions are taken. static void SmoothedPSD(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = aec->extended_filter_enabled ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; int i; float sdSum = 0, seSum = 0; const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); float32x4_t vec_sdSum = vdupq_n_f32(0.0f); float32x4_t vec_seSum = vdupq_n_f32(0.0f); for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1); vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1); vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1); vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15); vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]); vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); vst1q_f32(&aec->sd[i], vec_sd); vst1q_f32(&aec->se[i], vec_se); vst1q_f32(&aec->sx[i], vec_sx); { float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]); vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1); vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); vst2q_f32(&aec->sde[i][0], vec_sde); } { float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]); vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1); vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); vst2q_f32(&aec->sxd[i][0], vec_sxd); } vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); vec_seSum = vaddq_f32(vec_seSum, vec_se); } { float32x2_t vec_sdSum_total; float32x2_t vec_seSum_total; // A B C D vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); // A+B C+D vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); // A+B+C+D A+B+C+D sdSum = vget_lane_f32(vec_sdSum_total, 0); seSum = vget_lane_f32(vec_seSum_total, 0); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. aec->sx[i] = ptrGCoh[0] * aec->sx[i] + ptrGCoh[1] * WEBRTC_SPL_MAX( xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], WebRtcAec_kMinFarendPSD); aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); sdSum += aec->sd[i]; seSum += aec->se[i]; } // Divergent filter safeguard update. aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). *extreme_filter_divergence = (seSum > (19.95f * sdSum)); }
void nnp_conv1x1_upto_4x4__neon( uint32_t input_channels_subblock_size, uint32_t output_channels_subblock_size, size_t input_channels, size_t image_size, const float* input, const float* kernel, float* output) { const float*restrict input0 = input; const float*restrict input1 = input_channels_subblock_size > 1 ? input0 + image_size : input0; const float*restrict input2 = input_channels_subblock_size > 2 ? input1 + image_size : input1; const float*restrict input3 = input_channels_subblock_size > 3 ? input2 + image_size : input2; const float*restrict kernel0 = kernel; const float*restrict kernel1 = output_channels_subblock_size > 1 ? kernel0 + input_channels : kernel0; const float*restrict kernel2 = output_channels_subblock_size > 2 ? kernel1 + input_channels : kernel1; const float*restrict kernel3 = output_channels_subblock_size > 3 ? kernel2 + input_channels : kernel2; float32x4_t vkernel0x = vld1q_dup_f32(kernel0); float32x4_t vkernel1x = vld1q_dup_f32(kernel1); float32x4_t vkernel2x = vld1q_dup_f32(kernel2); float32x4_t vkernel3x = vld1q_dup_f32(kernel3); if (input_channels_subblock_size > 1) { vkernel0x = vld1q_lane_f32(kernel0 + 1, vkernel0x, 1); vkernel1x = vld1q_lane_f32(kernel1 + 1, vkernel1x, 1); vkernel2x = vld1q_lane_f32(kernel2 + 1, vkernel2x, 1); vkernel3x = vld1q_lane_f32(kernel3 + 1, vkernel3x, 1); if (input_channels_subblock_size > 2) { vkernel0x = vld1q_lane_f32(kernel0 + 2, vkernel0x, 2); vkernel1x = vld1q_lane_f32(kernel1 + 2, vkernel1x, 2); vkernel2x = vld1q_lane_f32(kernel2 + 2, vkernel2x, 2); vkernel3x = vld1q_lane_f32(kernel3 + 2, vkernel3x, 2); if (input_channels_subblock_size > 3) { vkernel0x = vld1q_lane_f32(kernel0 + 3, vkernel0x, 3); vkernel1x = vld1q_lane_f32(kernel1 + 3, vkernel1x, 3); vkernel2x = vld1q_lane_f32(kernel2 + 3, vkernel2x, 3); vkernel3x = vld1q_lane_f32(kernel3 + 3, vkernel3x, 3); } } } float*restrict output0 = output; float*restrict output1 = output_channels_subblock_size > 1 ? output0 + image_size : output0; float*restrict output2 = output_channels_subblock_size > 2 ? output1 + image_size : output1; float*restrict output3 = output_channels_subblock_size > 3 ? output2 + image_size : output2; while (image_size >= 4) { float32x4_t voutput0 = vld1q_f32(output0); float32x4_t voutput1 = vld1q_f32(output1); float32x4_t voutput2 = vld1q_f32(output2); float32x4_t voutput3 = vld1q_f32(output3); const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 1) { const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 2) { const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); if (input_channels_subblock_size > 3) { const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); } } } vst1q_f32(output0, voutput0); output0 += 4; if (output_channels_subblock_size > 1) { vst1q_f32(output1, voutput1); output1 += 4; if (output_channels_subblock_size > 2) { vst1q_f32(output2, voutput2); output2 += 4; if (output_channels_subblock_size > 3) { vst1q_f32(output3, voutput3); output3 += 4; } } } image_size -= 4; } if (image_size >= 2) { float32x2_t voutput0 = vld1_f32(output0); float32x2_t voutput1 = vld1_f32(output1); float32x2_t voutput2 = vld1_f32(output2); float32x2_t voutput3 = vld1_f32(output3); const float32x2_t vinput0 = vld1_f32(input0); input0 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 1) { const float32x2_t vinput1 = vld1_f32(input1); input1 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 2) { const float32x2_t vinput2 = vld1_f32(input2); input2 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); if (input_channels_subblock_size > 3) { const float32x2_t vinput3 = vld1_f32(input3); input3 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); } } } vst1_f32(output0, voutput0); output0 += 2; if (output_channels_subblock_size > 1) { vst1_f32(output1, voutput1); output1 += 2; if (output_channels_subblock_size > 2) { vst1_f32(output2, voutput2); output2 += 2; if (output_channels_subblock_size > 3) { vst1_f32(output3, voutput3); output3 += 2; } } } image_size -= 2; } if (image_size != 0) { float32x2_t voutput0 = vld1_dup_f32(output0); float32x2_t voutput1 = vld1_dup_f32(output1); float32x2_t voutput2 = vld1_dup_f32(output2); float32x2_t voutput3 = vld1_dup_f32(output3); const float32x2_t vinput0 = vld1_dup_f32(input0); voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 1) { const float32x2_t vinput1 = vld1_dup_f32(input1); voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 2) { const float32x2_t vinput2 = vld1_dup_f32(input2); voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); if (input_channels_subblock_size > 3) { const float32x2_t vinput3 = vld1_dup_f32(input3); voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); } } } vst1_lane_f32(output0, voutput0, 0); if (output_channels_subblock_size > 1) { vst1_lane_f32(output1, voutput1, 0); if (output_channels_subblock_size > 2) { vst1_lane_f32(output2, voutput2, 0); if (output_channels_subblock_size > 3) { vst1_lane_f32(output3, voutput3, 0); } } } } }
void sEnv_process(HvBase *_c, SignalEnvelope *o, hv_bInf_t bIn, void (*sendMessage)(HvBase *, int, const HvMessage *)) { #if HV_SIMD_AVX _mm256_stream_ps(o->buffer+o->numSamplesInBuffer, _mm256_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; __m256 sum = _mm256_setzero_ps(); while (n4) { __m256 x = _mm256_load_ps(o->buffer + n4 - HV_N_SIMD); __m256 h = _mm256_load_ps(o->hanningWeights + n4 - HV_N_SIMD); x = _mm256_mul_ps(x, h); sum = _mm256_add_ps(sum, x); n4 -= HV_N_SIMD; } sum = _mm256_hadd_ps(sum,sum); // horizontal sum sum = _mm256_hadd_ps(sum,sum); sEnv_sendMessage(_c, o, sum[0]+sum[4], sendMessage); // updates numSamplesInBuffer } #elif HV_SIMD_SSE _mm_stream_ps(o->buffer+o->numSamplesInBuffer, _mm_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; __m128 sum = _mm_setzero_ps(); while (n4) { __m128 x = _mm_load_ps(o->buffer + n4 - HV_N_SIMD); __m128 h = _mm_load_ps(o->hanningWeights + n4 - HV_N_SIMD); x = _mm_mul_ps(x, h); sum = _mm_add_ps(sum, x); n4 -= HV_N_SIMD; } sum = _mm_hadd_ps(sum,sum); // horizontal sum sum = _mm_hadd_ps(sum,sum); sEnv_sendMessage(_c, o, sum[0], sendMessage); } #elif HV_SIMD_NEON vst1q_f32(o->buffer+o->numSamplesInBuffer, vmulq_f32(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; float32x4_t sum = vdupq_n_f32(0.0f); while (n4) { float32x4_t x = vld1q_f32(o->buffer + n4 - HV_N_SIMD); float32x4_t h = vld1q_f32(o->hanningWeights + n4 - HV_N_SIMD); x = vmulq_f32(x, h); sum = vaddq_f32(sum, x); n4 -= HV_N_SIMD; } sEnv_sendMessage(_c, o, sum[0]+sum[1]+sum[2]+sum[3], sendMessage); } #else // HV_SIMD_NONE o->buffer[o->numSamplesInBuffer] = (bIn*bIn); o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { float sum = 0.0f; for (int i = 0; i < o->windowSize; ++i) { sum += (o->hanningWeights[i] * o->buffer[i]); } sEnv_sendMessage(_c, o, sum, sendMessage); } #endif }
void meanStdDev(const Size2D &size, const u16 * srcBase, ptrdiff_t srcStride, f32 * pMean, f32 * pStdDev) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3; f64 fsum = 0.0f, fsqsum = 0.0f; f32 arsum[8]; uint32x4_t v_zero = vdupq_n_u32(0u), v_sum; float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum; for (size_t i = 0; i < size.height; ++i) { const u16 * src = internal::getRowPtr(srcBase, srcStride, i); size_t j = 0u; while (j < roiw4) { size_t blockSize = std::min(roiw4 - j, blockSize0) + j; v_sum = v_zero; v_sqsum = v_zero_f; for ( ; j + 16 < blockSize ; j += 16) { internal::prefetch(src + j); uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); // 0 uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0)); uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo); float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); // 1 v_srclo = vmovl_u16(vget_low_u16(v_src1)); v_srchi = vmovl_u16(vget_high_u16(v_src1)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); v_srclo_f = vcvtq_f32_u32(v_srclo); v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); } for ( ; j < blockSize; j += 4) { uint32x4_t v_src = vmovl_u16(vld1_u16(src + j)); float32x4_t v_src_f = vcvtq_f32_u32(v_src); v_sum = vaddq_u32(v_sum, v_src); v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f); } vst1q_f32(arsum, vcvtq_f32_u32(v_sum)); vst1q_f32(arsum + 4, v_sqsum); fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3]; fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7]; } // collect a few last elements in the current row for ( ; j < size.width; ++j) { f32 srcval = src[j]; fsum += srcval; fsqsum += srcval * srcval; } } // calc mean and stddev f64 itotal = 1.0 / size.total(); f64 mean = fsum * itotal; f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0)); if (pMean) *pMean = mean; if (pStdDev) *pStdDev = stddev; #else (void)size; (void)srcBase; (void)srcStride; (void)pMean; (void)pStdDev; #endif }
void AudioBlockPanStereoToStereo_NEON( const float aInputL[WEBAUDIO_BLOCK_SIZE], const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL[WEBAUDIO_BLOCK_SIZE], float aGainR[WEBAUDIO_BLOCK_SIZE], const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE], float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) { ASSERT_ALIGNED(aInputL); ASSERT_ALIGNED(aInputR); ASSERT_ALIGNED(aGainL); ASSERT_ALIGNED(aGainR); ASSERT_ALIGNED(aIsOnTheLeft); ASSERT_ALIGNED(aOutputL); ASSERT_ALIGNED(aOutputR); float32x4_t vinL0, vinL1; float32x4_t vinR0, vinR1; float32x4_t voutL0, voutL1; float32x4_t voutR0, voutR1; float32x4_t vscaleL0, vscaleL1; float32x4_t vscaleR0, vscaleR1; float32x4_t onleft0, onleft1, notonleft0, notonleft1; float32x4_t zero = vmovq_n_f32(0); uint8x8_t isOnTheLeft; // Although MSVC throws uninitialized value warning for voutL0 and voutL1, // since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid // compiler warning, set zero. voutL0 = zero; voutL1 = zero; for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4)); vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i)); vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4)); vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i)); vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4)); // Load output with boolean "on the left" values. This assumes that // bools are stored as a single byte. isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]); voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0); voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1); voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2); voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3); voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0); voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1); voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2); voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3); // Convert the boolean values into masks by setting all bits to 1 // if true. voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero); voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero); // The right output masks are the same as the left masks voutR0 = voutL0; voutR1 = voutL1; // Calculate left channel assuming isOnTheLeft onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0); onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL0); // Calculate left channel assuming not isOnTheLeft notonleft0 = vmulq_f32(vinL0, vscaleL0); notonleft1 = vmulq_f32(vinL1, vscaleL1); // Write results using previously stored masks voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0); voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1); // Calculate right channel assuming isOnTheLeft onleft0 = vmulq_f32(vinR0, vscaleR0); onleft1 = vmulq_f32(vinR1, vscaleR1); // Calculate right channel assuming not isOnTheLeft notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0); notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1); // Write results using previously stored masks voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0); voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1); } }
// use ARM Neon extensions // // (a + jb)(c + jd) = (ac - bd) + j(ad + bc) // // mm_x = { x[0].real, x[0].imag, x[1].real, x[1].imag } // mm_hi = { h[0].real, h[0].real, h[1].real, h[1].real } // mm_hq = { h[0].imag, h[0].imag, h[1].imag, h[1].imag } // // mm_y0 = mm_x * mm_hi // = { x[0].real * h[0].real, // x[0].imag * h[0].real, // x[1].real * h[1].real, // x[1].imag * h[1].real }; // // mm_y1 = mm_x * mm_hq // = { x[0].real * h[0].imag, // x[0].imag * h[0].imag, // x[1].real * h[1].imag, // x[1].imag * h[1].imag }; // void dotprod_cccf_execute_neon(dotprod_cccf _q, float complex * _x, float complex * _y) { // type cast input as floating point array float * x = (float*) _x; // double effective length unsigned int n = 2*_q->n; // temporary buffers float32x4_t v; // input vector float32x4_t hi; // coefficients vector (real) float32x4_t hq; // coefficients vector (imag) float32x4_t ci; // output multiplication (v * hi) float32x4_t cq; // output multiplication (v * hq) // output accumulators float zeros[4] = {0,0,0,0}; float32x4_t sumi = vld1q_f32(zeros); float32x4_t sumq = vld1q_f32(zeros); // t = 4*(floor(_n/4)) unsigned int t = (n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) // {x[0].real, x[0].imag, x[1].real, x[1].imag} v = vld1q_f32(&x[i]); // load coefficients into register (aligned) // {hi[0].real, hi[0].imag, hi[1].real, hi[1].imag} // {hq[0].real, hq[0].imag, hq[1].real, hq[1].imag} hi = vld1q_f32(&_q->hi[i]); hq = vld1q_f32(&_q->hq[i]); // compute parallel multiplications ci = vmulq_f32(v, hi); cq = vmulq_f32(v, hq); // parallel addition sumi = vaddq_f32(sumi, ci); sumq = vaddq_f32(sumq, cq); } // unload and combine float wi[4]; float wq[4]; vst1q_f32(wi, sumi); vst1q_f32(wq, sumq); // fold down (add/sub) float complex total = ((wi[0] - wq[1]) + (wi[2] - wq[3])) + ((wi[1] + wq[0]) + (wi[3] + wq[2])) * _Complex_I; // cleanup for (i=t/2; i<_q->n; i++) total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I ); // set return value *_y = total; }
int LRN_arm::forward_inplace(Mat& bottom_top_blob) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; // squared values with local_size padding Mat square_blob; square_blob.create(w, h, channels); if (square_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_top_blob.channel(q); float* outptr = square_blob.channel(q); #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); float32x4_t _outp = vmulq_f32(_p, _p); vst1q_f32(outptr, _outp); ptr += 4; outptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *outptr = *ptr * *ptr; ptr++; outptr++; } } if (region_type == NormRegion_ACROSS_CHANNELS) { Mat square_sum; square_sum.create(w, h, channels); if (square_sum.empty()) return -100; square_sum.fill(0.f); const float alpha_div_size = alpha / local_size; #pragma omp parallel for for (int q=0; q<channels; q++) { // square sum for (int p=q - local_size / 2; p<=q + local_size / 2; p++) { if (p < 0 || p >= channels) continue; const float* sptr = square_blob.channel(p); float* ssptr = square_sum.channel(q); #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON for (; nn>0; nn--) { float32x4_t _sp = vld1q_f32(sptr); float32x4_t _ssp = vld1q_f32(ssptr); _ssp = vaddq_f32(_ssp, _sp); vst1q_f32(ssptr, _ssp); sptr += 4; ssptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *ssptr += *sptr; sptr++; ssptr++; } } float* ptr = bottom_top_blob.channel(q); float* ssptr = square_sum.channel(q); #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON float32x4_t _bias = vdupq_n_f32(bias); float32x4_t _ads = vdupq_n_f32(alpha_div_size); float32x4_t _mb = vdupq_n_f32(-beta); for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); float32x4_t _ssp = vld1q_f32(ssptr); _ssp = vmulq_f32(_ssp, _ads); _ssp = vaddq_f32(_ssp, _bias); _ssp = pow_ps(_ssp, _mb); _p = vmulq_f32(_p, _ssp); vst1q_f32(ptr, _p); ssptr += 4; ptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *ptr = *ptr * pow(bias + alpha_div_size * *ssptr, -beta); ssptr++; ptr++; } } }
void byte2float48_neon(const uint8_t *t, const int pitch, float *p) { uint16x8_t m0, m1, m2, m3, m4, m5; uint32x2_t temp1, temp4; m0 = vmovl_u8(vld1_u8(t)); temp1 = vld1_lane_u32((const uint32_t *)(t + 8), temp1, 0); temp1 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp1, 1); m1 = vmovl_u8(vreinterpret_u8_u32(temp1)); m2 = vmovl_u8(vld1_u8(t + pitch * 2 + 4)); t += pitch * 4; m3 = vmovl_u8(vld1_u8(t)); temp4 = vld1_lane_u32((const uint32_t *)(t + 8), temp4, 0); temp4 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp4, 1); m4 = vmovl_u8(vreinterpret_u8_u32(temp4)); m5 = vmovl_u8(vld1_u8(t + pitch * 2 + 4)); vst1q_f32(p, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m0)))); vst1q_f32(p + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m0)))); vst1q_f32(p + 8, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m1)))); vst1q_f32(p + 12, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m1)))); vst1q_f32(p + 16, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m2)))); vst1q_f32(p + 20, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m2)))); vst1q_f32(p + 24, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m3)))); vst1q_f32(p + 28, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m3)))); vst1q_f32(p + 32, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m4)))); vst1q_f32(p + 36, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m4)))); vst1q_f32(p + 40, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m5)))); vst1q_f32(p + 44, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m5)))); }
void phase(const Size2D &size, const f32 * src0Base, ptrdiff_t src0Stride, const f32 * src1Base, ptrdiff_t src1Stride, f32 * dstBase, ptrdiff_t dstStride, f32 scale) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON FASTATAN2CONST(scale) size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; for (size_t i = 0; i < size.height; ++i) { const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); f32 * dst = internal::getRowPtr(dstBase, dstStride, i); size_t j = 0; for (; j < roiw8; j += 8) { internal::prefetch(src0 + j); internal::prefetch(src1 + j); float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4); float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4); float32x4_t v_dst32f; // 0 FASTATAN2VECTOR(v_src10, v_src00, v_dst32f) vst1q_f32(dst + j, v_dst32f); // 1 FASTATAN2VECTOR(v_src11, v_src01, v_dst32f) vst1q_f32(dst + j + 4, v_dst32f); } if(j + 4 <= size.width) { float32x4_t v_src0 = vld1q_f32(src0 + j); float32x4_t v_src1 = vld1q_f32(src1 + j); float32x4_t v_dst32f; FASTATAN2VECTOR(v_src1, v_src0, v_dst32f) vst1q_f32(dst + j, v_dst32f); j += 4; } for (; j < size.width; j++) { f32 a; FASTATAN2SCALAR(src1[j], src0[j], a) dst[j] = a; } } #else (void)size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; (void)dstBase; (void)dstStride; (void)scale; #endif }
void nnp_conv1x1_only_4x4__neon( size_t input_channels, size_t image_size, const float* input, const float* kernel, float* output) { const float* input0 = input; const float* input1 = input0 + image_size; const float* input2 = input1 + image_size; const float* input3 = input2 + image_size; const float32x4_t vkernel0x = vld1q_f32(kernel); kernel += input_channels; const float32x4_t vkernel1x = vld1q_f32(kernel); kernel += input_channels; const float32x4_t vkernel2x = vld1q_f32(kernel); kernel += input_channels; const float32x4_t vkernel3x = vld1q_f32(kernel); float* output0 = output; float* output1 = output0 + image_size; float* output2 = output1 + image_size; float* output3 = output2 + image_size; while (image_size >= 4) { float32x4_t voutput0 = vld1q_f32(output0); float32x4_t voutput1 = vld1q_f32(output1); float32x4_t voutput2 = vld1q_f32(output2); float32x4_t voutput3 = vld1q_f32(output3); const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); vst1q_f32(output0, voutput0); output0 += 4; vst1q_f32(output1, voutput1); output1 += 4; vst1q_f32(output2, voutput2); output2 += 4; vst1q_f32(output3, voutput3); output3 += 4; image_size -= 4; } if (image_size >= 2) { float32x2_t voutput0 = vld1_f32(output0); float32x2_t voutput1 = vld1_f32(output1); float32x2_t voutput2 = vld1_f32(output2); float32x2_t voutput3 = vld1_f32(output3); const float32x2_t vinput0 = vld1_f32(input0); input0 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); const float32x2_t vinput1 = vld1_f32(input1); input1 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); const float32x2_t vinput2 = vld1_f32(input2); input2 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); const float32x2_t vinput3 = vld1_f32(input3); input3 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); vst1_f32(output0, voutput0); output0 += 2; vst1_f32(output1, voutput1); output1 += 2; vst1_f32(output2, voutput2); output2 += 2; vst1_f32(output3, voutput3); output3 += 2; image_size -= 2; } if (image_size != 0) { float32x2_t voutput0 = vld1_dup_f32(output0); float32x2_t voutput1 = vld1_dup_f32(output1); float32x2_t voutput2 = vld1_dup_f32(output2); float32x2_t voutput3 = vld1_dup_f32(output3); const float32x2_t vinput0 = vld1_dup_f32(input0); voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); const float32x2_t vinput1 = vld1_dup_f32(input1); voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); const float32x2_t vinput2 = vld1_dup_f32(input2); voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); const float32x2_t vinput3 = vld1_dup_f32(input3); voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); vst1_lane_f32(output0, voutput0, 0); vst1_lane_f32(output1, voutput1, 0); vst1_lane_f32(output2, voutput2, 0); vst1_lane_f32(output3, voutput3, 0); } }
static forcedinline void storeU (Type* dest, ParallelType a) noexcept { vst1q_f32 (dest, a); }