void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE], const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR, bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) { ASSERT_ALIGNED(aInputL); ASSERT_ALIGNED(aInputR); ASSERT_ALIGNED(aOutputL); ASSERT_ALIGNED(aOutputR); float32x4_t vinL0, vinL1; float32x4_t vinR0, vinR1; float32x4_t voutL0, voutL1; float32x4_t voutR0, voutR1; float32x4_t vscaleL = vmovq_n_f32(aGainL); float32x4_t vscaleR = vmovq_n_f32(aGainR); if (aIsOnTheLeft) { for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4)); voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL); voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1); voutR0 = vmulq_f32(vinR0, vscaleR); voutR1 = vmulq_f32(vinR1, vscaleR); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1); } } else { for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4)); voutL0 = vmulq_f32(vinL0, vscaleL); voutL1 = vmulq_f32(vinL1, vscaleL); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1); voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR); voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1); } } }
void AudioBlockCopyChannelWithScale_NEON(const float aInput[WEBAUDIO_BLOCK_SIZE], const float aScale[WEBAUDIO_BLOCK_SIZE], float aOutput[WEBAUDIO_BLOCK_SIZE]) { ASSERT_ALIGNED(aInput); ASSERT_ALIGNED(aScale); ASSERT_ALIGNED(aOutput); float32x4_t vin0, vin1, vin2, vin3; float32x4_t vout0, vout1, vout2, vout3; float32x4_t vscale0, vscale1, vscale2, vscale3; for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=16) { vin0 = vld1q_f32(ADDRESS_OF(aInput, i)); vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4)); vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8)); vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12)); vscale0 = vld1q_f32(ADDRESS_OF(aScale, i)); vscale1 = vld1q_f32(ADDRESS_OF(aScale, i+4)); vscale2 = vld1q_f32(ADDRESS_OF(aScale, i+8)); vscale3 = vld1q_f32(ADDRESS_OF(aScale, i+12)); vout0 = vmulq_f32(vin0, vscale0); vout1 = vmulq_f32(vin1, vscale1); vout2 = vmulq_f32(vin2, vscale2); vout3 = vmulq_f32(vin3, vscale3); vst1q_f32(ADDRESS_OF(aOutput, i), vout0); vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1); vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2); vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3); } }
static float32x4_t vsqrtq_f32(float32x4_t s) { int i; float32x4_t x = vrsqrteq_f32(s); // Code to handle sqrt(0). // If the input to sqrtf() is zero, a zero will be returned. // If the input to vrsqrteq_f32() is zero, positive infinity is returned. const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); // check for divide by zero const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); // zero out the positive infinity results x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(x))); // from arm documentation // The Newton-Raphson iteration: // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. // // Note: The precision did not improve after 2 iterations. for (i = 0; i < 2; i++) { x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); } // sqrt(s) = s * 1/sqrt(s) return vmulq_f32(s, x);; }
inline float32x4_t cv_vrecpq_f32(float32x4_t val) { float32x4_t reciprocal = vrecpeq_f32(val); reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); return reciprocal; }
void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) { ASSERT_ALIGNED(aBlock); float32x4_t vin0, vin1, vin2, vin3; float32x4_t vout0, vout1, vout2, vout3; float32x4_t vscale = vmovq_n_f32(aScale); uint32_t dif = aSize % 16; uint32_t vectorSize = aSize - dif; uint32_t i = 0; for (; i < vectorSize; i += 16) { vin0 = vld1q_f32(ADDRESS_OF(aBlock, i)); vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4)); vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8)); vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12)); vout0 = vmulq_f32(vin0, vscale); vout1 = vmulq_f32(vin1, vscale); vout2 = vmulq_f32(vin2, vscale); vout3 = vmulq_f32(vin3, vscale); vst1q_f32(ADDRESS_OF(aBlock, i), vout0); vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1); vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2); vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3); } for (unsigned j = 0; j < dif; ++i, ++j) { aBlock[i] *= aScale; } }
//----------------------------------------------------------------------------------- void MathlibNEON::SinCos4( ArrayReal x, ArrayReal &outSin, ArrayReal &outCos ) { // TODO: Improve accuracy by mapping to the range [-pi/4, pi/4] and swap // between cos & sin depending on which quadrant it fell: // Quadrant | sin | cos // n = 0 -> sin( x ), cos( x ) // n = 1 -> cos( x ), -sin( x ) // n = 2 -> -sin( x ), -cos( x ) // n = 3 -> -sin( x ), sin( x ) // See ARGUMENT REDUCTION FOR HUGE ARGUMENTS: // Good to the Last Bit // K. C. Ng and themembers of the FP group of SunPro // http://www.derekroconnor.net/Software/Ng--ArgReduction.pdf // -- Perhaps we can leave this to GSoC students? -- // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayReal integralPart; x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI ); sincos_ps( x, &outSin, &outCos ); }
inline float32x4_t cv_vrsqrtq_f32(float32x4_t val) { float32x4_t e = vrsqrteq_f32(val); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); return e; }
static void ScaleErrorSignalNEON(int extended_filter_enabled, float normal_mu, float normal_error_threshold, float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled ? kExtendedErrorThreshold : normal_error_threshold; const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); const float32x4_t kMu = vmovq_n_f32(mu); const float32x4_t kThresh = vmovq_n_f32(error_threshold); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); const float32x4_t absEf = vsqrtq_f32(ef_sum2); const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); ef_re_if = vandq_u32(bigger, ef_re_if); ef_im_if = vandq_u32(bigger, ef_im_if); ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); vst1q_f32(&ef[0][i], ef_re); vst1q_f32(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { float abs_ef; ef[0][i] /= (x_pow[i] + 1e-10f); ef[1][i] /= (x_pow[i] + 1e-10f); abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (abs_ef > error_threshold) { abs_ef = error_threshold / (abs_ef + 1e-10f); ef[0][i] *= abs_ef; ef[1][i] *= abs_ef; } // Stepsize factor ef[0][i] *= mu; ef[1][i] *= mu; } }
void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) { size_t i; unsigned char alpha; float32_t (*mat)[4] = transform->matrix; const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r; const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g; const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b; const uint8_t *otdata_r = &transform->output_table_r->data[0]; const uint8_t *otdata_g = &transform->output_table_g->data[0]; const uint8_t *otdata_b = &transform->output_table_b->data[0]; const float32x4_t mat0 = vld1q_f32(mat[0]); const float32x4_t mat1 = vld1q_f32(mat[1]); const float32x4_t mat2 = vld1q_f32(mat[2]); const float32x4_t max = vld1q_dup_f32(&clampMaxValue); const float32x4_t min = vld1q_dup_f32(&zero); const float32x4_t scale = vld1q_dup_f32(&floatScale); float32x4_t vec_r, vec_g, vec_b; int32x4_t result; /* CYA */ if (!length) return; for (i = 0; i < length; i++) { /* setup for transforming the pixel */ vec_r = vld1q_dup_f32(&igtbl_r[*src++]); vec_g = vld1q_dup_f32(&igtbl_g[*src++]); vec_b = vld1q_dup_f32(&igtbl_b[*src++]); alpha = *src++; /* gamma * matrix */ vec_r = vmulq_f32(vec_r, mat0); vec_g = vmulq_f32(vec_g, mat1); vec_b = vmulq_f32(vec_b, mat2); /* crunch, crunch, crunch */ vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b)); vec_r = vmaxq_f32(min, vec_r); vec_r = vminq_f32(max, vec_r); result = vcvtq_s32_f32(vmulq_f32(vec_r, scale)); /* use calc'd indices to output RGB values */ *dest++ = otdata_r[vgetq_lane_s32(result, 0)]; *dest++ = otdata_g[vgetq_lane_s32(result, 1)]; *dest++ = otdata_b[vgetq_lane_s32(result, 2)]; *dest++ = alpha; } }
static inline void neon_make_rgb(float32x4_t macropixel, float32x4_t *rgba0p, float32x4_t *rgba1p) { const float32x4_t u_coeff = {0.0, -0.34455, 1.7790, 0.0 }; const float32x4_t v_coeff = {1.4075, -0.7169, 0.0, 0.0 }; float32x4_t y0_vec, y1_vec, u_vec, v_vec, uv_vec; float32x2_t y0_u, y1_v; const float32_t alpha = 255.0; /* macropixel is [Y0, U, Y1, V]. */ /* since vdupq_lane_f32 will only take two element vectors we */ /* need to pick macropixel apart to build vectors of the components. */ /* so make y0_u be the first half of macropixel [Y0, U] and */ /* y1_v be the second half [Y1, V]. */ y0_u = vget_low_f32(macropixel); y1_v = vget_high_f32(macropixel); /* now copy Y0 to all elements of y0_vec, then overwrite element 3 */ /* with alpha. */ y0_vec = vdupq_lane_f32(y0_u, 0); y0_vec = vsetq_lane_f32(alpha, y0_vec, 3); /* make u_vec be [U, U, U, U]. we'll do that using */ /* vdupq_lane_f32 and selecting U (element 1) from y0_u */ u_vec = vdupq_lane_f32(y0_u, 1); /* now copy Y1 to all elements of y1_vec, then overwrite element 3 */ /* with alpha. */ y1_vec = vdupq_lane_f32(y1_v, 0); y1_vec = vsetq_lane_f32(alpha, y1_vec, 3); /* make v_vec be [V, V, V, V]. we'll do that using */ /* vdupq_lane_f32 and selecting V (element 1) from y1_v */ v_vec = vdupq_lane_f32(y1_v, 1); /* now multiply u_vec * u_coeff and v_vec by v_coeff. */ u_vec = vmulq_f32(u_vec, u_coeff); v_vec = vmulq_f32(v_vec, v_coeff); /* add u_vec and v_vec to form uv_vec. use that to build */ /* rgba0 and rgba1 by adding y0_vec, y1_vec*/ uv_vec = vaddq_f32(u_vec, v_vec); *rgba0p = vaddq_f32(y0_vec, uv_vec); *rgba1p = vaddq_f32(y1_vec, uv_vec); }
//----------------------------------------------------------------------------------- ArrayReal MathlibNEON::Cos4( ArrayReal x ) { // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayReal integralPart; x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI ); return cos_ps( x ); }
/* f32x4 mm mul */ void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C) { int i, k, j; float32x4_t neon_b, neon_c; float32x4_t neon_a0, neon_a1, neon_a2, neon_a3; float32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_f32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_f32(A + j_T); j_T+=Row; neon_a1 = vld1q_f32(A + j_T); j_T+=Row; neon_a2 = vld1q_f32(A + j_T); j_T+=Row; neon_a3 = vld1q_f32(A + j_T); neon_b = vld1q_f32(B + k_Row + j); neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0)); neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1)); neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2)); neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3)); neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c); vst1q_lane_f32(C + k_Row + i, neon_c, 0); vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3); } } } }
static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { int i; float32x4_t x = vrecpeq_f32(b); // from arm documentation // The Newton-Raphson iteration: // x[n+1] = x[n] * (2 - d * x[n]) // converges to (1/d) if x0 is the result of VRECPE applied to d. // // Note: The precision did not improve after 2 iterations. for (i = 0; i < 2; i++) { x = vmulq_f32(vrecpsq_f32(b, x), x); } // a/b = a*(1/b) return vmulq_f32(a, x); }
static void cft1st_128_neon(float* a) { const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); int j, k2; for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { float32x4_t a00v = vld1q_f32(&a[j + 0]); float32x4_t a04v = vld1q_f32(&a[j + 4]); float32x4_t a08v = vld1q_f32(&a[j + 8]); float32x4_t a12v = vld1q_f32(&a[j + 12]); float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v)); float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v)); float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v)); const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]); const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]); const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]); const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]); const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]); const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]); float32x4_t x0v = vaddq_f32(a01v, a23v); const float32x4_t x1v = vsubq_f32(a01v, a23v); const float32x4_t x2v = vaddq_f32(a45v, a67v); const float32x4_t x3v = vsubq_f32(a45v, a67v); const float32x4_t x3w = vrev64q_f32(x3v); float32x4_t x0w; a01v = vaddq_f32(x0v, x2v); x0v = vsubq_f32(x0v, x2v); x0w = vrev64q_f32(x0v); a45v = vmulq_f32(wk2rv, x0v); a45v = vmlaq_f32(a45v, wk2iv, x0w); x0v = vmlaq_f32(x1v, x3w, vec_swap_sign); x0w = vrev64q_f32(x0v); a23v = vmulq_f32(wk1rv, x0v); a23v = vmlaq_f32(a23v, wk1iv, x0w); x0v = vmlsq_f32(x1v, x3w, vec_swap_sign); x0w = vrev64q_f32(x0v); a67v = vmulq_f32(wk3rv, x0v); a67v = vmlaq_f32(a67v, wk3iv, x0w); a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v)); a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v)); a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v)); vst1q_f32(&a[j + 0], a00v); vst1q_f32(&a[j + 4], a04v); vst1q_f32(&a[j + 8], a08v); vst1q_f32(&a[j + 12], a12v); } }
static void test_fma() { for(int i=0; i<1020 * 4; i++) { data_f[i] = i; } float32x4_t c0_02 = vdupq_n_f32(0.02f); float32x4_t c0_04 = vdupq_n_f32(0.04f); float32x4_t c0_05 = vdupq_n_f32(0.05f); float32x4_t c0_10 = vdupq_n_f32(0.1f); float32x4_t c0_20 = vdupq_n_f32(0.2f); float32x4_t c1_00 = vdupq_n_f32(1.0f); startTime(); // Do ~1 billion ops for (int ct=0; ct < (1000 * (1000 / 80)); ct++) { for (int i=0; i < 1000; i++) { float32x4_t t; t = vmulq_f32(vld1q_f32((float32_t *)&data_f[i]), c0_02); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+4]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+8]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+12]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+16]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+20]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+24]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+28]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+32]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+36]), c0_02); t = vaddq_f32(t, c1_00); vst1q_f32((float32_t *)&data_f[i], t); } } endTime("neon fma", 1e9); }
template <bool align> SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) { assert(size >= F); if (align) assert(Aligned(a) && Aligned(b)); size_t partialAlignedSize = AlignLo(size, F); size_t fullAlignedSize = AlignLo(size, DF); size_t i = 0; float32x4_t sums[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; if (fullAlignedSize) { for (; i < fullAlignedSize; i += DF) { SquaredDifferenceSum16f<align>(a, b, i + F * 0, sums[0]); SquaredDifferenceSum16f<align>(a, b, i + F * 1, sums[1]); } sums[0] = vaddq_f32(sums[0], sums[1]); } for (; i < partialAlignedSize; i += F) SquaredDifferenceSum16f<align>(a, b, i, sums[0]); if (partialAlignedSize != size) { float32x4_t tailMask = RightNotZero(size - partialAlignedSize); float32x4_t _a = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F)); float32x4_t _b = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F)); float32x4_t _d = And(vsubq_f32(_a, _b), tailMask); sums[0] = vaddq_f32(sums[0], vmulq_f32(_d, _d)); } *sum = ExtractSum32f(sums[0]); }
static void FilterFarNEON( int num_partitions, int x_fft_buf_block_pos, float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float y_fft[2][PART_LEN1]) { int i; for (i = 0; i < num_partitions; i++) { int j; int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); const float32x4_t g = vaddq_f32(y_fft_re, e); const float32x4_t h = vaddq_f32(y_fft_im, f); vst1q_f32(&y_fft[0][j], g); vst1q_f32(&y_fft[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); } } }
void test_vmulQf32 (void) { float32x4_t out_float32x4_t; float32x4_t arg0_float32x4_t; float32x4_t arg1_float32x4_t; out_float32x4_t = vmulq_f32 (arg0_float32x4_t, arg1_float32x4_t); }
static void SubbandCoherenceNEON(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd, int* extreme_filter_divergence) { int i; SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); { const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); // Subband coherence for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); const float32x4_t vec_se = vld1q_f32(&aec->se[i]); const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]); float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]); vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]); vec_cohde = vdivq_f32(vec_cohde, vec_sdse); vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]); vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx); vst1q_f32(&cohde[i], vec_cohde); vst1q_f32(&cohxd[i], vec_cohxd); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { cohde[i] = (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / (aec->sd[i] * aec->se[i] + 1e-10f); cohxd[i] = (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / (aec->sx[i] * aec->sd[i] + 1e-10f); } }
/* Performs one rotation/translation */ static void neon_coord_4( float32x4_t a_4, float32x4_t b_4, float32x4_t x_4, float32x4_t y_4, float32x4_t pos_4f, float32x4_t point5_4, int * result) { float32x4_t tmp1 = vmulq_f32(a_4, x_4); float32x4_t tmp2 = vmulq_f32(b_4, y_4); tmp2 = vaddq_f32(tmp1, tmp2); tmp2 = vaddq_f32(tmp2, pos_4f); tmp2 = vaddq_f32(tmp2, point5_4); int32x4_t c_4 = vcvtq_s32_f32(tmp2); vst1q_s32(result, c_4); }
// Window time domain data to be used by the fft. static void WindowDataNEON(float* x_windowed, const float* x) { int i; for (i = 0; i < PART_LEN; i += 4) { const float32x4_t vec_Buf1 = vld1q_f32(&x[i]); const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]); const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]); // A B C D float32x4_t vec_sqrtHanning_rev = vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); // B A D C vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); // D C B A vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), vget_low_f32(vec_sqrtHanning_rev)); vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); vst1q_f32(&x_windowed[PART_LEN + i], vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); } }
template <bool align> SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t offset, float32x4_t & sum, float32x4_t & correction) { float32x4_t _a = Load<align>(a + offset); float32x4_t _b = Load<align>(b + offset); float32x4_t _d = vsubq_f32(_a, _b); float32x4_t term = vmlaq_f32(correction, _d, _d); float32x4_t temp = vaddq_f32(sum, term); correction = vsubq_f32(vmulq_f32(temp, sum), term); sum = temp; }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const float32x4_t & dx, const float32x4_t & dy, Buffer & buffer, size_t col) { float32x4_t bestDot = vdupq_n_f32(0); int32x4_t bestIndex = vdupq_n_s32(0); for(int i = 0; i < buffer.size; ++i) { float32x4_t dot = vaddq_f32(vmulq_f32(dx, buffer.cos[i]), vmulq_f32(dy, buffer.sin[i])); uint32x4_t mask = vcgtq_f32(dot, bestDot); bestDot = vmaxq_f32(dot, bestDot); bestIndex = vbslq_s32(mask, buffer.pos[i], bestIndex); dot = vnegq_f32(dot); mask = vcgtq_f32(dot, bestDot); bestDot = vmaxq_f32(dot, bestDot); bestIndex = vbslq_s32(mask, buffer.neg[i], bestIndex); } Store<align>(buffer.index + col, bestIndex); Store<align>(buffer.value + col, Sqrt<SIMD_NEON_RCP_ITER>(vaddq_f32(vmulq_f32(dx, dx), vmulq_f32(dy, dy)))); }
static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) { int i; const int num_partitions = aec->num_partitions; for (i = 0; i < num_partitions; i++) { int j; int xPos = (i + aec->xfBufBlockPos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap if (i + aec->xfBufBlockPos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]); const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]); const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]); const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]); const float32x4_t yf_re = vld1q_f32(&yf[0][j]); const float32x4_t yf_im = vld1q_f32(&yf[1][j]); const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re); const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im); const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im); const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re); const float32x4_t g = vaddq_f32(yf_re, e); const float32x4_t h = vaddq_f32(yf_im, f); vst1q_f32(&yf[0][j], g); vst1q_f32(&yf[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { yf[0][j] += MulRe(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j], aec->wfBuf[0][pos + j], aec->wfBuf[1][pos + j]); yf[1][j] += MulIm(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j], aec->wfBuf[0][pos + j], aec->wfBuf[1][pos + j]); } } }
void dotProd_neon(const float *data, const float *weights, float *vals, const int n, const int len, const float *istd) { for (int i = 0; i < n; i += 4) { float32x4_t accum0 = { 0.0f, 0.0f, 0.0f, 0.0f }; float32x4_t accum1 = accum0; float32x4_t accum2 = accum0; float32x4_t accum3 = accum0; for (int j = 0; j < len; j += 4) { float32x4_t d0 = vld1q_f32(data + j); float32x4_t d1 = d0; float32x4_t d2 = d0; float32x4_t d3 = d0; float32x4_t w0 = vld1q_f32(weights); float32x4_t w1 = vld1q_f32(weights + 4); float32x4_t w2 = vld1q_f32(weights + 8); float32x4_t w3 = vld1q_f32(weights + 12); accum0 = vaddq_f32(accum0, vmulq_f32(d0, w0)); accum1 = vaddq_f32(accum1, vmulq_f32(d1, w1)); accum2 = vaddq_f32(accum2, vmulq_f32(d2, w2)); accum3 = vaddq_f32(accum3, vmulq_f32(d3, w3)); weights += 16; } float32x2_t sum0 = vpadd_f32(vget_low_f32(accum0), vget_high_f32(accum0)); float32x2_t sum1 = vpadd_f32(vget_low_f32(accum1), vget_high_f32(accum1)); float32x2_t sum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2)); float32x2_t sum3 = vpadd_f32(vget_low_f32(accum3), vget_high_f32(accum3)); sum0 = vpadd_f32(sum0, sum1); sum1 = vpadd_f32(sum2, sum3); float32x4_t sum = vcombine_f32(sum0, sum1); sum = vmulq_n_f32(sum, istd[0]); sum = vaddq_f32(sum, vld1q_f32(weights + n*len + i)); vst1q_f32(vals + i, sum); } }
/* f32x4 mv mul */ void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C) { int i = 0; int k = 0; float32x4_t neon_b, neon_c; float32x4_t neon_a0, neon_a1, neon_a2, neon_a3; float32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_f32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_f32(A + j); neon_a1 = vld1q_f32(A + j + Row); neon_a2 = vld1q_f32(A + j + 2 * Row); neon_a3 = vld1q_f32(A + j + 3 * Row); neon_b = vld1q_f32(B + k); neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0)); neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1)); neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2)); neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3)); neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c); } vst1q_f32(C + i, neon_c); } }
void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t) for (int i = 0; i < n; i += 4) { int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int j = 0; j < len; j += 8) { int16x4x2_t d0 = vld2_s16(data + j); int16x4x2_t w0 = vld2_s16(weights); int16x4x2_t w1 = vld2_s16(weights + 8); int16x4x2_t w2 = vld2_s16(weights + 16); int16x4x2_t w3 = vld2_s16(weights + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); weights += 32; } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t val = vcvtq_f32_s32(sum); val = vmulq_f32(val, vld1q_f32(weightsf + i*2)); val = vmulq_n_f32(val, istd[0]); val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4)); vst1q_f32(vals + i, val); } }
static void neon_vector_mul(const std::vector<float>& vec_a, const std::vector<float>& vec_b, std::vector<float>& vec_result) { assert(vec_a.size() == vec_b.size()); assert(vec_a.size() == vec_result.size()); int i = 0; //neon process for (; i < (int)vec_result.size() - 3 ; i+=4) { const auto data_a = vld1q_f32(&vec_a[i]); const auto data_b = vld1q_f32(&vec_b[i]); float* dst_ptr = &vec_result[i]; const auto data_res = vmulq_f32(data_a, data_b); vst1q_f32(dst_ptr, data_res); } //normal process for (; i < (int)vec_result.size(); i++) { vec_result[i] = vec_a[i] * vec_b[i]; } }
//Kernel function: saxpy void saxpy_vector(KernelArgs* args) { //Setup const float32x4_t MASK_FALSE = vdupq_n_f32(0.f); const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE)); //Uniforms //Fuses //Literals //Stack variables float32x4_t scale, x, y, result, var060, var061; //Loop over input uint64_t index; for(index = 0; index < args->N; index += 4) { //Inputs scale = vld1q_f32(&args->scale[index]); x = vld1q_f32(&args->x[index]); y = vld1q_f32(&args->y[index]); //Begin kernel logic { //>>> result = scale * x + y var061 = vmulq_f32(scale, x); var060 = vaddq_f32(var061, y); result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result); } //End kernel logic //Outputs vst1q_f32(&args->result[index], result); } }
/** * @brief vector_dot_vector. * * @param dst[out] the output element(1*1) * @param src1[in] the input vector(1*n) * src2[in] the input vector(1*n) * dimN[in] size of vector * * @return void */ void neon_VecdotVec(float *dst, const float *src1, const float *src2, const int dimN) { float *mat0 = (float *)src1; float *mat1 = (float *)src2; float32x4_t q0 = vld1q_f32(mat0); float32x4_t q1 = vld1q_f32(mat1); q0 = vmulq_f32(q0, q1); int j = 4; for (; j <= dimN - 4; j += 4) { float32x4_t q2 = vld1q_f32(mat0 + j); float32x4_t q3 = vld1q_f32(mat1 + j); q0 = vmlaq_f32(q0, q2, q3); } float32x2_t d0 = vpadd_f32(vget_low_f32(q0), vget_high_f32(q0)); d0 = vpadd_f32(d0, d0); *dst = *((float *)&d0); for (; j < dimN; j++) { *dst += src1[j] * src2[j]; } }