static void test_fma() { for(int i=0; i<1020 * 4; i++) { data_f[i] = i; } float32x4_t c0_02 = vdupq_n_f32(0.02f); float32x4_t c0_04 = vdupq_n_f32(0.04f); float32x4_t c0_05 = vdupq_n_f32(0.05f); float32x4_t c0_10 = vdupq_n_f32(0.1f); float32x4_t c0_20 = vdupq_n_f32(0.2f); float32x4_t c1_00 = vdupq_n_f32(1.0f); startTime(); // Do ~1 billion ops for (int ct=0; ct < (1000 * (1000 / 80)); ct++) { for (int i=0; i < 1000; i++) { float32x4_t t; t = vmulq_f32(vld1q_f32((float32_t *)&data_f[i]), c0_02); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+4]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+8]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+12]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+16]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+20]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+24]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+28]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+32]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+36]), c0_02); t = vaddq_f32(t, c1_00); vst1q_f32((float32_t *)&data_f[i], t); } } endTime("neon fma", 1e9); }
void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) { ASSERT_ALIGNED(aBlock); float32x4_t vin0, vin1, vin2, vin3; float32x4_t vout0, vout1, vout2, vout3; float32x4_t vscale = vmovq_n_f32(aScale); uint32_t dif = aSize % 16; uint32_t vectorSize = aSize - dif; uint32_t i = 0; for (; i < vectorSize; i += 16) { vin0 = vld1q_f32(ADDRESS_OF(aBlock, i)); vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4)); vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8)); vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12)); vout0 = vmulq_f32(vin0, vscale); vout1 = vmulq_f32(vin1, vscale); vout2 = vmulq_f32(vin2, vscale); vout3 = vmulq_f32(vin3, vscale); vst1q_f32(ADDRESS_OF(aBlock, i), vout0); vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1); vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2); vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3); } for (unsigned j = 0; j < dif; ++i, ++j) { aBlock[i] *= aScale; } }
void * scaled_sumf_thread_NEON(void * argument) { jsize i = 0; struct scaled_sumfneon_thread_data * data = (struct scaled_sumfneon_thread_data *) argument; float32_t * r = (float32_t *)data->r; const float32_t * x = (const float32_t *)data->x; const float32_t * y = (const float32_t *)data->y; const float32_t a = (const float32_t)data->a; const jsize size = data->size; float32x4_t rx4, xx4, yx4, ax4; ax4 = vdupq_n_f32(a); for(i; i < size ; i += 4) { xx4 = vld1q_f32(&(x[i])); yx4 = vld1q_f32(&(y[i])); rx4 = vmlaq_f32(xx4, ax4, yx4); vst1q_f32(&(r[i]), rx4); } }
/** * @brief vector scale: A[] = alpha * B[]. * * @param dst[out] the result matrix A. * src[in] the input matrix B. * alpha[in] scale of B. * elemCnt[in] number of elements to calc. * * @return void. */ void neon_scale(float *dst, const float *src, const float alpha, const int elemCnt) { int i; for (i = 0; i <= elemCnt - 16; i += 16) { float32x4_t q0 = vld1q_f32(src + i); float32x4_t q1 = vld1q_f32(src + i + 4); float32x4_t q2 = vld1q_f32(src + i + 8); float32x4_t q3 = vld1q_f32(src + i + 12); q0 = vmulq_n_f32(q0, alpha); q1 = vmulq_n_f32(q1, alpha); q2 = vmulq_n_f32(q2, alpha); q3 = vmulq_n_f32(q3, alpha); vst1q_f32(dst + i, q0); vst1q_f32(dst + i + 4, q1); vst1q_f32(dst + i + 8, q2); vst1q_f32(dst + i + 12, q3); } for (; i < elemCnt; i++) { dst[i] = src[i] * alpha; } }
void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale, float* aOutput) { ASSERT_ALIGNED(aInput); ASSERT_ALIGNED(aOutput); float32x4_t vin0, vin1, vin2, vin3; float32x4_t vout0, vout1, vout2, vout3; float32x4_t vscale = vmovq_n_f32(aScale); for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=16) { vin0 = vld1q_f32(ADDRESS_OF(aInput, i)); vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4)); vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8)); vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12)); vout0 = vmulq_f32(vin0, vscale); vout1 = vmulq_f32(vin1, vscale); vout2 = vmulq_f32(vin2, vscale); vout3 = vmulq_f32(vin3, vscale); vst1q_f32(ADDRESS_OF(aOutput, i), vout0); vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1); vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2); vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3); } }
static void ScaleErrorSignalNEON(int extended_filter_enabled, float normal_mu, float normal_error_threshold, float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled ? kExtendedErrorThreshold : normal_error_threshold; const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); const float32x4_t kMu = vmovq_n_f32(mu); const float32x4_t kThresh = vmovq_n_f32(error_threshold); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); const float32x4_t absEf = vsqrtq_f32(ef_sum2); const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); ef_re_if = vandq_u32(bigger, ef_re_if); ef_im_if = vandq_u32(bigger, ef_im_if); ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); vst1q_f32(&ef[0][i], ef_re); vst1q_f32(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { float abs_ef; ef[0][i] /= (x_pow[i] + 1e-10f); ef[1][i] /= (x_pow[i] + 1e-10f); abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (abs_ef > error_threshold) { abs_ef = error_threshold / (abs_ef + 1e-10f); ef[0][i] *= abs_ef; ef[1][i] *= abs_ef; } // Stepsize factor ef[0][i] *= mu; ef[1][i] *= mu; } }
f64 dotProduct(const Size2D &_size, const f32 * src0Base, ptrdiff_t src0Stride, const f32 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width * sizeof(f32))) { size.width *= size.height; size.height = 1; } #define DOT_FLOAT_BLOCKSIZE (1 << 13) f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; while(i + 4 <= size.width) { size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4; float32x4_t v_sum = vdupq_n_f32(0.0f); for( ; i <= lim; i += 4 ) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i)); } float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); } if(i + 2 <= size.width) { float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); i += 2; } for (; i < size.width; ++i) result += src0[i] * src1[i]; } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }
void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) { size_t i; unsigned char alpha; float32_t (*mat)[4] = transform->matrix; const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r; const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g; const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b; const uint8_t *otdata_r = &transform->output_table_r->data[0]; const uint8_t *otdata_g = &transform->output_table_g->data[0]; const uint8_t *otdata_b = &transform->output_table_b->data[0]; const float32x4_t mat0 = vld1q_f32(mat[0]); const float32x4_t mat1 = vld1q_f32(mat[1]); const float32x4_t mat2 = vld1q_f32(mat[2]); const float32x4_t max = vld1q_dup_f32(&clampMaxValue); const float32x4_t min = vld1q_dup_f32(&zero); const float32x4_t scale = vld1q_dup_f32(&floatScale); float32x4_t vec_r, vec_g, vec_b; int32x4_t result; /* CYA */ if (!length) return; for (i = 0; i < length; i++) { /* setup for transforming the pixel */ vec_r = vld1q_dup_f32(&igtbl_r[*src++]); vec_g = vld1q_dup_f32(&igtbl_g[*src++]); vec_b = vld1q_dup_f32(&igtbl_b[*src++]); alpha = *src++; /* gamma * matrix */ vec_r = vmulq_f32(vec_r, mat0); vec_g = vmulq_f32(vec_g, mat1); vec_b = vmulq_f32(vec_b, mat2); /* crunch, crunch, crunch */ vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b)); vec_r = vmaxq_f32(min, vec_r); vec_r = vminq_f32(max, vec_r); result = vcvtq_s32_f32(vmulq_f32(vec_r, scale)); /* use calc'd indices to output RGB values */ *dest++ = otdata_r[vgetq_lane_s32(result, 0)]; *dest++ = otdata_g[vgetq_lane_s32(result, 1)]; *dest++ = otdata_b[vgetq_lane_s32(result, 2)]; *dest++ = alpha; } }
void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE], const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR, bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) { ASSERT_ALIGNED(aInputL); ASSERT_ALIGNED(aInputR); ASSERT_ALIGNED(aOutputL); ASSERT_ALIGNED(aOutputR); float32x4_t vinL0, vinL1; float32x4_t vinR0, vinR1; float32x4_t voutL0, voutL1; float32x4_t voutR0, voutR1; float32x4_t vscaleL = vmovq_n_f32(aGainL); float32x4_t vscaleR = vmovq_n_f32(aGainR); if (aIsOnTheLeft) { for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4)); voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL); voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1); voutR0 = vmulq_f32(vinR0, vscaleR); voutR1 = vmulq_f32(vinR1, vscaleR); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1); } } else { for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) { vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i)); vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4)); vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i)); vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4)); voutL0 = vmulq_f32(vinL0, vscaleL); voutL1 = vmulq_f32(vinL1, vscaleL); vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0); vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1); voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR); voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR); vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0); vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1); } } }
void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t) for (int i = 0; i < n; i += 4) { int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int j = 0; j < len; j += 8) { int16x4x2_t d0 = vld2_s16(data + j); int16x4x2_t w0 = vld2_s16(weights); int16x4x2_t w1 = vld2_s16(weights + 8); int16x4x2_t w2 = vld2_s16(weights + 16); int16x4x2_t w3 = vld2_s16(weights + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); weights += 32; } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t val = vcvtq_f32_s32(sum); val = vmulq_f32(val, vld1q_f32(weightsf + i*2)); val = vmulq_n_f32(val, istd[0]); val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4)); vst1q_f32(vals + i, val); } }
/* f32x4 mm mul */ void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C) { int i, k, j; float32x4_t neon_b, neon_c; float32x4_t neon_a0, neon_a1, neon_a2, neon_a3; float32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_f32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_f32(A + j_T); j_T+=Row; neon_a1 = vld1q_f32(A + j_T); j_T+=Row; neon_a2 = vld1q_f32(A + j_T); j_T+=Row; neon_a3 = vld1q_f32(A + j_T); neon_b = vld1q_f32(B + k_Row + j); neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0)); neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1)); neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2)); neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3)); neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c); vst1q_lane_f32(C + k_Row + i, neon_c, 0); vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3); } } } }
void t1(AlignedAddr *addr1, AlignedAddr *addr2) { // CHECK: @t1 // CHECK: call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %{{.*}}, i32 16) float32x4_t a = vld1q_f32(addr1); // CHECK: call void @llvm.arm.neon.vst1.v4f32(i8* %{{.*}}, <4 x float> %{{.*}}, i32 16) vst1q_f32(addr2, a); }
/* * Scale FFT data by 1/|length|. |length| must be a power of two */ static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) { float32_t* data = (float32_t*)fftData; float32_t scale = 2.0f / length; if (length >= 4) { /* * Do 4 float elements at a time because |length| is always a * multiple of 4 when |length| >= 4. * * TODO(rtoy): Figure out how to process 8 elements at a time * using intrinsics or replace this with inline assembly. */ do { float32x4_t x = vld1q_f32(data); length -= 4; x = vmulq_n_f32(x, scale); vst1q_f32(data, x); data += 4; } while (length > 0); } else if (length == 2) { float32x2_t x = vld1_f32(data); x = vmul_n_f32(x, scale); vst1_f32(data, x); } else { fftData[0] *= scale; } }
static void SubbandCoherenceNEON(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd, int* extreme_filter_divergence) { int i; SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); { const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); // Subband coherence for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); const float32x4_t vec_se = vld1q_f32(&aec->se[i]); const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]); float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]); vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]); vec_cohde = vdivq_f32(vec_cohde, vec_sdse); vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]); vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx); vst1q_f32(&cohde[i], vec_cohde); vst1q_f32(&cohxd[i], vec_cohxd); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { cohde[i] = (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / (aec->sd[i] * aec->se[i] + 1e-10f); cohxd[i] = (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / (aec->sx[i] * aec->sd[i] + 1e-10f); } }
static int PartitionDelayNEON(const AecCore* aec) { // Measures the energy in each filter partition and returns the partition with // highest energy. // TODO(bjornv): Spread computational cost by computing one partition per // block? float wfEnMax = 0; int i; int delay = 0; for (i = 0; i < aec->num_partitions; i++) { int j; int pos = i * PART_LEN1; float wfEn = 0; float32x4_t vec_wfEn = vdupq_n_f32(0.0f); // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]); const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]); vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0); vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1); } { float32x2_t vec_total; // A B C D vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn)); // A+B C+D vec_total = vpadd_f32(vec_total, vec_total); // A+B+C+D A+B+C+D wfEn = vget_lane_f32(vec_total, 0); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; } if (wfEn > wfEnMax) { wfEnMax = wfEn; delay = i; } } return delay; }
int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int size = w * h; top_blob.create(w, h, channels); if (top_blob.empty()) return -100; if (bias_term) { const float* scale_ptr = scale_data; const float* bias_ptr = bias_data; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); float s = scale_ptr[q]; float bias = bias_ptr[q]; #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON float32x4_t _s = vdupq_n_f32(s); float32x4_t _bias = vdupq_n_f32(bias); for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); _p = vmlaq_f32(_bias, _p, _s); vst1q_f32(outptr, _p); ptr += 4; outptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *outptr = *ptr * s + bias; ptr++; outptr++; } } } else {
// Window time domain data to be used by the fft. static void WindowDataNEON(float* x_windowed, const float* x) { int i; for (i = 0; i < PART_LEN; i += 4) { const float32x4_t vec_Buf1 = vld1q_f32(&x[i]); const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]); const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]); // A B C D float32x4_t vec_sqrtHanning_rev = vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]); // B A D C vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev); // D C B A vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev), vget_low_f32(vec_sqrtHanning_rev)); vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning)); vst1q_f32(&x_windowed[PART_LEN + i], vmulq_f32(vec_Buf2, vec_sqrtHanning_rev)); } }
void compare_neon_ge(float *psrc1, float src2, uchar *pdst, int size) { int remainder = size - 7; float32x4_t vsrc2 = vdupq_n_f32(src2); int i = 0; for(; i < remainder; i += 8){ float32x4_t vsrc1_32x4 = vld1q_f32(psrc1 + i ); float32x4_t vsrc2_32x4 = vld1q_f32(psrc1 + i + 4); uint32x4_t vdst1 = vcgeq_f32(vsrc1_32x4, vsrc2); uint32x4_t vdst2 = vcgeq_f32(vsrc2_32x4, vsrc2); uint16x4_t vdst1_16x4 = vmovn_u32(vdst1); uint16x4_t vdst2_16x4 = vmovn_u32(vdst2); uint16x8_t vdst_16x8 = vcombine_u16(vdst1_16x4, vdst2_16x4); uint8x8_t vdst_8x8 = vmovn_u16(vdst_16x8); vst1_u8(pdst + i, vdst_8x8); } for( ; i < size; i++){ pdst[i] = (psrc1[i] >= src2 ) ? 255 : 0; } }
static void neon_vector_mul(const std::vector<float>& vec_a, const std::vector<float>& vec_b, std::vector<float>& vec_result) { assert(vec_a.size() == vec_b.size()); assert(vec_a.size() == vec_result.size()); int i = 0; //neon process for (; i < (int)vec_result.size() - 3 ; i+=4) { const auto data_a = vld1q_f32(&vec_a[i]); const auto data_b = vld1q_f32(&vec_b[i]); float* dst_ptr = &vec_result[i]; const auto data_res = vmulq_f32(data_a, data_b); vst1q_f32(dst_ptr, data_res); } //normal process for (; i < (int)vec_result.size(); i++) { vec_result[i] = vec_a[i] * vec_b[i]; } }
//Kernel function: saxpy void saxpy_vector(KernelArgs* args) { //Setup const float32x4_t MASK_FALSE = vdupq_n_f32(0.f); const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE)); //Uniforms //Fuses //Literals //Stack variables float32x4_t scale, x, y, result, var060, var061; //Loop over input uint64_t index; for(index = 0; index < args->N; index += 4) { //Inputs scale = vld1q_f32(&args->scale[index]); x = vld1q_f32(&args->x[index]); y = vld1q_f32(&args->y[index]); //Begin kernel logic { //>>> result = scale * x + y var061 = vmulq_f32(scale, x); var060 = vaddq_f32(var061, y); result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result); } //End kernel logic //Outputs vst1q_f32(&args->result[index], result); } }
/* f32x4 add */ void mw_neon_mm_add_f32x4(float * A, int Row, int Col, float * B, float * C) { float32x4_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 4; i <= size ; i+=4) { k = i - 4; neon_a = vld1q_f32(A + k); neon_b = vld1q_f32(B + k); neon_c = vaddq_f32(neon_a, neon_b); vst1q_f32(C + k, neon_c); } k = i - 4; for (i = 0; i < size % 4; i++) { C[k + i] = A[k + i] + B[k + i]; } }
/* f32x4 mv mul */ void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C) { int i = 0; int k = 0; float32x4_t neon_b, neon_c; float32x4_t neon_a0, neon_a1, neon_a2, neon_a3; float32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_f32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_f32(A + j); neon_a1 = vld1q_f32(A + j + Row); neon_a2 = vld1q_f32(A + j + 2 * Row); neon_a3 = vld1q_f32(A + j + 3 * Row); neon_b = vld1q_f32(B + k); neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0)); neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1)); neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2)); neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3)); neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c); } vst1q_f32(C + i, neon_c); } }
/** * @brief vector_dot_vector. * * @param dst[out] the output element(1*1) * @param src1[in] the input vector(1*n) * src2[in] the input vector(1*n) * dimN[in] size of vector * * @return void */ void neon_VecdotVec(float *dst, const float *src1, const float *src2, const int dimN) { float *mat0 = (float *)src1; float *mat1 = (float *)src2; float32x4_t q0 = vld1q_f32(mat0); float32x4_t q1 = vld1q_f32(mat1); q0 = vmulq_f32(q0, q1); int j = 4; for (; j <= dimN - 4; j += 4) { float32x4_t q2 = vld1q_f32(mat0 + j); float32x4_t q3 = vld1q_f32(mat1 + j); q0 = vmlaq_f32(q0, q2, q3); } float32x2_t d0 = vpadd_f32(vget_low_f32(q0), vget_high_f32(q0)); d0 = vpadd_f32(d0, d0); *dst = *((float *)&d0); for (; j < dimN; j++) { *dst += src1[j] * src2[j]; } }
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { unsigned int m = n >> 2; unsigned int k = n & 3, j; unsigned int l = n & (~3); for (j = 0; j < m; j++) { v4sf src = vld1q_f32(a + 4 * j); v4sf tem = simd_ln4f(src); vst1q_f32(y + 4 * j, tem); } for (j = 0; j < k; j++) { y[j + l] = logf(a[j + l]); } }
int Bias_arm::forward_inplace(Mat& bottom_top_blob) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; const float* bias_ptr = bias_data; #pragma omp parallel for for (int q=0; q<channels; q++) { float* ptr = bottom_top_blob.channel(q); float bias = bias_ptr[q]; #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON float32x4_t _bias = vdupq_n_f32(bias); for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); float32x4_t _outp = vaddq_f32(_p, _bias); vst1q_f32(ptr, _outp); ptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *ptr = *ptr + bias; ptr++; } } return 0; }
void AudioBufferAddWithScale_NEON(const float* aInput, float aScale, float* aOutput, uint32_t aSize) { ASSERT_ALIGNED(aInput); ASSERT_ALIGNED(aOutput); float32x4_t vin0, vin1, vin2, vin3; float32x4_t vout0, vout1, vout2, vout3; float32x4_t vscale = vmovq_n_f32(aScale); uint32_t dif = aSize % 16; aSize -= dif; unsigned i = 0; for (; i < aSize; i+=16) { vin0 = vld1q_f32(ADDRESS_OF(aInput, i)); vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4)); vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8)); vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12)); vout0 = vld1q_f32(ADDRESS_OF(aOutput, i)); vout1 = vld1q_f32(ADDRESS_OF(aOutput, i+4)); vout2 = vld1q_f32(ADDRESS_OF(aOutput, i+8)); vout3 = vld1q_f32(ADDRESS_OF(aOutput, i+12)); vout0 = vmlaq_f32(vout0, vin0, vscale); vout1 = vmlaq_f32(vout1, vin1, vscale); vout2 = vmlaq_f32(vout2, vin2, vscale); vout3 = vmlaq_f32(vout3, vin3, vscale); vst1q_f32(ADDRESS_OF(aOutput, i), vout0); vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1); vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2); vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3); } for (unsigned j = 0; j < dif; ++i, ++j) { aOutput[i] += aInput[i]*aScale; } }
void test_square_root_v4sf () { const float32_t pool[] = {4.0f, 9.0f, 16.0f, 25.0f}; float32x4_t val; float32x4_t res; val = vld1q_f32 (pool); res = vsqrtq_f32 (val); if (vgetq_lane_f32 (res, 0) != 2.0f) abort (); if (vgetq_lane_f32 (res, 1) != 3.0f) abort (); if (vgetq_lane_f32 (res, 2) != 4.0f) abort (); if (vgetq_lane_f32 (res, 3) != 5.0f) abort (); }
int main (void) { volatile float32_t minus_e, pi, ln2, sqrt2, phi; float32_t expected, actual; float32x4_t arg2; float32_t arr[4]; pi = 3.14159265359; arr[0] = minus_e = -2.71828; arr[1] = ln2 = 0.69314718056; arr[2] = sqrt2 = 1.41421356237; arr[3] = phi = 1.61803398874; arg2 = vld1q_f32 (arr); actual = vmuls_laneq_f32 (pi, arg2, 0); expected = pi * minus_e; if (expected != actual) abort (); expected = pi * ln2; actual = vmuls_laneq_f32 (pi, arg2, 1); if (expected != actual) abort (); expected = pi * sqrt2; actual = vmuls_laneq_f32 (pi, arg2, 2); if (expected != actual) abort (); expected = pi * phi; actual = vmuls_laneq_f32 (pi, arg2, 3); if (expected != actual) abort (); return 0; }
/** * @brief vector scale & accu: A[] = alpha * B[] + beta * A[]. * * @param dst[out] the accumulating matrix A. * src[in] the input matrix B. * alpha[in] scale of B. * beta[in] scale of A. * elemCnt[in] number of elements to calc. * * @return void. */ void neon_axpby(float *dst, const float *src, const float alpha, const float beta, const int elemCnt) { int i; for (i = 0; i <= elemCnt - 16; i += 16) { float32x4_t q0 = vld1q_f32(src + i); float32x4_t q1 = vld1q_f32(src + i + 4); float32x4_t q2 = vld1q_f32(src + i + 8); float32x4_t q3 = vld1q_f32(src + i + 12); float32x4_t q4 = vld1q_f32(dst + i); float32x4_t q5 = vld1q_f32(dst + i + 4); float32x4_t q6 = vld1q_f32(dst + i + 8); float32x4_t q7 = vld1q_f32(dst + i + 12); q0 = vmulq_n_f32(q0, alpha); q1 = vmulq_n_f32(q1, alpha); q2 = vmulq_n_f32(q2, alpha); q3 = vmulq_n_f32(q3, alpha); q0 = vmlaq_n_f32(q0, q4, beta); q1 = vmlaq_n_f32(q1, q5, beta); q2 = vmlaq_n_f32(q2, q6, beta); q3 = vmlaq_n_f32(q3, q7, beta); vst1q_f32(dst + i, q0); vst1q_f32(dst + i + 4, q1); vst1q_f32(dst + i + 8, q2); vst1q_f32(dst + i + 12, q3); } for (; i < elemCnt; i++) { float a = src[i] * alpha + dst[i] * beta; dst[i] = a; } }
static void FilterFarNEON( int num_partitions, int x_fft_buf_block_pos, float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float y_fft[2][PART_LEN1]) { int i; for (i = 0; i < num_partitions; i++) { int j; int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); const float32x4_t g = vaddq_f32(y_fft_re, e); const float32x4_t h = vaddq_f32(y_fft_im, f); vst1q_f32(&y_fft[0][j], g); vst1q_f32(&y_fft[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); } } }