f64 dotProduct(const Size2D &_size, const f32 * src0Base, ptrdiff_t src0Stride, const f32 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width * sizeof(f32))) { size.width *= size.height; size.height = 1; } #define DOT_FLOAT_BLOCKSIZE (1 << 13) f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; while(i + 4 <= size.width) { size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4; float32x4_t v_sum = vdupq_n_f32(0.0f); for( ; i <= lim; i += 4 ) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i)); } float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); } if(i + 2 <= size.width) { float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); i += 2; } for (; i < size.width; ++i) result += src0[i] * src1[i]; } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }
void test_vpaddf32 (void) { float32x2_t out_float32x2_t; float32x2_t arg0_float32x2_t; float32x2_t arg1_float32x2_t; out_float32x2_t = vpadd_f32 (arg0_float32x2_t, arg1_float32x2_t); }
static int PartitionDelayNEON(const AecCore* aec) { // Measures the energy in each filter partition and returns the partition with // highest energy. // TODO(bjornv): Spread computational cost by computing one partition per // block? float wfEnMax = 0; int i; int delay = 0; for (i = 0; i < aec->num_partitions; i++) { int j; int pos = i * PART_LEN1; float wfEn = 0; float32x4_t vec_wfEn = vdupq_n_f32(0.0f); // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]); const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]); vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0); vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1); } { float32x2_t vec_total; // A B C D vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn)); // A+B C+D vec_total = vpadd_f32(vec_total, vec_total); // A+B+C+D A+B+C+D wfEn = vget_lane_f32(vec_total, 0); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] + aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j]; } if (wfEn > wfEnMax) { wfEnMax = wfEn; delay = i; } } return delay; }
/** * @brief vector_dot_vector. * * @param dst[out] the output element(1*1) * @param src1[in] the input vector(1*n) * src2[in] the input vector(1*n) * dimN[in] size of vector * * @return void */ void neon_VecdotVec(float *dst, const float *src1, const float *src2, const int dimN) { float *mat0 = (float *)src1; float *mat1 = (float *)src2; float32x4_t q0 = vld1q_f32(mat0); float32x4_t q1 = vld1q_f32(mat1); q0 = vmulq_f32(q0, q1); int j = 4; for (; j <= dimN - 4; j += 4) { float32x4_t q2 = vld1q_f32(mat0 + j); float32x4_t q3 = vld1q_f32(mat1 + j); q0 = vmlaq_f32(q0, q2, q3); } float32x2_t d0 = vpadd_f32(vget_low_f32(q0), vget_high_f32(q0)); d0 = vpadd_f32(d0, d0); *dst = *((float *)&d0); for (; j < dimN; j++) { *dst += src1[j] * src2[j]; } }
void dotProd_neon(const float *data, const float *weights, float *vals, const int n, const int len, const float *istd) { for (int i = 0; i < n; i += 4) { float32x4_t accum0 = { 0.0f, 0.0f, 0.0f, 0.0f }; float32x4_t accum1 = accum0; float32x4_t accum2 = accum0; float32x4_t accum3 = accum0; for (int j = 0; j < len; j += 4) { float32x4_t d0 = vld1q_f32(data + j); float32x4_t d1 = d0; float32x4_t d2 = d0; float32x4_t d3 = d0; float32x4_t w0 = vld1q_f32(weights); float32x4_t w1 = vld1q_f32(weights + 4); float32x4_t w2 = vld1q_f32(weights + 8); float32x4_t w3 = vld1q_f32(weights + 12); accum0 = vaddq_f32(accum0, vmulq_f32(d0, w0)); accum1 = vaddq_f32(accum1, vmulq_f32(d1, w1)); accum2 = vaddq_f32(accum2, vmulq_f32(d2, w2)); accum3 = vaddq_f32(accum3, vmulq_f32(d3, w3)); weights += 16; } float32x2_t sum0 = vpadd_f32(vget_low_f32(accum0), vget_high_f32(accum0)); float32x2_t sum1 = vpadd_f32(vget_low_f32(accum1), vget_high_f32(accum1)); float32x2_t sum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2)); float32x2_t sum3 = vpadd_f32(vget_low_f32(accum3), vget_high_f32(accum3)); sum0 = vpadd_f32(sum0, sum1); sum1 = vpadd_f32(sum2, sum3); float32x4_t sum = vcombine_f32(sum0, sum1); sum = vmulq_n_f32(sum, istd[0]); sum = vaddq_f32(sum, vld1q_f32(weights + n*len + i)); vst1q_f32(vals + i, sum); } }
void computeNetwork0_neon(const float *input, const float *weights, uint8_t *d) { float32x4_t m0 = { 0.0f, 0.0f, 0.0f, 0.0f }; float32x4_t m1 = m0; float32x4_t m2 = m0; float32x4_t m3 = m0; float32x4_t m4, m5, m6, m7; for (int i = 0; i < 192/4; i += 4) { m4 = vld1q_f32(input + i); m5 = m4; m6 = m4; m7 = m4; m4 = vmulq_f32(m4, vld1q_f32(weights + i * 4)); m5 = vmulq_f32(m5, vld1q_f32(weights + i * 4 + 4)); m6 = vmulq_f32(m6, vld1q_f32(weights + i * 4 + 8)); m7 = vmulq_f32(m7, vld1q_f32(weights + i * 4 + 12)); m0 = vaddq_f32(m0, m4); m1 = vaddq_f32(m1, m5); m2 = vaddq_f32(m2, m6); m3 = vaddq_f32(m3, m7); } float32x2_t sum0 = vpadd_f32(vget_low_f32(m0), vget_high_f32(m0)); float32x2_t sum1 = vpadd_f32(vget_low_f32(m1), vget_high_f32(m1)); float32x2_t sum2 = vpadd_f32(vget_low_f32(m2), vget_high_f32(m2)); float32x2_t sum3 = vpadd_f32(vget_low_f32(m3), vget_high_f32(m3)); sum0 = vpadd_f32(sum0, sum1); sum1 = vpadd_f32(sum2, sum3); m0 = vcombine_f32(sum0, sum1); m0 = vaddq_f32(m0, vld1q_f32(weights + 768/4)); m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weights + 784/4)); m2 = vmulq_f32(m2, vld1q_f32(weights + (784+16)/4)); m3 = vmulq_f32(m3, vld1q_f32(weights + (784+32)/4)); m4 = vmulq_f32(m4, vld1q_f32(weights + (784+48)/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weights + (784+64)/4)); m7 = m1; m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f)); m1 = vaddq_f32(m1, ones_f); m7 = vmulq_f32(reciprocal(m1), m7); m3 = m0; m0 = vdupq_lane_f32(vget_low_f32(m0), 0); m1 = vdupq_lane_f32(vget_low_f32(m3), 1); m2 = vdupq_lane_f32(vget_high_f32(m3), 0); m3 = vdupq_lane_f32(vget_high_f32(m3), 1); m0 = vmulq_f32(m0, vld1q_f32(weights + 864/4)); m1 = vmulq_f32(m1, vld1q_f32(weights + (864+16)/4)); m2 = vmulq_f32(m2, vld1q_f32(weights + (864+32)/4)); m3 = vmulq_f32(m3, vld1q_f32(weights + (864+48)/4)); m4 = vdupq_lane_f32(vget_low_f32(m7), 0); m5 = vdupq_lane_f32(vget_low_f32(m7), 1); m6 = vdupq_lane_f32(vget_high_f32(m7), 0); m7 = vdupq_lane_f32(vget_high_f32(m7), 1); m4 = vmulq_f32(m4, vld1q_f32(weights + (864+64)/4)); m5 = vmulq_f32(m5, vld1q_f32(weights + (864+80)/4)); m6 = vmulq_f32(m6, vld1q_f32(weights + (864+96)/4)); m7 = vmulq_f32(m7, vld1q_f32(weights + (864+112)/4)); m0 = vaddq_f32(m0, m1); m2 = vaddq_f32(m2, m3); m4 = vaddq_f32(m4, m5); m6 = vaddq_f32(m6, m7); m0 = vaddq_f32(m0, m2); m4 = vaddq_f32(m4, m6); m0 = vaddq_f32(m0, m4); m0 = vaddq_f32(m0, vld1q_f32(weights + (864+128)/4)); float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0)); d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0)); }
// Updates the following smoothed Power Spectral Densities (PSD): // - sd : near-end // - se : residual echo // - sx : far-end // - sde : cross-PSD of near-end and residual echo // - sxd : cross-PSD of near-end and far-end // // In addition to updating the PSDs, also the filter diverge state is determined // upon actions are taken. static void SmoothedPSD(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = aec->extended_filter_enabled ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; int i; float sdSum = 0, seSum = 0; const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); float32x4_t vec_sdSum = vdupq_n_f32(0.0f); float32x4_t vec_seSum = vdupq_n_f32(0.0f); for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1); vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1); vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1); vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15); vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]); vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); vst1q_f32(&aec->sd[i], vec_sd); vst1q_f32(&aec->se[i], vec_se); vst1q_f32(&aec->sx[i], vec_sx); { float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]); vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1); vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); vst2q_f32(&aec->sde[i][0], vec_sde); } { float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]); vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1); vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); vst2q_f32(&aec->sxd[i][0], vec_sxd); } vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); vec_seSum = vaddq_f32(vec_seSum, vec_se); } { float32x2_t vec_sdSum_total; float32x2_t vec_seSum_total; // A B C D vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); // A+B C+D vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); // A+B+C+D A+B+C+D sdSum = vget_lane_f32(vec_sdSum_total, 0); seSum = vget_lane_f32(vec_seSum_total, 0); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. aec->sx[i] = ptrGCoh[0] * aec->sx[i] + ptrGCoh[1] * WEBRTC_SPL_MAX( xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], WebRtcAec_kMinFarendPSD); aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); sdSum += aec->sd[i]; seSum += aec->se[i]; } // Divergent filter safeguard update. aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). *extreme_filter_divergence = (seSum > (19.95f * sdSum)); }
/** * @brief vector_mul_matrix. * * @param src1[in] the input vector(1*k) * src2[in] the input matrix(k*n) * dst[out] the output vector(1*n) * kn[in] DIM_K & DIM_N * * @return void */ void neon_vectormulmatrix_float(float * dst, const float * src1, const float * src2, int *kn) { int j ,l; int k = kn[0]; int n = kn[1]; const float * src1_p = src1; const float * src2_p = src2; float * dst_p = dst; for (j = 0; j <= n - 4; j += 4) { float32x2_t d16 = {0}; float32x2_t d17 = {0}; float32x2_t d18 = {0}; float32x2_t d19 = {0}; float32x2_t d20; float32x2_t d21; float32x4_t q0; src1_p = src1; src2_p = src2 + j * k; for (l = 0; l <= k - 4; l += 4) { // Matrix A float32x4_t q8 = vld1q_f32(src1_p); float32x2_t d0 = vget_low_f32(q8); float32x2_t d1 = vget_high_f32(q8); // Matrix B float32x4_t q12 = vld1q_f32(src2_p); float32x4_t q13 = vld1q_f32(src2_p + k); float32x4_t q14 = vld1q_f32(src2_p + k * 2); float32x4_t q15 = vld1q_f32(src2_p + k * 3); float32x2_t d8 = vget_low_f32(q12); float32x2_t d9 = vget_high_f32(q12); float32x2_t d10 = vget_low_f32(q13); float32x2_t d11 = vget_high_f32(q13); float32x2_t d12 = vget_low_f32(q14); float32x2_t d13 = vget_high_f32(q14); float32x2_t d14 = vget_low_f32(q15); float32x2_t d15 = vget_high_f32(q15); d16 = vmla_f32(d16, d0, d8); d17 = vmla_f32(d17, d0, d10); d18 = vmla_f32(d18, d0, d12); d19 = vmla_f32(d19, d0, d14); d16 = vmla_f32(d16, d1, d9); d17 = vmla_f32(d17, d1, d11); d18 = vmla_f32(d18, d1, d13); d19 = vmla_f32(d19, d1, d15); src1_p += 4; src2_p += 4; }// end for l d16 = vpadd_f32(d16, d17); d18 = vpadd_f32(d18, d19); float sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0; for(; l < k; l ++) { float src1_d; src1_d = *src1_p; sum0 += src1_d * *src2_p; sum1 += src1_d * *(src2_p + k); sum2 += src1_d * *(src2_p + 2 * k); sum3 += src1_d * *(src2_p + 3 * k); src1_p++; src2_p++; } d20 = vset_lane_f32(sum0, d20, 0); d20 = vset_lane_f32(sum1, d20, 1); d21 = vset_lane_f32(sum2, d21, 0); d21 = vset_lane_f32(sum3, d21, 1); q0 = vaddq_f32(vcombine_f32(d16, d18), vcombine_f32(d20, d21)); vst1q_f32(dst_p, q0); dst_p += 4; }// end for j }
/** * @brief Elem_t¿‡–Õæÿ’ÛA”ÎElem_t¿‡–Õæÿ’ÛBœ‡≥À. * * @param dst[out] ‰≥ˆæÿ’ÛC. * src1[in] ‰»Îæÿ’ÛA. * src2[in] ‰»Îæÿ’ÛB. * mkn[in] æÿ’Ûµƒ∏˜∏ˆŒ¨ ˝. * * @return void */ void neon_matrixmul_4x4float(Elem_t * dst, Elem_t * src1, Elem_t * src2, int *mkn) { int m = mkn[0]; int k = mkn[1]; int n = mkn[2]; for (int i = 0; i < m; i += 4) { for (int j = 0; j < n; j += 4) { float32x2_t d16 = {0}; float32x2_t d17 = {0}; float32x2_t d18 = {0}; float32x2_t d19 = {0}; float32x2_t d20 = {0}; float32x2_t d21 = {0}; float32x2_t d22 = {0}; float32x2_t d23 = {0}; float32x2_t d24 = {0}; float32x2_t d25 = {0}; float32x2_t d26 = {0}; float32x2_t d27 = {0}; float32x2_t d28 = {0}; float32x2_t d29 = {0}; float32x2_t d30 = {0}; float32x2_t d31 = {0}; for (int l = 0; l < k; l += 4) { // Matrix A float32x4_t q8 = vld1q_f32(src1 ); float32x4_t q9 = vld1q_f32(src1 + k ); float32x4_t q10 = vld1q_f32(src1 + k*2); float32x4_t q11 = vld1q_f32(src1 + k*3); float32x2_t d0 = vget_low_f32(q8); float32x2_t d1 = vget_high_f32(q8); float32x2_t d2 = vget_low_f32(q9); float32x2_t d3 = vget_high_f32(q9); float32x2_t d4 = vget_low_f32(q10); float32x2_t d5 = vget_high_f32(q10); float32x2_t d6 = vget_low_f32(q11); float32x2_t d7 = vget_high_f32(q11); // Matrix B float32x4_t q12 = vld1q_f32(src2 ); float32x4_t q13 = vld1q_f32(src2 + k ); float32x4_t q14 = vld1q_f32(src2 + k*2); float32x4_t q15 = vld1q_f32(src2 + k*3); float32x2_t d8 = vget_low_f32(q12); float32x2_t d9 = vget_high_f32(q12); float32x2_t d10 = vget_low_f32(q13); float32x2_t d11 = vget_high_f32(q13); float32x2_t d12 = vget_low_f32(q14); float32x2_t d13 = vget_high_f32(q14); float32x2_t d14 = vget_low_f32(q15); float32x2_t d15 = vget_high_f32(q15); d16 = vmla_f32(d16, d0, d8); d17 = vmla_f32(d17, d0, d10); d18 = vmla_f32(d18, d0, d12); d19 = vmla_f32(d19, d0, d14); d16 = vmla_f32(d16, d1, d9); d17 = vmla_f32(d17, d1, d11); d18 = vmla_f32(d18, d1, d13); d19 = vmla_f32(d19, d1, d15); d20 = vmla_f32(d20, d2, d8); d21 = vmla_f32(d21, d2, d10); d22 = vmla_f32(d22, d2, d12); d23 = vmla_f32(d23, d2, d14); d20 = vmla_f32(d20, d3, d9); d21 = vmla_f32(d21, d3, d11); d22 = vmla_f32(d22, d3, d13); d23 = vmla_f32(d23, d3, d15); d24 = vmla_f32(d24, d4, d8); d25 = vmla_f32(d25, d4, d10); d26 = vmla_f32(d26, d4, d12); d27 = vmla_f32(d27, d4, d14); d24 = vmla_f32(d24, d5, d9); d25 = vmla_f32(d25, d5, d11); d26 = vmla_f32(d26, d5, d13); d27 = vmla_f32(d27, d5, d15); d28 = vmla_f32(d28, d6, d8); d29 = vmla_f32(d29, d6, d10); d30 = vmla_f32(d30, d6, d12); d31 = vmla_f32(d31, d6, d14); d28 = vmla_f32(d28, d7, d9); d29 = vmla_f32(d29, d7, d11); d30 = vmla_f32(d30, d7, d13); d31 = vmla_f32(d31, d7, d15); src1 += 4; src2 += 4; }// end for l d16 = vpadd_f32(d16, d17); d18 = vpadd_f32(d18, d19); d20 = vpadd_f32(d20, d21); d22 = vpadd_f32(d22, d23); d24 = vpadd_f32(d24, d25); d26 = vpadd_f32(d26, d27); d28 = vpadd_f32(d28, d29); d30 = vpadd_f32(d30, d31); vst1q_f32(dst , vcombine_f32(d16, d18)); vst1q_f32(dst + n , vcombine_f32(d20, d22)); vst1q_f32(dst + n*2, vcombine_f32(d24, d26)); vst1q_f32(dst + n*3, vcombine_f32(d28, d30)); src1 -= k; src2 += k*3; dst += 4; }// end for j src1 += k*4; src2 -= k*n; dst += n*3; }// end for i }