Пример #1
0
f64 dotProduct(const Size2D &_size,
               const f32 * src0Base, ptrdiff_t src0Stride,
               const f32 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
    {
        size.width *= size.height;
        size.height = 1;
    }

#define DOT_FLOAT_BLOCKSIZE (1 << 13)
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        while(i + 4 <= size.width)
        {
            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
            float32x4_t v_sum = vdupq_n_f32(0.0f);

            for( ; i <= lim; i += 4 )
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);
                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
            }

            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
        }

        if(i + 2 <= size.width)
        {
            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
            i += 2;
        }

        for (; i < size.width; ++i)
            result += src0[i] * src1[i];
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}
Пример #2
0
void test_vpaddf32 (void)
{
  float32x2_t out_float32x2_t;
  float32x2_t arg0_float32x2_t;
  float32x2_t arg1_float32x2_t;

  out_float32x2_t = vpadd_f32 (arg0_float32x2_t, arg1_float32x2_t);
}
Пример #3
0
static int PartitionDelayNEON(const AecCore* aec) {
  // Measures the energy in each filter partition and returns the partition with
  // highest energy.
  // TODO(bjornv): Spread computational cost by computing one partition per
  // block?
  float wfEnMax = 0;
  int i;
  int delay = 0;

  for (i = 0; i < aec->num_partitions; i++) {
    int j;
    int pos = i * PART_LEN1;
    float wfEn = 0;
    float32x4_t vec_wfEn = vdupq_n_f32(0.0f);
    // vectorized code (four at once)
    for (j = 0; j + 3 < PART_LEN1; j += 4) {
      const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);
      const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);
      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);
      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);
    }
    {
      float32x2_t vec_total;
      // A B C D
      vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn));
      // A+B C+D
      vec_total = vpadd_f32(vec_total, vec_total);
      // A+B+C+D A+B+C+D
      wfEn = vget_lane_f32(vec_total, 0);
    }

    // scalar code for the remaining items.
    for (; j < PART_LEN1; j++) {
      wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
              aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
    }

    if (wfEn > wfEnMax) {
      wfEnMax = wfEn;
      delay = i;
    }
  }
  return delay;
}
Пример #4
0
/**
 * @brief   vector_dot_vector.
 *
 * @param   dst[out]     the output element(1*1)
 * @param   src1[in]     the input  vector(1*n)
 *          src2[in]     the input  vector(1*n)
 *          dimN[in]     size of vector
 *
 * @return  void
 */
void neon_VecdotVec(float *dst,
                    const float *src1,
                    const float *src2,
                    const int dimN)
{
    float *mat0 = (float *)src1;
    float *mat1 = (float *)src2;
    float32x4_t q0 = vld1q_f32(mat0);
    float32x4_t q1 = vld1q_f32(mat1);
    q0 = vmulq_f32(q0, q1);
    int j = 4;
    for (; j <= dimN - 4; j += 4)
    {
        float32x4_t q2 = vld1q_f32(mat0 + j);
        float32x4_t q3 = vld1q_f32(mat1 + j);
        q0 = vmlaq_f32(q0, q2, q3);
    }
    float32x2_t d0 = vpadd_f32(vget_low_f32(q0), vget_high_f32(q0));
    d0 = vpadd_f32(d0, d0);
    *dst = *((float *)&d0);
    for (; j < dimN; j++) {
        *dst += src1[j] * src2[j];
    }
}
Пример #5
0
void dotProd_neon(const float *data, const float *weights, float *vals, const int n, const int len, const float *istd) {
    for (int i = 0; i < n; i += 4) {
        float32x4_t accum0 = { 0.0f, 0.0f, 0.0f, 0.0f };
        float32x4_t accum1 = accum0;
        float32x4_t accum2 = accum0;
        float32x4_t accum3 = accum0;

        for (int j = 0; j < len; j += 4) {
            float32x4_t d0 = vld1q_f32(data + j);
            float32x4_t d1 = d0;
            float32x4_t d2 = d0;
            float32x4_t d3 = d0;

            float32x4_t w0 = vld1q_f32(weights);
            float32x4_t w1 = vld1q_f32(weights + 4);
            float32x4_t w2 = vld1q_f32(weights + 8);
            float32x4_t w3 = vld1q_f32(weights + 12);

            accum0 = vaddq_f32(accum0, vmulq_f32(d0, w0));
            accum1 = vaddq_f32(accum1, vmulq_f32(d1, w1));
            accum2 = vaddq_f32(accum2, vmulq_f32(d2, w2));
            accum3 = vaddq_f32(accum3, vmulq_f32(d3, w3));

            weights += 16;
        }

        float32x2_t sum0 = vpadd_f32(vget_low_f32(accum0), vget_high_f32(accum0));
        float32x2_t sum1 = vpadd_f32(vget_low_f32(accum1), vget_high_f32(accum1));
        float32x2_t sum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
        float32x2_t sum3 = vpadd_f32(vget_low_f32(accum3), vget_high_f32(accum3));
        sum0 = vpadd_f32(sum0, sum1);
        sum1 = vpadd_f32(sum2, sum3);
        float32x4_t sum = vcombine_f32(sum0, sum1);
        
        sum = vmulq_n_f32(sum, istd[0]);
        sum = vaddq_f32(sum, vld1q_f32(weights + n*len + i));
        vst1q_f32(vals + i, sum);
    }
}
Пример #6
0
void computeNetwork0_neon(const float *input, const float *weights, uint8_t *d) {
    float32x4_t m0 = { 0.0f, 0.0f, 0.0f, 0.0f };
    float32x4_t m1 = m0;
    float32x4_t m2 = m0;
    float32x4_t m3 = m0;

    float32x4_t m4, m5, m6, m7;

    for (int i = 0; i < 192/4; i += 4) {
        m4 = vld1q_f32(input + i);
        m5 = m4;
        m6 = m4;
        m7 = m4;

        m4 = vmulq_f32(m4, vld1q_f32(weights + i * 4));
        m5 = vmulq_f32(m5, vld1q_f32(weights + i * 4 + 4));
        m6 = vmulq_f32(m6, vld1q_f32(weights + i * 4 + 8));
        m7 = vmulq_f32(m7, vld1q_f32(weights + i * 4 + 12));

        m0 = vaddq_f32(m0, m4);
        m1 = vaddq_f32(m1, m5);
        m2 = vaddq_f32(m2, m6);
        m3 = vaddq_f32(m3, m7);
    }

    float32x2_t sum0 = vpadd_f32(vget_low_f32(m0), vget_high_f32(m0));
    float32x2_t sum1 = vpadd_f32(vget_low_f32(m1), vget_high_f32(m1));
    float32x2_t sum2 = vpadd_f32(vget_low_f32(m2), vget_high_f32(m2));
    float32x2_t sum3 = vpadd_f32(vget_low_f32(m3), vget_high_f32(m3));
    sum0 = vpadd_f32(sum0, sum1);
    sum1 = vpadd_f32(sum2, sum3);
    m0 = vcombine_f32(sum0, sum1);

    m0 = vaddq_f32(m0, vld1q_f32(weights + 768/4));

    m1 = m0;
    m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l));
    m0 = vaddq_f32(m0, ones_f);
    m0 = vmulq_f32(reciprocal(m0), m1);

    m1 = vdupq_lane_f32(vget_low_f32(m0), 0);
    m2 = vdupq_lane_f32(vget_low_f32(m0), 1);
    m3 = vdupq_lane_f32(vget_high_f32(m0), 0);
    m4 = vdupq_lane_f32(vget_high_f32(m0), 1);

    m1 = vmulq_f32(m1, vld1q_f32(weights + 784/4));
    m2 = vmulq_f32(m2, vld1q_f32(weights + (784+16)/4));
    m3 = vmulq_f32(m3, vld1q_f32(weights + (784+32)/4));
    m4 = vmulq_f32(m4, vld1q_f32(weights + (784+48)/4));

    m1 = vaddq_f32(m1, m2);
    m3 = vaddq_f32(m3, m4);
    m1 = vaddq_f32(m1, m3);
    m1 = vaddq_f32(m1, vld1q_f32(weights + (784+64)/4));

    m7 = m1;
    m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f));
    m1 = vaddq_f32(m1, ones_f);
    m7 = vmulq_f32(reciprocal(m1), m7);

    m3 = m0;

    m0 = vdupq_lane_f32(vget_low_f32(m0), 0);
    m1 = vdupq_lane_f32(vget_low_f32(m3), 1);
    m2 = vdupq_lane_f32(vget_high_f32(m3), 0);
    m3 = vdupq_lane_f32(vget_high_f32(m3), 1);

    m0 = vmulq_f32(m0, vld1q_f32(weights + 864/4));
    m1 = vmulq_f32(m1, vld1q_f32(weights + (864+16)/4));
    m2 = vmulq_f32(m2, vld1q_f32(weights + (864+32)/4));
    m3 = vmulq_f32(m3, vld1q_f32(weights + (864+48)/4));

    m4 = vdupq_lane_f32(vget_low_f32(m7), 0);
    m5 = vdupq_lane_f32(vget_low_f32(m7), 1);
    m6 = vdupq_lane_f32(vget_high_f32(m7), 0);
    m7 = vdupq_lane_f32(vget_high_f32(m7), 1);

    m4 = vmulq_f32(m4, vld1q_f32(weights + (864+64)/4));
    m5 = vmulq_f32(m5, vld1q_f32(weights + (864+80)/4));
    m6 = vmulq_f32(m6, vld1q_f32(weights + (864+96)/4));
    m7 = vmulq_f32(m7, vld1q_f32(weights + (864+112)/4));

    m0 = vaddq_f32(m0, m1);
    m2 = vaddq_f32(m2, m3);
    m4 = vaddq_f32(m4, m5);
    m6 = vaddq_f32(m6, m7);

    m0 = vaddq_f32(m0, m2);
    m4 = vaddq_f32(m4, m6);
    m0 = vaddq_f32(m0, m4);

    m0 = vaddq_f32(m0, vld1q_f32(weights + (864+128)/4));

    float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0));
    d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0));
}
Пример #7
0
// Updates the following smoothed  Power Spectral Densities (PSD):
//  - sd  : near-end
//  - se  : residual echo
//  - sx  : far-end
//  - sde : cross-PSD of near-end and residual echo
//  - sxd : cross-PSD of near-end and far-end
//
// In addition to updating the PSDs, also the filter diverge state is determined
// upon actions are taken.
static void SmoothedPSD(AecCore* aec,
                        float efw[2][PART_LEN1],
                        float dfw[2][PART_LEN1],
                        float xfw[2][PART_LEN1],
                        int* extreme_filter_divergence) {
  // Power estimate smoothing coefficients.
  const float* ptrGCoh = aec->extended_filter_enabled
      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
  int i;
  float sdSum = 0, seSum = 0;
  const float32x4_t vec_15 =  vdupq_n_f32(WebRtcAec_kMinFarendPSD);
  float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
  float32x4_t vec_seSum = vdupq_n_f32(0.0f);

  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
    const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
    const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
    const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
    const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
    const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
    float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]);
    float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]);
    float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]);
    float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0);
    float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0);
    float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0);

    vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1);
    vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1);
    vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1);
    vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15);
    vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]);
    vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]);
    vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]);

    vst1q_f32(&aec->sd[i], vec_sd);
    vst1q_f32(&aec->se[i], vec_se);
    vst1q_f32(&aec->sx[i], vec_sx);

    {
      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
      float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0);
      float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1);
      vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]);
      vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]);
      vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1);
      vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0);
      vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]);
      vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]);
      vst2q_f32(&aec->sde[i][0], vec_sde);
    }

    {
      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
      float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0);
      float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1);
      vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]);
      vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]);
      vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1);
      vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0);
      vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]);
      vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]);
      vst2q_f32(&aec->sxd[i][0], vec_sxd);
    }

    vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
    vec_seSum = vaddq_f32(vec_seSum, vec_se);
  }
  {
    float32x2_t vec_sdSum_total;
    float32x2_t vec_seSum_total;
    // A B C D
    vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum),
                                vget_high_f32(vec_sdSum));
    vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum),
                                vget_high_f32(vec_seSum));
    // A+B C+D
    vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
    vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
    // A+B+C+D A+B+C+D
    sdSum = vget_lane_f32(vec_sdSum_total, 0);
    seSum = vget_lane_f32(vec_seSum_total, 0);
  }

  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
    aec->se[i] = ptrGCoh[0] * aec->se[i] +
                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
    // We threshold here to protect against the ill-effects of a zero farend.
    // The threshold is not arbitrarily chosen, but balances protection and
    // adverse interaction with the algorithm's tuning.
    // TODO(bjornv): investigate further why this is so sensitive.
    aec->sx[i] =
        ptrGCoh[0] * aec->sx[i] +
        ptrGCoh[1] * WEBRTC_SPL_MAX(
            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
            WebRtcAec_kMinFarendPSD);

    aec->sde[i][0] =
        ptrGCoh[0] * aec->sde[i][0] +
        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
    aec->sde[i][1] =
        ptrGCoh[0] * aec->sde[i][1] +
        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);

    aec->sxd[i][0] =
        ptrGCoh[0] * aec->sxd[i][0] +
        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
    aec->sxd[i][1] =
        ptrGCoh[0] * aec->sxd[i][1] +
        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);

    sdSum += aec->sd[i];
    seSum += aec->se[i];
  }

  // Divergent filter safeguard update.
  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;

  // Signal extreme filter divergence if the error is significantly larger
  // than the nearend (13 dB).
  *extreme_filter_divergence = (seSum > (19.95f * sdSum));
}
Пример #8
0
/**
 * @brief   vector_mul_matrix.
 *
 * @param   src1[in]     the input  vector(1*k)
 *          src2[in]     the input  matrix(k*n)
 *          dst[out]     the output vector(1*n)
 *          kn[in]       DIM_K & DIM_N
 *
 * @return  void
 */
void neon_vectormulmatrix_float(float * dst,
                                const float * src1,
                                const float * src2,
                                int *kn)
{
    int j ,l;
    int k = kn[0];
    int n = kn[1];
    const float * src1_p = src1;
    const float * src2_p = src2;
    float * dst_p = dst;
    for (j = 0; j <= n - 4; j += 4) {
        float32x2_t d16 = {0};
        float32x2_t d17 = {0};
        float32x2_t d18 = {0};
        float32x2_t d19 = {0};
        float32x2_t d20;
        float32x2_t d21;
        float32x4_t q0;
        src1_p = src1;
        src2_p = src2 + j * k;
        for (l = 0; l <= k - 4; l += 4) {
            // Matrix A
            float32x4_t q8  = vld1q_f32(src1_p);
            float32x2_t d0 = vget_low_f32(q8);
            float32x2_t d1 = vget_high_f32(q8);
            // Matrix B
            float32x4_t q12 = vld1q_f32(src2_p);
            float32x4_t q13 = vld1q_f32(src2_p + k);
            float32x4_t q14 = vld1q_f32(src2_p + k * 2);
            float32x4_t q15 = vld1q_f32(src2_p + k * 3);
            float32x2_t d8  = vget_low_f32(q12);
            float32x2_t d9  = vget_high_f32(q12);
            float32x2_t d10 = vget_low_f32(q13);
            float32x2_t d11 = vget_high_f32(q13);
            float32x2_t d12 = vget_low_f32(q14);
            float32x2_t d13 = vget_high_f32(q14);
            float32x2_t d14 = vget_low_f32(q15);
            float32x2_t d15 = vget_high_f32(q15);
            d16 = vmla_f32(d16, d0, d8);
            d17 = vmla_f32(d17, d0, d10);
            d18 = vmla_f32(d18, d0, d12);
            d19 = vmla_f32(d19, d0, d14);
            d16 = vmla_f32(d16, d1, d9);
            d17 = vmla_f32(d17, d1, d11);
            d18 = vmla_f32(d18, d1, d13);
            d19 = vmla_f32(d19, d1, d15);
            src1_p += 4;
            src2_p += 4;
        }// end for l
        d16 = vpadd_f32(d16, d17);
        d18 = vpadd_f32(d18, d19);
        float sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
        for(; l < k; l ++) {
            float src1_d;
            src1_d = *src1_p;
            sum0 +=  src1_d * *src2_p;
            sum1 +=  src1_d * *(src2_p + k);
            sum2 +=  src1_d * *(src2_p + 2 * k);
            sum3 +=  src1_d * *(src2_p + 3 * k);
            src1_p++;
            src2_p++;
        }
        d20 = vset_lane_f32(sum0, d20, 0);
        d20 = vset_lane_f32(sum1, d20, 1);
        d21 = vset_lane_f32(sum2, d21, 0);
        d21 = vset_lane_f32(sum3, d21, 1);
        q0 = vaddq_f32(vcombine_f32(d16, d18), vcombine_f32(d20, d21));
        vst1q_f32(dst_p, q0);
        dst_p  += 4;
    }// end for j
}
Пример #9
0
/**
 * @brief   Elem_t¿‡–Õæÿ’ÛA”ÎElem_t¿‡–Õæÿ’ÛBœ‡≥À.
 *
 * @param   dst[out]     ‰≥ˆæÿ’ÛC.
 *          src1[in]     ‰»Îæÿ’ÛA.
 *          src2[in]     ‰»Îæÿ’ÛB.
 *          mkn[in]     æÿ’Ûµƒ∏˜∏ˆŒ¨ ˝.
 *
 * @return  void
 */
void neon_matrixmul_4x4float(Elem_t * dst,
                             Elem_t * src1,
                             Elem_t * src2,
                             int *mkn)
{
    int m = mkn[0];
    int k = mkn[1];
    int n = mkn[2];
    
    for (int i = 0; i < m; i += 4)
    {
        for (int j = 0; j < n; j += 4)
        {
            float32x2_t d16 = {0};
            float32x2_t d17 = {0};
            float32x2_t d18 = {0};
            float32x2_t d19 = {0};
            float32x2_t d20 = {0};
            float32x2_t d21 = {0};
            float32x2_t d22 = {0};
            float32x2_t d23 = {0};
            float32x2_t d24 = {0};
            float32x2_t d25 = {0};
            float32x2_t d26 = {0};
            float32x2_t d27 = {0};
            float32x2_t d28 = {0};
            float32x2_t d29 = {0};
            float32x2_t d30 = {0};
            float32x2_t d31 = {0};
            
            for (int l = 0; l < k; l += 4)
            {
                // Matrix A
                float32x4_t q8  = vld1q_f32(src1      );
                float32x4_t q9  = vld1q_f32(src1 + k  );
                float32x4_t q10 = vld1q_f32(src1 + k*2);
                float32x4_t q11 = vld1q_f32(src1 + k*3);
                float32x2_t d0 = vget_low_f32(q8);
                float32x2_t d1 = vget_high_f32(q8);
                float32x2_t d2 = vget_low_f32(q9);
                float32x2_t d3 = vget_high_f32(q9);
                float32x2_t d4 = vget_low_f32(q10);
                float32x2_t d5 = vget_high_f32(q10);
                float32x2_t d6 = vget_low_f32(q11);
                float32x2_t d7 = vget_high_f32(q11);
                
                // Matrix B
                float32x4_t q12 = vld1q_f32(src2      );
                float32x4_t q13 = vld1q_f32(src2 + k  );
                float32x4_t q14 = vld1q_f32(src2 + k*2);
                float32x4_t q15 = vld1q_f32(src2 + k*3);
                float32x2_t d8  = vget_low_f32(q12);
                float32x2_t d9  = vget_high_f32(q12);
                float32x2_t d10 = vget_low_f32(q13);
                float32x2_t d11 = vget_high_f32(q13);
                float32x2_t d12 = vget_low_f32(q14);
                float32x2_t d13 = vget_high_f32(q14);
                float32x2_t d14 = vget_low_f32(q15);
                float32x2_t d15 = vget_high_f32(q15);
                
                d16 = vmla_f32(d16, d0, d8);
                d17 = vmla_f32(d17, d0, d10);
                d18 = vmla_f32(d18, d0, d12);
                d19 = vmla_f32(d19, d0, d14);
                d16 = vmla_f32(d16, d1, d9);
                d17 = vmla_f32(d17, d1, d11);
                d18 = vmla_f32(d18, d1, d13);
                d19 = vmla_f32(d19, d1, d15);
                
                d20 = vmla_f32(d20, d2, d8);
                d21 = vmla_f32(d21, d2, d10);
                d22 = vmla_f32(d22, d2, d12);
                d23 = vmla_f32(d23, d2, d14);
                d20 = vmla_f32(d20, d3, d9);
                d21 = vmla_f32(d21, d3, d11);
                d22 = vmla_f32(d22, d3, d13);
                d23 = vmla_f32(d23, d3, d15);
                
                d24 = vmla_f32(d24, d4, d8);
                d25 = vmla_f32(d25, d4, d10);
                d26 = vmla_f32(d26, d4, d12);
                d27 = vmla_f32(d27, d4, d14);
                d24 = vmla_f32(d24, d5, d9);
                d25 = vmla_f32(d25, d5, d11);
                d26 = vmla_f32(d26, d5, d13);
                d27 = vmla_f32(d27, d5, d15);
                
                d28 = vmla_f32(d28, d6, d8);
                d29 = vmla_f32(d29, d6, d10);
                d30 = vmla_f32(d30, d6, d12);
                d31 = vmla_f32(d31, d6, d14);
                d28 = vmla_f32(d28, d7, d9);
                d29 = vmla_f32(d29, d7, d11);
                d30 = vmla_f32(d30, d7, d13);
                d31 = vmla_f32(d31, d7, d15);
                
                src1 += 4;
                src2 += 4;
            }// end for l
            d16 = vpadd_f32(d16, d17);
            d18 = vpadd_f32(d18, d19);
            d20 = vpadd_f32(d20, d21);
            d22 = vpadd_f32(d22, d23);
            d24 = vpadd_f32(d24, d25);
            d26 = vpadd_f32(d26, d27);
            d28 = vpadd_f32(d28, d29);
            d30 = vpadd_f32(d30, d31);
            vst1q_f32(dst      , vcombine_f32(d16, d18));
            vst1q_f32(dst + n  , vcombine_f32(d20, d22));
            vst1q_f32(dst + n*2, vcombine_f32(d24, d26));
            vst1q_f32(dst + n*3, vcombine_f32(d28, d30));
            
            src1 -= k;
            src2 += k*3;
            dst  += 4;
        }// end for j
        src1 += k*4;
        src2 -= k*n;
        dst  += n*3;
    }// end for i
}