C++ (Cpp) vld1q_f32 Beispiele

Beispiel #1

0

Datei anzeigen

Datei: fptest.cpp Projekt: Khaon/android_system_extras

static void test_fma() {
    for(int i=0; i<1020 * 4; i++) {
        data_f[i] = i;
    }
    float32x4_t c0_02 = vdupq_n_f32(0.02f);
    float32x4_t c0_04 = vdupq_n_f32(0.04f);
    float32x4_t c0_05 = vdupq_n_f32(0.05f);
    float32x4_t c0_10 = vdupq_n_f32(0.1f);
    float32x4_t c0_20 = vdupq_n_f32(0.2f);
    float32x4_t c1_00 = vdupq_n_f32(1.0f);

    startTime();

    // Do ~1 billion ops
    for (int ct=0; ct < (1000 * (1000 / 80)); ct++) {
        for (int i=0; i < 1000; i++) {
            float32x4_t t;
            t = vmulq_f32(vld1q_f32((float32_t *)&data_f[i]), c0_02);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+4]), c0_04);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+8]), c0_05);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+12]), c0_10);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+16]), c0_20);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+20]), c0_20);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+24]), c0_10);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+28]), c0_05);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+32]), c0_04);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+36]), c0_02);
            t = vaddq_f32(t, c1_00);
            vst1q_f32((float32_t *)&data_f[i], t);
        }
    }

    endTime("neon fma", 1e9);
}

Beispiel #2

0

Datei anzeigen

Datei: AudioNodeEngineNEON.cpp Projekt: jasonLaster/gecko-dev

void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {
  ASSERT_ALIGNED(aBlock);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale = vmovq_n_f32(aScale);

  uint32_t dif = aSize % 16;
  uint32_t vectorSize = aSize - dif;
  uint32_t i = 0;
  for (; i < vectorSize; i += 16) {
    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));

    vout0 = vmulq_f32(vin0, vscale);
    vout1 = vmulq_f32(vin1, vscale);
    vout2 = vmulq_f32(vin2, vscale);
    vout3 = vmulq_f32(vin3, vscale);

    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
  }

  for (unsigned j = 0; j < dif; ++i, ++j) {
    aBlock[i] *= aScale;
  }
}

Beispiel #3

0

Datei anzeigen

Datei: la.c Projekt: dribbroc/HONEI-Droid

void * scaled_sumf_thread_NEON(void * argument)
{
    jsize i = 0;
    struct scaled_sumfneon_thread_data * data = (struct scaled_sumfneon_thread_data *) argument;
    float32_t * r = (float32_t *)data->r;
    const float32_t * x = (const float32_t *)data->x;
    const float32_t * y = (const float32_t *)data->y;
    const float32_t a = (const float32_t)data->a;
    const jsize size = data->size;

    float32x4_t rx4, xx4, yx4, ax4;

    ax4 = vdupq_n_f32(a);

    for(i; i < size ; i += 4)
    {
        xx4 = vld1q_f32(&(x[i]));
        yx4 = vld1q_f32(&(y[i]));

        rx4 = vmlaq_f32(xx4, ax4, yx4);

        vst1q_f32(&(r[i]), rx4);
    }

}

Beispiel #4

0

Datei anzeigen

Datei: cblas_hppl.c Projekt: shenjiang11/iOS

/**
 * @brief   vector scale: A[] = alpha * B[].
 *
 * @param   dst[out]    the result matrix A.
 *          src[in]     the input matrix B.
 *          alpha[in]   scale of B.
 *          elemCnt[in] number of elements to calc.
 *
 * @return  void.
 */
void neon_scale(float *dst,
                const float *src,
                const float alpha,
                const int elemCnt)
{
    int i;
    for (i = 0; i <= elemCnt - 16; i += 16)
    {
        float32x4_t q0 = vld1q_f32(src + i);
        float32x4_t q1 = vld1q_f32(src + i + 4);
        float32x4_t q2 = vld1q_f32(src + i + 8);
        float32x4_t q3 = vld1q_f32(src + i + 12);
        q0 = vmulq_n_f32(q0, alpha);
        q1 = vmulq_n_f32(q1, alpha);
        q2 = vmulq_n_f32(q2, alpha);
        q3 = vmulq_n_f32(q3, alpha);
        vst1q_f32(dst + i,      q0);
        vst1q_f32(dst + i + 4,  q1);
        vst1q_f32(dst + i + 8,  q2);
        vst1q_f32(dst + i + 12, q3);
    }
    for (; i < elemCnt; i++)
    {
        dst[i] = src[i] * alpha;
    }
}

Beispiel #5

0

Datei anzeigen

Datei: AudioNodeEngineNEON.cpp Projekt: Incognito/mozilla-central

void
AudioBlockCopyChannelWithScale_NEON(const float* aInput,
                                    float aScale,
                                    float* aOutput)
{
  ASSERT_ALIGNED(aInput);
  ASSERT_ALIGNED(aOutput);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale = vmovq_n_f32(aScale);

  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=16) {
    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
    vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4));
    vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8));
    vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12));

    vout0 = vmulq_f32(vin0, vscale);
    vout1 = vmulq_f32(vin1, vscale);
    vout2 = vmulq_f32(vin2, vscale);
    vout3 = vmulq_f32(vin3, vscale);

    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
    vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1);
    vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2);
    vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3);
  }
}

Beispiel #6

0

Datei anzeigen

Datei: aec_core_neon.c Projekt: Crawping/chromium_extract

static void ScaleErrorSignalNEON(int extended_filter_enabled,
                                 float normal_mu,
                                 float normal_error_threshold,
                                 float x_pow[PART_LEN1],
                                 float ef[2][PART_LEN1]) {
  const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;
  const float error_threshold = extended_filter_enabled ?
      kExtendedErrorThreshold : normal_error_threshold;
  const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);
  const float32x4_t kMu = vmovq_n_f32(mu);
  const float32x4_t kThresh = vmovq_n_f32(error_threshold);
  int i;
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]);
    const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);
    const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);
    const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f);
    float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);
    float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);
    const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);
    const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);
    const float32x4_t absEf = vsqrtq_f32(ef_sum2);
    const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);
    const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);
    const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);
    uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));
    uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));
    uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger),
                                     vreinterpretq_u32_f32(ef_re));
    uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger),
                                     vreinterpretq_u32_f32(ef_im));
    ef_re_if = vandq_u32(bigger, ef_re_if);
    ef_im_if = vandq_u32(bigger, ef_im_if);
    ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);
    ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);
    ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);
    ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);
    vst1q_f32(&ef[0][i], ef_re);
    vst1q_f32(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    float abs_ef;
    ef[0][i] /= (x_pow[i] + 1e-10f);
    ef[1][i] /= (x_pow[i] + 1e-10f);
    abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

    if (abs_ef > error_threshold) {
      abs_ef = error_threshold / (abs_ef + 1e-10f);
      ef[0][i] *= abs_ef;
      ef[1][i] *= abs_ef;
    }

    // Stepsize factor
    ef[0][i] *= mu;
    ef[1][i] *= mu;
  }
}

Beispiel #7

0

Datei anzeigen

Datei: dot_product.cpp Projekt: 007Indian/opencv

f64 dotProduct(const Size2D &_size,
               const f32 * src0Base, ptrdiff_t src0Stride,
               const f32 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
    {
        size.width *= size.height;
        size.height = 1;
    }

#define DOT_FLOAT_BLOCKSIZE (1 << 13)
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        while(i + 4 <= size.width)
        {
            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
            float32x4_t v_sum = vdupq_n_f32(0.0f);

            for( ; i <= lim; i += 4 )
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);
                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
            }

            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
        }

        if(i + 2 <= size.width)
        {
            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
            i += 2;
        }

        for (; i < size.width; ++i)
            result += src0[i] * src1[i];
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}

Beispiel #8

0

Datei anzeigen

Datei: transform-neon.c Projekt: makotokato/qcms

void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform,
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length)
{
  size_t i;
  unsigned char alpha;
  float32_t (*mat)[4] = transform->matrix;

  const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r;
  const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g;
  const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b;

  const uint8_t *otdata_r = &transform->output_table_r->data[0];
  const uint8_t *otdata_g = &transform->output_table_g->data[0];
  const uint8_t *otdata_b = &transform->output_table_b->data[0];

  const float32x4_t mat0 = vld1q_f32(mat[0]);
  const float32x4_t mat1 = vld1q_f32(mat[1]);
  const float32x4_t mat2 = vld1q_f32(mat[2]);

  const float32x4_t max   = vld1q_dup_f32(&clampMaxValue);
  const float32x4_t min   = vld1q_dup_f32(&zero);
  const float32x4_t scale = vld1q_dup_f32(&floatScale);

  float32x4_t vec_r, vec_g, vec_b;
  int32x4_t result;

  /* CYA */
  if (!length)
    return;

  for (i = 0; i < length; i++) {
    /* setup for transforming the pixel */
    vec_r = vld1q_dup_f32(&igtbl_r[*src++]);
    vec_g = vld1q_dup_f32(&igtbl_g[*src++]);
    vec_b = vld1q_dup_f32(&igtbl_b[*src++]);
    alpha = *src++;

    /* gamma * matrix */
    vec_r = vmulq_f32(vec_r, mat0);
    vec_g = vmulq_f32(vec_g, mat1);
    vec_b = vmulq_f32(vec_b, mat2);

    /* crunch, crunch, crunch */
    vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b));
    vec_r = vmaxq_f32(min, vec_r);
    vec_r = vminq_f32(max, vec_r);
    result = vcvtq_s32_f32(vmulq_f32(vec_r, scale));

    /* use calc'd indices to output RGB values */
    *dest++ = otdata_r[vgetq_lane_s32(result, 0)];
    *dest++ = otdata_g[vgetq_lane_s32(result, 1)];
    *dest++ = otdata_b[vgetq_lane_s32(result, 2)];
    *dest++ = alpha;
  }
}

Beispiel #9

0

Datei anzeigen

Datei: AudioNodeEngineNEON.cpp Projekt: jasonLaster/gecko-dev

void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                      const float aInputR[WEBAUDIO_BLOCK_SIZE],
                                      float aGainL, float aGainR,
                                      bool aIsOnTheLeft,
                                      float aOutputL[WEBAUDIO_BLOCK_SIZE],
                                      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
  ASSERT_ALIGNED(aInputL);
  ASSERT_ALIGNED(aInputR);
  ASSERT_ALIGNED(aOutputL);
  ASSERT_ALIGNED(aOutputR);

  float32x4_t vinL0, vinL1;
  float32x4_t vinR0, vinR1;
  float32x4_t voutL0, voutL1;
  float32x4_t voutR0, voutR1;
  float32x4_t vscaleL = vmovq_n_f32(aGainL);
  float32x4_t vscaleR = vmovq_n_f32(aGainR);

  if (aIsOnTheLeft) {
    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));

      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));

      voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
      voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);

      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);

      voutR0 = vmulq_f32(vinR0, vscaleR);
      voutR1 = vmulq_f32(vinR1, vscaleR);

      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
    }
  } else {
    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));

      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));

      voutL0 = vmulq_f32(vinL0, vscaleL);
      voutL1 = vmulq_f32(vinL1, vscaleL);

      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);

      voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
      voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);

      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
    }
  }
}

Beispiel #10

0

Datei anzeigen

Datei: simd_neon.c Projekt: IFeelBloated/vapoursynth-nnedi3

void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) {
    const int16_t *data = (const int16_t *)dataf;
    const int16_t *weights = (const int16_t *)weightsf;
    weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t)

    for (int i = 0; i < n; i += 4) {
        int32x4_t accum0 = { 0, 0, 0, 0 };
        int32x4_t accum1 = accum0;
        int32x4_t accum2 = accum0;
        int32x4_t accum3 = accum0;

        for (int j = 0; j < len; j += 8) {
            int16x4x2_t d0 = vld2_s16(data + j);

            int16x4x2_t w0 = vld2_s16(weights);
            int16x4x2_t w1 = vld2_s16(weights + 8);
            int16x4x2_t w2 = vld2_s16(weights + 16);
            int16x4x2_t w3 = vld2_s16(weights + 24);

            accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]);
            accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]);

            accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]);
            accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]);

            accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]);
            accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]);

            accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]);
            accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]);

            weights += 32;
        }

        int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0));
        int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1));
        int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
        int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3));
        sum0 = vpadd_s32(sum0, sum1);
        sum1 = vpadd_s32(sum2, sum3);
        int32x4_t sum = vcombine_s32(sum0, sum1);

        float32x4_t val = vcvtq_f32_s32(sum);
        val = vmulq_f32(val, vld1q_f32(weightsf + i*2));
        val = vmulq_n_f32(val, istd[0]);
        val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4));
        vst1q_f32(vals + i, val);
    }
}

Beispiel #11

0

Datei anzeigen

Datei: mw_neon.c Projekt: Daniel-Hwang/eeg

/* f32x4 mm mul */
void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C)
{
	int i, k, j;

	float32x4_t neon_b, neon_c;
	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_f32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_f32(A + j_T);

				neon_b = vld1q_f32(B + k_Row + j);
				neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));
				neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));
				neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));
				neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));

				neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);

				vst1q_lane_f32(C + k_Row + i, neon_c, 0);
				vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}

Beispiel #12

0

Datei anzeigen

Datei: arm-vector-align.c Projekt: FrOSt-Foundation/clang

void t1(AlignedAddr *addr1, AlignedAddr *addr2) {
// CHECK: @t1
// CHECK: call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %{{.*}}, i32 16)
  float32x4_t a = vld1q_f32(addr1);
// CHECK: call void @llvm.arm.neon.vst1.v4f32(i8* %{{.*}}, <4 x float> %{{.*}}, i32 16)
  vst1q_f32(addr2, a);
}

Beispiel #13

0

Datei anzeigen

Datei: omxSP_FFTInv_CCSToR_F32.c Projekt: venkatarajasekhar/Qt

/*
 * Scale FFT data by 1/|length|. |length| must be a power of two
 */
static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) {
  float32_t* data = (float32_t*)fftData;
  float32_t scale = 2.0f / length;

  if (length >= 4) {
    /*
     * Do 4 float elements at a time because |length| is always a
     * multiple of 4 when |length| >= 4.
     *
     * TODO(rtoy): Figure out how to process 8 elements at a time
     * using intrinsics or replace this with inline assembly.
     */
    do {
      float32x4_t x = vld1q_f32(data);

      length -= 4;
      x = vmulq_n_f32(x, scale);
      vst1q_f32(data, x);
      data += 4;
    } while (length > 0);
  } else if (length == 2) {
    float32x2_t x = vld1_f32(data);
    x = vmul_n_f32(x, scale);
    vst1_f32(data, x);
  } else {
    fftData[0] *= scale;
  }
}

Beispiel #14

0

Datei anzeigen

Datei: aec_core_neon.c Projekt: Crawping/chromium_extract

static void SubbandCoherenceNEON(AecCore* aec,
                                 float efw[2][PART_LEN1],
                                 float dfw[2][PART_LEN1],
                                 float xfw[2][PART_LEN1],
                                 float* fft,
                                 float* cohde,
                                 float* cohxd,
                                 int* extreme_filter_divergence) {
  int i;

  SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence);

  {
    const float32x4_t vec_1eminus10 =  vdupq_n_f32(1e-10f);

    // Subband coherence
    for (i = 0; i + 3 < PART_LEN1; i += 4) {
      const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
      const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
      const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
      const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
      const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
      float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);
      float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);
      vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);
      vec_cohde = vdivq_f32(vec_cohde, vec_sdse);
      vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);
      vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);

      vst1q_f32(&cohde[i], vec_cohde);
      vst1q_f32(&cohxd[i], vec_cohxd);
    }
  }
  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    cohde[i] =
        (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
        (aec->sd[i] * aec->se[i] + 1e-10f);
    cohxd[i] =
        (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
        (aec->sx[i] * aec->sd[i] + 1e-10f);
  }
}

Beispiel #15

0

Datei anzeigen

Datei: aec_core_neon.c Projekt: Crawping/chromium_extract

static int PartitionDelayNEON(const AecCore* aec) {
  // Measures the energy in each filter partition and returns the partition with
  // highest energy.
  // TODO(bjornv): Spread computational cost by computing one partition per
  // block?
  float wfEnMax = 0;
  int i;
  int delay = 0;

  for (i = 0; i < aec->num_partitions; i++) {
    int j;
    int pos = i * PART_LEN1;
    float wfEn = 0;
    float32x4_t vec_wfEn = vdupq_n_f32(0.0f);
    // vectorized code (four at once)
    for (j = 0; j + 3 < PART_LEN1; j += 4) {
      const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);
      const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);
      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);
      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);
    }
    {
      float32x2_t vec_total;
      // A B C D
      vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn));
      // A+B C+D
      vec_total = vpadd_f32(vec_total, vec_total);
      // A+B+C+D A+B+C+D
      wfEn = vget_lane_f32(vec_total, 0);
    }

    // scalar code for the remaining items.
    for (; j < PART_LEN1; j++) {
      wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
              aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
    }

    if (wfEn > wfEnMax) {
      wfEnMax = wfEn;
      delay = i;
    }
  }
  return delay;
}

Beispiel #16

0

Datei anzeigen

Datei: scale_arm.cpp Projekt: superhero1991/ncnn

int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    if (bias_term)
    {
        const float* scale_ptr = scale_data;
        const float* bias_ptr = bias_data;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            float s = scale_ptr[q];
            float bias = bias_ptr[q];

#if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _s = vdupq_n_f32(s);
            float32x4_t _bias = vdupq_n_f32(bias);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmlaq_f32(_bias, _p, _s);
                vst1q_f32(outptr, _p);

                ptr += 4;
                outptr += 4;
            }
#endif // __ARM_NEON

            for (; remain>0; remain--)
            {
                *outptr = *ptr * s + bias;

                ptr++;
                outptr++;
            }
        }
    }
    else
    {

Beispiel #17

0

Datei anzeigen

Datei: aec_core_neon.c Projekt: Crawping/chromium_extract

// Window time domain data to be used by the fft.
static void WindowDataNEON(float* x_windowed, const float* x) {
  int i;
  for (i = 0; i < PART_LEN; i += 4) {
    const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);
    const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);
    const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);
    // A B C D
    float32x4_t vec_sqrtHanning_rev =
        vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
    // B A D C
    vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);
    // D C B A
    vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),
                                       vget_low_f32(vec_sqrtHanning_rev));
    vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));
    vst1q_f32(&x_windowed[PART_LEN + i],
            vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));
  }
}

Beispiel #18

0

Datei anzeigen

Datei: compare_neon.hpp Projekt: Yvaine/common_tool

void compare_neon_ge(float *psrc1, float src2,  uchar *pdst, int size)
{
	int remainder = size - 7;
	float32x4_t vsrc2 = vdupq_n_f32(src2);
	int i = 0;
	for(; i < remainder; i += 8){
		float32x4_t vsrc1_32x4 = vld1q_f32(psrc1 + i ); 
		float32x4_t vsrc2_32x4 = vld1q_f32(psrc1 + i + 4); 
		uint32x4_t vdst1 = vcgeq_f32(vsrc1_32x4, vsrc2);
		uint32x4_t vdst2 = vcgeq_f32(vsrc2_32x4, vsrc2);
		uint16x4_t vdst1_16x4 = vmovn_u32(vdst1);
		uint16x4_t vdst2_16x4 = vmovn_u32(vdst2);
		uint16x8_t vdst_16x8 = vcombine_u16(vdst1_16x4, vdst2_16x4);
		uint8x8_t vdst_8x8 = vmovn_u16(vdst_16x8);
		vst1_u8(pdst + i, vdst_8x8);
	}
	for( ; i < size; i++){
		pdst[i] = (psrc1[i] >= src2 ) ? 255 : 0;  
	}
}

Beispiel #19

0

Datei anzeigen

Datei: arm_neon_introduction.cpp Projekt: xylcbd/blogs_code

static void neon_vector_mul(const std::vector<float>& vec_a, const std::vector<float>& vec_b, std::vector<float>& vec_result)
{
	assert(vec_a.size() == vec_b.size());
	assert(vec_a.size() == vec_result.size());
	int i = 0;
	//neon process
	for (; i < (int)vec_result.size() - 3 ; i+=4)
	{
		const auto data_a = vld1q_f32(&vec_a[i]);
		const auto data_b = vld1q_f32(&vec_b[i]);
		float* dst_ptr = &vec_result[i];
		const auto data_res = vmulq_f32(data_a, data_b);
		vst1q_f32(dst_ptr, data_res);
	}
	//normal process
	for (; i < (int)vec_result.size(); i++)
	{
		vec_result[i] = vec_a[i] * vec_b[i];
	}
}

Beispiel #20

0

Datei anzeigen

Datei: neoncompile_saxpy_kernel.c Projekt: yongshanding/vecAndroid

//Kernel function: saxpy
void saxpy_vector(KernelArgs* args) {

    //Setup
    const float32x4_t MASK_FALSE = vdupq_n_f32(0.f);
    const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE));
    
    //Uniforms
    
    //Fuses
    
    //Literals
    
    //Stack variables
    float32x4_t scale, x, y, result, var060, var061;
    
    //Loop over input
    uint64_t index;
    for(index = 0; index < args->N; index += 4) {
    
        //Inputs
        scale = vld1q_f32(&args->scale[index]);
        x = vld1q_f32(&args->x[index]);
        y = vld1q_f32(&args->y[index]);
        
        //Begin kernel logic
        {
        
            //>>> result = scale * x + y
            var061 = vmulq_f32(scale, x);
            var060 = vaddq_f32(var061, y);
            result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result);
        
        }
        //End kernel logic
        
        //Outputs
        vst1q_f32(&args->result[index], result);
        
    }
}

Beispiel #21

0

Datei anzeigen

Datei: mw_neon.c Projekt: Daniel-Hwang/eeg

/* f32x4 add */
void mw_neon_mm_add_f32x4(float * A, int Row, int Col, float * B, float * C)
{
	float32x4_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 4; i <= size ; i+=4)
	{
		k = i - 4;
		neon_a = vld1q_f32(A + k);
		neon_b = vld1q_f32(B + k);
		neon_c = vaddq_f32(neon_a, neon_b);
		vst1q_f32(C + k, neon_c);
	}

	k = i - 4;
    for (i = 0; i < size % 4; i++)
	{
		C[k + i] = A[k + i] + B[k + i];
	}
}

Beispiel #22

0

Datei anzeigen

Datei: mw_neon.c Projekt: Daniel-Hwang/eeg

/* f32x4 mv mul */
void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C)
{
	int i = 0;
	int k = 0;

	float32x4_t neon_b, neon_c;
	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_f32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_f32(A + j);
			neon_a1 = vld1q_f32(A + j + Row);
			neon_a2 = vld1q_f32(A + j + 2 * Row);
			neon_a3 = vld1q_f32(A + j + 3 * Row);

			neon_b = vld1q_f32(B + k);
			neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));
			neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));
			neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));
			neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));

			neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);

		}

		vst1q_f32(C + i, neon_c);
	}
}

Beispiel #23

0

Datei anzeigen

Datei: cblas_hppl.c Projekt: shenjiang11/iOS

/**
 * @brief   vector_dot_vector.
 *
 * @param   dst[out]     the output element(1*1)
 * @param   src1[in]     the input  vector(1*n)
 *          src2[in]     the input  vector(1*n)
 *          dimN[in]     size of vector
 *
 * @return  void
 */
void neon_VecdotVec(float *dst,
                    const float *src1,
                    const float *src2,
                    const int dimN)
{
    float *mat0 = (float *)src1;
    float *mat1 = (float *)src2;
    float32x4_t q0 = vld1q_f32(mat0);
    float32x4_t q1 = vld1q_f32(mat1);
    q0 = vmulq_f32(q0, q1);
    int j = 4;
    for (; j <= dimN - 4; j += 4)
    {
        float32x4_t q2 = vld1q_f32(mat0 + j);
        float32x4_t q3 = vld1q_f32(mat1 + j);
        q0 = vmlaq_f32(q0, q2, q3);
    }
    float32x2_t d0 = vpadd_f32(vget_low_f32(q0), vget_high_f32(q0));
    d0 = vpadd_f32(d0, d0);
    *dst = *((float *)&d0);
    for (; j < dimN; j++) {
        *dst += src1[j] * src2[j];
    }
}

Beispiel #24

0

Datei anzeigen

Datei: sln_kernel_neon.c Projekt: caomw/OpenVML

void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) 
{
    unsigned int m = n >> 2;
    unsigned int k = n & 3, j;
    unsigned int l = n & (~3);

    for (j = 0; j < m; j++) {
        v4sf src = vld1q_f32(a + 4 * j);
        v4sf tem = simd_ln4f(src);
        vst1q_f32(y + 4 * j, tem);
    }

    for (j = 0; j < k; j++) {
        y[j + l] = logf(a[j + l]);
    }
}

Beispiel #25

0

Datei anzeigen

Datei: bias_arm.cpp Projekt: superhero1991/ncnn

int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_ptr[q];

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        float32x4_t _bias = vdupq_n_f32(bias);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vaddq_f32(_p, _bias);
            vst1q_f32(ptr, _outp);

            ptr += 4;
        }
#endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *ptr = *ptr + bias;

            ptr++;
        }
    }

    return 0;
}

Beispiel #26

0

Datei anzeigen

Datei: AudioNodeEngineNEON.cpp Projekt: Incognito/mozilla-central

void AudioBufferAddWithScale_NEON(const float* aInput,
                                  float aScale,
                                  float* aOutput,
                                  uint32_t aSize)
{
  ASSERT_ALIGNED(aInput);
  ASSERT_ALIGNED(aOutput);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale = vmovq_n_f32(aScale);

  uint32_t dif = aSize % 16;
  aSize -= dif;
  unsigned i = 0;
  for (; i < aSize; i+=16) {
    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
    vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4));
    vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8));
    vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12));

    vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));
    vout1 = vld1q_f32(ADDRESS_OF(aOutput, i+4));
    vout2 = vld1q_f32(ADDRESS_OF(aOutput, i+8));
    vout3 = vld1q_f32(ADDRESS_OF(aOutput, i+12));

    vout0 = vmlaq_f32(vout0, vin0, vscale);
    vout1 = vmlaq_f32(vout1, vin1, vscale);
    vout2 = vmlaq_f32(vout2, vin2, vscale);
    vout3 = vmlaq_f32(vout3, vin3, vscale);

    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
    vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1);
    vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2);
    vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3);
  }

  for (unsigned j = 0; j < dif; ++i, ++j) {
    aOutput[i] += aInput[i]*aScale;
  }
}

Beispiel #27

0

Datei anzeigen

Datei: vsqrt.c Projekt: 0day-ci/gcc

void
test_square_root_v4sf ()
{
  const float32_t pool[] = {4.0f, 9.0f, 16.0f, 25.0f};
  float32x4_t val;
  float32x4_t res;

  val = vld1q_f32 (pool);
  res = vsqrtq_f32 (val);

  if (vgetq_lane_f32 (res, 0) != 2.0f)
    abort ();
  if (vgetq_lane_f32 (res, 1) != 3.0f)
    abort ();
  if (vgetq_lane_f32 (res, 2) != 4.0f)
    abort ();
  if (vgetq_lane_f32 (res, 3) != 5.0f)
    abort ();
}

Beispiel #28

0

Datei anzeigen

Datei: vmuls_laneq_f32_1.c Projekt: 0day-ci/gcc

int
main (void)
{
  volatile float32_t minus_e, pi, ln2, sqrt2, phi;
  float32_t expected, actual;
  float32x4_t arg2;
  float32_t arr[4];

  pi = 3.14159265359;
  arr[0] = minus_e = -2.71828;
  arr[1] = ln2 = 0.69314718056;
  arr[2] = sqrt2 = 1.41421356237;
  arr[3] = phi = 1.61803398874;

  arg2 = vld1q_f32 (arr);
  actual = vmuls_laneq_f32 (pi, arg2, 0);
  expected = pi * minus_e;

  if (expected != actual)
    abort ();

  expected = pi * ln2;
  actual = vmuls_laneq_f32 (pi, arg2, 1);

  if (expected != actual)
    abort ();

  expected = pi * sqrt2;
  actual = vmuls_laneq_f32 (pi, arg2, 2);

  if (expected != actual)
    abort ();

  expected = pi * phi;
  actual = vmuls_laneq_f32 (pi, arg2, 3);

  if (expected != actual)
    abort ();

  return 0;
}

Beispiel #29

0

Datei anzeigen

Datei: cblas_hppl.c Projekt: shenjiang11/iOS

/**
 * @brief   vector scale & accu: A[] = alpha * B[] + beta * A[].
 *
 * @param   dst[out]    the accumulating matrix A.
 *          src[in]     the input matrix B.
 *          alpha[in]   scale of B.
 *          beta[in]    scale of A.
 *          elemCnt[in] number of elements to calc.
 *
 * @return  void.
 */
void neon_axpby(float *dst,
                const float *src,
                const float alpha,
                const float beta,
                const int elemCnt)
{
    int i;
    for (i = 0; i <= elemCnt - 16; i += 16)
    {
        float32x4_t q0 = vld1q_f32(src + i);
        float32x4_t q1 = vld1q_f32(src + i + 4);
        float32x4_t q2 = vld1q_f32(src + i + 8);
        float32x4_t q3 = vld1q_f32(src + i + 12);
        float32x4_t q4 = vld1q_f32(dst + i);
        float32x4_t q5 = vld1q_f32(dst + i + 4);
        float32x4_t q6 = vld1q_f32(dst + i + 8);
        float32x4_t q7 = vld1q_f32(dst + i + 12);
        q0 = vmulq_n_f32(q0, alpha);
        q1 = vmulq_n_f32(q1, alpha);
        q2 = vmulq_n_f32(q2, alpha);
        q3 = vmulq_n_f32(q3, alpha);
        q0 = vmlaq_n_f32(q0, q4, beta);
        q1 = vmlaq_n_f32(q1, q5, beta);
        q2 = vmlaq_n_f32(q2, q6, beta);
        q3 = vmlaq_n_f32(q3, q7, beta);
        vst1q_f32(dst + i,      q0);
        vst1q_f32(dst + i + 4,  q1);
        vst1q_f32(dst + i + 8,  q2);
        vst1q_f32(dst + i + 12, q3);
    }
    for (; i < elemCnt; i++)
    {
        float a = src[i] * alpha + dst[i] * beta;
        dst[i] = a;
    }
}

Beispiel #30

0

Datei anzeigen

Datei: aec_core_neon.c Projekt: Crawping/chromium_extract

static void FilterFarNEON(
    int num_partitions,
    int x_fft_buf_block_pos,
    float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
    float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
    float y_fft[2][PART_LEN1]) {
  int i;
  for (i = 0; i < num_partitions; i++) {
    int j;
    int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
    int pos = i * PART_LEN1;
    // Check for wrap
    if (i + x_fft_buf_block_pos >= num_partitions) {
      xPos -= num_partitions * PART_LEN1;
    }

    // vectorized code (four at once)
    for (j = 0; j + 3 < PART_LEN1; j += 4) {
      const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);
      const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);
      const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]);
      const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]);
      const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]);
      const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]);
      const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re);
      const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im);
      const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im);
      const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re);
      const float32x4_t g = vaddq_f32(y_fft_re, e);
      const float32x4_t h = vaddq_f32(y_fft_im, f);
      vst1q_f32(&y_fft[0][j], g);
      vst1q_f32(&y_fft[1][j], h);
    }
    // scalar code for the remaining items.
    for (; j < PART_LEN1; j++) {
      y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j],
                           x_fft_buf[1][xPos + j],
                           h_fft_buf[0][pos + j],
                           h_fft_buf[1][pos + j]);
      y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],
                           x_fft_buf[1][xPos + j],
                           h_fft_buf[0][pos + j],
                           h_fft_buf[1][pos + j]);
    }
  }
}