예제 #1
0
void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                      const float aInputR[WEBAUDIO_BLOCK_SIZE],
                                      float aGainL, float aGainR,
                                      bool aIsOnTheLeft,
                                      float aOutputL[WEBAUDIO_BLOCK_SIZE],
                                      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
  ASSERT_ALIGNED(aInputL);
  ASSERT_ALIGNED(aInputR);
  ASSERT_ALIGNED(aOutputL);
  ASSERT_ALIGNED(aOutputR);

  float32x4_t vinL0, vinL1;
  float32x4_t vinR0, vinR1;
  float32x4_t voutL0, voutL1;
  float32x4_t voutR0, voutR1;
  float32x4_t vscaleL = vmovq_n_f32(aGainL);
  float32x4_t vscaleR = vmovq_n_f32(aGainR);

  if (aIsOnTheLeft) {
    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));

      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));

      voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
      voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);

      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);

      voutR0 = vmulq_f32(vinR0, vscaleR);
      voutR1 = vmulq_f32(vinR1, vscaleR);

      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
    }
  } else {
    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));

      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));

      voutL0 = vmulq_f32(vinL0, vscaleL);
      voutL1 = vmulq_f32(vinL1, vscaleL);

      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);

      voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
      voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);

      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
    }
  }
}
void
AudioBlockCopyChannelWithScale_NEON(const float aInput[WEBAUDIO_BLOCK_SIZE],
                                    const float aScale[WEBAUDIO_BLOCK_SIZE],
                                    float aOutput[WEBAUDIO_BLOCK_SIZE])
{
  ASSERT_ALIGNED(aInput);
  ASSERT_ALIGNED(aScale);
  ASSERT_ALIGNED(aOutput);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale0, vscale1, vscale2, vscale3;

  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=16) {
    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
    vin1 = vld1q_f32(ADDRESS_OF(aInput, i+4));
    vin2 = vld1q_f32(ADDRESS_OF(aInput, i+8));
    vin3 = vld1q_f32(ADDRESS_OF(aInput, i+12));

    vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
    vscale1 = vld1q_f32(ADDRESS_OF(aScale, i+4));
    vscale2 = vld1q_f32(ADDRESS_OF(aScale, i+8));
    vscale3 = vld1q_f32(ADDRESS_OF(aScale, i+12));

    vout0 = vmulq_f32(vin0, vscale0);
    vout1 = vmulq_f32(vin1, vscale1);
    vout2 = vmulq_f32(vin2, vscale2);
    vout3 = vmulq_f32(vin3, vscale3);

    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
    vst1q_f32(ADDRESS_OF(aOutput, i+4), vout1);
    vst1q_f32(ADDRESS_OF(aOutput, i+8), vout2);
    vst1q_f32(ADDRESS_OF(aOutput, i+12), vout3);
  }
}
예제 #3
0
static float32x4_t vsqrtq_f32(float32x4_t s) {
  int i;
  float32x4_t x = vrsqrteq_f32(s);

  // Code to handle sqrt(0).
  // If the input to sqrtf() is zero, a zero will be returned.
  // If the input to vrsqrteq_f32() is zero, positive infinity is returned.
  const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
  // check for divide by zero
  const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x));
  // zero out the positive infinity results
  x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero),
                                      vreinterpretq_u32_f32(x)));
  // from arm documentation
  // The Newton-Raphson iteration:
  //     x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2)
  // converges to (1/√d) if x0 is the result of VRSQRTE applied to d.
  //
  // Note: The precision did not improve after 2 iterations.
  for (i = 0; i < 2; i++) {
    x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x);
  }
  // sqrt(s) = s * 1/sqrt(s)
  return vmulq_f32(s, x);;
}
예제 #4
0
inline float32x4_t cv_vrecpq_f32(float32x4_t val)
{
    float32x4_t reciprocal = vrecpeq_f32(val);
    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
    return reciprocal;
}
예제 #5
0
void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {
  ASSERT_ALIGNED(aBlock);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale = vmovq_n_f32(aScale);

  uint32_t dif = aSize % 16;
  uint32_t vectorSize = aSize - dif;
  uint32_t i = 0;
  for (; i < vectorSize; i += 16) {
    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));

    vout0 = vmulq_f32(vin0, vscale);
    vout1 = vmulq_f32(vin1, vscale);
    vout2 = vmulq_f32(vin2, vscale);
    vout3 = vmulq_f32(vin3, vscale);

    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
  }

  for (unsigned j = 0; j < dif; ++i, ++j) {
    aBlock[i] *= aScale;
  }
}
예제 #6
0
    //-----------------------------------------------------------------------------------
    void MathlibNEON::SinCos4( ArrayReal x, ArrayReal &outSin, ArrayReal &outCos )
    {
        // TODO: Improve accuracy by mapping to the range [-pi/4, pi/4] and swap
        // between cos & sin depending on which quadrant it fell:
        // Quadrant | sin     |  cos
        // n = 0 ->  sin( x ),  cos( x )
        // n = 1 ->  cos( x ), -sin( x )
        // n = 2 -> -sin( x ), -cos( x )
        // n = 3 -> -sin( x ),  sin( x )
        // See ARGUMENT REDUCTION FOR HUGE ARGUMENTS:
        // Good to the Last Bit
        // K. C. Ng and themembers of the FP group of SunPro
        // http://www.derekroconnor.net/Software/Ng--ArgReduction.pdf

        // -- Perhaps we can leave this to GSoC students? --

        // Map arbitrary angle x to the range [-pi; +pi] without using division.
        // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON)
        // can replace the add, the sub, & the two muls for two mad
        ArrayReal integralPart;
        x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF );
        x = Modf4( x, integralPart );
        x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI );

        sincos_ps( x, &outSin, &outCos );
    }
예제 #7
0
inline float32x4_t cv_vrsqrtq_f32(float32x4_t val)
{
    float32x4_t e = vrsqrteq_f32(val);
    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
    return e;
}
예제 #8
0
static void ScaleErrorSignalNEON(int extended_filter_enabled,
                                 float normal_mu,
                                 float normal_error_threshold,
                                 float x_pow[PART_LEN1],
                                 float ef[2][PART_LEN1]) {
  const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;
  const float error_threshold = extended_filter_enabled ?
      kExtendedErrorThreshold : normal_error_threshold;
  const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);
  const float32x4_t kMu = vmovq_n_f32(mu);
  const float32x4_t kThresh = vmovq_n_f32(error_threshold);
  int i;
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]);
    const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);
    const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);
    const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f);
    float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);
    float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);
    const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);
    const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);
    const float32x4_t absEf = vsqrtq_f32(ef_sum2);
    const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);
    const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);
    const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);
    uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));
    uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));
    uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger),
                                     vreinterpretq_u32_f32(ef_re));
    uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger),
                                     vreinterpretq_u32_f32(ef_im));
    ef_re_if = vandq_u32(bigger, ef_re_if);
    ef_im_if = vandq_u32(bigger, ef_im_if);
    ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);
    ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);
    ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);
    ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);
    vst1q_f32(&ef[0][i], ef_re);
    vst1q_f32(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    float abs_ef;
    ef[0][i] /= (x_pow[i] + 1e-10f);
    ef[1][i] /= (x_pow[i] + 1e-10f);
    abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

    if (abs_ef > error_threshold) {
      abs_ef = error_threshold / (abs_ef + 1e-10f);
      ef[0][i] *= abs_ef;
      ef[1][i] *= abs_ef;
    }

    // Stepsize factor
    ef[0][i] *= mu;
    ef[1][i] *= mu;
  }
}
예제 #9
0
void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform,
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length)
{
  size_t i;
  unsigned char alpha;
  float32_t (*mat)[4] = transform->matrix;

  const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r;
  const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g;
  const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b;

  const uint8_t *otdata_r = &transform->output_table_r->data[0];
  const uint8_t *otdata_g = &transform->output_table_g->data[0];
  const uint8_t *otdata_b = &transform->output_table_b->data[0];

  const float32x4_t mat0 = vld1q_f32(mat[0]);
  const float32x4_t mat1 = vld1q_f32(mat[1]);
  const float32x4_t mat2 = vld1q_f32(mat[2]);

  const float32x4_t max   = vld1q_dup_f32(&clampMaxValue);
  const float32x4_t min   = vld1q_dup_f32(&zero);
  const float32x4_t scale = vld1q_dup_f32(&floatScale);

  float32x4_t vec_r, vec_g, vec_b;
  int32x4_t result;

  /* CYA */
  if (!length)
    return;

  for (i = 0; i < length; i++) {
    /* setup for transforming the pixel */
    vec_r = vld1q_dup_f32(&igtbl_r[*src++]);
    vec_g = vld1q_dup_f32(&igtbl_g[*src++]);
    vec_b = vld1q_dup_f32(&igtbl_b[*src++]);
    alpha = *src++;

    /* gamma * matrix */
    vec_r = vmulq_f32(vec_r, mat0);
    vec_g = vmulq_f32(vec_g, mat1);
    vec_b = vmulq_f32(vec_b, mat2);

    /* crunch, crunch, crunch */
    vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b));
    vec_r = vmaxq_f32(min, vec_r);
    vec_r = vminq_f32(max, vec_r);
    result = vcvtq_s32_f32(vmulq_f32(vec_r, scale));

    /* use calc'd indices to output RGB values */
    *dest++ = otdata_r[vgetq_lane_s32(result, 0)];
    *dest++ = otdata_g[vgetq_lane_s32(result, 1)];
    *dest++ = otdata_b[vgetq_lane_s32(result, 2)];
    *dest++ = alpha;
  }
}
예제 #10
0
static inline void neon_make_rgb(float32x4_t macropixel, float32x4_t *rgba0p,
				 float32x4_t  *rgba1p)
{
  const float32x4_t  u_coeff = {0.0, -0.34455,  1.7790, 0.0 };
  const float32x4_t  v_coeff = {1.4075, -0.7169, 0.0, 0.0 };
  float32x4_t  y0_vec, y1_vec, u_vec, v_vec,  uv_vec;
  float32x2_t y0_u, y1_v;
  const float32_t alpha = 255.0;

  
  /* macropixel is [Y0, U, Y1, V].   */

  /* since vdupq_lane_f32 will only take two element vectors we */
  /* need to pick macropixel apart to build vectors of the components.  */
  /* so make y0_u be the first half of macropixel [Y0, U] and  */
  /* y1_v be the second half [Y1, V]. */
 
  y0_u =  vget_low_f32(macropixel);
  y1_v =  vget_high_f32(macropixel);

  /* now copy Y0 to all elements of y0_vec, then overwrite element 3  */
  /* with alpha.   */
  y0_vec = vdupq_lane_f32(y0_u, 0);
  y0_vec =  vsetq_lane_f32(alpha, y0_vec, 3);

  /* make u_vec be [U, U, U, U]. we'll do that using  */
  /* vdupq_lane_f32 and selecting U (element 1) from y0_u  */
  
  u_vec  = vdupq_lane_f32(y0_u, 1);

  /* now copy Y1 to all elements of y1_vec, then overwrite element 3  */
  /* with alpha.   */
  
  y1_vec  = vdupq_lane_f32(y1_v, 0);
  y1_vec =  vsetq_lane_f32(alpha, y1_vec, 3);

  /* make v_vec be [V, V, V, V]. we'll do that using  */
  /* vdupq_lane_f32 and selecting V (element 1) from y1_v  */
  
  v_vec = vdupq_lane_f32(y1_v, 1);

  /* now multiply u_vec * u_coeff and v_vec by v_coeff.  */
  u_vec =  vmulq_f32(u_vec, u_coeff);
  v_vec =  vmulq_f32(v_vec, v_coeff);

  /* add u_vec and v_vec to form uv_vec. use that to build  */
  /*  rgba0 and  rgba1 by adding y0_vec, y1_vec*/
  
  uv_vec = vaddq_f32(u_vec, v_vec);
  *rgba0p =  vaddq_f32(y0_vec, uv_vec);
  *rgba1p =  vaddq_f32(y1_vec, uv_vec);
  
  
}
예제 #11
0
    //-----------------------------------------------------------------------------------
    ArrayReal MathlibNEON::Cos4( ArrayReal x )
    {
        // Map arbitrary angle x to the range [-pi; +pi] without using division.
        // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON)
        // can replace the add, the sub, & the two muls for two mad
        ArrayReal integralPart;
        x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF );
        x = Modf4( x, integralPart );
        x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI );

        return cos_ps( x );
    }
예제 #12
0
파일: mw_neon.c 프로젝트: Daniel-Hwang/eeg
/* f32x4 mm mul */
void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C)
{
	int i, k, j;

	float32x4_t neon_b, neon_c;
	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_f32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_f32(A + j_T);

				neon_b = vld1q_f32(B + k_Row + j);
				neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));
				neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));
				neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));
				neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));

				neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);

				vst1q_lane_f32(C + k_Row + i, neon_c, 0);
				vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
예제 #13
0
static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) {
  int i;
  float32x4_t x = vrecpeq_f32(b);
  // from arm documentation
  // The Newton-Raphson iteration:
  //     x[n+1] = x[n] * (2 - d * x[n])
  // converges to (1/d) if x0 is the result of VRECPE applied to d.
  //
  // Note: The precision did not improve after 2 iterations.
  for (i = 0; i < 2; i++) {
    x = vmulq_f32(vrecpsq_f32(b, x), x);
  }
  // a/b = a*(1/b)
  return vmulq_f32(a, x);
}
예제 #14
0
static void cft1st_128_neon(float* a) {
  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
  int j, k2;

  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
    float32x4_t a00v = vld1q_f32(&a[j + 0]);
    float32x4_t a04v = vld1q_f32(&a[j + 4]);
    float32x4_t a08v = vld1q_f32(&a[j + 8]);
    float32x4_t a12v = vld1q_f32(&a[j + 12]);
    float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
    float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
    float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
    float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
    const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
    float32x4_t x0v = vaddq_f32(a01v, a23v);
    const float32x4_t x1v = vsubq_f32(a01v, a23v);
    const float32x4_t x2v = vaddq_f32(a45v, a67v);
    const float32x4_t x3v = vsubq_f32(a45v, a67v);
    const float32x4_t x3w = vrev64q_f32(x3v);
    float32x4_t x0w;
    a01v = vaddq_f32(x0v, x2v);
    x0v = vsubq_f32(x0v, x2v);
    x0w = vrev64q_f32(x0v);
    a45v = vmulq_f32(wk2rv, x0v);
    a45v = vmlaq_f32(a45v, wk2iv, x0w);
    x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
    x0w = vrev64q_f32(x0v);
    a23v = vmulq_f32(wk1rv, x0v);
    a23v = vmlaq_f32(a23v, wk1iv, x0w);
    x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
    x0w = vrev64q_f32(x0v);
    a67v = vmulq_f32(wk3rv, x0v);
    a67v = vmlaq_f32(a67v, wk3iv, x0w);
    a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
    a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
    a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
    a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
    vst1q_f32(&a[j + 0], a00v);
    vst1q_f32(&a[j + 4], a04v);
    vst1q_f32(&a[j + 8], a08v);
    vst1q_f32(&a[j + 12], a12v);
  }
}
예제 #15
0
static void test_fma() {
    for(int i=0; i<1020 * 4; i++) {
        data_f[i] = i;
    }
    float32x4_t c0_02 = vdupq_n_f32(0.02f);
    float32x4_t c0_04 = vdupq_n_f32(0.04f);
    float32x4_t c0_05 = vdupq_n_f32(0.05f);
    float32x4_t c0_10 = vdupq_n_f32(0.1f);
    float32x4_t c0_20 = vdupq_n_f32(0.2f);
    float32x4_t c1_00 = vdupq_n_f32(1.0f);

    startTime();

    // Do ~1 billion ops
    for (int ct=0; ct < (1000 * (1000 / 80)); ct++) {
        for (int i=0; i < 1000; i++) {
            float32x4_t t;
            t = vmulq_f32(vld1q_f32((float32_t *)&data_f[i]), c0_02);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+4]), c0_04);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+8]), c0_05);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+12]), c0_10);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+16]), c0_20);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+20]), c0_20);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+24]), c0_10);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+28]), c0_05);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+32]), c0_04);
            t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+36]), c0_02);
            t = vaddq_f32(t, c1_00);
            vst1q_f32((float32_t *)&data_f[i], t);
        }
    }

    endTime("neon fma", 1e9);
}
예제 #16
0
        template <bool align> SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum)
        {
            assert(size >= F);
            if (align)
                assert(Aligned(a) && Aligned(b));

            size_t partialAlignedSize = AlignLo(size, F);
            size_t fullAlignedSize = AlignLo(size, DF);
            size_t i = 0;
            float32x4_t sums[2] = { vdupq_n_f32(0), vdupq_n_f32(0) };
            if (fullAlignedSize)
            {
                for (; i < fullAlignedSize; i += DF)
                {
                    SquaredDifferenceSum16f<align>(a, b, i + F * 0, sums[0]);
                    SquaredDifferenceSum16f<align>(a, b, i + F * 1, sums[1]);
                }
                sums[0] = vaddq_f32(sums[0], sums[1]);
            }
            for (; i < partialAlignedSize; i += F)
                SquaredDifferenceSum16f<align>(a, b, i, sums[0]);
            if (partialAlignedSize != size)
            {
                float32x4_t tailMask = RightNotZero(size - partialAlignedSize);
                float32x4_t _a = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F));
                float32x4_t _b = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F));
                float32x4_t _d = And(vsubq_f32(_a, _b), tailMask);
                sums[0] = vaddq_f32(sums[0], vmulq_f32(_d, _d));
            }
            *sum = ExtractSum32f(sums[0]);
        }
예제 #17
0
static void FilterFarNEON(
    int num_partitions,
    int x_fft_buf_block_pos,
    float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
    float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
    float y_fft[2][PART_LEN1]) {
  int i;
  for (i = 0; i < num_partitions; i++) {
    int j;
    int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
    int pos = i * PART_LEN1;
    // Check for wrap
    if (i + x_fft_buf_block_pos >= num_partitions) {
      xPos -= num_partitions * PART_LEN1;
    }

    // vectorized code (four at once)
    for (j = 0; j + 3 < PART_LEN1; j += 4) {
      const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);
      const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);
      const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]);
      const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]);
      const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]);
      const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]);
      const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re);
      const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im);
      const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im);
      const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re);
      const float32x4_t g = vaddq_f32(y_fft_re, e);
      const float32x4_t h = vaddq_f32(y_fft_im, f);
      vst1q_f32(&y_fft[0][j], g);
      vst1q_f32(&y_fft[1][j], h);
    }
    // scalar code for the remaining items.
    for (; j < PART_LEN1; j++) {
      y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j],
                           x_fft_buf[1][xPos + j],
                           h_fft_buf[0][pos + j],
                           h_fft_buf[1][pos + j]);
      y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],
                           x_fft_buf[1][xPos + j],
                           h_fft_buf[0][pos + j],
                           h_fft_buf[1][pos + j]);
    }
  }
}
void test_vmulQf32 (void)
{
  float32x4_t out_float32x4_t;
  float32x4_t arg0_float32x4_t;
  float32x4_t arg1_float32x4_t;

  out_float32x4_t = vmulq_f32 (arg0_float32x4_t, arg1_float32x4_t);
}
예제 #19
0
static void SubbandCoherenceNEON(AecCore* aec,
                                 float efw[2][PART_LEN1],
                                 float dfw[2][PART_LEN1],
                                 float xfw[2][PART_LEN1],
                                 float* fft,
                                 float* cohde,
                                 float* cohxd,
                                 int* extreme_filter_divergence) {
  int i;

  SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence);

  {
    const float32x4_t vec_1eminus10 =  vdupq_n_f32(1e-10f);

    // Subband coherence
    for (i = 0; i + 3 < PART_LEN1; i += 4) {
      const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
      const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
      const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
      const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
      const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
      float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);
      float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);
      vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);
      vec_cohde = vdivq_f32(vec_cohde, vec_sdse);
      vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);
      vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);

      vst1q_f32(&cohde[i], vec_cohde);
      vst1q_f32(&cohxd[i], vec_cohxd);
    }
  }
  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    cohde[i] =
        (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
        (aec->sd[i] * aec->se[i] + 1e-10f);
    cohxd[i] =
        (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
        (aec->sx[i] * aec->sd[i] + 1e-10f);
  }
}
/* Performs one rotation/translation */
static void 
neon_coord_4(
    float32x4_t a_4, 
    float32x4_t b_4,
    float32x4_t x_4, 
    float32x4_t y_4,
    float32x4_t pos_4f, 
    float32x4_t point5_4, 
    int * result)
{
    float32x4_t tmp1 = vmulq_f32(a_4, x_4);
    float32x4_t tmp2 = vmulq_f32(b_4, y_4);
    tmp2 = vaddq_f32(tmp1, tmp2);
    tmp2 = vaddq_f32(tmp2, pos_4f);
    tmp2 = vaddq_f32(tmp2, point5_4);
    int32x4_t c_4 = vcvtq_s32_f32(tmp2);
    vst1q_s32(result, c_4);
}
예제 #21
0
// Window time domain data to be used by the fft.
static void WindowDataNEON(float* x_windowed, const float* x) {
  int i;
  for (i = 0; i < PART_LEN; i += 4) {
    const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);
    const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);
    const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);
    // A B C D
    float32x4_t vec_sqrtHanning_rev =
        vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
    // B A D C
    vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);
    // D C B A
    vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),
                                       vget_low_f32(vec_sqrtHanning_rev));
    vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));
    vst1q_f32(&x_windowed[PART_LEN + i],
            vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));
  }
}
 template <bool align> SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t offset, float32x4_t & sum, float32x4_t & correction)
 {
     float32x4_t _a = Load<align>(a + offset);
     float32x4_t _b = Load<align>(b + offset);
     float32x4_t _d = vsubq_f32(_a, _b);
     float32x4_t term = vmlaq_f32(correction, _d, _d);
     float32x4_t temp = vaddq_f32(sum, term);
     correction = vsubq_f32(vmulq_f32(temp, sum), term);
     sum = temp;
 }
예제 #23
0
파일: SimdNeonHog.cpp 프로젝트: 4144/Simd
        template <bool align> SIMD_INLINE void HogDirectionHistograms(const float32x4_t & dx, const float32x4_t & dy, Buffer & buffer, size_t col)
        {
            float32x4_t bestDot = vdupq_n_f32(0);
            int32x4_t bestIndex = vdupq_n_s32(0);
            for(int i = 0; i < buffer.size; ++i)
            {
                float32x4_t dot = vaddq_f32(vmulq_f32(dx, buffer.cos[i]), vmulq_f32(dy, buffer.sin[i]));
                uint32x4_t mask = vcgtq_f32(dot, bestDot);
                bestDot = vmaxq_f32(dot, bestDot);
                bestIndex = vbslq_s32(mask, buffer.pos[i], bestIndex);

                dot = vnegq_f32(dot);
                mask = vcgtq_f32(dot, bestDot);
                bestDot = vmaxq_f32(dot, bestDot);
                bestIndex = vbslq_s32(mask, buffer.neg[i], bestIndex);
            }
            Store<align>(buffer.index + col, bestIndex);
            Store<align>(buffer.value + col, Sqrt<SIMD_NEON_RCP_ITER>(vaddq_f32(vmulq_f32(dx, dx), vmulq_f32(dy, dy))));
        }
예제 #24
0
static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) {
  int i;
  const int num_partitions = aec->num_partitions;
  for (i = 0; i < num_partitions; i++) {
    int j;
    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
    int pos = i * PART_LEN1;
    // Check for wrap
    if (i + aec->xfBufBlockPos >= num_partitions) {
      xPos -= num_partitions * PART_LEN1;
    }

    // vectorized code (four at once)
    for (j = 0; j + 3 < PART_LEN1; j += 4) {
      const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
      const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
      const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
      const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
      const float32x4_t yf_re = vld1q_f32(&yf[0][j]);
      const float32x4_t yf_im = vld1q_f32(&yf[1][j]);
      const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re);
      const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im);
      const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im);
      const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re);
      const float32x4_t g = vaddq_f32(yf_re, e);
      const float32x4_t h = vaddq_f32(yf_im, f);
      vst1q_f32(&yf[0][j], g);
      vst1q_f32(&yf[1][j], h);
    }
    // scalar code for the remaining items.
    for (; j < PART_LEN1; j++) {
      yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
                        aec->xfBuf[1][xPos + j],
                        aec->wfBuf[0][pos + j],
                        aec->wfBuf[1][pos + j]);
      yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
                        aec->xfBuf[1][xPos + j],
                        aec->wfBuf[0][pos + j],
                        aec->wfBuf[1][pos + j]);
    }
  }
}
예제 #25
0
void dotProd_neon(const float *data, const float *weights, float *vals, const int n, const int len, const float *istd) {
    for (int i = 0; i < n; i += 4) {
        float32x4_t accum0 = { 0.0f, 0.0f, 0.0f, 0.0f };
        float32x4_t accum1 = accum0;
        float32x4_t accum2 = accum0;
        float32x4_t accum3 = accum0;

        for (int j = 0; j < len; j += 4) {
            float32x4_t d0 = vld1q_f32(data + j);
            float32x4_t d1 = d0;
            float32x4_t d2 = d0;
            float32x4_t d3 = d0;

            float32x4_t w0 = vld1q_f32(weights);
            float32x4_t w1 = vld1q_f32(weights + 4);
            float32x4_t w2 = vld1q_f32(weights + 8);
            float32x4_t w3 = vld1q_f32(weights + 12);

            accum0 = vaddq_f32(accum0, vmulq_f32(d0, w0));
            accum1 = vaddq_f32(accum1, vmulq_f32(d1, w1));
            accum2 = vaddq_f32(accum2, vmulq_f32(d2, w2));
            accum3 = vaddq_f32(accum3, vmulq_f32(d3, w3));

            weights += 16;
        }

        float32x2_t sum0 = vpadd_f32(vget_low_f32(accum0), vget_high_f32(accum0));
        float32x2_t sum1 = vpadd_f32(vget_low_f32(accum1), vget_high_f32(accum1));
        float32x2_t sum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
        float32x2_t sum3 = vpadd_f32(vget_low_f32(accum3), vget_high_f32(accum3));
        sum0 = vpadd_f32(sum0, sum1);
        sum1 = vpadd_f32(sum2, sum3);
        float32x4_t sum = vcombine_f32(sum0, sum1);
        
        sum = vmulq_n_f32(sum, istd[0]);
        sum = vaddq_f32(sum, vld1q_f32(weights + n*len + i));
        vst1q_f32(vals + i, sum);
    }
}
예제 #26
0
파일: mw_neon.c 프로젝트: Daniel-Hwang/eeg
/* f32x4 mv mul */
void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C)
{
	int i = 0;
	int k = 0;

	float32x4_t neon_b, neon_c;
	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_f32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_f32(A + j);
			neon_a1 = vld1q_f32(A + j + Row);
			neon_a2 = vld1q_f32(A + j + 2 * Row);
			neon_a3 = vld1q_f32(A + j + 3 * Row);

			neon_b = vld1q_f32(B + k);
			neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));
			neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));
			neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));
			neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));

			neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);

		}

		vst1q_f32(C + i, neon_c);
	}
}
예제 #27
0
void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) {
    const int16_t *data = (const int16_t *)dataf;
    const int16_t *weights = (const int16_t *)weightsf;
    weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t)

    for (int i = 0; i < n; i += 4) {
        int32x4_t accum0 = { 0, 0, 0, 0 };
        int32x4_t accum1 = accum0;
        int32x4_t accum2 = accum0;
        int32x4_t accum3 = accum0;

        for (int j = 0; j < len; j += 8) {
            int16x4x2_t d0 = vld2_s16(data + j);

            int16x4x2_t w0 = vld2_s16(weights);
            int16x4x2_t w1 = vld2_s16(weights + 8);
            int16x4x2_t w2 = vld2_s16(weights + 16);
            int16x4x2_t w3 = vld2_s16(weights + 24);

            accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]);
            accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]);

            accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]);
            accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]);

            accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]);
            accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]);

            accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]);
            accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]);

            weights += 32;
        }

        int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0));
        int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1));
        int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
        int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3));
        sum0 = vpadd_s32(sum0, sum1);
        sum1 = vpadd_s32(sum2, sum3);
        int32x4_t sum = vcombine_s32(sum0, sum1);

        float32x4_t val = vcvtq_f32_s32(sum);
        val = vmulq_f32(val, vld1q_f32(weightsf + i*2));
        val = vmulq_n_f32(val, istd[0]);
        val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4));
        vst1q_f32(vals + i, val);
    }
}
예제 #28
0
static void neon_vector_mul(const std::vector<float>& vec_a, const std::vector<float>& vec_b, std::vector<float>& vec_result)
{
	assert(vec_a.size() == vec_b.size());
	assert(vec_a.size() == vec_result.size());
	int i = 0;
	//neon process
	for (; i < (int)vec_result.size() - 3 ; i+=4)
	{
		const auto data_a = vld1q_f32(&vec_a[i]);
		const auto data_b = vld1q_f32(&vec_b[i]);
		float* dst_ptr = &vec_result[i];
		const auto data_res = vmulq_f32(data_a, data_b);
		vst1q_f32(dst_ptr, data_res);
	}
	//normal process
	for (; i < (int)vec_result.size(); i++)
	{
		vec_result[i] = vec_a[i] * vec_b[i];
	}
}
//Kernel function: saxpy
void saxpy_vector(KernelArgs* args) {

    //Setup
    const float32x4_t MASK_FALSE = vdupq_n_f32(0.f);
    const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE));
    
    //Uniforms
    
    //Fuses
    
    //Literals
    
    //Stack variables
    float32x4_t scale, x, y, result, var060, var061;
    
    //Loop over input
    uint64_t index;
    for(index = 0; index < args->N; index += 4) {
    
        //Inputs
        scale = vld1q_f32(&args->scale[index]);
        x = vld1q_f32(&args->x[index]);
        y = vld1q_f32(&args->y[index]);
        
        //Begin kernel logic
        {
        
            //>>> result = scale * x + y
            var061 = vmulq_f32(scale, x);
            var060 = vaddq_f32(var061, y);
            result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result);
        
        }
        //End kernel logic
        
        //Outputs
        vst1q_f32(&args->result[index], result);
        
    }
}
예제 #30
0
/**
 * @brief   vector_dot_vector.
 *
 * @param   dst[out]     the output element(1*1)
 * @param   src1[in]     the input  vector(1*n)
 *          src2[in]     the input  vector(1*n)
 *          dimN[in]     size of vector
 *
 * @return  void
 */
void neon_VecdotVec(float *dst,
                    const float *src1,
                    const float *src2,
                    const int dimN)
{
    float *mat0 = (float *)src1;
    float *mat1 = (float *)src2;
    float32x4_t q0 = vld1q_f32(mat0);
    float32x4_t q1 = vld1q_f32(mat1);
    q0 = vmulq_f32(q0, q1);
    int j = 4;
    for (; j <= dimN - 4; j += 4)
    {
        float32x4_t q2 = vld1q_f32(mat0 + j);
        float32x4_t q3 = vld1q_f32(mat1 + j);
        q0 = vmlaq_f32(q0, q2, q3);
    }
    float32x2_t d0 = vpadd_f32(vget_low_f32(q0), vget_high_f32(q0));
    d0 = vpadd_f32(d0, d0);
    *dst = *((float *)&d0);
    for (; j < dimN; j++) {
        *dst += src1[j] * src2[j];
    }
}