Ejemplo n.º 1
1
 void SubVectorsSIMD(float* c, const float* a, const float* b, std::size_t n) {
     std::size_t i = 0;
     for (; i < ROUND_DOWN(n, 4); i += 4) {
         __m128 ma = _mm_loadu_ps(a + i);
         __m128 mb = _mm_loadu_ps(b + i);
         __m128 mc = _mm_sub_ps(ma, mb);
         _mm_storeu_ps(c + i, mc);
     }
     for (; i < n; i++) {
         c[i] = a[i] - b[i];
     }
 }
Ejemplo n.º 2
0
static void blendMatrices(MMatrix4x4 * matrix, const MMatrix4x4 * skinMatrix, const float weight)
{
	__m128 w = _mm_set1_ps(weight);
				
	for(int i=0; i<16; i+=4)
	{
		__m128 a = _mm_loadu_ps(matrix->entries + i);
		__m128 b = _mm_loadu_ps(skinMatrix->entries + i);
		__m128 c = _mm_mul_ps(b, w);
		__m128 d = _mm_add_ps(a, c);
		_mm_storeu_ps(matrix->entries + i, d);
	}
}
Ejemplo n.º 3
0
static inline void
transpose_sse (gfloat *src, gfloat *dst, const int width, const int height)
{
    __m128 row1 = _mm_loadu_ps (src);
    __m128 row2 = _mm_loadu_ps (src + height);
    __m128 row3 = _mm_loadu_ps (src + 2 * height);
    __m128 row4 = _mm_loadu_ps (src + 3 * height);
    _MM_TRANSPOSE4_PS (row1, row2, row3, row4);
    _mm_storeu_ps (dst, row1);
    _mm_storeu_ps (dst + width, row2);
    _mm_storeu_ps (dst + 2 * width, row3);
    _mm_storeu_ps (dst + 3 * width, row4);
}
Ejemplo n.º 4
0
static void process_sinc(rarch_resampler_t *resamp, float *out_buffer)
{
   __m128 sum_l = _mm_setzero_ps();
   __m128 sum_r = _mm_setzero_ps();

   const float *buffer_l = resamp->buffer_l + resamp->ptr;
   const float *buffer_r = resamp->buffer_r + resamp->ptr;

   unsigned phase = resamp->time >> PHASES_SHIFT;
   unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
   __m128 delta_f = _mm_set1_ps(delta);

   const float *phase_table = resamp->phase_table[phase][PHASE_INDEX];
   const float *delta_table = resamp->phase_table[phase][DELTA_INDEX];

   for (unsigned i = 0; i < TAPS; i += 4)
   {
      __m128 buf_l  = _mm_loadu_ps(buffer_l + i);
      __m128 buf_r  = _mm_loadu_ps(buffer_r + i);

      __m128 phases = _mm_load_ps(phase_table + i);
      __m128 deltas = _mm_load_ps(delta_table + i);

      __m128 sinc   = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f));

      sum_l         = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc));
      sum_r         = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc));
   }

   // Them annoying shuffles :V
   // sum_l = { l3, l2, l1, l0 }
   // sum_r = { r3, r2, r1, r0 }

   __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)),
         _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));

   // sum   = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
   // sum   = { R1, R0, L1, L0 }

   sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);

   // sum   = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
   // sum   = { X,  R,  X,  L } 

   // Store L
   _mm_store_ss(out_buffer + 0, sum);

   // movehl { X, R, X, L } == { X, R, X, R }
   _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum));
}
Ejemplo n.º 5
0
static void spline_n_4(int i, float t, float *knot, float *splineVal) {
	knot += i + 1;

#ifdef _M_SSE
	const __m128 knot012 = _mm_loadu_ps(&knot[0]);
	const __m128 knot345 = _mm_loadu_ps(&knot[3]);
	const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012);
	const __m128 f30_41_52 = _mm_div_ps(t012, _mm_sub_ps(knot345, knot012));

	const __m128 knot343 = _mm_shuffle_ps(knot345, knot345, _MM_SHUFFLE(3, 0, 1, 0));
	const __m128 knot122 = _mm_shuffle_ps(knot012, knot012, _MM_SHUFFLE(3, 2, 2, 1));
	const __m128 t122 = _mm_shuffle_ps(t012, t012, _MM_SHUFFLE(3, 2, 2, 1));
	const __m128 f31_42_32 = _mm_div_ps(t122, _mm_sub_ps(knot343, knot122));

	// It's still faster to use SSE, even with this.
	float MEMORY_ALIGNED16(ff30_41_52[4]);
	float MEMORY_ALIGNED16(ff31_42_32[4]);
	_mm_store_ps(ff30_41_52, f30_41_52);
	_mm_store_ps(ff31_42_32, f31_42_32);

	const float &f30 = ff30_41_52[0];
	const float &f41 = ff30_41_52[1];
	const float &f52 = ff30_41_52[2];
	const float &f31 = ff31_42_32[0];
	const float &f42 = ff31_42_32[1];
	const float &f32 = ff31_42_32[2];
#else
	// TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly...
	float t0 = (t - knot[0]);
	float t1 = (t - knot[1]);
	float t2 = (t - knot[2]);
	// TODO: All our knots are integers so we should be able to get rid of these divisions (How?)
	float f30 = t0/(knot[3]-knot[0]);
	float f41 = t1/(knot[4]-knot[1]);
	float f52 = t2/(knot[5]-knot[2]);
	float f31 = t1/(knot[3]-knot[1]);
	float f42 = t2/(knot[4]-knot[2]);
	float f32 = t2/(knot[3]-knot[2]);
#endif

	float a = (1-f30)*(1-f31);
	float b = (f31*f41);
	float c = (1-f41)*(1-f42);
	float d = (f42*f52);

	splineVal[0] = a-(a*f32);
	splineVal[1] = 1-a-b+((a+b+c-1)*f32);
	splineVal[2] = b+((1-b-c-d)*f32);
	splineVal[3] = d*f32;
}
Ejemplo n.º 6
0
void audio_mix_volume_SSE2(float *out, const float *in, float vol, size_t samples)
{
   size_t i;
   __m128 volume = _mm_set1_ps(vol);

   for (i = 0; i + 16 <= samples; i += 16, out += 16, in += 16)
   {
      unsigned j;
      __m128 input[4];
      __m128 additive[4];
      
      input[0]    = _mm_loadu_ps(out +  0);
      input[1]    = _mm_loadu_ps(out +  4);
      input[2]    = _mm_loadu_ps(out +  8);
      input[3]    = _mm_loadu_ps(out + 12);

      additive[0] = _mm_mul_ps(volume, _mm_loadu_ps(in +  0));
      additive[1] = _mm_mul_ps(volume, _mm_loadu_ps(in +  4));
      additive[2] = _mm_mul_ps(volume, _mm_loadu_ps(in +  8));
      additive[3] = _mm_mul_ps(volume, _mm_loadu_ps(in + 12));

      for (j = 0; j < 4; j++)
         _mm_storeu_ps(out + 4 * j, _mm_add_ps(input[j], additive[j]));
   }

   audio_mix_volume_C(out, in, vol, samples - i);
}
Ejemplo n.º 7
0
void sse_rgb2gray(float* ra, float* ga, float* ba, float* gray) {
    __m128 c1 =  _mm_set1_ps(0.3f);
    __m128 c2 =  _mm_set1_ps(0.59f);
    __m128 c3 =  _mm_set1_ps(0.11f);

    for(int i = 0; i < N; i+=4) {
        __m128 a = _mm_loadu_ps(ra+i);
        __m128 b = _mm_loadu_ps(ga+i);
        __m128 c = _mm_loadu_ps(ba+i);
        __m128 ab = _mm_add_ps(_mm_mul_ps(c1, a), _mm_mul_ps(c2, b));
        __m128 out = _mm_add_ps(ab, _mm_mul_ps(c3, c));
        _mm_storeu_ps(gray+i, out);
    }
}
Ejemplo n.º 8
0
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
	int i;
	int limit = data_len - 12;
	__m128 sum0, sum1, sum2;

	(void) lag;
	FLAC__ASSERT(lag <= 12);
	FLAC__ASSERT(lag <= data_len);

	sum0 = _mm_setzero_ps();
	sum1 = _mm_setzero_ps();
	sum2 = _mm_setzero_ps();

	for(i = 0; i <= limit; i++) {
		__m128 d, d0, d1, d2;
		d0 = _mm_loadu_ps(data+i);
		d1 = _mm_loadu_ps(data+i+4);
		d2 = _mm_loadu_ps(data+i+8);
		d = d0; d = _mm_shuffle_ps(d, d, 0);
		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
	}

	{
		__m128 d0 = _mm_setzero_ps();
		__m128 d1 = _mm_setzero_ps();
		__m128 d2 = _mm_setzero_ps();
		limit++; if(limit < 0) limit = 0;

		for(i = data_len-1; i >= limit; i--) {
			__m128 d;
			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
			d2 = _mm_move_ss(d2, d1);
			d1 = _mm_move_ss(d1, d0);
			d0 = _mm_move_ss(d0, d);
			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
		}
	}

	_mm_storeu_ps(autoc,   sum0);
	_mm_storeu_ps(autoc+4, sum1);
	_mm_storeu_ps(autoc+8, sum2);
}
Ejemplo n.º 9
0
    // inline matrix multiplication is much faster than function calling,
    // we should only move it if we need size
    eMatrix4x4 & operator *= (const eMatrix4x4 &mtx)
    {
#ifdef eUSE_SSE

	    const __m128 in10 = _mm_loadu_ps(&mtx.m11);
	    const __m128 in11 = _mm_loadu_ps(&mtx.m21);
	    const __m128 in12 = _mm_loadu_ps(&mtx.m31);
	    const __m128 in13 = _mm_loadu_ps(&mtx.m41);

	    for (eU32 i=0; i<16; i+=4)
        {
		    const __m128 in2 = _mm_loadu_ps(&m[i]);

		    const __m128 e0 = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
		    const __m128 e1 = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
		    const __m128 e2 = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
		    const __m128 e3 = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));

		    const __m128 m0 = _mm_mul_ps(in10, e0);
		    const __m128 m1 = _mm_mul_ps(in11, e1);
		    const __m128 m2 = _mm_mul_ps(in12, e2);
		    const __m128 m3 = _mm_mul_ps(in13, e3);

		    const __m128 a0 = _mm_add_ps(m0, m1);
		    const __m128 a1 = _mm_add_ps(m2, m3);
		    const __m128 a2 = _mm_add_ps(a0, a1);

		    _mm_storeu_ps(&this->m[i], a2);
	    }
#else
        *this = eMatrix4x4(m11*mtx.m11+m12*mtx.m21+m13*mtx.m31+m14*mtx.m41,
                          m11*mtx.m12+m12*mtx.m22+m13*mtx.m32+m14*mtx.m42,
                          m11*mtx.m13+m12*mtx.m23+m13*mtx.m33+m14*mtx.m43,
                          m11*mtx.m14+m12*mtx.m24+m13*mtx.m34+m14*mtx.m44,
                          m21*mtx.m11+m22*mtx.m21+m23*mtx.m31+m24*mtx.m41,
                          m21*mtx.m12+m22*mtx.m22+m23*mtx.m32+m24*mtx.m42,
                          m21*mtx.m13+m22*mtx.m23+m23*mtx.m33+m24*mtx.m43,
                          m21*mtx.m14+m22*mtx.m24+m23*mtx.m34+m24*mtx.m44,
                          m31*mtx.m11+m32*mtx.m21+m33*mtx.m31+m34*mtx.m41,
                          m31*mtx.m12+m32*mtx.m22+m33*mtx.m32+m34*mtx.m42,
                          m31*mtx.m13+m32*mtx.m23+m33*mtx.m33+m34*mtx.m43,
                          m31*mtx.m14+m32*mtx.m24+m33*mtx.m34+m34*mtx.m44,
                          m41*mtx.m11+m42*mtx.m21+m43*mtx.m31+m44*mtx.m41,
                          m41*mtx.m12+m42*mtx.m22+m43*mtx.m32+m44*mtx.m42,
                          m41*mtx.m13+m42*mtx.m23+m43*mtx.m33+m44*mtx.m43,
                          m41*mtx.m14+m42*mtx.m24+m43*mtx.m34+m44*mtx.m44);
#endif
        return *this;
    }
__attribute__((noinline)) float dot128fma(float *x1, float *x2, size_t len) {
  assert(len % 4 == 0);
  __m128 sum = _mm_setzero_ps();
  if (len > 3) {
    size_t limit = len - 3;
    for (size_t i = 0; i < limit; i += 4) {
      __m128 v1 = _mm_loadu_ps(x1 + i);
      __m128 v2 = _mm_loadu_ps(x2 + i);
      sum = _mm_fmadd_ps(v1, v2, sum);
    }
  }
  float buffer[4];
  _mm_storeu_ps(buffer, sum);
  return buffer[0] + buffer[1] + buffer[2] + buffer[3];
}
Ejemplo n.º 11
0
static inline long
conv_yF_yHalf (const float *src, uint16_t *dst, long samples)
{
  const __v4sf *s_vec;
  uint64_t     *d_vec;

  long n = samples;

  s_vec = (const __v4sf *)src;
  d_vec = (uint64_t *)dst;

  while (n >= 4)
    {
      __m128 in_val = _mm_loadu_ps((float *)s_vec++);
      __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
      _mm_storel_epi64((__m128i *)d_vec++, out_val);
      n -= 4;
    }

  src = (const float *)s_vec;
  dst = (uint16_t *)d_vec;

  while (n)
    {
      __m128 in_val = _mm_load_ss(src++);
      __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
      *dst++ = _mm_extract_epi16(out_val, 0);
      n -= 1;
    }

  return samples;
}
Ejemplo n.º 12
0
SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n)
{
    int i;
    __m128 n1;
    __m128 n2;
 
    if ((i = n & ~3))
    {
        n2 = _mm_set1_ps(y);
        for (i -= 4;  i >= 0;  i -= 4)
        {
            n1 = _mm_loadu_ps(x + i);
            n1 = _mm_sub_ps(n1, n2);
            _mm_storeu_ps(z + i, n1);
        }
    }
    /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
    switch (n & 3)
    {
    case 3:
        z[n - 3] = x[n - 3] - y;
    case 2:
        z[n - 2] = x[n - 2] - y;
    case 1:
        z[n - 1] = x[n - 1] - y;
    }
}
Ejemplo n.º 13
0
SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n)
{
    int i;
	static const uint32_t mask = 0x80000000;
	static const float *fmask = (float *) &mask;
    __m128 n1;
    __m128 n2;
 
    if ((i = n & ~3))
    {
        n2 = _mm_set1_ps(*fmask);
        for (i -= 4;  i >= 0;  i -= 4)
        {
            n1 = _mm_loadu_ps(x + i);
		    n1 = _mm_xor_ps(n1, n2);
            _mm_storeu_ps(z + i, n1);
        }
    }
    /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
    switch (n & 3)
    {
    case 3:
        z[n - 3] = -x[n - 3];
    case 2:
        z[n - 2] = -x[n - 2];
    case 1:
        z[n - 1] = -x[n - 1];
    }
}
Ejemplo n.º 14
0
void AngleQuaternion(vec_t *angles, vec_t *quaternion)
{
	static const ALIGN16_BEG int ps_signmask[4] ALIGN16_END = { 0x80000000, 0, 0x80000000, 0 };

	__m128 a = _mm_loadu_ps(angles);
	a = _mm_mul_ps(a, _mm_load_ps(_ps_0p5)); //a *= 0.5
	__m128 s, c;
	sincos_ps(a, &s, &c);

	__m128 im1 = _mm_shuffle_ps(s, c, _MM_SHUFFLE(1, 0, 1, 0)); //im1 =  {sin[0], sin[1], cos[0], cos[1] }
	__m128 im2 = _mm_shuffle_ps(c, s, _MM_SHUFFLE(2, 2, 2, 2)); //im2 =  {cos[2], cos[2], sin[2], sin[2] }

	__m128 part1 = _mm_mul_ps(
		_mm_shuffle_ps(im1, im1, _MM_SHUFFLE(1, 2, 2, 0)),
		_mm_shuffle_ps(im1, im1, _MM_SHUFFLE(0, 3, 1, 3))
		);
	part1 = _mm_mul_ps(part1, im2);

	__m128 part2 = _mm_mul_ps(
		_mm_shuffle_ps(im1, im1, _MM_SHUFFLE(2, 1, 0, 2)),
		_mm_shuffle_ps(im1, im1, _MM_SHUFFLE(3, 0, 3, 1))
		);

	part2 = _mm_mul_ps(part2, _mm_shuffle_ps(im2, im2, _MM_SHUFFLE(0, 0, 2, 2)));

	__m128 signmask = _mm_load_ps((float*)ps_signmask);
	part2 = _mm_xor_ps(part2, signmask);

	__m128 res = _mm_add_ps(part1, part2);
	_mm_storeu_ps(quaternion, res);
}
Ejemplo n.º 15
0
static void
TEST (void)
{
  union128d s1;
  union128 u, s2;
  double source1[2] = {123.345, 67.3321};
  float  e[4] = {5633.098, 93.21, 3.34, 4555.2};

  s1.x = _mm_loadu_pd (source1);
  s2.x = _mm_loadu_ps (e);

  __asm("" : "+v"(s1.x), "+v"(s2.x));
  u.x = test(s2.x, s1.x);

  e[0] = (float)source1[0];

  if (check_union128(u, e))
#if DEBUG
  {
      printf ("sse2_test_cvtsd2ss_1; check_union128 failed\n");
      printf ("\t [%f,%f,%f,%f],[%f,%f]\n", s2.a[0], s2.a[1], s2.a[2], s2.a[3],
    		  s1.a[0], s1.a[1]);
      printf ("\t -> \t[%f,%f,%f,%f]\n", u.a[0], u.a[1], u.a[2], u.a[3]);
      printf ("\texpect\t[%f,%f,%f,%f]\n", e[0], e[1], e[2], e[3]);
  }
#else
    abort ();
#endif
}
Ejemplo n.º 16
0
void FastResampler_FirFilter2_C2_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
	Q_UNUSED(channels);
	__m128 sum = _mm_setzero_ps();
	__m128 v_frac = _mm_set1_ps(frac);
	for(unsigned int i = 0; i < filter_length / 4; ++i) {
		__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
		coef1 += 4; coef2 += 4;
		__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
		__m128 v_input1 = _mm_loadu_ps(input), v_input2 = _mm_loadu_ps(input + 4);
		input += 8;
		sum = _mm_add_ps(sum, _mm_mul_ps(v_input1, _mm_unpacklo_ps(filter_value, filter_value)));
		sum = _mm_add_ps(sum, _mm_mul_ps(v_input2, _mm_unpackhi_ps(filter_value, filter_value)));
	}
	__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0xee));
	_mm_store_sd((double*) output, _mm_castps_pd(sum2));
}
Ejemplo n.º 17
0
static void
clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
{
  __m128 xmm1;
  float max = *src2_1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    float x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
  xmm1 = _mm_set_ps1(max);
  for (; n >= 4; n -= 4) {
    __m128 xmm0;
    xmm0 = _mm_loadu_ps(src1);
    xmm0 = _mm_min_ps(xmm0, xmm1);
    _mm_store_ps(dest, xmm0);
    dest += 4;
    src1 += 4;
  }
  for (; n > 0; n--) {
    float x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
}
Ejemplo n.º 18
0
Archivo: dsp.cpp Proyecto: taqu/opus
    void conv_Float1ToFloat2(void* dst, const void* s, s32 numSamples)
    {
        LSfloat* d = reinterpret_cast<LSfloat*>(dst);
        const LSfloat* src = reinterpret_cast<const LSfloat*>(s);

        s32 num = numSamples >> 2; //4個のfloatをまとめて処理
        s32 offset = num << 2;
        s32 rem = numSamples - offset;

        const LSfloat* p = src;
        LSfloat* q = d;
        for(s32 i=0; i<num; ++i){
            __m128 f32_0 = _mm_loadu_ps(p);
            __m128 f32_1 = _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(1, 1, 0, 0));
            __m128 f32_2 = _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(3, 3, 2, 2));

            _mm_storeu_ps((q+0), f32_1);
            _mm_storeu_ps((q+4), f32_2);
            p += 4;
            q += 8;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j=i<<1;
            q[j+0] = q[j+1] = p[i];
        }
    }
Ejemplo n.º 19
0
static void GF_FUNC_ALIGN VS_CC
proc_horizontal(float *srcp, int radius, int length, int width, float *kernel,
                float *dstp)
{
    for (int i = 1; i <= radius; i++) {
        srcp[-i] = srcp[i];
        srcp[width - 1 + i] = srcp[width - 1 - i];
    }
    
    GF_ALIGN float ar_kernel[17][4];
    for (int i = 0; i < length; i++) {
        for (int j = 0; j < 4; j++) {
            ar_kernel[i][j] = kernel[i];
        }
    }
    
    for (int x = 0; x < width; x += 4) {
        __m128 sum = _mm_setzero_ps();
        
        for (int i = -radius; i <= radius; i++) {
            __m128 k = _mm_load_ps(ar_kernel[i + radius]);
            __m128 xmm0 = _mm_loadu_ps(srcp + x + i);
            sum = _mm_add_ps(sum, _mm_mul_ps(xmm0, k));
        }
        _mm_store_ps(dstp + x, sum);
    }
}
Ejemplo n.º 20
0
Archivo: dsp.cpp Proyecto: taqu/opus
    void conv_Float2ToFloat1(void* dst, const void* s, s32 numSamples)
    {
        LSfloat* d = reinterpret_cast<LSfloat*>(dst);
        const LSfloat* src = reinterpret_cast<const LSfloat*>(s);

        s32 num = numSamples >> 2; //4個のfloatをまとめて処理
        s32 offset = num << 2;
        s32 rem = numSamples - offset;
        __m128 coff = _mm_set1_ps(0.5f);

        const LSfloat* p = src;
        LSfloat* q = d;
        for(s32 i=0; i<num; ++i){
            __m128 f32_0 = _mm_loadu_ps(p);
            __m128 f32_1 = _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(2, 3, 1, 0));
            __m128 f32_2 = _mm_mul_ps(_mm_add_ps(f32_0, f32_1), coff);
            __m128 f32_3 = _mm_shuffle_ps(f32_2, f32_2, _MM_SHUFFLE(2, 0, 2, 0));

            _mm_storel_pi((__m64*)q, f32_3);
            p += 4;
            q += 2;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j=i<<1;
            q[i] = 0.5f*(p[j+0]+p[j+1]);
        }
    }
Ejemplo n.º 21
0
Archivo: dsp.cpp Proyecto: taqu/opus
    void conv_Float1ToShort1(void* dst, const void* s, s32 numSamples)
    {
        LSshort* d = reinterpret_cast<LSshort*>(dst);
        const LSfloat* src = reinterpret_cast<const LSfloat*>(s);

        s32 num = numSamples >> 2; //4個のfloatをまとめて処理
        //ストア処理用に4サンプル除外
        if(0<num){
            --num;
        }
        s32 offset = num << 2;
        s32 rem = numSamples - offset;

        const __m128 fcoff = _mm_set1_ps(32768.0f);

        const LSfloat* p = src;
        LSshort* q = d;
        for(s32 i=0; i<num; ++i){
            __m128 f32_0 = _mm_mul_ps(_mm_loadu_ps(p), fcoff);
            __m128i s32_0 = _mm_cvtps_epi32(f32_0);
            __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0);

            _mm_storeu_si128((__m128i*)q, s16_0);
            p += 4;
            q += 4;
        }

        for(s32 i=0; i<rem; ++i){
            q[i] = toShort(p[i]);
        }
    }
Ejemplo n.º 22
0
static inline void AccumulateWeighted(Vec3f &out, const Vec3Packedf &in, const Vec4f &w) {
#ifdef _M_SSE
	out.vec = _mm_add_ps(out.vec, _mm_mul_ps(_mm_loadu_ps(in.AsArray()), w.vec));
#else
	out += in * w.x;
#endif
}
Ejemplo n.º 23
0
float sumSSE(float* floats, int size) {

    int i,q,r;
    float f[4] = {0};

    //if(size<10) { float ax; while(size) { ax+=floats[size]; --size; } return ax; }

    q = 4*(size/4); // whole
    r = size - q; // remainder

    // load and sum first 8
    __m128 x = _mm_load_ps( floats);
    __m128 y = _mm_load_ps( floats + 4);
    x = _mm_add_ps(x,y);

    // sum remaining whole blocks one at a time (use j which is size-r...)
    for(i=8; i<q; i+=4) {
        y = _mm_load_ps(floats + i );
        x = _mm_add_ps(x,y);
    }

    //printf("size:%d q:%d r: %d\n",size,q,r);
    // if we have a remainder add it to our sum
    for(; r; --r) f[r] = floats[size-r];
    y = _mm_loadu_ps(f);
    x = _mm_add_ps(x,y);

    // move back into float array, and return the sum
    _mm_store_ps(f,x);
    return f[0] + f[1] + f[2] + f[3];
}
Ejemplo n.º 24
0
void *calculate_row(void *thread_id)
{
	long id = (long)thread_id;
	int i, j, ii, jj, ii_limit, jj_limit, l, start_i, end_i, jj_limit_minus_3;
	__m128 a_line, b_line, r_line;
	const float *A = matrix_a, *B= matrix_b;
	const int n = matrix_n, m = matrix_m, k = matrix_k;
	float *row_in_B, *row_in_C, ii_l_in_A;

	start_i = row_per_thread * id;
	end_i = row_per_thread * (id + 1);
	if(end_i >= m)
		end_i = m;
	
	for(i = start_i ; i < end_i ; i += TILE_SIZE) // i: row block index in C 
	{
		ii_limit = min(end_i, i + TILE_SIZE);

		for(j = 0 ; j < n ; j += TILE_SIZE2) // j : col block index in C
		{
			jj_limit = min(n, j + TILE_SIZE2);
			jj_limit_minus_3 = jj_limit - 3;

			for(l = 0; l < k; ++l)
			{
				row_in_B = B + l*n;
				for(ii = i ; ii < ii_limit; ++ii)
				{
					ii_l_in_A = A[ii * k + l];
					a_line = _mm_set1_ps(ii_l_in_A); //A[ii * k + l]);  
					row_in_C = matrix_c + ii * n;
					for(jj = j; jj < jj_limit_minus_3 ; jj += 4)
					{
						b_line = _mm_loadu_ps(row_in_B + jj);
						r_line = _mm_loadu_ps(row_in_C + jj);
						_mm_storeu_ps(row_in_C + jj, _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line));
					}
					for(; jj < jj_limit; ++jj)
					{
						*(row_in_C + jj) += ii_l_in_A * *(row_in_B + jj);
					}
				}
			}
		}
	}
	pthread_exit(NULL);
}
Ejemplo n.º 25
0
/* the fast arctan function adopted from OpenCV */
static void _ccv_atan2(float* x, float* y, float* angle, float* mag, int len)
{
	int i = 0;
	float scale = (float)(180.0 / CCV_PI);
#ifdef HAVE_SSE2
#ifndef _WIN32
	union { int i; float fl; } iabsmask; iabsmask.i = 0x7fffffff;
	__m128 eps = _mm_set1_ps((float)1e-6), absmask = _mm_set1_ps(iabsmask.fl);
	__m128 _90 = _mm_set1_ps((float)(3.141592654 * 0.5)), _180 = _mm_set1_ps((float)3.141592654), _360 = _mm_set1_ps((float)(3.141592654 * 2));
	__m128 zero = _mm_setzero_ps(), _0_28 = _mm_set1_ps(0.28f), scale4 = _mm_set1_ps(scale);
	
	for(; i <= len - 4; i += 4)
	{
		__m128 x4 = _mm_loadu_ps(x + i), y4 = _mm_loadu_ps(y + i);
		__m128 xq4 = _mm_mul_ps(x4, x4), yq4 = _mm_mul_ps(y4, y4);
		__m128 xly = _mm_cmplt_ps(xq4, yq4);
		__m128 z4 = _mm_div_ps(_mm_mul_ps(x4, y4), _mm_add_ps(_mm_add_ps(_mm_max_ps(xq4, yq4), _mm_mul_ps(_mm_min_ps(xq4, yq4), _0_28)), eps));

		// a4 <- x < y ? 90 : 0;
		__m128 a4 = _mm_and_ps(xly, _90);
		// a4 <- (y < 0 ? 360 - a4 : a4) == ((x < y ? y < 0 ? 270 : 90) : (y < 0 ? 360 : 0))
		__m128 mask = _mm_cmplt_ps(y4, zero);
		a4 = _mm_or_ps(_mm_and_ps(_mm_sub_ps(_360, a4), mask), _mm_andnot_ps(mask, a4));
		// a4 <- (x < 0 && !(x < y) ? 180 : a4)
		mask = _mm_andnot_ps(xly, _mm_cmplt_ps(x4, zero));
		a4 = _mm_or_ps(_mm_and_ps(_180, mask), _mm_andnot_ps(mask, a4));
		
		// a4 <- (x < y ? a4 - z4 : a4 + z4)
		a4 = _mm_mul_ps(_mm_add_ps(_mm_xor_ps(z4, _mm_andnot_ps(absmask, xly)), a4), scale4);
		__m128 m4 = _mm_sqrt_ps(_mm_add_ps(xq4, yq4));
		_mm_storeu_ps(angle + i, a4);
		_mm_storeu_ps(mag + i, m4);
	}
#endif
#endif
	for(; i < len; i++)
	{
		float xf = x[i], yf = y[i];
		float a, x2 = xf * xf, y2 = yf * yf;
		if(y2 <= x2)
			a = xf * yf / (x2 + 0.28f * y2 + (float)1e-6) + (float)(xf < 0 ? CCV_PI : yf >= 0 ? 0 : CCV_PI * 2);
		else
			a = (float)(yf >= 0 ? CCV_PI * 0.5 : CCV_PI * 1.5) - xf * yf / (y2 + 0.28f * x2 + (float)1e-6);
		angle[i] = a * scale;
		mag[i] = sqrtf(x2 + y2);
	}
}
void matrix_mult_simd(float A[SIZE][SIZE], float B[SIZE][SIZE], float ans[SIZE][SIZE]) {
  float temp[4] = {0};
  __m128 acc, a, b;

  for (int i = 0 ; i < SIZE ; i++) {
    for (int j = 0; j < SIZE; j ++) {
      acc = _mm_set1_ps(0.0);
      for (int k = 0; k < (SIZE - 3); k +=4) {
        a = _mm_loadu_ps(&A[j][k]);
        b = _mm_loadu_ps(&B[j][k]);
        acc = _mm_add_ps(acc, _mm_mul_ps(a, b));
      }
      _mm_storeu_ps(temp, acc);
      ans[i][j] = temp[0] + temp[1] + temp[2] + temp[3];
    }
  }
}
Ejemplo n.º 27
0
static void
sse3_test_movsldup_reg (float *i1, float *r)
{
  __m128 t1 = _mm_loadu_ps (i1);
  __m128 t2 = _mm_moveldup_ps (t1);

  _mm_storeu_ps (r, t2);
}
Ejemplo n.º 28
0
void LoadRenderParams(float inScaleVal,
                      float outScaleVal,
                      const RenderParams & renderParams,
                      __m128 & inScale,
                      __m128 & outScale,
                      __m128 & slope,
                      __m128 & offset,
                      __m128 & power,
                      __m128 & saturation)
{
    inScale    = _mm_set1_ps(inScaleVal);
    outScale   = _mm_set1_ps(outScaleVal);
    slope      = _mm_loadu_ps(renderParams.getSlope());
    offset     = _mm_loadu_ps(renderParams.getOffset());
    power      = _mm_loadu_ps(renderParams.getPower());
    saturation = _mm_set1_ps(renderParams.getSaturation());
}
  void _Run(OutputPixelType aaOutput[ciHeight][ciWidth], InputPixelType_1 aaInput1[ciHeight][ciWidth], InputPixelType_2 aaInput2[ciHeight][ciWidth])
  {
    for (int iY = 0; iY < ciHeight; ++iY)
    {
      OutputPixelType   *pOutput = aaOutput[iY];
      InputPixelType_1  *pInput1 = aaInput1[iY];
      InputPixelType_2  *pInput2 = aaInput2[iY];

      for (int iX = 0; iX < ciWidth; iX += VectorWidth)
      {
        __m128 mmIn1 = _mm_loadu_ps( pInput1 + iX );
        __m128 mmIn2 = _mm_loadu_ps( pInput2 + iX );

        _mm_storeu_ps( pOutput + iX, _mm_add_ps(mmIn1, mmIn2) );
      }
    }
  }
Ejemplo n.º 30
0
inline __m128 sse_load( float * i ) 
{
#ifdef CODE_ALIGNED_SIMD_INSTRUCTIONS
	return _mm_load_ps( i );
#else
	return _mm_loadu_ps( i );
#endif
}