template <> vector3d<float> vector3d<float>::transform(const matrix3d<float> &M) const
{
  float _x, _y, _z;
  __m128 v1, v2, rx, ry, rz;
  
  v1 = _mm_load_ps(&coefficients[0]);
  v2 = _mm_load_ps(&M.elements[0]);
  rx = _mm_dp_ps(v1,v2,0xF1);
  
  
  v1 = _mm_load_ps(&coefficients[0]);
  v2 = _mm_load_ps(&M.elements[4]);
  ry = _mm_dp_ps(v1,v2,0xF1);
  
  v1 = _mm_load_ps(&coefficients[0]);
  v2 = _mm_load_ps(&M.elements[8]);
  rz = _mm_dp_ps(v1,v2,0xF1);
  
  _mm_store_ss(&_x,rx);
  _mm_store_ss(&_y,ry);
  _mm_store_ss(&_z,rz);
  
  vector3d<float> q;
  q.set(_x,_y,_z);
  return q;
}
Exemple #2
0
void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
      int N, opus_val32 *xy1, opus_val32 *xy2)
{
   int i;
   __m128 xsum1, xsum2;
   xsum1 = _mm_setzero_ps();
   xsum2 = _mm_setzero_ps();
   for (i=0;i<N-3;i+=4)
   {
      __m128 xi = _mm_loadu_ps(x+i);
      __m128 y1i = _mm_loadu_ps(y01+i);
      __m128 y2i = _mm_loadu_ps(y02+i);
      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
   }
   /* Horizontal sum */
   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
   _mm_store_ss(xy1, xsum1);
   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
   _mm_store_ss(xy2, xsum2);
   for (;i<N;i++)
   {
      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
   }
}
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer)
{
   unsigned i;
   __m128 sum_l = _mm_setzero_ps();
   __m128 sum_r = _mm_setzero_ps();

   const float *buffer_l = resamp->buffer_l + resamp->ptr;
   const float *buffer_r = resamp->buffer_r + resamp->ptr;

   unsigned taps = resamp->taps;
   unsigned phase = resamp->time >> SUBPHASE_BITS;
#if SINC_COEFF_LERP
   const float *phase_table = resamp->phase_table + phase * taps * 2;
   const float *delta_table = phase_table + taps;
   __m128 delta = _mm_set1_ps((float)(resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD);
#else
   const float *phase_table = resamp->phase_table + phase * taps;
#endif

   for (i = 0; i < taps; i += 4)
   {
      __m128 buf_l = _mm_loadu_ps(buffer_l + i);
      __m128 buf_r = _mm_loadu_ps(buffer_r + i);

#if SINC_COEFF_LERP
      __m128 deltas = _mm_load_ps(delta_table + i);
      __m128 sinc = _mm_add_ps(_mm_load_ps(phase_table + i), _mm_mul_ps(deltas, delta));
#else
      __m128 sinc = _mm_load_ps(phase_table + i);
#endif
      sum_l       = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc));
      sum_r       = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc));
   }

   // Them annoying shuffles :V
   // sum_l = { l3, l2, l1, l0 }
   // sum_r = { r3, r2, r1, r0 }

   __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)),
         _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));

   // sum   = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
   // sum   = { R1, R0, L1, L0 }

   sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);

   // sum   = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
   // sum   = { X,  R,  X,  L } 

   // Store L
   _mm_store_ss(out_buffer + 0, sum);

   // movehl { X, R, X, L } == { X, R, X, R }
   _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum));
}
Exemple #4
0
static void process_sinc(rarch_resampler_t *resamp, float *out_buffer)
{
   __m128 sum_l = _mm_setzero_ps();
   __m128 sum_r = _mm_setzero_ps();

   const float *buffer_l = resamp->buffer_l + resamp->ptr;
   const float *buffer_r = resamp->buffer_r + resamp->ptr;

   unsigned phase = resamp->time >> PHASES_SHIFT;
   unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
   __m128 delta_f = _mm_set1_ps(delta);

   const float *phase_table = resamp->phase_table[phase][PHASE_INDEX];
   const float *delta_table = resamp->phase_table[phase][DELTA_INDEX];

   for (unsigned i = 0; i < TAPS; i += 4)
   {
      __m128 buf_l  = _mm_loadu_ps(buffer_l + i);
      __m128 buf_r  = _mm_loadu_ps(buffer_r + i);

      __m128 phases = _mm_load_ps(phase_table + i);
      __m128 deltas = _mm_load_ps(delta_table + i);

      __m128 sinc   = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f));

      sum_l         = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc));
      sum_r         = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc));
   }

   // Them annoying shuffles :V
   // sum_l = { l3, l2, l1, l0 }
   // sum_r = { r3, r2, r1, r0 }

   __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)),
         _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));

   // sum   = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
   // sum   = { R1, R0, L1, L0 }

   sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);

   // sum   = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
   // sum   = { X,  R,  X,  L } 

   // Store L
   _mm_store_ss(out_buffer + 0, sum);

   // movehl { X, R, X, L } == { X, R, X, R }
   _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum));
}
Exemple #5
0
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer)
{
    unsigned i;
    __m256 sum_l             = _mm256_setzero_ps();
    __m256 sum_r             = _mm256_setzero_ps();

    const float *buffer_l    = resamp->buffer_l + resamp->ptr;
    const float *buffer_r    = resamp->buffer_r + resamp->ptr;

    unsigned taps            = resamp->taps;
    unsigned phase           = resamp->time >> SUBPHASE_BITS;
#if SINC_COEFF_LERP
    const float *phase_table = resamp->phase_table + phase * taps * 2;
    const float *delta_table = phase_table + taps;
    __m256 delta             = _mm256_set1_ps((float)
                               (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD);
#else
    const float *phase_table = resamp->phase_table + phase * taps;
#endif

    for (i = 0; i < taps; i += 8)
    {
        __m256 buf_l  = _mm256_loadu_ps(buffer_l + i);
        __m256 buf_r  = _mm256_loadu_ps(buffer_r + i);

#if SINC_COEFF_LERP
        __m256 deltas = _mm256_load_ps(delta_table + i);
        __m256 sinc   = _mm256_add_ps(_mm256_load_ps(phase_table + i),
                                      _mm256_mul_ps(deltas, delta));
#else
        __m256 sinc   = _mm256_load_ps(phase_table + i);
#endif
        sum_l         = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
        sum_r         = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
    }

    /* hadd on AVX is weird, and acts on low-lanes
     * and high-lanes separately. */
    __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
    __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
    res_l        = _mm256_hadd_ps(res_l, res_l);
    res_r        = _mm256_hadd_ps(res_r, res_r);
    res_l        = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
    res_r        = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);

    /* This is optimized to mov %xmmN, [mem].
     * There doesn't seem to be any _mm256_store_ss intrinsic. */
    _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0));
    _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0));
}
Exemple #6
0
F32 round(F32 val) {
#ifdef USE_SSE4
    __m128 t = _mm_set_ss(val);
    t = _mm_round_ss(t, t, _MM_FROUND_TO_NEAREST_INT);
    _mm_store_ss(&val, t);
#elif defined(USE_SSE2)
    __m128 t = _mm_set_ss(val);
	U32 i = (U32)_mm_cvtss_si32(t);
	t = _mm_cvtsi32_ss(t, (int32)i);
	_mm_store_ss(&val, t);
#else
	val = (F32)core_floor(val + 0.5f);
#endif
    return val;
}
int intr_frustum_box(const float* frustum, const float* box_min, const float* box_max)
{
    const __m128 min = _mm_load_ps(box_min);
    const __m128 max = _mm_load_ps(box_max);

    for (int i = 0; i < 6; i++) {
        const __m128 plane = _mm_load_ps(frustum + 4 * i);

        const __m128 mask = _mm_cmplt_ps(plane, _mm_setzero_ps());
        const __m128 n = _mm_or_ps(_mm_and_ps(mask, max),
                                   _mm_andnot_ps(mask, min));

        const __m128 d = _mm_mul_ps(n, plane);
        const __m128 d0 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(2, 1, 0, 3));
        const __m128 d1 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(1, 0, 3, 2));
        const __m128 d2 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(0, 3, 2, 1));
        const __m128 dot = _mm_add_ss(_mm_add_ss(d0, d), _mm_add_ss(d1, d2));

        const __m128 ret = _mm_cmpgt_ss(dot, _mm_setzero_ps());
        
        float reti;
        _mm_store_ss(&reti, ret);
        if (reti != 0)
            return 0;
    }
    
    return 1;
}
Exemple #8
0
static inline long
conv_yHalf_yF (const uint16_t *src, float *dst, long samples)
{
  const uint64_t *s_vec;
  __v4sf         *d_vec;

  long n = samples;

  s_vec = (const uint64_t *)src;
  d_vec = (__v4sf *)dst;

  while (n >= 4)
    {
      __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0);
      __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
      _mm_storeu_ps((float *)d_vec++, out_val);
      n -= 4;
    }

  src = (const uint16_t *)s_vec;
  dst = (float *)d_vec;

  while (n)
    {
      __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0);
      __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
      _mm_store_ss(dst++, out_val);
      n -= 1;
    }

  return samples;
}
Exemple #9
0
void VertexFuncSmoothSphereMapAOS(
    const VERTEXINPUT &in,
    VERTEXOUTPUT &out)
{
    const FLOAT *pWorldMat = (FLOAT *)in.pConstants;
    const FLOAT *pLightPos = (FLOAT *)in.pConstants + CONST_OFFSET_LIGHT_POSITION;

    Mat4Vec3Multiply(out.pVertices, pWorldMat, (FLOAT *)in.pAttribs);

    // Transform normals in attrib buffer by world matrix.
    OSALIGNLINE(FLOAT) normals[4];
    Mat4Vec3Multiply(&normals[0], pWorldMat, &in.pAttribs[SHADER_INPUT_SLOT_NORMAL]);

    // Normalize normals
    __m128 normal = Vec3Normalize(&normals[0]);

    __m128 lightPos = _mm_load_ps(pLightPos);
    __m128 lightDir = _mm_load_ps(in.pAttribs);

    // Compute a light direction vector for each vertex.
    //     lightDir = Normalize(lightPos - vertexPos)
    lightDir = _mm_sub_ps(lightPos, lightDir);
    lightDir = Vec3Normalize(lightDir);

    __m128 ndotl0 = Vec3Diffuse(normal, lightDir);

    _mm_store_ps(&out.pAttribs[0], _mm_load_ps(&in.pAttribs[0]));

    _mm_store_ss(&out.pAttribs[0x2], ndotl0);
}
Exemple #10
0
/* Combines unpack and accumulate */
void vector_accumulate_8bit(float *out, const char *in, int n) {
#ifdef FOLD_USE_INTRINSICS
    __m128 in_, out_, tmp_;
    float ftmp;
    int ii;
    for (ii = 0 ; ii < (n & -16) ; ii += 16) {
        __builtin_prefetch(out + 64, 1, 0);
        __builtin_prefetch(in  + 64, 0, 0);

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < (n & -4) ; ii += 4) {
        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < n ; ii++) {  // Cast these without intrinsics
        ftmp = (float)(*in);
        out_ = _mm_load_ss(out);
        in_ = _mm_load_ss(&ftmp);
        tmp_ = _mm_add_ss(out_, in_);
        _mm_store_ss(out, tmp_);
        in  += 1;
        out += 1;
    }
    _mm_empty();
#else
    int i;
    for (i=0; i<n; i++) { out[i] += (float)in[i]; }
#endif
}
Exemple #11
0
int
drid_moments(float* coords, int32_t index, int32_t* partners, int32_t n_partners, double* moments)
{
    int32_t i;
    float d;
    moments_t onlinemoments;
    __m128 x, y, r, r2, s;

    moments_clear(&onlinemoments);
    x = load_float3(&coords[3 * index]);

    for (i = 0; i < n_partners; i++) {
        y = load_float3(&coords[3 * partners[i]]);
        r = _mm_sub_ps(x, y);     /* x - y       */
        r2 = _mm_mul_ps(r, r);    /* (x - y)**2  */

        /* horizontal add the components of d2 with */
        /* two instructions. note: it's critical */
        /* here that the last entry of x1 and x2 was 0 */
        /* so that d2.w = 0 */
	s = _mm_add_ps(r2, _mm_movehl_ps(r2, r2));
        s = _mm_add_ss(s, _mm_shuffle_ps(s, s, 1));
        /* store into a regular float. I tried using _mm_rsqrt_ps, but it's not
           accurate to pass the tests */
        _mm_store_ss(&d, s);
        moments_push(&onlinemoments, 1.0 / sqrt((double) d));
    }

    moments[0] = moments_mean(&onlinemoments);
    moments[1] = sqrt(moments_second(&onlinemoments));
    moments[2] = cbrt(moments_third(&onlinemoments));

    return 1;
}
__inline static void _mm_add_ps_4x1(__m128 sum, float *dst) {
  // A+B C+D
  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)));
  // A+B+C+D A+B+C+D
  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
  _mm_store_ss(dst, sum);
}
void ColorModelView::paintEvent(QPaintEvent *)
{
    QPainter p(this);

    auto mainBounds = mainAreaBounds();
    auto sideBounds = sideAreaBounds();

    if (mainImage_.isNull()) {
        // FIXME: support other color model?
        QImage img(256, 256, QImage::Format_RGB32);
        auto *pixels = reinterpret_cast<quint32 *>(img.bits());
        auto basecolor = QColor::fromHsv(value_.hsvHue(), 255, 255);
        auto basecolorMM = _mm_setr_epi32(basecolor.blue(), basecolor.green(), basecolor.red(), 0);
        basecolorMM = _mm_add_epi32(basecolorMM, _mm_srli_epi32(basecolorMM, 7)); // map [0, 255] to [0, 256]
        auto white = _mm_set1_epi32(256 * 255);
        auto dX = _mm_sub_epi32(basecolorMM, _mm_set1_epi32(256));
        for (int y = 0; y < 256; ++y) {
            auto brightness = _mm_set1_epi32(256 - y - (y >> 7));
            auto col = white; // [0, 256 * 255]
            for (int x = 0; x < 256; ++x) {
                auto c = _mm_mullo_epi16(_mm_srli_epi32(col, 8), brightness);
                c = _mm_srli_epi16(c, 8); // [0, 255]
                c = _mm_packs_epi32(c, c);
                c = _mm_packus_epi16(c, c);

                _mm_store_ss(reinterpret_cast<float *>(&pixels[x + y * 256]),
                        _mm_castsi128_ps(c));

                col = _mm_add_epi32(col, dX);
            }
        }
        mainImage_ = QPixmap::fromImage(img);
    }
Exemple #14
0
F32 ceil(F32 val) {
#ifdef USE_SSE4
    __m128 t = _mm_set_ss(val);
    t = _mm_ceil_ss(t, t);
    _mm_store_ss(&val, t);
#elif defined(USE_SSE2)
    val += 0.5f;
	__m128 t = _mm_set_ss(val);
	U32 i = (U32)_mm_cvtss_si32(t);
	t = _mm_cvtsi32_ss(t, (int32)i);
	_mm_store_ss(&val, t);
#else
	val = (F32)core_ceil(val);
#endif
    return val;
}
void SubpixelMaximizer::fitUsingSSE3(float coef[FitMatrix::ROWS], const signed short data[3][3][3]) const
{
  assert(FitMatrix::PADDEDCOLS == 32);
  __m128 localFitMatrixScale = _mm_set_ss(fitMatrix.scale);
  const short* localFitMatrix = fitMatrix();
  // Load data into four SSE Registers
  __m128i x[4];
  signed short* dataFlat = (signed short*) data; // flat arraw of 27 signed shorts
  x[0] = _mm_loadu_si128((__m128i*)(dataFlat + 0));
  x[1] = _mm_loadu_si128((__m128i*)(dataFlat + 8));
  x[2] = _mm_loadu_si128((__m128i*)(dataFlat + 16));
  x[3] = _mm_loadu_si128((__m128i*)(dataFlat + 24));
  x[3] = _mm_srli_si128(_mm_slli_si128(x[3], 10), 10);   // Clear dataFlat[27..31]

  for(int i = 0; i < FitMatrix::ROWS; i++)
  {
    // Compute scalar product between ((float*)x)[0..31] and localFitMatrix
    __m128i sum =             _mm_madd_epi16(x[0], *(__m128i*)(localFitMatrix + 0));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[1], *(__m128i*)(localFitMatrix + 8)));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[2], *(__m128i*)(localFitMatrix + 16)));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[3], *(__m128i*)(localFitMatrix + 24)));
    sum = _mm_hadd_epi32(sum, sum);
    sum = _mm_hadd_epi32(sum, sum);
    _mm_store_ss(coef + i, _mm_mul_ss(_mm_cvtepi32_ps(sum), localFitMatrixScale));
    localFitMatrix += 32;
  }
}
Exemple #16
0
/* use compiler intrinsics for 4x parallel processing */
static inline float chi2_intrinsic_aligned_float(int n, const float* x, const float* y) {
    float result=0;
    const __m128 eps = _mm_set1_ps(FLT_MIN);
    const __m128 zero = _mm_setzero_ps();
    __m128 chi2 = _mm_setzero_ps();
    
    for (; n>3; n-=4) {
        const __m128 a = _mm_loadu_ps(x);
        const __m128 b = _mm_loadu_ps(y);
        const __m128 a_plus_eps = _mm_add_ps(a,eps);
        const __m128 a_plus_b_plus_eps = _mm_add_ps(a_plus_eps,b);
        const __m128 a_minus_b = _mm_sub_ps(a,b);
        const __m128 a_minus_b_sq = _mm_mul_ps(a_minus_b, a_minus_b);
        const __m128 prod = _mm_div_ps(a_minus_b_sq, a_plus_b_plus_eps);
        chi2 = _mm_add_ps(chi2, prod);
	x+=4;
	y+=4;
    }
    const __m128 shuffle1 = _mm_shuffle_ps(chi2, chi2, _MM_SHUFFLE(1,0,3,2));
    const __m128 sum1 = _mm_add_ps(chi2, shuffle1);
    const __m128 shuffle2 = _mm_shuffle_ps(sum1, sum1, _MM_SHUFFLE(2,3,0,1));
    const __m128 sum2 = _mm_add_ps(sum1, shuffle2);
// with SSE3, we could use hadd_ps, but the difference is negligible 

    _mm_store_ss(&result,sum2);
    _mm_empty();
    
    if (n)
        result += chi2_baseline_float(n, x, y);	// remaining 1-3 entries
    return result;
}
Exemple #17
0
void vector_accumulate(float *out, const float *in, int n) {
#ifdef FOLD_USE_INTRINSICS
    __m128 in_, out_, tmp_;
    int ii;
    for (ii = 0 ; ii < (n & -16) ; ii += 16) {
        __builtin_prefetch(out + 64, 1, 0);
        __builtin_prefetch(in  + 64, 0, 0);

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < (n & -4) ; ii += 4) {
        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < n ; ii++) {
        in_  = _mm_load_ss(in);
        out_ = _mm_load_ss(out);
        tmp_ = _mm_add_ss(out_, in_);
        _mm_store_ss(out, tmp_);
        in  += 1;
        out += 1;
    }
    _mm_empty();
#else
    int i;
    for (i=0; i<n; i++) { out[i] += in[i]; }
#endif
}
Exemple #18
0
  float SSEVector3::operator *( const SSEVector3& v ) const
  {
    float result[ 4 ];

    // Store in lowest float, do not multiply fourth value: 1110 0001
    const int mask = 0x71;
    _mm_store_ss( result, _mm_dp_ps( vec, v.vec, mask ) );
    return result[ 0 ];
  }
Exemple #19
0
  float SSEVector3::Length() const
  {
    float result[ 4 ];
    float lengthSquared = LengthSquared();

    // Store in all floats, do not multiply fourth value: 0111 1111
    const int mask = 0x7F;
    _mm_store_ss( result, _mm_sqrt_ss( _mm_dp_ps( vec, vec, mask ) ) );
    return result[ 0 ];
  }
Exemple #20
0
HW_FORCE_INLINE float dot(const Vec<N>& a, const Vec<N>& b) {
	__m128 x = _mm_mul_ps(a.xmm, b.xmm);

	x = _mm_hadd_ps(x, x);
	x = _mm_hadd_ps(x, x);

	float tmp;
	_mm_store_ss(&tmp, x);
	return tmp;
}
Exemple #21
0
F32 root(F32 val) {
#ifdef USE_SSE2
    __m128 i = _mm_set_ss(val);
    i = _mm_sqrt_ss(i);
    _mm_store_ss(&val, i);
    return val;
#else
    return (F32)core_sqrt((F64)val);
#endif
}
Exemple #22
0
// use MMX/SSE extensions
void dotprod_rrrf_execute_mmx(dotprod_rrrf _q,
                              float *      _x,
                              float *      _y)
{
    // first cut: ...
    __m128 v;   // input vector
    __m128 h;   // coefficients vector
    __m128 s;   // dot product
    __m128 sum = _mm_setzero_ps(); // load zeros into sum register

    // t = 4*(floor(_n/4))
    unsigned int t = (_q->n >> 2) << 2;

    //
    unsigned int i;
    for (i=0; i<t; i+=4) {
        // load inputs into register (unaligned)
        v = _mm_loadu_ps(&_x[i]);

        // load coefficients into register (aligned)
        h = _mm_load_ps(&_q->h[i]);

        // compute multiplication
        s = _mm_mul_ps(v, h);
       
        // parallel addition
        sum = _mm_add_ps( sum, s );
    }

    // aligned output array
    float w[4] __attribute__((aligned(16)));

#if HAVE_PMMINTRIN_H
    // fold down into single value
    __m128 z = _mm_setzero_ps();
    sum = _mm_hadd_ps(sum, z);
    sum = _mm_hadd_ps(sum, z);
   
    // unload single (lower value)
    _mm_store_ss(w, sum);
    float total = w[0];
#else
    // unload packed array
    _mm_store_ps(w, sum);
    float total = w[0] + w[1] + w[2] + w[3];
#endif

    // cleanup
    for (; i<_q->n; i++)
        total += _x[i] * _q->h[i];

    // set return value
    *_y = total;
}
// Update location by velocity, one time-step
void update_coords(uint32_t i, float* x, float* y, float* z, float* vx, float* vy, float* vz) {
	__m128 vec, flo, out;


	vec = _mm_set_ss(vx[i]);
	flo = _mm_set_ss(x[i]);
	out = _mm_add_ss(vec, flo);
	_mm_store_ss(&x[i], out);

	vec = _mm_set_ss(vy[i]);
	flo = _mm_set_ss(y[i]);
	out = _mm_add_ss(vec, flo);
	_mm_store_ss(&y[i], out);

	vec = _mm_set_ss(vz[i]);
	flo = _mm_set_ss(z[i]);
	out = _mm_add_ss(vec, flo);
	_mm_store_ss(&z[i], out);

}
Exemple #24
0
HW_FORCE_INLINE float invLength(const Vec<N>& a) {
	__m128 x = _mm_mul_ps(a.xmm, a.xmm);

	x = _mm_hadd_ps(x, x);
	x = _mm_hadd_ps(x, x);

	x = _mm_rsqrt_ss(x);

	float tmp;
	_mm_store_ss(&tmp, x);
	return tmp;
}
Exemple #25
0
void R_LocalPointToGlobal( const float modelMatrix[16], const idVec3 &in, idVec3 &out ) {
#if defined(MACOS_X) && defined(__i386__)
	__m128 m0, m1, m2, m3;
	__m128 in0, in1, in2;
	float i0,i1,i2;
	i0 = in[0];
	i1 = in[1];
	i2 = in[2];
	
	m0 = _mm_loadu_ps(&modelMatrix[0]);
	m1 = _mm_loadu_ps(&modelMatrix[4]);
	m2 = _mm_loadu_ps(&modelMatrix[8]);
	m3 = _mm_loadu_ps(&modelMatrix[12]);
	
	in0 = _mm_load1_ps(&i0);
	in1 = _mm_load1_ps(&i1);
	in2 = _mm_load1_ps(&i2);
	
	m0 = _mm_mul_ps(m0, in0);
	m1 = _mm_mul_ps(m1, in1);
	m2 = _mm_mul_ps(m2, in2);

	m0 = _mm_add_ps(m0, m1);
	m0 = _mm_add_ps(m0, m2);
	m0 = _mm_add_ps(m0, m3);
	
	_mm_store_ss(&out[0], m0);
	m1 = (__m128) _mm_shuffle_epi32((__m128i)m0, 0x55);
	_mm_store_ss(&out[1], m1);
	m2 = _mm_movehl_ps(m2, m0);
	_mm_store_ss(&out[2], m2);
#else	
	out[0] = in[0] * modelMatrix[0] + in[1] * modelMatrix[4]
		+ in[2] * modelMatrix[8] + modelMatrix[12];
	out[1] = in[0] * modelMatrix[1] + in[1] * modelMatrix[5]
		+ in[2] * modelMatrix[9] + modelMatrix[13];
	out[2] = in[0] * modelMatrix[2] + in[1] * modelMatrix[6]
		+ in[2] * modelMatrix[10] + modelMatrix[14];
#endif
}
Exemple #26
0
/*
====================
idMD5Mesh::CalculateBounds
====================
*/
void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const {

	__m128 minX = vector_float_posInfinity;
	__m128 minY = vector_float_posInfinity;
	__m128 minZ = vector_float_posInfinity;
	__m128 maxX = vector_float_negInfinity;
	__m128 maxY = vector_float_negInfinity;
	__m128 maxZ = vector_float_negInfinity;
	for ( int i = 0; i < numMeshJoints; i++ ) {
		const idJointMat & joint = entJoints[meshJoints[i]];
		__m128 x = _mm_load_ps( joint.ToFloatPtr() + 0 * 4 );
		__m128 y = _mm_load_ps( joint.ToFloatPtr() + 1 * 4 );
		__m128 z = _mm_load_ps( joint.ToFloatPtr() + 2 * 4 );
		minX = _mm_min_ps( minX, x );
		minY = _mm_min_ps( minY, y );
		minZ = _mm_min_ps( minZ, z );
		maxX = _mm_max_ps( maxX, x );
		maxY = _mm_max_ps( maxY, y );
		maxZ = _mm_max_ps( maxZ, z );
	}
	__m128 expand = _mm_splat_ps( _mm_load_ss( & maxJointVertDist ), 0 );
	minX = _mm_sub_ps( minX, expand );
	minY = _mm_sub_ps( minY, expand );
	minZ = _mm_sub_ps( minZ, expand );
	maxX = _mm_add_ps( maxX, expand );
	maxY = _mm_add_ps( maxY, expand );
	maxZ = _mm_add_ps( maxZ, expand );
	_mm_store_ss( bounds.ToFloatPtr() + 0, _mm_splat_ps( minX, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 1, _mm_splat_ps( minY, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 2, _mm_splat_ps( minZ, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 3, _mm_splat_ps( maxX, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) );

}
Exemple #27
0
static void
sse2_test (void)
{
  float a = 1.0f;
  float b = 2.0f;
  float c = 3.0f;
  float r;

  __m128 v = _mm_set_ps(a, b, c, 0);
  
  v = (__m128)_mm_srli_si128((__m128i)v, 4);
  _mm_store_ss(&r, v);
  if (r != 3.0f)
    abort ();
}
Exemple #28
0
// ~~~~~~~~~~~~~~~ Task2
void mulVectorSse(MATRIX_TYPE** matrix, MATRIX_TYPE* vector, MATRIX_TYPE* result, size_t size) {
	for (size_t i = 0; i < size; i++) {
		__m128 localSum = _mm_setzero_ps();

		for (size_t j = 0; j < size; j += 4) {
			__m128 tempMatix = _mm_load_ps(&matrix[i][j]);
			__m128 tempVector = _mm_load_ps(&vector[j]);
			localSum = _mm_add_ps(localSum, _mm_mul_ps(tempMatix, tempVector));
		}

		localSum = _mm_hadd_ps(localSum, localSum);
		localSum = _mm_hadd_ps(localSum, localSum);
		_mm_store_ss(&result[i], localSum);
	}
}
void FastResampler_FirFilter2_C1_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
	Q_UNUSED(channels);
	__m128 sum = _mm_setzero_ps();
	__m128 v_frac = _mm_set1_ps(frac);
	for(unsigned int i = 0; i < filter_length / 4; ++i) {
		__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
		coef1 += 4; coef2 += 4;
		__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
		__m128 v_input = _mm_loadu_ps(input);
		input += 4;
		sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value));
	}
	__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e));
	__m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01));
	_mm_store_ss(output, sum3);
}
Exemple #30
0
float reduction_sum_sse(float *v, int n)
{
    int i;
    float sum;
    __m128 *v4 = (__m128 *)v;
    __m128 vsum = _mm_set1_ps(0.0f);

    for (i = 0; i < n / 4; i++)
        vsum = _mm_add_ps(vsum, v4[i]);

    vsum = _mm_hadd_ps(vsum, vsum);
    vsum = _mm_hadd_ps(vsum, vsum);

    _mm_store_ss(&sum, vsum);

    return sum;
}