예제 #1
0
			inline void Multiply(const PVector4df &v, PVector4df &out) {
#ifdef __SSE_AVAIL__

				__m128 v1 = _mm_load_ps(v._Vec);
				__m128 m0 = _mm_load_ps(Row1);
				__m128 m1 = _mm_load_ps(Row2);
				__m128 m2 = _mm_load_ps(Row3);
				__m128 m3 = _mm_load_ps(Row4);

				m0 = _mm_mul_ps(m0, v1);   //(e11 * v.X) , (e21 * v.Y), (e31 * v.Z), (e41 * v.W)
				m1 = _mm_mul_ps(m1, v1);	//(e12 * v.X) , (e22 * v.Y), (e32 * v.Z), (e42 * v.W)
				m2 = _mm_mul_ps(m2, v1);	//(e13 * v.X) , (e23 * v.Y), (e33 * v.Z), (e43 * v.W)
				m3 = _mm_mul_ps(m3, v1);	//(e14 * v.X) , (e24 * v.Y), (e34 * v.Z), (e44 * v.W)

				m0 = _mm_hadd_ps(m0, m1);
				m0 = _mm_hadd_ps(m0, m0);
				m2 = _mm_hadd_ps(m2, m3);
				m2 = _mm_hadd_ps(m2, m2);
				_mm_store_ps(out._Vec, _mm_movehl_ps(m2, m0));
#else
				out.X = v.X * e11 + v.Y * e21 + v.Z * e31 + v.W * e41;
				out.Y = v.X * e12 + v.Y * e22 + v.Z * e32 + v.W * e42;
				out.Z = v.X * e13 + v.Y * e23 + v.Z * e33 + v.W * e43;
				out.W = v.X * e14 + v.Y * e24 + v.Z * e34 + v.W * e44;
#endif
			}
예제 #2
0
void shz::math::matrix<shz::math::f32, 4, 4>::mul(const shz::math::f32* left, const shz::math::f32*right, shz::math::f32* target) {
	shz::math::f32 _ALIGNED(16) left_transposed[shz::math::matrix<shz::math::f32, 4, 4>::size];
	matrix<shz::math::f32, 4, 4>::transpose(left, left_transposed);
	shz::math::f32 _ALIGNED(16) temp[4];

	for(size_t m=0; m<4; ++m){
		__m128 right_operand = _mm_load_ps(right);
		shz::math::f32* transposed = left_transposed;

		// This probably can be further optimized. Needs benchmarking
		// Idea: unroll loop and exploit _mm_hadd_ps to sum partials from 2 rows at once
		// AVX: with AVX instruction set this loop is trivial
		for(size_t n=0; n<4; ++n){
			__m128 left_operand = _mm_load_ps(transposed);
			__m128 mul_result = _mm_mul_ps(left_operand, right_operand);
			__m128 added = _mm_hadd_ps(mul_result, mul_result);
			added = _mm_hadd_ps(added, added); 
			_mm_store_ps(temp, added);

			*target = temp[0];

			target++;
			transposed += 4;
		}

		right += 4;
	}
}
예제 #3
0
			inline PVector4df operator*(const PVector4df &v) {
		
#ifdef __SSE_AVAIL__

				__m128 v1 = _mm_load_ps(v._Vec);
				__m128 m0 = _mm_load_ps(Row1);
				__m128 m1 = _mm_load_ps(Row2);
				__m128 m2 = _mm_load_ps(Row3);
				__m128 m3 = _mm_load_ps(Row4);
				 m0 = _mm_mul_ps(m0, v1);   //(e11 * v.X) , (e21 * v.Y), (e31 * v.Z), (e41 * v.W)
				 m1 = _mm_mul_ps(m1, v1);	//(e12 * v.X) , (e22 * v.Y), (e32 * v.Z), (e42 * v.W)
				 m2 = _mm_mul_ps(m2, v1);	//(e13 * v.X) , (e23 * v.Y), (e33 * v.Z), (e43 * v.W)
				 m3 = _mm_mul_ps(m3, v1);	//(e14 * v.X) , (e24 * v.Y), (e34 * v.Z), (e44 * v.W)

				m0 = _mm_hadd_ps(m0, m1);
				m0 = _mm_hadd_ps(m0, m0);
				m2 = _mm_hadd_ps(m2, m3);
				m2 = _mm_hadd_ps(m2, m2);
				m0 = _mm_movehl_ps(m2, m0);

				PVector4df val;
				_mm_store_ps(val._Vec, m0);
				return val;
#else

				return PVector4df(v.X * e11 + v.Y * e21 + v.Z * e31 + v.W * e41,
					v.X * e12 + v.Y * e22 + v.Z * e32 + v.W * e42,
					v.X * e13 + v.Y * e23 + v.Z * e33 + v.W * e43,
					v.X * e14 + v.Y * e24 + v.Z * e34 + v.W * e44);
#endif
			}
예제 #4
0
파일: Vector.hpp 프로젝트: yuriks/cga-t2
HW_FORCE_INLINE Vec<N> spreadDot(const Vec<N>& a, const Vec<N>& b) {
	__m128 x = _mm_mul_ps(a.xmm, b.xmm);

	x = _mm_hadd_ps(x, x);
	x = _mm_hadd_ps(x, x);

	return Vec<N>(x);
}
예제 #5
0
static void 
mexsoftmax(float* y, float* shift, mwSize m, mwSize n) {
  __m128 i1, i2;
  __m128 o1, o2;
 
  while (m>0)
    {
      mwSize curn = n;
      float sum = 0.0f;
      declconst128(zero, 0.0f);
      
      while (curn>0 && ((unsigned long)(y+curn) & 15) != 0)
        {
          --curn;
          y[curn]=fastexp(y[curn]-*shift);
          sum += y[curn];
        }

      __m128 s1 = _mm_load1_ps (shift);
      __m128 sum1 = zero;

      while (curn>7) {
        i1 = _mm_load_ps (y+curn-4);
        i2 = _mm_load_ps (y+curn-8);
        i1 = _mm_sub_ps (i1, s1);
        i2 = _mm_sub_ps (i2, s1);
        o1 = vfastexp(i1);
        o2 = vfastexp(i2);
        _mm_store_ps (y+curn-4, o1);
        sum1 = _mm_add_ps (sum1, o1);
        _mm_store_ps (y+curn-8, o2);
        sum1 = _mm_add_ps (sum1, o2);
        curn-=8;
      }

      sum1 = _mm_hadd_ps (sum1, sum1);
      sum1 = _mm_hadd_ps (sum1, sum1);
      sum += _mm_cvtss_f32 (sum1);
     
      while(curn>0) {
        --curn;
        y[curn]=fastexp(y[curn]-*shift);
        sum += y[curn];
      }

      sum = 1.0f / sum;

      ptrdiff_t n_pdt = n;
      ptrdiff_t one_pdt = 1;

      sscal (&n_pdt, &sum, y, &one_pdt);

      ++shift;
      y+=n;
      --m;
    }
}
예제 #6
0
파일: Vector.hpp 프로젝트: yuriks/cga-t2
HW_FORCE_INLINE float dot(const Vec<N>& a, const Vec<N>& b) {
	__m128 x = _mm_mul_ps(a.xmm, b.xmm);

	x = _mm_hadd_ps(x, x);
	x = _mm_hadd_ps(x, x);

	float tmp;
	_mm_store_ss(&tmp, x);
	return tmp;
}
예제 #7
0
static inline __m128 horizontal_add(const __m128 a)
{
#if 0 //!! needs SSE3
    const __m128 ftemp = _mm_hadd_ps(a, a);
    return _mm_hadd_ps(ftemp, ftemp);
#else    
    const __m128 ftemp = _mm_add_ps(a, _mm_movehl_ps(a, a)); //a0+a2,a1+a3
    return _mm_add_ss(ftemp, _mm_shuffle_ps(ftemp, ftemp, _MM_SHUFFLE(1, 1, 1, 1))); //(a0+a2)+(a1+a3)
#endif
}
예제 #8
0
파일: Vector.hpp 프로젝트: yuriks/cga-t2
HW_FORCE_INLINE Vec<N> normalized(const Vec<N>& a) {
	__m128 x = _mm_mul_ps(a.xmm, a.xmm);
	x = _mm_hadd_ps(x, x);
	x = _mm_hadd_ps(x, x);
	x = _mm_rsqrt_ps(x);

	x = _mm_mul_ps(a.xmm, x);

	return Vec<N>(x);
}
예제 #9
0
    _XOINL float QuaternionSquareSum(const Quaternion& q)
    {
#if defined(XO_SSE)
        __m128 square = _mm_mul_ps(q.xmm, q.xmm);
        square = _mm_hadd_ps(square, square);
        square = _mm_hadd_ps(square, square);
        return _mm_cvtss_f32(square);
#else
        return q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w;
#endif
    }
예제 #10
0
float length2() const {
	Vec3 a = *this;
	a.w = 0.0f;

	__m128 &D = a.m128;
	D = _mm_mul_ps(D, D);
	D = _mm_hadd_ps(D, D);
	D = _mm_hadd_ps(D, D);

	return a.x;
}
예제 #11
0
    inline float hadd(const vector4f& rhs)
    {
#if SSE_INSTR_SET >= 3  // SSE3
        __m128 tmp0 = _mm_hadd_ps(rhs, rhs);
        __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
#else
        __m128 tmp0 = _mm_add_ps(rhs, _mm_movehl_ps(rhs, rhs));
        __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
#endif
        return _mm_cvtss_f32(tmp1);
    }
예제 #12
0
// use MMX/SSE extensions
void dotprod_rrrf_execute_mmx(dotprod_rrrf _q,
                              float *      _x,
                              float *      _y)
{
    // first cut: ...
    __m128 v;   // input vector
    __m128 h;   // coefficients vector
    __m128 s;   // dot product
    __m128 sum = _mm_setzero_ps(); // load zeros into sum register

    // t = 4*(floor(_n/4))
    unsigned int t = (_q->n >> 2) << 2;

    //
    unsigned int i;
    for (i=0; i<t; i+=4) {
        // load inputs into register (unaligned)
        v = _mm_loadu_ps(&_x[i]);

        // load coefficients into register (aligned)
        h = _mm_load_ps(&_q->h[i]);

        // compute multiplication
        s = _mm_mul_ps(v, h);
       
        // parallel addition
        sum = _mm_add_ps( sum, s );
    }

    // aligned output array
    float w[4] __attribute__((aligned(16)));

#if HAVE_PMMINTRIN_H
    // fold down into single value
    __m128 z = _mm_setzero_ps();
    sum = _mm_hadd_ps(sum, z);
    sum = _mm_hadd_ps(sum, z);
   
    // unload single (lower value)
    _mm_store_ss(w, sum);
    float total = w[0];
#else
    // unload packed array
    _mm_store_ps(w, sum);
    float total = w[0] + w[1] + w[2] + w[3];
#endif

    // cleanup
    for (; i<_q->n; i++)
        total += _x[i] * _q->h[i];

    // set return value
    *_y = total;
}
예제 #13
0
파일: Vector.hpp 프로젝트: yuriks/cga-t2
HW_FORCE_INLINE float invLength(const Vec<N>& a) {
	__m128 x = _mm_mul_ps(a.xmm, a.xmm);

	x = _mm_hadd_ps(x, x);
	x = _mm_hadd_ps(x, x);

	x = _mm_rsqrt_ss(x);

	float tmp;
	_mm_store_ss(&tmp, x);
	return tmp;
}
예제 #14
0
inline float dot_product(__m128 a, __m128 b)
{
#if defined(SSE4)
	__m128 m = _mm_dp_ps(a, b, 0xff);
	return m.m128_f32[0];
#elif defined(SSE3)
	__m128 m = _mm_mul_ps(a, b);
	m = _mm_hadd_ps(m, m);
	m = _mm_hadd_ps(m, m);
	return m.m128_f32[0];
#else
	__m128 m = _mm_mul_ps(a, b);
	return m.m128_f32[0] + m.m128_f32[1] + m.m128_f32[2] + m.m128_f32[3];
#endif
}
예제 #15
0
// ~~~~~~~~~~~~~~~ Task2
void mulVectorSse(MATRIX_TYPE** matrix, MATRIX_TYPE* vector, MATRIX_TYPE* result, size_t size) {
	for (size_t i = 0; i < size; i++) {
		__m128 localSum = _mm_setzero_ps();

		for (size_t j = 0; j < size; j += 4) {
			__m128 tempMatix = _mm_load_ps(&matrix[i][j]);
			__m128 tempVector = _mm_load_ps(&vector[j]);
			localSum = _mm_add_ps(localSum, _mm_mul_ps(tempMatix, tempVector));
		}

		localSum = _mm_hadd_ps(localSum, localSum);
		localSum = _mm_hadd_ps(localSum, localSum);
		_mm_store_ss(&result[i], localSum);
	}
}
예제 #16
0
		inline float32x4_t dot(const float32x4_t xmm1, const float32x4_t xmm2)
		{
#if TWIST_ARCH & TWIST_ARCH_SSE3_BIT
			float32x4_t mul0 = _mm_mul_ps(xmm1, xmm2);
			float32x4_t hadd0 = _mm_hadd_ps(mul0, mul0);
			float32x4_t hadd1 = _mm_hadd_ps(hadd0, hadd0);
			return hadd1;
#else // SSE3
			float32x4_t mul0 = _mm_mul_ps(xmm1, xmm2);
			float32x4_t swap0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1));
			float32x4_t add0 = _mm_add_ps(mul0, swap0);
			float32x4_t swap1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3));
			float32x4_t add1 = _mm_add_ps(add0, swap1);
			return add1;
#endif // SSE
		}
예제 #17
0
    inline vector4f haddp(const vector4f* row)
    {
#if SSE_INSTR_SET >= 3  // SSE3
        return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
                           _mm_hadd_ps(row[2], row[3]));
#else
        __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
        __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
        __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
        tmp0 = _mm_add_ps(tmp0, tmp1);
        tmp1 = _mm_unpacklo_ps(row[2], row[3]);
        tmp1 = _mm_add_ps(tmp1, tmp2);
        tmp2 = _mm_movehl_ps(tmp1, tmp0);
        tmp0 = _mm_movelh_ps(tmp0, tmp1);
        return _mm_add_ps(tmp0, tmp2);
#endif
    }
예제 #18
0
float reduction_sum_sse(float *v, int n)
{
    int i;
    float sum;
    __m128 *v4 = (__m128 *)v;
    __m128 vsum = _mm_set1_ps(0.0f);

    for (i = 0; i < n / 4; i++)
        vsum = _mm_add_ps(vsum, v4[i]);

    vsum = _mm_hadd_ps(vsum, vsum);
    vsum = _mm_hadd_ps(vsum, vsum);

    _mm_store_ss(&sum, vsum);

    return sum;
}
예제 #19
0
core::F32_t core::Vector3::dot( core::Vector3 &a, core::Vector3 &b )
{
	ALIGNED_16 core::F32_t aVector[] = {a.x, a.y, a.z, 0};
	ALIGNED_16 core::F32_t bVector[] = {b.x, b.y, b.z, 0};
	__m128 ma;
	__m128 mb;
	//__m128 mr;
	ma = _mm_load_ps(aVector);
	mb = _mm_load_ps(bVector);
	ALIGNED_16 core::F32_t res[4];

	ma = _mm_mul_ps(ma, mb);
	ma = _mm_hadd_ps(ma, ma);
	ma = _mm_hadd_ps(ma, ma);
	
	_mm_store_ps(&res[0], ma);
	return res[0];
}
예제 #20
0
const Vec3 &normalize() {
	w = 0.f;

	__m128 D = m128;
	D = _mm_mul_ps(D, D);
	D = _mm_hadd_ps(D, D);
	D = _mm_hadd_ps(D, D);

	// 1 iteration of Newton-raphson -- Idea from Intel's Embree.
	__m128 r = _mm_rsqrt_ps(D);
	r = _mm_add_ps(
	        _mm_mul_ps(_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f), r),
	        _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(D, _mm_set_ps(-0.5f, -0.5f, -0.5f, -0.5f)), r), _mm_mul_ps(r, r)));

	m128 = _mm_mul_ps(m128, r);

	return *this;
}
예제 #21
0
    float DotProductSIMD(const float* a, const float* b, std::size_t n) {
        std::size_t i = 0;

        __m128 sum =  _mm_setzero_ps();
        for(; i < ROUND_DOWN(n, 4); i += 4) {
            __m128 x = _mm_loadu_ps(a + i);
            __m128 y = _mm_loadu_ps(a + i);
            x = _mm_mul_ps(x, y);
            sum = _mm_add_ps(x, sum);
        }

        sum = _mm_hadd_ps(sum, sum);
        sum = _mm_hadd_ps(sum, sum);
        float product = _mm_cvtss_f32(sum);

        for(; i < n; i++) {
            product += a[i] * b[i];
        }
        return product;
    }
예제 #22
0
int 
distance_scan_to_map(
    map_t *  map,
    scan_t * scan,
    position_t position)
{    
    int npoints = 0; /* number of points where scan matches map */
    int64_t sum = 0; /* sum of map values at those points */
    
    /* Pre-compute sine and cosine of angle for rotation */
    double position_theta_radians = radians(position.theta_degrees);
    double costheta = cos(position_theta_radians) * map->scale_pixels_per_mm;
    double sintheta = sin(position_theta_radians) * map->scale_pixels_per_mm;
    
    /* Pre-compute pixel offset for translation */
    double pos_x_pix = position.x_mm * map->scale_pixels_per_mm;
    double pos_y_pix = position.y_mm * map->scale_pixels_per_mm;
    
    __m128 sincos128 = _mm_set_ps (costheta, -sintheta, sintheta, costheta);
    __m128 posxy128  = _mm_set_ps (pos_x_pix, pos_y_pix, pos_x_pix, pos_y_pix);

    int i = 0;
    for (i=0; i<scan->npoints; i++) 
    {        
        /* Consider only scan points representing obstacles */
        if (scan->value[i] == OBSTACLE)
        {
            /* Compute coordinate pair using SSE */
            __m128 xy128 = _mm_set_ps (scan->x_mm[i], scan->y_mm[i], scan->x_mm[i], scan->y_mm[i]);
            xy128 = _mm_mul_ps(sincos128, xy128);
            xy128 = _mm_hadd_ps(xy128, xy128);
            xy128 = _mm_add_ps(xy128, posxy128);
            cs_pos_mmx_t pos;
            pos.mmx = _mm_cvtps_pi32(xy128);

            /* Extract coordinates */
            int x = pos.pos.x;
            int y = pos.pos.y;

            /* Empty the multimedia state to avoid floating-point errors later */
            _mm_empty();
         
            /* Add point if in map bounds */
            if (x >= 0 && x < map->size_pixels && y >= 0 && y < map->size_pixels) 
            {
                sum += map->pixels[y * map->size_pixels + x];
                npoints++;
            } 
        }
    } 

    /* Return sum scaled by number of points, or -1 if none */
    return npoints ? (int)(sum * 1024 / npoints) : -1;  
}
예제 #23
0
v4f step_t::operator () (float t) const
{
  // Evaluate the polynomial f by Estrin's method. Return
  //   (0 0 0 0)  if t < t0,
  //   (f f f f)  if t0 <= t < t1,
  //   (1 1 1 1)  if t > t1.
  v4f c4 = load4f (c);
  v4f one = { 1.0f, 1.0f, 1.0f, 1.0f };
  v4f tttt = _mm_set1_ps (t);           // t t t t
  v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t
  v4f f0 = c4 * tt;                     // c0 c1*t c2 c3*t
  v4f ha = _mm_hadd_ps (f0, f0) * tt * tt;
  v4f f = _mm_hadd_ps (ha, ha);         // f f f f
  v4f f1 = _mm_unpacklo_ps (f, one);    // f 1 f 1
  v4f tx = load4f (T);                  // t0  t1 t1 inf
  v4f lo = _mm_movelh_ps (tx, tx);      // t0  t1 t0  t1
  v4f hi = _mm_movehl_ps (tx, tx);      // t1 inf t1 inf
  v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi));
  v4f val = _mm_and_ps (sel, f1);       // f? 1? f? 1?
  return _mm_hadd_ps (val, val);
}
예제 #24
0
// ~~~~~~~~~~~~~~~ Task3
void mulMatrix_IJKAlgSse(MATRIX_TYPE** first, MATRIX_TYPE** second, MATRIX_TYPE** result, size_t size) {
	transpose(second, size);

	for (size_t i = 0; i < size; i++) {
		for (size_t j = 0; j < size; j++) {
			__m128 temp = _mm_setzero_ps();

			for (size_t k = 0; k < size; k += 4) {
				__m128 tempFirst = _mm_load_ps(&first[i][k]);
				__m128 tempSecond = _mm_load_ps(&second[j][k]);
				temp = _mm_add_ps(temp, _mm_mul_ps(tempFirst, tempSecond));
			}

			temp = _mm_hadd_ps(temp, temp);
			temp = _mm_hadd_ps(temp, temp);
			_mm_store_ss(&result[i][j], temp);
		}
	}

	transpose(second, size);
}
예제 #25
0
SSH float* ssh_mtx_mtx(const float* m1, const float* m2)
{
	static float flt[16];

	__m128 _m[4];

	_m[0] = _mm_set_ps(m2[0], m2[4], m2[8], m2[12]);
	_m[1] = _mm_set_ps(m2[1], m2[5], m2[9], m2[13]);
	_m[2] = _mm_set_ps(m2[2], m2[6], m2[10], m2[14]);
	_m[3] = _mm_set_ps(m2[3], m2[7], m2[11], m2[15]);

	for(ssh_u i = 0; i < 4; i++)
	{
		for(ssh_u j = 0; j < 4; j++)
		{
			__m128 _tmp(_mm_mul_ps(*(__m128*)&m1[i * 4], _m[j]));
			_tmp = _mm_hadd_ps(_tmp, _tmp);
			flt[i * 4 + j] = _mm_hadd_ps(_tmp, _tmp).m128_f32[0];
		}
	}
	return flt;
}
예제 #26
0
/** transform vector by rigid transform */
inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec)
{
#ifdef SIMPLE_GL_USE_SSE4
    __m128 res;
    __m128 dotProd;

    res      = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\
    dotProd  = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) );

    return Matrix<float, 4, 1>(res);
#elif defined(SIMPLE_GL_USE_SSE3)
    __m128 res;

    __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);

    __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);

    __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);

    __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);

    __m128 vec01    = _mm_unpacklo_ps(dotProd0, dotProd1);
    __m128 vec23    = _mm_unpackhi_ps(dotProd2, dotProd3);
    res             = _mm_movelh_ps(vec01, vec23);

    return Matrix<float, 4, 1>(res);
#else // SSE2
    // TODO: Think about good sse optimization
    Matrix<float, 4, 1> res;
    res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3];
    res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3];
    res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3];
    res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3];
    return res;
#endif
}
예제 #27
0
파일: Play.cpp 프로젝트: zhangce/nn
float vsum(const float *a, int _n)
{
    float sum;
    int n = _n - _n%3;
    __m128 vsum = _mm_set1_ps(0.0f);
    assert((n & 3) == 0);
    assert(((uintptr_t)a & 15) == 0);
    for (int i = 0; i < n; i += 4)
    {
        __m128 v = _mm_load_ps(&a[i]);
        vsum = _mm_add_ps(vsum, v);
    }
    vsum = _mm_hadd_ps(vsum, vsum);
    vsum = _mm_hadd_ps(vsum, vsum);
    _mm_store_ss(&sum, vsum);

    for(int i=n;i<_n;i++){
    	sum += a[i];
    }

    return sum;
}
예제 #28
0
void lx_matmul_sse3_aligned(const float *in_A,
                            const float *in_x,
                            const float *out_y,
                            LXInteger n)
{
    __m128 A0 = _mm_load_ps((const float *)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float *)(in_A + 4));
    __m128 A2 = _mm_load_ps((const float *)(in_A + 8));
    __m128 A3 = _mm_load_ps((const float *)(in_A + 12));
    
    for (LXInteger i = 0; i < n; i++) {
        __m128 x =  _mm_load_ps((const float*)(in_x + i*4));
        __m128 m0 = _mm_mul_ps(A0, x);
        __m128 m1 = _mm_mul_ps(A1, x);
        __m128 m2 = _mm_mul_ps(A2, x);
        __m128 m3 = _mm_mul_ps(A3, x);
        __m128 sum_01 = _mm_hadd_ps(m0, m1); 
        __m128 sum_23 = _mm_hadd_ps(m2, m3);
        __m128 result = _mm_hadd_ps(sum_01, sum_23);
        _mm_store_ps((float*)(out_y + i*4), result);
    }
}
예제 #29
0
static void
matvec_sse()
{
        /* Assume that the data size is an even multiple of the 128 bit
         * SSE vectors (i.e. 4 floats) */
        assert(!(SIZE & 0x3));

        /* TASK: Implement your SSE version of the matrix-vector
         * multiplication here.
         */
        /* HINT: You might find at least the following instructions
         * useful:
         *  - _mm_setzero_ps
         *  - _mm_load_ps
         *  - _mm_hadd_ps
         *  - _mm_cvtss_f32
         *
         * HINT: You can create the sum of all elements in a vector
         * using two hadd instructions.
         */

        __m128 dummy=_mm_setzero_ps();
        for(int i=0;i<SIZE;++i){
            __m128 temp=_mm_setzero_ps();
            for(int j=0;j<SIZE;j+=4){

                __m128 mm_vec_b=_mm_load_ps((__m128*)(vec_b+j));
                __m128 mm_matr=_mm_load_ps((__m128*)(mat_a+MINDEX(i,j)));
                __m128 out=_mm_mul_ps(mm_vec_b,mm_matr);
                temp=_mm_add_ps(temp,out);

//                vec_c[i]+=_mm_cvtss_f32(_mm_dp_ps(mm_matr,mm_vec_b,0xf1));
            }
            __m128 res=_mm_hadd_ps(_mm_hadd_ps(temp,dummy),dummy);
            vec_c[i]=_mm_cvtss_f32(res);
        }

}
예제 #30
0
int main(){
  typedef union{
    __m128 m128;
    float flt[4];
  } m128f;
  __m128 x = {1.0,2.0,3.0,4.0};
  __m128 y = {10.0,20.0,30.0,40.0};
  m128f s,h;
  s.m128=haddps(x,y);
  h.m128=_mm_hadd_ps(x,y);
  printf("Software hadd: %f %f %f %f\n",s.flt[0],s.flt[1],s.flt[2],s.flt[3]);
  printf("Hardware hadd: %f %f %f %f\n",h.flt[0],h.flt[1],h.flt[2],h.flt[3]);
  return;
}