コード例 #1
0
ファイル: pitch_sse.c プロジェクト: 110Percent/BeeBot
void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
      int N, opus_val32 *xy1, opus_val32 *xy2)
{
   int i;
   __m128 xsum1, xsum2;
   xsum1 = _mm_setzero_ps();
   xsum2 = _mm_setzero_ps();
   for (i=0;i<N-3;i+=4)
   {
      __m128 xi = _mm_loadu_ps(x+i);
      __m128 y1i = _mm_loadu_ps(y01+i);
      __m128 y2i = _mm_loadu_ps(y02+i);
      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
   }
   /* Horizontal sum */
   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
   _mm_store_ss(xy1, xsum1);
   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
   _mm_store_ss(xy2, xsum2);
   for (;i<N;i++)
   {
      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
   }
}
コード例 #2
0
ファイル: util.cpp プロジェクト: shewu/raytracer
/* apparently this is retarded */
void mulMatrix1(Matrix4x4 ret, Matrix4x4 mat1, Matrix4x4 mat2)
{
    /* for some reason not aligning the matrix segfaults,
     * but aligning deadlocks the program */
    /* aha we can heavily sse this:
     * 1. transpose mat2
     * 2. dotproduct the rows */

    /* 1. transpose mat2 */
    __m128 row0, row1, row2, row3;
    __m128 tmp0, tmp1, tmp2, tmp3;

    /* Load 4x4 mat2 from memory into four SSE registers. */
    row0 = _mm_load_ps( mat2[0] );
    row1 = _mm_load_ps( mat2[1] );
    row2 = _mm_load_ps( mat2[2] );
    row3 = _mm_load_ps( mat2[3] );

    /* Interleave bottom/top two pixels from two SSE registers with each other
     * into a single SSE register. */
    tmp0 = _mm_unpacklo_ps( row0, row1 );
    tmp2 = _mm_unpacklo_ps( row2, row3 );
    tmp1 = _mm_unpackhi_ps( row0, row1 );
    tmp3 = _mm_unpackhi_ps( row2, row3 );

    /* Move bottom/top two pixels from two SSE registers into one SSE register. */
    row0 = _mm_movelh_ps( tmp0, tmp2 );
    row1 = _mm_movehl_ps( tmp2, tmp0 );
    row2 = _mm_movelh_ps( tmp1, tmp3 );
    row3 = _mm_movehl_ps( tmp3, tmp1 );

    /* Store 4x4 matrix from all four SSE registers into memory. */
    _mm_store_ps( mat2[0], row0 );
    _mm_store_ps( mat2[1], row1 );
    _mm_store_ps( mat2[2], row2 );
    _mm_store_ps( mat2[3], row3 );

    /* 2. dotproduct the rows */
    /* OMG 16 DOT PRODUCTS */
    ret[0][0] = mul_asm(mat1[0], mat2[0]);
    ret[0][1] = mul_asm(mat1[0], mat2[1]);
    ret[0][2] = mul_asm(mat1[0], mat2[2]);
    ret[0][3] = mul_asm(mat1[0], mat2[3]);
    ret[1][0] = mul_asm(mat1[1], mat2[0]);
    ret[1][1] = mul_asm(mat1[1], mat2[1]);
    ret[1][2] = mul_asm(mat1[1], mat2[2]);
    ret[1][3] = mul_asm(mat1[1], mat2[3]);
    ret[2][0] = mul_asm(mat1[2], mat2[0]);
    ret[2][1] = mul_asm(mat1[2], mat2[1]);
    ret[2][2] = mul_asm(mat1[2], mat2[2]);
    ret[2][3] = mul_asm(mat1[2], mat2[3]);
    ret[3][0] = mul_asm(mat1[3], mat2[0]);
    ret[3][1] = mul_asm(mat1[3], mat2[1]);
    ret[3][2] = mul_asm(mat1[3], mat2[2]);
    ret[3][3] = mul_asm(mat1[3], mat2[3]);

    return;
}
コード例 #3
0
/// Transform this box using the specified transform matrix.
///
/// @param[in] rTransform  Matrix by which to transform.
void Helium::Simd::AaBox::TransformBy( const Matrix44& rTransform )
{
    // Expand each corner position.
    Register minVec = m_minimum.GetSimdVector();
    Register maxVec = m_maximum.GetSimdVector();

    Vector3Soa corners0;
    corners0.m_x = _mm_shuffle_ps( minVec, minVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    corners0.m_y = _mm_shuffle_ps( minVec, maxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    corners0.m_z = _mm_unpackhi_ps( minVec, maxVec );
    corners0.m_z = _mm_movelh_ps( corners0.m_z, corners0.m_z );

    Vector3Soa corners1;
    corners1.m_x = _mm_shuffle_ps( maxVec, maxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    corners1.m_y = corners0.m_y;
    corners1.m_z = corners0.m_z;

    // Transform all corners by the provided transformation matrix.
    Matrix44Soa transformSplat( rTransform );
    transformSplat.TransformPoint( corners0, corners0 );
    transformSplat.TransformPoint( corners1, corners1 );

    // Compute the minimum.
    Register minX = Simd::MinF32( corners0.m_x, corners1.m_x );
    Register minY = Simd::MinF32( corners0.m_y, corners1.m_y );
    Register minXYLo = _mm_unpacklo_ps( minX, minY );
    Register minXYHi = _mm_unpackhi_ps( minX, minY );
    Register minXY = Simd::MinF32( minXYLo, minXYHi );

    Register minZ = Simd::MinF32( corners0.m_z, corners1.m_z );
    Register minZLo = _mm_unpacklo_ps( minZ, minZ );
    Register minZHi = _mm_unpackhi_ps( minZ, minZ );
    minZ = Simd::MinF32( minZLo, minZHi );

    Register minLo = _mm_movelh_ps( minXY, minZ );
    Register minHi = _mm_movehl_ps( minZ, minXY );

    m_minimum.SetSimdVector( Simd::MinF32( minLo, minHi ) );

    // Compute the maximum.
    Register maxX = Simd::MaxF32( corners0.m_x, corners1.m_x );
    Register maxY = Simd::MaxF32( corners0.m_y, corners1.m_y );
    Register maxXYLo = _mm_unpacklo_ps( maxX, maxY );
    Register maxXYHi = _mm_unpackhi_ps( maxX, maxY );
    Register maxXY = Simd::MaxF32( maxXYLo, maxXYHi );

    Register maxZ = Simd::MaxF32( corners0.m_z, corners1.m_z );
    Register maxZLo = _mm_unpacklo_ps( maxZ, maxZ );
    Register maxZHi = _mm_unpackhi_ps( maxZ, maxZ );
    maxZ = Simd::MaxF32( maxZLo, maxZHi );

    Register maxLo = _mm_movelh_ps( maxXY, maxZ );
    Register maxHi = _mm_movehl_ps( maxZ, maxXY );

    m_maximum.SetSimdVector( Simd::MaxF32( maxLo, maxHi ) );
}
コード例 #4
0
ファイル: noise.cpp プロジェクト: entropian/CRaytracer
float calcCubicNoiseValSSE(const vec3 p)    
{
    int ix, iy, iz;
    __m128 fx, fy;
    float fz;

    ix = (int)floor(p[0]);
    fx = _mm_set_ps1(p[0] - ix);    
    iy = (int)floor(p[1]);
    fy = _mm_set_ps1(p[1] - iy);
    iz = (int)floor(p[2]);
    fz = p[2] - iz;

    uSIMD k0, k1, k2, k3;
    __m128 out0, out1, out2, out3;

    for(int k = -1; k <= 2; k++)
    {
        for(int j = -1; j <= 2; j++)
        {
            k0.a[j+1] = getLatticeVal(ix-1, iy + j, iz + k);
            k1.a[j+1] = getLatticeVal(ix+0, iy + j, iz + k);
            k2.a[j+1] = getLatticeVal(ix+1, iy + j, iz + k);
            k3.a[j+1] = getLatticeVal(ix+2, iy + j, iz + k);            
        }
        switch(k)
        {
        case -1:
            out0 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m));
            break;
        case 0:
            out1 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m));            
            break;
        case 1:
            out2 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m));            
            break;
        case 2:
            out3 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m));            
            break;
        }
    }
    // Transpose the matrix formed by the out vectors.
    __m128 t1 = _mm_movelh_ps(out1, out0);
    __m128 t2 = _mm_movehl_ps(out0, out1);
    __m128 t3 = _mm_movelh_ps(out3, out2);
    __m128 t4 = _mm_movehl_ps(out2, out3);
    k0.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(0, 2, 0, 2));
    k1.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 3, 1, 3));
    k2.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(0, 2, 0, 2));
    k3.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 3, 1, 3));                    

    uSIMD final_knots;
    final_knots.m  = fourKnotSplineSSE(&fy, &(k0.m), &(k1.m), &(k2.m), &(k3.m));
    return clamp(fourKnotSpline(fz, final_knots.a), -1.0f, 1.0f);
}
コード例 #5
0
ファイル: sse-3.c プロジェクト: Scorpiion/Renux_cross_gcc
static inline
void
foo (__m128 *x)
{
  __m128 y = _mm_setzero_ps ();
  __m128 v = _mm_movehl_ps (y, *x);
  __m128 w = _mm_movehl_ps (*x, y);
  check (*x, 9, 1, 2, -3);
  check (v, 2, -3, 0, 0);
  check (w, 0, 0, 2, -3);
}
コード例 #6
0
ファイル: dridkernels.c プロジェクト: huynhqu1/mdtraj
int
drid_moments(float* coords, int32_t index, int32_t* partners, int32_t n_partners, double* moments)
{
    int32_t i;
    float d;
    moments_t onlinemoments;
    __m128 x, y, r, r2, s;

    moments_clear(&onlinemoments);
    x = load_float3(&coords[3 * index]);

    for (i = 0; i < n_partners; i++) {
        y = load_float3(&coords[3 * partners[i]]);
        r = _mm_sub_ps(x, y);     /* x - y       */
        r2 = _mm_mul_ps(r, r);    /* (x - y)**2  */

        /* horizontal add the components of d2 with */
        /* two instructions. note: it's critical */
        /* here that the last entry of x1 and x2 was 0 */
        /* so that d2.w = 0 */
	s = _mm_add_ps(r2, _mm_movehl_ps(r2, r2));
        s = _mm_add_ss(s, _mm_shuffle_ps(s, s, 1));
        /* store into a regular float. I tried using _mm_rsqrt_ps, but it's not
           accurate to pass the tests */
        _mm_store_ss(&d, s);
        moments_push(&onlinemoments, 1.0 / sqrt((double) d));
    }

    moments[0] = moments_mean(&onlinemoments);
    moments[1] = sqrt(moments_second(&onlinemoments));
    moments[2] = cbrt(moments_third(&onlinemoments));

    return 1;
}
コード例 #7
0
_inline float process_folded_fir_sse2(const float *fir_kernel, const float *queue_head, const float *queue_tail, int len)
{
	__m128 acc = _mm_set_ps(0, 0, 0, 0);

	queue_tail -= 3;

	len >>= 2;
	while (len > 0)
	{
		__m128 head = _mm_loadu_ps(queue_head);
		__m128 tail = _mm_loadu_ps(queue_tail);
		__m128 kern = _mm_load_ps(fir_kernel);

		tail = _mm_shuffle_ps(tail, tail, 0x1b);  // swap the order
		__m128 t1 = _mm_add_ps(tail, head);       // add the head
		t1 = _mm_mul_ps(t1, kern);                // mul
		acc = _mm_add_ps(acc, t1);                // add

		queue_head += 4;
		queue_tail -= 4;
		fir_kernel += 4;
		len--;
	}

	// horizontal sum
	const __m128 t = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
	const __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));

	return sum.m128_f32[0];
}
コード例 #8
0
ファイル: PMatrix4x4.hpp プロジェクト: centicosm/PSTD
			inline void Multiply(const PVector4df &v, PVector4df &out) {
#ifdef __SSE_AVAIL__

				__m128 v1 = _mm_load_ps(v._Vec);
				__m128 m0 = _mm_load_ps(Row1);
				__m128 m1 = _mm_load_ps(Row2);
				__m128 m2 = _mm_load_ps(Row3);
				__m128 m3 = _mm_load_ps(Row4);

				m0 = _mm_mul_ps(m0, v1);   //(e11 * v.X) , (e21 * v.Y), (e31 * v.Z), (e41 * v.W)
				m1 = _mm_mul_ps(m1, v1);	//(e12 * v.X) , (e22 * v.Y), (e32 * v.Z), (e42 * v.W)
				m2 = _mm_mul_ps(m2, v1);	//(e13 * v.X) , (e23 * v.Y), (e33 * v.Z), (e43 * v.W)
				m3 = _mm_mul_ps(m3, v1);	//(e14 * v.X) , (e24 * v.Y), (e34 * v.Z), (e44 * v.W)

				m0 = _mm_hadd_ps(m0, m1);
				m0 = _mm_hadd_ps(m0, m0);
				m2 = _mm_hadd_ps(m2, m3);
				m2 = _mm_hadd_ps(m2, m2);
				_mm_store_ps(out._Vec, _mm_movehl_ps(m2, m0));
#else
				out.X = v.X * e11 + v.Y * e21 + v.Z * e31 + v.W * e41;
				out.Y = v.X * e12 + v.Y * e22 + v.Z * e32 + v.W * e42;
				out.Z = v.X * e13 + v.Y * e23 + v.Z * e33 + v.W * e43;
				out.W = v.X * e14 + v.Y * e24 + v.Z * e34 + v.W * e44;
#endif
			}
コード例 #9
0
void OptimizedSelfAdjointMatrix6x6f::rankUpdate(const Eigen::Matrix<float, 6, 1>& u, const float& alpha)
{
  __m128 s = _mm_set1_ps(alpha);
  __m128 v1234 = _mm_loadu_ps(u.data());
  __m128 v56xx = _mm_loadu_ps(u.data() + 4);

  __m128 v1212 = _mm_movelh_ps(v1234, v1234);
  __m128 v3434 = _mm_movehl_ps(v1234, v1234);
  __m128 v5656 = _mm_movelh_ps(v56xx, v56xx);

  __m128 v1122 = _mm_mul_ps(s, _mm_unpacklo_ps(v1212, v1212));

  _mm_store_ps(data + 0, _mm_add_ps(_mm_load_ps(data + 0), _mm_mul_ps(v1122, v1212)));
  _mm_store_ps(data + 4, _mm_add_ps(_mm_load_ps(data + 4), _mm_mul_ps(v1122, v3434)));
  _mm_store_ps(data + 8, _mm_add_ps(_mm_load_ps(data + 8), _mm_mul_ps(v1122, v5656)));

  __m128 v3344 = _mm_mul_ps(s, _mm_unpacklo_ps(v3434, v3434));

  _mm_store_ps(data + 12, _mm_add_ps(_mm_load_ps(data + 12), _mm_mul_ps(v3344, v3434)));
  _mm_store_ps(data + 16, _mm_add_ps(_mm_load_ps(data + 16), _mm_mul_ps(v3344, v5656)));

  __m128 v5566 = _mm_mul_ps(s, _mm_unpacklo_ps(v5656, v5656));

  _mm_store_ps(data + 20, _mm_add_ps(_mm_load_ps(data + 20), _mm_mul_ps(v5566, v5656)));
}
コード例 #10
0
ファイル: PMatrix4x4.hpp プロジェクト: centicosm/PSTD
			inline PVector4df operator*(const PVector4df &v) {
		
#ifdef __SSE_AVAIL__

				__m128 v1 = _mm_load_ps(v._Vec);
				__m128 m0 = _mm_load_ps(Row1);
				__m128 m1 = _mm_load_ps(Row2);
				__m128 m2 = _mm_load_ps(Row3);
				__m128 m3 = _mm_load_ps(Row4);
				 m0 = _mm_mul_ps(m0, v1);   //(e11 * v.X) , (e21 * v.Y), (e31 * v.Z), (e41 * v.W)
				 m1 = _mm_mul_ps(m1, v1);	//(e12 * v.X) , (e22 * v.Y), (e32 * v.Z), (e42 * v.W)
				 m2 = _mm_mul_ps(m2, v1);	//(e13 * v.X) , (e23 * v.Y), (e33 * v.Z), (e43 * v.W)
				 m3 = _mm_mul_ps(m3, v1);	//(e14 * v.X) , (e24 * v.Y), (e34 * v.Z), (e44 * v.W)

				m0 = _mm_hadd_ps(m0, m1);
				m0 = _mm_hadd_ps(m0, m0);
				m2 = _mm_hadd_ps(m2, m3);
				m2 = _mm_hadd_ps(m2, m2);
				m0 = _mm_movehl_ps(m2, m0);

				PVector4df val;
				_mm_store_ps(val._Vec, m0);
				return val;
#else

				return PVector4df(v.X * e11 + v.Y * e21 + v.Z * e31 + v.W * e41,
					v.X * e12 + v.Y * e22 + v.Z * e32 + v.W * e42,
					v.X * e13 + v.Y * e23 + v.Z * e33 + v.W * e43,
					v.X * e14 + v.Y * e24 + v.Z * e34 + v.W * e44);
#endif
			}
コード例 #11
0
int any_ps(__m128 m)
{
  __m128 y = _mm_shuffle_ps(m, m, _MM_SHUFFLE(2,3,0,1));
  m = _mm_or_ps(m, y);
  __m128 z = _mm_movehl_ps(m, m);
  m = _mm_or_ps(m, z);  
  return _mm_ucomineq_ss(m, _mm_setzero_ps());
}
コード例 #12
0
ファイル: SimdFrustumSse.cpp プロジェクト: euler0/Helium
/// Compute the corners of this view frustum.
///
/// A view frustum can have either four or eight corners depending on whether a far clip plane exists (eight
/// corners) or whether an infinite far clip plane is used (four corners).
///
/// Note that this assumes that the frustum is always properly defined, with each possible combination of
/// neighboring clip planes intersecting at a valid point.
///
/// @param[out] pCorners  Array in which the frustum corners will be stored.  This must point to a region of memory
///                       large enough for four points if this frustum has an infinite far clip plane, or eight
///                       points if this frustum has a normal far clip plane.
///
/// @return  Number of clip planes computed (either four or eight).
size_t Helium::Simd::Frustum::ComputeCorners( Vector3* pCorners ) const
{
    HELIUM_ASSERT( pCorners );

    // Compute the corners in struct-of-arrays format.
    HELIUM_SIMD_ALIGN_PRE float32_t cornersX[ 8 ] HELIUM_SIMD_ALIGN_POST;
    HELIUM_SIMD_ALIGN_PRE float32_t cornersY[ 8 ] HELIUM_SIMD_ALIGN_POST;
    HELIUM_SIMD_ALIGN_PRE float32_t cornersZ[ 8 ] HELIUM_SIMD_ALIGN_POST;

    size_t cornerCount = ComputeCornersSoa( cornersX, cornersY, cornersZ );
    HELIUM_ASSERT( cornerCount == 4 || cornerCount == 8 );

    // Swizzle the results and store in the output array.
    Helium::Simd::Register cornerXVec = Helium::Simd::LoadAligned( cornersX );
    Helium::Simd::Register cornerYVec = Helium::Simd::LoadAligned( cornersY );
    Helium::Simd::Register cornerZVec = Helium::Simd::LoadAligned( cornersZ );

    Helium::Simd::Register xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec );
    Helium::Simd::Register xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec );
    Helium::Simd::Register zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec );
    Helium::Simd::Register zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec );

    pCorners[ 0 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) );
    pCorners[ 1 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) );
    pCorners[ 2 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) );
    pCorners[ 3 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) );

    if( cornerCount == 8 )
    {
        cornerXVec = Helium::Simd::LoadAligned( cornersX + 4 );
        cornerYVec = Helium::Simd::LoadAligned( cornersY + 4 );
        cornerZVec = Helium::Simd::LoadAligned( cornersZ + 4 );

        xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec );
        xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec );
        zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec );
        zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec );

        pCorners[ 4 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) );
        pCorners[ 5 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) );
        pCorners[ 6 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) );
        pCorners[ 7 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) );
    }

    return cornerCount;
}
コード例 #13
0
sse2_test(void) {
  u a, b;
  a.v = setupa ();
  b.v = setupb ();
  if (untrue)
    bar(a.v, b.v);
  b.v = (__v4sf) _mm_movehl_ps ((__m128)a.v, (__m128)b.v);
  foo (a, b);
}
コード例 #14
0
ファイル: src_sinc_opt.c プロジェクト: CarnyPriest/SAMbuild
static inline __m128 horizontal_add(const __m128 a)
{
#if 0 //!! needs SSE3
    const __m128 ftemp = _mm_hadd_ps(a, a);
    return _mm_hadd_ps(ftemp, ftemp);
#else    
    const __m128 ftemp = _mm_add_ps(a, _mm_movehl_ps(a, a)); //a0+a2,a1+a3
    return _mm_add_ss(ftemp, _mm_shuffle_ps(ftemp, ftemp, _MM_SHUFFLE(1, 1, 1, 1))); //(a0+a2)+(a1+a3)
#endif
}
コード例 #15
0
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer)
{
   unsigned i;
   __m128 sum_l = _mm_setzero_ps();
   __m128 sum_r = _mm_setzero_ps();

   const float *buffer_l = resamp->buffer_l + resamp->ptr;
   const float *buffer_r = resamp->buffer_r + resamp->ptr;

   unsigned taps = resamp->taps;
   unsigned phase = resamp->time >> SUBPHASE_BITS;
#if SINC_COEFF_LERP
   const float *phase_table = resamp->phase_table + phase * taps * 2;
   const float *delta_table = phase_table + taps;
   __m128 delta = _mm_set1_ps((float)(resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD);
#else
   const float *phase_table = resamp->phase_table + phase * taps;
#endif

   for (i = 0; i < taps; i += 4)
   {
      __m128 buf_l = _mm_loadu_ps(buffer_l + i);
      __m128 buf_r = _mm_loadu_ps(buffer_r + i);

#if SINC_COEFF_LERP
      __m128 deltas = _mm_load_ps(delta_table + i);
      __m128 sinc = _mm_add_ps(_mm_load_ps(phase_table + i), _mm_mul_ps(deltas, delta));
#else
      __m128 sinc = _mm_load_ps(phase_table + i);
#endif
      sum_l       = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc));
      sum_r       = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc));
   }

   // Them annoying shuffles :V
   // sum_l = { l3, l2, l1, l0 }
   // sum_r = { r3, r2, r1, r0 }

   __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)),
         _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));

   // sum   = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
   // sum   = { R1, R0, L1, L0 }

   sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);

   // sum   = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
   // sum   = { X,  R,  X,  L } 

   // Store L
   _mm_store_ss(out_buffer + 0, sum);

   // movehl { X, R, X, L } == { X, R, X, R }
   _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum));
}
コード例 #16
0
ファイル: debug.hpp プロジェクト: HazyResearch/EmptyHeaded
//Thanks stack overflow.
static inline float _mm256_reduce_add_ps(__m256 x) {
    /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
    const int imm = 1;
    const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, imm), _mm256_castps256_ps128(x));
    /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
    const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
    /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
    const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
    /* Conversion to float is a no-op on x86-64 */
    return _mm_cvtss_f32(x32);
}
コード例 #17
0
ファイル: nx_sse_float.hpp プロジェクト: mywoodstock/nxsimd
    inline float hadd(const vector4f& rhs)
    {
#if SSE_INSTR_SET >= 3  // SSE3
        __m128 tmp0 = _mm_hadd_ps(rhs, rhs);
        __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
#else
        __m128 tmp0 = _mm_add_ps(rhs, _mm_movehl_ps(rhs, rhs));
        __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
#endif
        return _mm_cvtss_f32(tmp1);
    }
コード例 #18
0
ファイル: matrix3.c プロジェクト: bradparks/obs-studio
void matrix3_transpose(struct matrix3 *dst, const struct matrix3 *m)
{
	__m128 tmp1, tmp2;
	vec3_transform(&dst->t, &m->t, m);
	vec3_neg(&dst->t, &dst->t);

        tmp1 = _mm_movelh_ps(m->x.m, m->y.m);
        tmp2 = _mm_movehl_ps(m->y.m, m->x.m);
        dst->x.m = _mm_shuffle_ps(tmp1, m->z.m, _MM_SHUFFLE(3, 0, 2, 0));
        dst->y.m = _mm_shuffle_ps(tmp1, m->z.m, _MM_SHUFFLE(3, 1, 3, 1));
        dst->z.m = _mm_shuffle_ps(tmp2, m->z.m, _MM_SHUFFLE(3, 2, 2, 0));
}
コード例 #19
0
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */
static inline float horizontal_sum_avx2(__m256 x)
{
  const __m128 hi_quad = _mm256_extractf128_ps(x, 1);
  const __m128 lo_quad = _mm256_castps256_ps128(x);
  const __m128 sum_quad = _mm_add_ps(lo_quad, hi_quad);
  const __m128 lo_dual = sum_quad;
  const __m128 hi_dual = _mm_movehl_ps(sum_quad, sum_quad);
  const __m128 sum_dual = _mm_add_ps(lo_dual, hi_dual);
  const __m128 lo = sum_dual;
  const __m128 hi = _mm_shuffle_ps(sum_dual, sum_dual, 0x1);
  const __m128 sum = _mm_add_ss(lo, hi);
  return _mm_cvtss_f32(sum);
}
コード例 #20
0
ファイル: BoundingBox.cpp プロジェクト: boberfly/Urho3D
BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const
{
#ifdef URHO3D_SSE
    const __m128 one = _mm_set_ss(1.f);
    __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one));
    __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one));
    __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f));
    __m128 halfSize = _mm_sub_ps(centerPoint, minPt);
    __m128 m0 = _mm_loadu_ps(&transform.m00_);
    __m128 m1 = _mm_loadu_ps(&transform.m10_);
    __m128 m2 = _mm_loadu_ps(&transform.m20_);
    __m128 r0 = _mm_mul_ps(m0, centerPoint);
    __m128 r1 = _mm_mul_ps(m1, centerPoint);
    __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1));
    __m128 r2 = _mm_mul_ps(m2, centerPoint);
    const __m128 zero = _mm_setzero_ps();
    __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero));
    __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
    const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
    __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize));
    __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize));
    __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize));
    t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y));
    t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero));
    __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
    return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir));
#else
    Vector3 newCenter = transform * Center();
    Vector3 oldEdge = Size() * 0.5f;
    Vector3 newEdge = Vector3(
        Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_,
        Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_,
        Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_
    );

    return BoundingBox(newCenter - newEdge, newCenter + newEdge);
#endif
}
コード例 #21
0
ファイル: sinc.c プロジェクト: freakdave/RetroArch
static void process_sinc(rarch_resampler_t *resamp, float *out_buffer)
{
   __m128 sum_l = _mm_setzero_ps();
   __m128 sum_r = _mm_setzero_ps();

   const float *buffer_l = resamp->buffer_l + resamp->ptr;
   const float *buffer_r = resamp->buffer_r + resamp->ptr;

   unsigned phase = resamp->time >> PHASES_SHIFT;
   unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
   __m128 delta_f = _mm_set1_ps(delta);

   const float *phase_table = resamp->phase_table[phase][PHASE_INDEX];
   const float *delta_table = resamp->phase_table[phase][DELTA_INDEX];

   for (unsigned i = 0; i < TAPS; i += 4)
   {
      __m128 buf_l  = _mm_loadu_ps(buffer_l + i);
      __m128 buf_r  = _mm_loadu_ps(buffer_r + i);

      __m128 phases = _mm_load_ps(phase_table + i);
      __m128 deltas = _mm_load_ps(delta_table + i);

      __m128 sinc   = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f));

      sum_l         = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc));
      sum_r         = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc));
   }

   // Them annoying shuffles :V
   // sum_l = { l3, l2, l1, l0 }
   // sum_r = { r3, r2, r1, r0 }

   __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)),
         _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));

   // sum   = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
   // sum   = { R1, R0, L1, L0 }

   sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);

   // sum   = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
   // sum   = { X,  R,  X,  L } 

   // Store L
   _mm_store_ss(out_buffer + 0, sum);

   // movehl { X, R, X, L } == { X, R, X, R }
   _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum));
}
コード例 #22
0
ファイル: nx_sse_float.hpp プロジェクト: mywoodstock/nxsimd
    inline vector4f haddp(const vector4f* row)
    {
#if SSE_INSTR_SET >= 3  // SSE3
        return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
                           _mm_hadd_ps(row[2], row[3]));
#else
        __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
        __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
        __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
        tmp0 = _mm_add_ps(tmp0, tmp1);
        tmp1 = _mm_unpacklo_ps(row[2], row[3]);
        tmp1 = _mm_add_ps(tmp1, tmp2);
        tmp2 = _mm_movehl_ps(tmp1, tmp0);
        tmp0 = _mm_movelh_ps(tmp0, tmp1);
        return _mm_add_ps(tmp0, tmp2);
#endif
    }
コード例 #23
0
ファイル: miniRT_math.cpp プロジェクト: anirul/miniRT
	// ----------------------------------------------------------
	//  Name:   matrix::MinValue
	//  Desc:   Returns the asbolute minimum element of the
	//				matrix.
	// ----------------------------------------------------------
	float matrix::MinValue() {
#ifdef _M_IX86
		F32vec4 min1 = _mm_min_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2));
		F32vec4 min2 = _mm_min_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4));
		F32vec4 min = _mm_min_ps(min1, min2);
		min = _mm_min_ps(min, _mm_movehl_ps(min,min));
		min = _mm_min_ss(min, _mm_shuffle_ps(min,min,0x01));
		return min[0];
#else
		float min = this->operator()(0, 0);
		for (int i = 0; i < 4; ++i)
			for (int j = 0; j < 4; ++j)
				if (this->operator()(i, j) < min) 
					min = this->operator()(i, j);
		return min;
#endif // _M_IX86 
	}
コード例 #24
0
ファイル: matrix4.cpp プロジェクト: Nate1595/OpenTomb
matrix4 matrix4::transposed() const
{
#ifdef __SSE__
    __m128 tmp3, tmp2, tmp1, tmp0;
    tmp0 = _mm_unpacklo_ps(x.v, y.v);
    tmp2 = _mm_unpacklo_ps(z.v, w.v);
    tmp1 = _mm_unpackhi_ps(x.v, y.v);
    tmp3 = _mm_unpackhi_ps(z.v, w.v);

    return matrix4(_mm_movelh_ps(tmp0, tmp2), _mm_movehl_ps(tmp2, tmp0), _mm_movelh_ps(tmp1, tmp3), _mm_movehl_ps(tmp3, tmp1));
#else
    return matrix4(float4(x.x, y.x, z.x, w.x),
                   float4(x.y, y.y, z.y, w.y),
                   float4(x.z, y.z, z.z, w.z),
                   float4(x.w, y.w, z.w, w.w));
#endif
}
コード例 #25
0
ファイル: miniRT_math.cpp プロジェクト: anirul/miniRT
	// ----------------------------------------------------------
	//  Name:   matrix::MaxValue
	//  Desc:   Returns the asbolute maximum element of the
	//				matrix.
	// ----------------------------------------------------------
	float matrix::MaxValue() {
#ifdef _M_IX86
		F32vec4 max1 = _mm_max_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2));
		F32vec4 max2 = _mm_max_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4));
		F32vec4 max = _mm_max_ps(max1, max2);
		max = _mm_max_ps(max, _mm_movehl_ps(max,max));
		max = _mm_max_ss(max, _mm_shuffle_ps(max,max,0x01));
		return max[0];
#else
		float max = this->operator()(0,0);
		for (int i = 0; i < 4; ++i)
			for (int j = 0; j < 4; ++j)
				if (this->operator()(i, j) > max)
					max = this->operator()(i, j);
		return max;
#endif // _M_IX86
	}
コード例 #26
0
ファイル: main.cpp プロジェクト: KubaO/stackoverflown
void fast(element_t * const elements, const int num_elts, const float a) {
    element_t * elts = elements;
    float logf_a = logf(a);
    float logf_1_a = logf(1.0/a);
    v4sf log_a = _mm_load1_ps(&logf_a);
    v4sf log_1_a = _mm_load1_ps(&logf_1_a);
    assert(num_elts % 3 == 0); // operates on 3 elements at a time

    // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    for (int i = 0; i < num_elts; i += 3) {
        // transpose
        // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output
        v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0
        v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0
        v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0
        v4sf r3 = _mm_setzero_ps();        // 0, 0, 0, 0
        v4sf t0 = _mm_unpacklo_ps(r0, r1); //  x1,x2,y1,y2
        v4sf t1 = _mm_unpacklo_ps(r2, r3); //  x3,0, y3,0
        v4sf t2 = _mm_unpackhi_ps(r0, r1); //  z1,z2,0, 0
        v4sf t3 = _mm_unpackhi_ps(r2, r3); //  z3,0, 0, 0
        r0 = _mm_movelh_ps(t0, t1);        // x1,x2,x3,0
        r1 = _mm_movehl_ps(t1, t0);        // y1,y2,y3,0
        r2 = _mm_movelh_ps(t2, t3);        // z1,z2,z3,0
        // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a))
        v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0
        v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0
        v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0
        v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0
        v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0
        v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0
        // sum
        v4sf s1 = _mm_add_ps(ex0, ex1);
        v4sf s2 = _mm_add_ps(sum1, ex2);
        // pow(sum, 1/a) = exp(sum * log(1/a))
        v4sf ps = _mm_mul_ps(s2, log_1_a);
        v4sf es = exp_ps(ps);
        ALIGN16_BEG float re[4] ALIGN16_END;
        _mm_store_ps(re, es);
        elts[0].re = re[0];
        elts[1].re = re[1];
        elts[2].re = re[2];
        elts += 3;
    }
}
コード例 #27
0
ファイル: bump.cpp プロジェクト: bustercopley/polymorph
v4f step_t::operator () (float t) const
{
  // Evaluate the polynomial f by Estrin's method. Return
  //   (0 0 0 0)  if t < t0,
  //   (f f f f)  if t0 <= t < t1,
  //   (1 1 1 1)  if t > t1.
  v4f c4 = load4f (c);
  v4f one = { 1.0f, 1.0f, 1.0f, 1.0f };
  v4f tttt = _mm_set1_ps (t);           // t t t t
  v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t
  v4f f0 = c4 * tt;                     // c0 c1*t c2 c3*t
  v4f ha = _mm_hadd_ps (f0, f0) * tt * tt;
  v4f f = _mm_hadd_ps (ha, ha);         // f f f f
  v4f f1 = _mm_unpacklo_ps (f, one);    // f 1 f 1
  v4f tx = load4f (T);                  // t0  t1 t1 inf
  v4f lo = _mm_movelh_ps (tx, tx);      // t0  t1 t0  t1
  v4f hi = _mm_movehl_ps (tx, tx);      // t1 inf t1 inf
  v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi));
  v4f val = _mm_and_ps (sel, f1);       // f? 1? f? 1?
  return _mm_hadd_ps (val, val);
}
コード例 #28
0
ファイル: tr_main.cpp プロジェクト: casey0102/iodoom3
void R_LocalPointToGlobal( const float modelMatrix[16], const idVec3 &in, idVec3 &out ) {
#if defined(MACOS_X) && defined(__i386__)
	__m128 m0, m1, m2, m3;
	__m128 in0, in1, in2;
	float i0,i1,i2;
	i0 = in[0];
	i1 = in[1];
	i2 = in[2];
	
	m0 = _mm_loadu_ps(&modelMatrix[0]);
	m1 = _mm_loadu_ps(&modelMatrix[4]);
	m2 = _mm_loadu_ps(&modelMatrix[8]);
	m3 = _mm_loadu_ps(&modelMatrix[12]);
	
	in0 = _mm_load1_ps(&i0);
	in1 = _mm_load1_ps(&i1);
	in2 = _mm_load1_ps(&i2);
	
	m0 = _mm_mul_ps(m0, in0);
	m1 = _mm_mul_ps(m1, in1);
	m2 = _mm_mul_ps(m2, in2);

	m0 = _mm_add_ps(m0, m1);
	m0 = _mm_add_ps(m0, m2);
	m0 = _mm_add_ps(m0, m3);
	
	_mm_store_ss(&out[0], m0);
	m1 = (__m128) _mm_shuffle_epi32((__m128i)m0, 0x55);
	_mm_store_ss(&out[1], m1);
	m2 = _mm_movehl_ps(m2, m0);
	_mm_store_ss(&out[2], m2);
#else	
	out[0] = in[0] * modelMatrix[0] + in[1] * modelMatrix[4]
		+ in[2] * modelMatrix[8] + modelMatrix[12];
	out[1] = in[0] * modelMatrix[1] + in[1] * modelMatrix[5]
		+ in[2] * modelMatrix[9] + modelMatrix[13];
	out[2] = in[0] * modelMatrix[2] + in[1] * modelMatrix[6]
		+ in[2] * modelMatrix[10] + modelMatrix[14];
#endif
}
コード例 #29
0
static int _ccv_nnc_gemm_forw_sse2(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b)
{
	const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
	const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1;
	const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
	const int* bdim = (b_nd == 1) ? b->info.dim : b->info.dim + 1;
	assert(bdim[0] == bias->info.dim[0]);
	assert(bdim[0] == w->info.dim[0]);
	assert(adim[0] == w->info.dim[1]);
	const int* ainc = CCV_IS_TENSOR_VIEW(a) ? (a_nd == 1 ? a->inc : a->inc + 1) : adim;
	const int* binc = CCV_IS_TENSOR_VIEW(b) ? (b_nd == 1 ? b->inc : b->inc + 1) : bdim;
	const int* winc = CCV_IS_TENSOR_VIEW(w) ? w->inc : w->info.dim;
	const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]);
	int i;
	for (i = 0; i < batch_size; i++)
	{
		const float* const ap = a->data.f32 + i * ainc[0];
		float* const bp = b->data.f32 + i * binc[0];
		parallel_for(j, bdim[0]) {
			const float* const wp = w->data.f32 + j * winc[1];
			int k;
			__m128 v40 = _mm_set_ss(bias->data.f32[j]);
			__m128 v41 = _mm_setzero_ps();
			for (k = 0; k < adim[0] - 7; k += 8)
			{
				__m128 ap40 = _mm_load_ps(ap + k);
				__m128 ap41 = _mm_load_ps(ap + k + 4);
				__m128 w40 = _mm_load_ps(wp + k);
				__m128 w41 = _mm_load_ps(wp + k + 4);
				v40 =_mm_add_ps(_mm_mul_ps(w40, ap40), v40);
				v41 =_mm_add_ps(_mm_mul_ps(w41, ap41), v41);
			}
			v40 = _mm_add_ps(v40, v41);
			v41 = _mm_add_ps(v40, _mm_movehl_ps(v40, v40));
			v40 = _mm_add_ss(v41, _mm_shuffle_ps(v41, v41, 1));
			_mm_store_ss(bp + j, v40);
		} parallel_endfor
	}
	return CCV_NNC_EXEC_SUCCESS;
}
コード例 #30
0
ファイル: miniRT_math.cpp プロジェクト: anirul/miniRT
	// ----------------------------------------------------------
	//  Name:   matrix::Determinant
	//  Desc:   Return the matrix determinant. A = det[M].
	// ----------------------------------------------------------
	float matrix::Determinant() {
#ifdef _M_IX86
		__m128 Va,Vb,Vc;
		__m128 r1,r2,r3,t1,t2,sum;
		F32vec4 Det;

		// First, Let's calculate the first four minterms of
		// the first line
		t1 = _L4; t2 = _mm_ror_ps(_L3,1); 
		// V3'·V4
		Vc = _mm_mul_ps(t2,_mm_ror_ps(t1,0));
		// V3'·V4"
		Va = _mm_mul_ps(t2,_mm_ror_ps(t1,2));
		// V3'·V4^
		Vb = _mm_mul_ps(t2,_mm_ror_ps(t1,3));

		// V3"·V4^ - V3^·V4"
		r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));
		// V3^·V4' - V3'·V4^
		r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));
		// V3'·V4" - V3"·V4'
		r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));

		Va = _mm_ror_ps(_L2,1);
		sum = _mm_mul_ps(Va,r1);
		Vb = _mm_ror_ps(Va,1);
		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
		Vc = _mm_ror_ps(Vb,1);
		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));

		// Now we can calculate the determinant:
		Det = _mm_mul_ps(sum,_L1);
		Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
		Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
		return Det[0];
#else
		// TODO
		return 0.0f;
#endif // _M_IX86
	}