Ejemplo n.º 1
0
inline matrix_t
transpose(const matrix_t& matrix)
{
	matrix_t result = matrix;
	_MM_TRANSPOSE4_PS(result[0], result[1], result[2], result[3]);
	return result;
}
Ejemplo n.º 2
0
void dct4x4_1d_llm_fwd_sse_and_transpose(float* s, float* d)//8add, 4 mul
{
	const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2);
	const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2);

	__m128 s0 = _mm_load_ps(s); s += 4;
	__m128 s1 = _mm_load_ps(s); s += 4;
	__m128 s2 = _mm_load_ps(s); s += 4;
	__m128 s3 = _mm_load_ps(s);

	__m128 p03 = _mm_add_ps(s0, s3);
	__m128 p12 = _mm_add_ps(s1, s2);
	__m128 m03 = _mm_sub_ps(s0, s3);
	__m128 m12 = _mm_sub_ps(s1, s2);

	s0 = _mm_add_ps(p03, p12);
	s1 = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12));
	s2 = _mm_sub_ps(p03, p12);
	s3 = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12));
	_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
	_mm_store_ps(d, s0);
	_mm_store_ps(d + 4, s1);
	_mm_store_ps(d + 8, s2);
	_mm_store_ps(d + 12, s3);
}
Ejemplo n.º 3
0
void dct4x4_1d_llm_inv_sse_and_transpose(float* s, float* d)
{
	const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2);
	const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2);

	__m128 s0 = _mm_load_ps(s); s += 4;
	__m128 s1 = _mm_load_ps(s); s += 4;
	__m128 s2 = _mm_load_ps(s); s += 4;
	__m128 s3 = _mm_load_ps(s);

	__m128 t10 = _mm_add_ps(s0, s2);
	__m128 t12 = _mm_sub_ps(s0, s2);

	__m128 t0 = _mm_add_ps(_mm_mul_ps(c2, s1), _mm_mul_ps(c6, s3));
	__m128 t2 = _mm_sub_ps(_mm_mul_ps(c6, s1), _mm_mul_ps(c2, s3));

	s0 = _mm_add_ps(t10, t0);
	s1 = _mm_add_ps(t12, t2);
	s2 = _mm_sub_ps(t12, t2);
	s3 = _mm_sub_ps(t10, t0);
	_MM_TRANSPOSE4_PS(s0, s1, s2, s3);
	_mm_store_ps(d, s0);
	_mm_store_ps(d + 4, s1);
	_mm_store_ps(d + 8, s2);
	_mm_store_ps(d + 12, s3);
}
Ejemplo n.º 4
0
/** compose homogeneous transforms */
inline RigidTransform<float>& operator *= (RigidTransform<float>& lhs, const RigidTransform<float>& rhs)
{
#ifdef SIMPLE_GL_USE_SSE4
    // load transposed matrix
    __m128 matrix0 = _mm_load_ps(&rhs[0].x);
    __m128 matrix1 = _mm_load_ps(&rhs[1].x);
    __m128 matrix2 = _mm_load_ps(&rhs[2].x);
    __m128 matrix3 = _mm_load_ps(&rhs[3].x);
    _MM_TRANSPOSE4_PS(matrix0, matrix1, matrix2, matrix3);

    __m128 row, dotProd;
    #define __CALC_ROW(i)\
        row         = lhs[i].m128;\
        lhs[i].m128 = _mm_dp_ps(row, matrix0, 0xEE);\
        dotProd     = _mm_dp_ps(row, matrix1, 0xEE);\
        lhs[i].m128 = _mm_blend_ps( lhs[i].m128, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\
        dotProd     = _mm_dp_ps(row, matrix2, 0xEE);\
        lhs[i].m128 = _mm_blend_ps( lhs[i].m128, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\
        dotProd     = _mm_dp_ps(row, matrix3, 0xEE);\
        lhs[i].m128 = _mm_blend_ps( lhs[i].m128, dotProd, _MM_SHUFFLE(0, 0, 0, 1) );

    // calculate
    __CALC_ROW(0)
    __CALC_ROW(1)
    __CALC_ROW(2)
    // __CALC_ROW(3) - 3rd row should be (0, 0, 0, 1)
    #undef __CALC_ROW

    return lhs;
#else // SSE2
    __m128 row0;
    __m128 row1;
    __m128 row2;
    __m128 row3;
    #define __CALC_ROW(i)\
        row0 = _mm_shuffle_ps( lhs[i].m128, lhs[i].m128, _MM_SHUFFLE(0, 0, 0, 0) );\
        row1 = _mm_shuffle_ps( lhs[i].m128, lhs[i].m128, _MM_SHUFFLE(1, 1, 1, 1) );\
        row2 = _mm_shuffle_ps( lhs[i].m128, lhs[i].m128, _MM_SHUFFLE(2, 2, 2, 2) );\
        row3 = _mm_shuffle_ps( lhs[i].m128, lhs[i].m128, _MM_SHUFFLE(3, 3, 3, 3) );\
        \
        row0 = _mm_mul_ps(row0, rhs[0].m128);\
        row1 = _mm_mul_ps(row1, rhs[1].m128);\
        row2 = _mm_mul_ps(row2, rhs[2].m128);\
        row3 = _mm_mul_ps(row3, rhs[3].m128);\
        \
        lhs[i].m128 = _mm_add_ps(row0, row1);\
        lhs[i].m128 = _mm_add_ps(lhs[i].m128, row2);\
        lhs[i].m128 = _mm_add_ps(lhs[i].m128, row3);

    // calculate
    __CALC_ROW(0)
    __CALC_ROW(1)
    __CALC_ROW(2)
    //__CALC_ROW(3) - 3rd row should be (0, 0, 0, 1)
    #undef __CALC_ROW

    return lhs;
#endif
}
Ejemplo n.º 5
0
vec4 mat4::operator*(const vec4& b) const {
    __m128 r0 = _mm_mul_ps(m[0], b.v);
    __m128 r1 = _mm_mul_ps(m[1], b.v);
    __m128 r2 = _mm_mul_ps(m[2], b.v);
    __m128 r3 = _mm_mul_ps(m[3], b.v);
    _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
    return _mm_add_ps(r0, _mm_add_ps(r1, _mm_add_ps(r2, r3)));
}
Ejemplo n.º 6
0
static void transpose_block(float *in_data, float *out_data)
{
	int i;

	__m128 row1, row2, row3, row4;

	for (i = 0; i < 8; i += 4)
	{
		/* Transpose one 4x8 matrix at a time by using _MM_TRANSPOSE4_PS
		 * on two 4x4 matrixes
		 * First iteration: upper left and lower left
		 * Second iteration: upper right and lower right
		 */

		// Transpose the upper 4x4 matrix
		row1 = _mm_load_ps(in_data + i);
		row2 = _mm_load_ps(in_data + 8 + i);
		row3 = _mm_load_ps(in_data + 16 + i);
		row4 = _mm_load_ps(in_data + 24 + i);
		_MM_TRANSPOSE4_PS(row1, row2, row3, row4);

		// Store the first four elements of each row of the transposed 8x8 matrix
		_mm_store_ps(out_data + i * 8, row1);
		_mm_store_ps(out_data + (i + 1) * 8, row2);
		_mm_store_ps(out_data + (i + 2) * 8, row3);
		_mm_store_ps(out_data + (i + 3) * 8, row4);

		// Transpose the lower 4x4 matrix
		row1 = _mm_load_ps(in_data + 32 + i);
		row2 = _mm_load_ps(in_data + 40 + i);
		row3 = _mm_load_ps(in_data + 48 + i);
		row4 = _mm_load_ps(in_data + 56 + i);
		_MM_TRANSPOSE4_PS(row1, row2, row3, row4);

		// Store the last four elements of each row of the transposed 8x8 matrix
		_mm_store_ps(out_data + i * 8 + 4, row1);
		_mm_store_ps(out_data + (i + 1) * 8 + 4, row2);
		_mm_store_ps(out_data + (i + 2) * 8 + 4, row3);
		_mm_store_ps(out_data + (i + 3) * 8 + 4, row4);
	}
}
Ejemplo n.º 7
0
Mat44 Mat44::Transpose() const
{
	__m128 mat1rows[] = { _mm_load_ps(mat), _mm_load_ps(mat+4), _mm_load_ps(mat+8), _mm_load_ps(mat+12) };
	_MM_TRANSPOSE4_PS(mat1rows[0], mat1rows[1], mat1rows[2], mat1rows[3]);

	Mat44 res;
	_mm_store_ps(res.mat   , mat1rows[0]);
	_mm_store_ps(res.mat+4 , mat1rows[1]);
	_mm_store_ps(res.mat+8 , mat1rows[2]);
	_mm_store_ps(res.mat+12, mat1rows[3]);
	return res;
};
Ejemplo n.º 8
0
void transpose4x4<float, float>(const float* src, size_t lda, float* dst, size_t ldb) {
  __m128 row0, row1, row2, row3;
  row0 = _mm_loadu_ps(src);
  row1 = _mm_loadu_ps(src+lda);
  row2 = _mm_loadu_ps(src+2*lda);
  row3 = _mm_loadu_ps(src+3*lda);
  _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
  _mm_storeu_ps(dst, row0);
  _mm_storeu_ps(dst+ldb, row1);
  _mm_storeu_ps(dst+2*ldb, row2);
  _mm_storeu_ps(dst+3*ldb, row3);
}
Ejemplo n.º 9
0
inline void transpose_4x4block_SSE_32(float* A, float* B, const size_t lda,
                                   const size_t ldb) {
    __m128 row1 = _mm_load_ps(&A[0*ldb]);
    __m128 row2 = _mm_load_ps(&A[1*ldb]);
    __m128 row3 = _mm_load_ps(&A[2*ldb]);
    __m128 row4 = _mm_load_ps(&A[3*ldb]);
    _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
    _mm_store_ps(&B[0*lda], row1);
    _mm_store_ps(&B[1*lda], row2);
    _mm_store_ps(&B[2*lda], row3);
    _mm_store_ps(&B[3*lda], row4);
}
Ejemplo n.º 10
0
			template <typename T> void Transpose(void) {
#ifdef __SSE_AVAIL__
				_MM_TRANSPOSE4_PS(ssem1, ssem2, ssem3, ssem4);
#else
				Matrix4x4<T> m = *this;

					Mat[0] = m.Mat[0]; Mat[1] = m.Mat[4]; Mat[2] = m.Mat[8]; Mat[3] = m.Mat[12];
					Mat[4] = m.Mat[1]; Mat[5] = m.Mat[5]; Mat[6] = m.Mat[9]; Mat[7] = m.Mat[13];
					Mat[8] = m.Mat[2]; Mat[9] = m.Mat[6]; Mat[10] = m.Mat[10]; Mat[11] = m.Mat[14];
					Mat[12] = m.Mat[3]; Mat[13] = m.Mat[7]; Mat[14] = m.Mat[11]; Mat[15] = m.Mat[15];
#endif
			};
Ejemplo n.º 11
0
static inline void
transpose_sse (gfloat *src, gfloat *dst, const int width, const int height)
{
    __m128 row1 = _mm_loadu_ps (src);
    __m128 row2 = _mm_loadu_ps (src + height);
    __m128 row3 = _mm_loadu_ps (src + 2 * height);
    __m128 row4 = _mm_loadu_ps (src + 3 * height);
    _MM_TRANSPOSE4_PS (row1, row2, row3, row4);
    _mm_storeu_ps (dst, row1);
    _mm_storeu_ps (dst + width, row2);
    _mm_storeu_ps (dst + 2 * width, row3);
    _mm_storeu_ps (dst + 3 * width, row4);
}
Ejemplo n.º 12
0
void
Matrix4x4<float>::transpose()
{
    __m128 row0 = m_rows[0].asSSE();
    __m128 row1 = m_rows[1].asSSE();
    __m128 row2 = m_rows[2].asSSE();
    __m128 row3 = m_rows[3].asSSE();
    _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
    m_rows[0] = row0;
    m_rows[1] = row1;
    m_rows[2] = row2;
    m_rows[3] = row3;
}
Ejemplo n.º 13
0
//Binning
static void gather4Simd(VecF32Soa dest[3],VecF32 vertices[12]){
	for(uint32 i = 0;i<3;++i){
		__m128 v0 = vertices[i].simd; //x0, y0, z0, w0
		__m128 v1 = vertices[3+i].simd;//x1, y1, z1, w1
		__m128 v2 = vertices[6+i].simd;//x2, y2, z2, w2
		__m128 v3 = vertices[9+i].simd;//x3, y3, z3, w3
		_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
		dest[i].x = VecF32(v0);
		dest[i].y = VecF32(v1);
		dest[i].z = VecF32(v2);
		dest[i].w = VecF32(v3);
	}
}
Ejemplo n.º 14
0
// It's 20% faster but wasn't the 2x speed up I was hoping for.
// Will have to investigate later.
mat4 mat4::operator*(const mat4& b) const {
    mat4 temp = b;
    temp.makeTranspose();
    __m128 r[4];
    for(int i = 0; i < 4; ++i) {
        __m128 c0 = _mm_mul_ps(m[i], temp.m[0]);
        __m128 c1 = _mm_mul_ps(m[i], temp.m[1]);
        __m128 c2 = _mm_mul_ps(m[i], temp.m[2]);
        __m128 c3 = _mm_mul_ps(m[i], temp.m[3]);
        _MM_TRANSPOSE4_PS(c0, c1, c2, c3);
        r[i] = _mm_add_ps(c0, _mm_add_ps(c1, _mm_add_ps(c2, c3)));
    }
    return r;
}
Ejemplo n.º 15
0
inline __m128 convolute_loop(__m128 bspline, __m128 m[4])
{
#if defined(SSE4)
	return _mm_set_ps(
		dot_product(bspline, m[0]),
		dot_product(bspline, m[1]),
		dot_product(bspline, m[2]),
		dot_product(bspline, m[3]));
#else
	_MM_TRANSPOSE4_PS(m[3], m[2], m[1], m[0]);
	return
		_mm_add_ps( _mm_add_ps( _mm_add_ps(
			_mm_mul_ps(m[0], _mm_shuffle_ps(bspline, bspline, _MM_SHUFFLE(3,3,3,3))),
			_mm_mul_ps(m[1], _mm_shuffle_ps(bspline, bspline, _MM_SHUFFLE(2,2,2,2)))),
			_mm_mul_ps(m[2], _mm_shuffle_ps(bspline, bspline, _MM_SHUFFLE(1,1,1,1)))),
			_mm_mul_ps(m[3], _mm_shuffle_ps(bspline, bspline, _MM_SHUFFLE(0,0,0,0))));
#endif
}
Ejemplo n.º 16
0
void AABBTransformAsAABB_SIMD(AABB &aabb, const float4x4 &m)
{
	simd4f minPt = aabb.MinPoint_SSE();
	simd4f maxPt = aabb.MaxPoint_SSE();
	simd4f centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f));
	simd4f halfSize = _mm_sub_ps(centerPoint, minPt);
	simd4f newCenter = mat4x4_mul_vec4(m.row, centerPoint);

	simd4f x = abs_ps(_mm_mul_ps(m.row[0], halfSize));
	simd4f y = abs_ps(_mm_mul_ps(m.row[1], halfSize));
	simd4f z = abs_ps(_mm_mul_ps(m.row[2], halfSize));
	simd4f w = _mm_setzero_ps();
	_MM_TRANSPOSE4_PS(x, y, z, w); // Contains 2x unpacklo's, 2x unpackhi's, 2x movelh's and 2x movehl's. (or 8 shuffles, depending on the compiler)

	simd4f newDir = _mm_add_ps(_mm_add_ps(x, y), _mm_add_ps(z, w));

	aabb.MinPoint_SSE() = _mm_sub_ps(newCenter, newDir);
	aabb.MaxPoint_SSE() = _mm_add_ps(newCenter, newDir);
}
void TransformedAABBoxSSE::Gather(vFloat4 pOut[3], UINT triId, const __m128 xformedPos[], UINT idx)
{
	for(int i = 0; i < 3; i++)
	{
		UINT ind0 = sBBIndexList[triId*3 + i + 0];
		UINT ind1 = sBBIndexList[triId*3 + i + 3];
		UINT ind2 = sBBIndexList[triId*3 + i + 6];
		UINT ind3 = sBBIndexList[triId*3 + i + 9];

		__m128 v0 = xformedPos[ind0];	// x0 y0 z0 w0
		__m128 v1 = xformedPos[ind1];	// x1 y1 z1 w1
		__m128 v2 = xformedPos[ind2];	// x2 y2 z2 w2
		__m128 v3 = xformedPos[ind3];	// x3 y3 z3 w3
		_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
		pOut[i].X = v0;
		pOut[i].Y = v1;
		pOut[i].Z = v2;
		pOut[i].W = v3;
	}
}
Ejemplo n.º 18
0
inline wg_v4sf Recognizer::local_distance4(float *s,
                                           float *t0,
                                           float *t1,
                                           float *t2,
                                           float *t3) {
    wg_v4sf v_, v0, v1, v2, v3;
    v_.v = _mm_set_ps1(-0.0);
    v0.v = _mm_sub_ps(((wg_v4sf *)t0)->v, ((wg_v4sf *)s)->v);
    v0.v = _mm_andnot_ps(v_.v,v0.v); // absolute value
    v1.v = _mm_sub_ps(((wg_v4sf *)t1)->v, ((wg_v4sf *)s)->v);
    v1.v = _mm_andnot_ps(v_.v,v1.v); // absolute value
    v2.v = _mm_sub_ps(((wg_v4sf *)t2)->v, ((wg_v4sf *)s)->v);
    v2.v = _mm_andnot_ps(v_.v,v2.v); // absolute value
    v3.v = _mm_sub_ps(((wg_v4sf *)t3)->v, ((wg_v4sf *)s)->v);
    v3.v = _mm_andnot_ps(v_.v,v3.v); // absolute value
    // convert row vectors to column vectors
    _MM_TRANSPOSE4_PS(v0.v, v1.v, v2.v, v3.v);
    v3.v = _mm_add_ps(v3.v, v2.v);
    v3.v = _mm_add_ps(v3.v, v1.v);
    v3.v = _mm_add_ps(v3.v, v0.v);
    return v3;
}
Ejemplo n.º 19
0
void mult_matrix_vecf(const float* ina,
	const float* const inv, float* dst)
{
#ifdef ARCAN_MATH_ALIGNED_SIMD
	assert((uintptr_t)dst % 16 == 0);
	assert((uintptr_t)ina % 16 == 0);
	assert((uintptr_t)inv % 16 == 0);
	__m128 r0 = _mm_loadu_ps(&ina[0]);
	__m128 r1 = _mm_loadu_ps(&ina[4]);
	__m128 r2 = _mm_loadu_ps(&ina[8]);
	__m128 r3 = _mm_loadu_ps(&ina[12]);
	const __m128 ir = _mm_loadu_ps(inv);
#else
	__m128 r0 = _mm_load_ps(&ina[0]);
	__m128 r1 = _mm_load_ps(&ina[4]);
	__m128 r2 = _mm_load_ps(&ina[8]);
	__m128 r3 = _mm_load_ps(&ina[12]);
	const __m128 ir = _mm_load_ps(inv);
#endif
/* column major curses .. */
	_MM_TRANSPOSE4_PS(r0, r1, r2, r3);

	__m128 m0 = _mm_mul_ps(r0, ir);
	__m128 m1 = _mm_mul_ps(r1, ir);
	__m128 m2 = _mm_mul_ps(r2, ir);
	__m128 m3 = _mm_mul_ps(r3, ir);

	__m128 a1 = _mm_hadd_ps(m0, m1);
	__m128 a2 = _mm_hadd_ps(m2, m3);
	__m128 rs = _mm_hadd_ps(a1, a2);

#ifdef ARCAN_MATH_ALIGNED_SIMD
	_mm_storeu_ps(dst, rs);
#else
	_mm_store_ps(dst, rs);
#endif
}
Ejemplo n.º 20
0
void BrushToolEdit::drawSmoothen(const QPoint &pt, float amount)
{
    Terrain *tip = tool->tip(pt).data();

    // compute affected rectangle
    QRect dirtyRect(pt, tip->size());
    dirtyRect = dirtyRect.intersected(QRect(QPoint(0, 0), terrain->size()));

    if (!dirtyRect.isValid()) {
        return;
    }

    edit->beginEdit(dirtyRect, terrain);

    QSize tSize = terrain->size();
    QSize tipSize = tip->size();
    QSize blurBufferSize(dirtyRect.width() + 6, dirtyRect.height() + 6);
    TemporaryBuffer<float> blurBuffer1(blurBufferSize.width() * blurBufferSize.height(), 4);
    TemporaryBuffer<float> blurBuffer2(blurBufferSize.width() * blurBufferSize.height(), 4);
    TemporaryBuffer<float> tipBuffer(blurBufferSize.width() * blurBufferSize.height(), 4);

    for (int y = 0; y < blurBufferSize.height(); ++y) {
        int cy = y + dirtyRect.top() - 3;
        cy = std::max(std::min(cy, tSize.height() - 1), 0);
        for (int x = 0; x < blurBufferSize.width(); ++x) {
            int cx = x + dirtyRect.left() - 3;
            cx = std::max(std::min(cx, tSize.width() - 1), 0);
            blurBuffer1[x + y * blurBufferSize.width()] =
                    terrain->landform(cx, cy);
        }
    }
    for (int y = 0; y < blurBufferSize.height(); ++y) {
        int cy = y + dirtyRect.top() - 3;
        int ty = cy - pt.y();
        if (ty >= 0 && ty < tipSize.height()) {
            for (int x = 0; x < blurBufferSize.width(); ++x) {
                int cx = x + dirtyRect.left() - 3;
                int tx = cx - pt.x();
                tipBuffer[x + y * blurBufferSize.width()] =
                        tx >= 0 && tx < tipSize.width() ?
                            tip->landform(tx, ty) * amount :
                            0.f;
            }
        } else {
            std::fill(&tipBuffer[y * blurBufferSize.width()],
                    &tipBuffer[(y + 1) * blurBufferSize.width()], 0.f);
        }
    }

    // apply horizontal blur
    for (int y = 0; y < blurBufferSize.height(); ++y) {
        float *inBuf = blurBuffer1 + y * blurBufferSize.width();
        float *outBuf = blurBuffer2 + y * blurBufferSize.width();
        float *varBuf = tipBuffer + y * blurBufferSize.width();
        for (int x = 3; x < blurBufferSize.width() - 3; ++x) {
            float variance = varBuf[x];
            __m128 kernel = globalGaussianKernelTable.fetch(variance);

            // sample input
            __m128 p1 = _mm_loadu_ps(inBuf + x - 3);
            __m128 p2 = _mm_loadu_ps(inBuf + x);
            p1 = _mm_shuffle_ps(p1, p1, _MM_SHUFFLE(0, 1, 2, 3));
            auto p = _mm_add_ps(p1, p2);

            // apply kernel
            p = _mm_mul_ps(p, kernel);
            p = _mm_hadd_ps(p, p);
            p = _mm_hadd_ps(p, p);

            // write
            _mm_store_ss(outBuf + x, p);
        }
    }

    // apply vertical blur
    for (int y = 3; y < blurBufferSize.height() - 3; ++y) {
        float *inBuf = blurBuffer2 + y * blurBufferSize.width();
        float *outBuf = blurBuffer1 + y * blurBufferSize.width();
        float *varBuf = tipBuffer + y * blurBufferSize.width();
        for (int x = 0; x < blurBufferSize.width() - 3; x += 4) {
            // fetch kernel
            __m128 kernel1 = globalGaussianKernelTable.fetch(varBuf[x]);
            __m128 kernel2 = globalGaussianKernelTable.fetch(varBuf[x + 1]);
            __m128 kernel3 = globalGaussianKernelTable.fetch(varBuf[x + 2]);
            __m128 kernel4 = globalGaussianKernelTable.fetch(varBuf[x + 3]);
            _MM_TRANSPOSE4_PS(kernel1, kernel2, kernel3, kernel4);

            // load input
            __m128 p1 = _mm_loadu_ps(inBuf + x);
            p1 = _mm_add_ps(p1, p1);
            __m128 p2 = _mm_loadu_ps(inBuf + x - blurBufferSize.width());
            p2 = _mm_add_ps(p2, _mm_loadu_ps(inBuf + x + blurBufferSize.width()));
            __m128 p3 = _mm_loadu_ps(inBuf + x - blurBufferSize.width() * 2);
            p3 = _mm_add_ps(p3, _mm_loadu_ps(inBuf + x + blurBufferSize.width() * 2));
            __m128 p4 = _mm_loadu_ps(inBuf + x - blurBufferSize.width() * 3);
            p4 = _mm_add_ps(p4, _mm_loadu_ps(inBuf + x + blurBufferSize.width() * 3));

            // apply kernel
            p1 = _mm_mul_ps(p1, kernel1);
            p2 = _mm_mul_ps(p2, kernel2);
            p3 = _mm_mul_ps(p3, kernel3);
            p4 = _mm_mul_ps(p4, kernel4);
            p1 = _mm_add_ps(p1, p2);
            p3 = _mm_add_ps(p3, p4);
            auto p = _mm_add_ps(p1, p3);

            // store
            _mm_storeu_ps(outBuf + x, p);
        }
    }

    for (int y = 0; y < dirtyRect.height(); ++y) {
        float *inBuf = blurBuffer1 + (y + 3) * blurBufferSize.width() + 3;
        for (int x = 0; x < dirtyRect.width(); ++x) {
            int cx = x + dirtyRect.left();
            int cy = y + dirtyRect.top();
            terrain->landform(cx, cy) = inBuf[x];
        }
    }
    edit->endEdit(terrain);
}
Ejemplo n.º 21
0
void sINLINE RNMarchingCubesBase<T>::func(const sVector31 &v,typename T::FieldType &pot,const funcinfo &fi)
{
  __m128 vx = _mm_load_ps1(&v.x);
  __m128 vy = _mm_load_ps1(&v.y);
  __m128 vz = _mm_load_ps1(&v.z);
  __m128 po = _mm_setzero_ps();           // p
  __m128 nx = _mm_setzero_ps();
  __m128 ny = _mm_setzero_ps();
  __m128 nz = _mm_setzero_ps();
  __m128 akkur = _mm_setzero_ps();
  __m128 akkug = _mm_setzero_ps();
  __m128 akkub = _mm_setzero_ps();
  __m128 akkua = _mm_setzero_ps();
  __m128 s255 = _mm_set_ps1(255.0f);
  
  sBool good = 0;

  for(sInt i=0;i<fi.pn4;i++)
  {
    const T::SimdType *part = fi.parts4 + i;

    __m128 dx = _mm_sub_ps(vx,part->x);
    __m128 dy = _mm_sub_ps(vy,part->y);
    __m128 dz = _mm_sub_ps(vz,part->z);
    __m128 ddx = _mm_mul_ps(dx,dx);
    __m128 ddy = _mm_mul_ps(dy,dy);
    __m128 ddz = _mm_mul_ps(dz,dz);
    __m128 pp = _mm_add_ps(_mm_add_ps(ddx,ddy),ddz);

    if(_mm_movemask_ps(_mm_cmple_ps(pp,fi.treshf4))!=0)
    {
      __m128 pp2 = _mm_sub_ps(_mm_div_ps(fi.one,pp),fi.tresh4);
      __m128 pp3 = _mm_max_ps(pp2,_mm_setzero_ps());
      po = _mm_add_ps(po,pp3);                  // p = p+pp;
      __m128 pp4 = _mm_mul_ps(pp3,pp3);         // pp*pp
      nx = _mm_add_ps(nx,_mm_mul_ps(pp4,dx));   // n += d*(pp*pp)
      ny = _mm_add_ps(ny,_mm_mul_ps(pp4,dy));
      nz = _mm_add_ps(nz,_mm_mul_ps(pp4,dz));
      if(T::Color==1)
      {
        akkur = _mm_add_ps(akkur,_mm_mul_ps(pp3,part->cr));
        akkug = _mm_add_ps(akkug,_mm_mul_ps(pp3,part->cg));
        akkub = _mm_add_ps(akkub,_mm_mul_ps(pp3,part->cb));
        good = 1;
      }
    }
  }

  sF32 p = 0;
  sVector30 n;
  
  _MM_TRANSPOSE4_PS(po,nx,ny,nz);
  __m128 r = _mm_add_ps(_mm_add_ps(_mm_add_ps(nx,ny),nz),po);
  n.x = r.m128_f32[1];
  n.y = r.m128_f32[2];
  n.z = r.m128_f32[3];
  p = r.m128_f32[0];

  if(p==0)
    n.Init(0,0,0);
  else
    n.UnitFast();
  pot.x = n.x;
  pot.y = n.y;
  pot.z = n.z;
  pot.w = p-fi.iso;
  if(T::Color)
  {
    if(good)
    {
      r = _mm_mul_ss(s255,_mm_rcp_ss(r));
  //    r = _mm_rcp_ss(r);
      _MM_TRANSPOSE4_PS(akkub,akkug,akkur,akkua);
      __m128 r2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(akkur,akkug),akkub),akkua);

      r2 = _mm_mul_ps(r2,_mm_shuffle_ps(r,r,0x00));
      __m128i r3 = _mm_cvtps_epi32(r2);
      r3 = _mm_packs_epi32(r3,r3);
      __m128i r4 = _mm_packus_epi16(r3,r3);
      pot.c = r4.m128i_u32[0]|0xff000000;
    }
    else
    {
      pot.c = 0;
    }
  }
}
Ejemplo n.º 22
0
void mat4::makeTranspose() {
    _MM_TRANSPOSE4_PS(m[0], m[1], m[2], m[3]);
}
Ejemplo n.º 23
0
kw_mat4 kw_transpose(kw_mat4 m){
    kw_mat4 result = m;
    _MM_TRANSPOSE4_PS(result.simd[0], result.simd[1], result.simd[2], result.simd[3]);
    return result;
}
Ejemplo n.º 24
0
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
//-------------------------------------------------------------------------------
// For each tile go through all the bins and process all the triangles in it.
// Rasterize each triangle to the CPU depth buffer. 
//-------------------------------------------------------------------------------
void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m128i colOffset = _mm_setr_epi32(0, 1, 0, 1);
	__m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1);

	__m128i fxptZero = _mm_setzero_si128();
	float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; 

	// Based on TaskId determine which tile to process
	UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS;
    UINT tileX = tileId % screenWidthInTiles;
    UINT tileY = tileId / screenWidthInTiles;

    int tileStartX = tileX * TILE_WIDTH_IN_PIXELS;
	int tileEndX   = tileStartX + TILE_WIDTH_IN_PIXELS - 1;
	
	int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS;
	int tileEndY   = tileStartY + TILE_HEIGHT_IN_PIXELS - 1;

	ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx);

	UINT bin = 0;
	UINT binIndex = 0;
	UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX;
	UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX;
	UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];

	__m128 gatherBuf[4][3];
	bool done = false;
	bool allBinsEmpty = true;
	mNumRasterizedTris[idx][tileId] = numTrisInBin;
	while(!done)
	{
		// Loop through all the bins and process 4 binned traingles at a time
		UINT ii;
		int numSimdTris = 0;
		for(ii = 0; ii < SSE; ii++)
		{
			while(numTrisInBin <= 0)
			{
				 // This bin is empty.  Move to next bin.
				if(++bin >= 1)
				{
					break;
				}
				numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];
				mNumRasterizedTris[idx][tileId] += numTrisInBin;
				binIndex = 0;
			}
			if(!numTrisInBin)
			{
				break; // No more tris in the bins
			}
			USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx);
			allBinsEmpty = false;
			numSimdTris++; 

			++binIndex;
			--numTrisInBin;
		}
		done = bin >= NUM_XFORMVERTS_TASKS;
		
		if(allBinsEmpty)
		{
			return;
		}

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        __m128i fxPtX[3], fxPtY[3];
		__m128 Z[3];
		for(int i = 0; i < 3; i++)
		{
			__m128 v0 = gatherBuf[0][i];
			__m128 v1 = gatherBuf[1][i];
			__m128 v2 = gatherBuf[2][i];
			__m128 v3 = gatherBuf[3][i];

  			// transpose into SoA layout
			_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
			fxPtX[i] = _mm_cvtps_epi32(v0);
			fxPtY[i] = _mm_cvtps_epi32(v1);
			Z[i] = v2;
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
		__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
		__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
		__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
		__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(B2, A1);
		triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
		__m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea));

		Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea);
		Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea);

		// Use bounding box traversal strategy to determine which pixels to rasterize 
		__m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endX   = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX));

		__m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endY   = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY));

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < numSimdTris; lane++)
        {
			// Extract this triangle's properties from the SIMD versions
            __m128 zz[3];
			for(int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]);
			}
									
			int startXx = startX.m128i_i32[lane];
			int endXx	= endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy	= endY.m128i_i32[lane];
		
			__m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]);
			__m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]);
			__m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]);

			__m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]);
			__m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]);
			__m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]);

			__m128i aa0Inc = _mm_slli_epi32(aa0, 1);
			__m128i aa1Inc = _mm_slli_epi32(aa1, 1);
			__m128i aa2Inc = _mm_slli_epi32(aa2, 1);

			__m128i row, col;

			// Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X
			// This method provides better perfromance
			int rowIdx = (startYy * SCREENW + 2 * startXx);

			col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx));
			__m128i aa0Col = _mm_mullo_epi32(aa0, col);
			__m128i aa1Col = _mm_mullo_epi32(aa1, col);
			__m128i aa2Col = _mm_mullo_epi32(aa2, col);

			row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy));
			__m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane]));
			__m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane]));
			__m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane]));

			__m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row);
			__m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row);
			__m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row);

			__m128i bb0Inc = _mm_slli_epi32(bb0, 1);
			__m128i bb1Inc = _mm_slli_epi32(bb1, 1);
			__m128i bb2Inc = _mm_slli_epi32(bb2, 1);

			__m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]);
			zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2]));

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for(int r = startYy; r < endYy; r += 2,
											rowIdx += 2 * SCREENW,
											sum0Row = _mm_add_epi32(sum0Row, bb0Inc),
											sum1Row = _mm_add_epi32(sum1Row, bb1Inc),
											sum2Row = _mm_add_epi32(sum2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int index = rowIdx;
				__m128i alpha = sum0Row;
				__m128i beta = sum1Row;
				__m128i gama = sum2Row;

				//Compute barycentric-interpolated depth
				__m128 depth = zz[0];
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1]));
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2]));

				for(int c = startXx; c < endXx; c += 2,
												index += 4,
												alpha = _mm_add_epi32(alpha, aa0Inc),
												beta  = _mm_add_epi32(beta, aa1Inc),
												gama  = _mm_add_epi32(gama, aa2Inc),
												depth = _mm_add_ps(depth, zx))
				{
					//Test Pixel inside triangle
					__m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama);
					
					__m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]);
					__m128 mergedDepth = _mm_max_ps(depth, previousDepthValue);
					__m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask));
					_mm_store_ps(&pDepthBuffer[index], finalDepth);
				}//for each column											
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles
}
Ejemplo n.º 26
0
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax)
{
  assert(vertices.size() % 16 == 0);

  // Simple k-means clustering by normal direction to improve backface culling efficiency
  std::vector<__m128> quadNormals;
  for (auto i = 0; i < vertices.size(); i += 4)
  {
    auto v0 = vertices[i + 0];
    auto v1 = vertices[i + 1];
    auto v2 = vertices[i + 2];
    auto v3 = vertices[i + 3];

    quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3))));
  }

  std::vector<__m128> centroids;
  std::vector<uint32_t> centroidAssignment;
  centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f));

  centroidAssignment.resize(vertices.size() / 4);

  bool anyChanged = true;
  for (int iter = 0; iter < 10 && anyChanged; ++iter)
  {
    anyChanged = false;

    for (auto j = 0; j < quadNormals.size(); ++j)
    {
      __m128 normal = quadNormals[j];

      __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity());
      int bestCentroid = -1;
      for (int k = 0; k < centroids.size(); ++k)
      {
        __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F);
        if (_mm_comige_ss(distance, bestDistance))
        {
          bestDistance = distance;
          bestCentroid = k;
        }
      }

      if (centroidAssignment[j] != bestCentroid)
      {
        centroidAssignment[j] = bestCentroid;
        anyChanged = true;
      }
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = _mm_setzero_ps();
    }

    for (int j = 0; j < quadNormals.size(); ++j)
    {
      int k = centroidAssignment[j];

      centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]);
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = normalize(centroids[k]);
    }
  }

  std::vector<__m128> orderedVertices;
  for (int k = 0; k < centroids.size(); ++k)
  {
    for (int j = 0; j < vertices.size() / 4; ++j)
    {
      if (centroidAssignment[j] == k)
      {
        orderedVertices.push_back(vertices[4 * j + 0]);
        orderedVertices.push_back(vertices[4 * j + 1]);
        orderedVertices.push_back(vertices[4 * j + 2]);
        orderedVertices.push_back(vertices[4 * j + 3]);
      }
    }
  }

  auto occluder = std::make_unique<Occluder>();

  __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin));

  __m128 scalingX = _mm_set1_ps(2047.0f);
  __m128 scalingY = _mm_set1_ps(2047.0f);
  __m128 scalingZ = _mm_set1_ps(1023.0f);

  __m128 half = _mm_set1_ps(0.5f);

  for (size_t i = 0; i < orderedVertices.size(); i += 16)
  {
    for (auto j = 0; j < 4; ++j)
    {
      // Transform into [0,1] space relative to bounding box
      __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
      __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
      __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
      __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents);

      // Transpose into [xxxx][yyyy][zzzz][wwww]
      _MM_TRANSPOSE4_PS(v0, v1, v2, v3);

      // Scale and truncate to int
      v0 = _mm_fmadd_ps(v0, scalingX, half);
      v1 = _mm_fmadd_ps(v1, scalingY, half);
      v2 = _mm_fmadd_ps(v2, scalingZ, half);

      __m128i X = _mm_cvttps_epi32(v0);
      __m128i Y = _mm_cvttps_epi32(v1);
      __m128i Z = _mm_cvttps_epi32(v2);

      // Pack to 11/11/10 format
      __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z));

      occluder->m_vertexData.push_back(XYZ);
    }
  }

  occluder->m_refMin = refMin;
  occluder->m_refMax = refMax;

  __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity());
  __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity());

  for (size_t i = 0; i < orderedVertices.size(); ++i)
  {
    min = _mm_min_ps(vertices[i], min);
    max = _mm_max_ps(vertices[i], max);
  }

  // Set W = 1 - this is expected by frustum culling code
  min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000);
  max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000);

  occluder->m_boundsMin = min;
  occluder->m_boundsMax = max;

  occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f));

  return occluder;
}
Ejemplo n.º 27
0
			out.row_[2].x_ = row_[2].x_ * row_[0].x_ + row_[2].y_ * row_[1].x_ + row_[2].z_ * row_[2].x_ + row_[2].w_ * row_[3].x_;
			out.row_[2].y_ = row_[2].x_ * row_[0].y_ + row_[2].y_ * row_[1].y_ + row_[2].z_ * row_[2].y_ + row_[2].w_ * row_[3].y_;
			out.row_[2].z_ = row_[2].x_ * row_[0].z_ + row_[2].y_ * row_[1].z_ + row_[2].z_ * row_[2].z_ + row_[2].w_ * row_[3].z_;
			out.row_[2].w_ = row_[2].x_ * row_[0].w_ + row_[2].y_ * row_[1].w_ + row_[2].z_ * row_[2].w_ + row_[2].w_ * row_[3].w_;
			out.row_[3].x_ = row_[3].x_ * row_[0].x_ + row_[3].y_ * row_[1].x_ + row_[3].z_ * row_[2].x_ + row_[3].w_ * row_[3].x_;
			out.row_[3].y_ = row_[3].x_ * row_[0].y_ + row_[3].y_ * row_[1].y_ + row_[3].z_ * row_[2].y_ + row_[3].w_ * row_[3].y_;
			out.row_[3].z_ = row_[3].x_ * row_[0].z_ + row_[3].y_ * row_[1].z_ + row_[3].z_ * row_[2].z_ + row_[3].w_ * row_[3].z_;
			out.row_[3].w_ = row_[3].x_ * row_[0].w_ + row_[3].y_ * row_[1].w_ + row_[3].z_ * row_[2].w_ + row_[3].w_ * row_[3].w_;
			*this = out;
			return *this;
#		endif
		}

		inline void Matrix4::Transpose()
		{
			_MM_TRANSPOSE4_PS(row_[0], row_[1], row_[2], row_[3])
		}

		Matrix4 MatrixOrthographicLH(const float & ViewWidth, const float & ViewHeight, const float & NearZ, const float & FarZ)
		{
			Matrix4 M;
			float fRange = 1.0f / (FarZ - NearZ);
			// Note: This is recorded on the stack
			Vector4 rMem = { 2.0f / ViewWidth,	2.0f / ViewHeight, fRange, -fRange * NearZ };
			// Copy from memory to SSE register
			Vector4 vValues = rMem;
			Vector4 vTemp = _mm_setzero_ps();
			// Copy x only
			vTemp = _mm_move_ss(vTemp, vValues);
			// 2.0f / ViewWidth,0,0,0
			M.row_[0] = vTemp;