inline matrix_t transpose(const matrix_t& matrix) { matrix_t result = matrix; _MM_TRANSPOSE4_PS(result[0], result[1], result[2], result[3]); return result; }
void dct4x4_1d_llm_fwd_sse_and_transpose(float* s, float* d)//8add, 4 mul { const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 p03 = _mm_add_ps(s0, s3); __m128 p12 = _mm_add_ps(s1, s2); __m128 m03 = _mm_sub_ps(s0, s3); __m128 m12 = _mm_sub_ps(s1, s2); s0 = _mm_add_ps(p03, p12); s1 = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12)); s2 = _mm_sub_ps(p03, p12); s3 = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12)); _MM_TRANSPOSE4_PS(s0, s1, s2, s3); _mm_store_ps(d, s0); _mm_store_ps(d + 4, s1); _mm_store_ps(d + 8, s2); _mm_store_ps(d + 12, s3); }
void dct4x4_1d_llm_inv_sse_and_transpose(float* s, float* d) { const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 t10 = _mm_add_ps(s0, s2); __m128 t12 = _mm_sub_ps(s0, s2); __m128 t0 = _mm_add_ps(_mm_mul_ps(c2, s1), _mm_mul_ps(c6, s3)); __m128 t2 = _mm_sub_ps(_mm_mul_ps(c6, s1), _mm_mul_ps(c2, s3)); s0 = _mm_add_ps(t10, t0); s1 = _mm_add_ps(t12, t2); s2 = _mm_sub_ps(t12, t2); s3 = _mm_sub_ps(t10, t0); _MM_TRANSPOSE4_PS(s0, s1, s2, s3); _mm_store_ps(d, s0); _mm_store_ps(d + 4, s1); _mm_store_ps(d + 8, s2); _mm_store_ps(d + 12, s3); }
/** compose homogeneous transforms */ inline RigidTransform<float>& operator *= (RigidTransform<float>& lhs, const RigidTransform<float>& rhs) { #ifdef SIMPLE_GL_USE_SSE4 // load transposed matrix __m128 matrix0 = _mm_load_ps(&rhs[0].x); __m128 matrix1 = _mm_load_ps(&rhs[1].x); __m128 matrix2 = _mm_load_ps(&rhs[2].x); __m128 matrix3 = _mm_load_ps(&rhs[3].x); _MM_TRANSPOSE4_PS(matrix0, matrix1, matrix2, matrix3); __m128 row, dotProd; #define __CALC_ROW(i)\ row = lhs[i].m128;\ lhs[i].m128 = _mm_dp_ps(row, matrix0, 0xEE);\ dotProd = _mm_dp_ps(row, matrix1, 0xEE);\ lhs[i].m128 = _mm_blend_ps( lhs[i].m128, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\ dotProd = _mm_dp_ps(row, matrix2, 0xEE);\ lhs[i].m128 = _mm_blend_ps( lhs[i].m128, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\ dotProd = _mm_dp_ps(row, matrix3, 0xEE);\ lhs[i].m128 = _mm_blend_ps( lhs[i].m128, dotProd, _MM_SHUFFLE(0, 0, 0, 1) ); // calculate __CALC_ROW(0) __CALC_ROW(1) __CALC_ROW(2) // __CALC_ROW(3) - 3rd row should be (0, 0, 0, 1) #undef __CALC_ROW return lhs; #else // SSE2 __m128 row0; __m128 row1; __m128 row2; __m128 row3; #define __CALC_ROW(i)\ row0 = _mm_shuffle_ps( lhs[i].m128, lhs[i].m128, _MM_SHUFFLE(0, 0, 0, 0) );\ row1 = _mm_shuffle_ps( lhs[i].m128, lhs[i].m128, _MM_SHUFFLE(1, 1, 1, 1) );\ row2 = _mm_shuffle_ps( lhs[i].m128, lhs[i].m128, _MM_SHUFFLE(2, 2, 2, 2) );\ row3 = _mm_shuffle_ps( lhs[i].m128, lhs[i].m128, _MM_SHUFFLE(3, 3, 3, 3) );\ \ row0 = _mm_mul_ps(row0, rhs[0].m128);\ row1 = _mm_mul_ps(row1, rhs[1].m128);\ row2 = _mm_mul_ps(row2, rhs[2].m128);\ row3 = _mm_mul_ps(row3, rhs[3].m128);\ \ lhs[i].m128 = _mm_add_ps(row0, row1);\ lhs[i].m128 = _mm_add_ps(lhs[i].m128, row2);\ lhs[i].m128 = _mm_add_ps(lhs[i].m128, row3); // calculate __CALC_ROW(0) __CALC_ROW(1) __CALC_ROW(2) //__CALC_ROW(3) - 3rd row should be (0, 0, 0, 1) #undef __CALC_ROW return lhs; #endif }
vec4 mat4::operator*(const vec4& b) const { __m128 r0 = _mm_mul_ps(m[0], b.v); __m128 r1 = _mm_mul_ps(m[1], b.v); __m128 r2 = _mm_mul_ps(m[2], b.v); __m128 r3 = _mm_mul_ps(m[3], b.v); _MM_TRANSPOSE4_PS(r0, r1, r2, r3); return _mm_add_ps(r0, _mm_add_ps(r1, _mm_add_ps(r2, r3))); }
static void transpose_block(float *in_data, float *out_data) { int i; __m128 row1, row2, row3, row4; for (i = 0; i < 8; i += 4) { /* Transpose one 4x8 matrix at a time by using _MM_TRANSPOSE4_PS * on two 4x4 matrixes * First iteration: upper left and lower left * Second iteration: upper right and lower right */ // Transpose the upper 4x4 matrix row1 = _mm_load_ps(in_data + i); row2 = _mm_load_ps(in_data + 8 + i); row3 = _mm_load_ps(in_data + 16 + i); row4 = _mm_load_ps(in_data + 24 + i); _MM_TRANSPOSE4_PS(row1, row2, row3, row4); // Store the first four elements of each row of the transposed 8x8 matrix _mm_store_ps(out_data + i * 8, row1); _mm_store_ps(out_data + (i + 1) * 8, row2); _mm_store_ps(out_data + (i + 2) * 8, row3); _mm_store_ps(out_data + (i + 3) * 8, row4); // Transpose the lower 4x4 matrix row1 = _mm_load_ps(in_data + 32 + i); row2 = _mm_load_ps(in_data + 40 + i); row3 = _mm_load_ps(in_data + 48 + i); row4 = _mm_load_ps(in_data + 56 + i); _MM_TRANSPOSE4_PS(row1, row2, row3, row4); // Store the last four elements of each row of the transposed 8x8 matrix _mm_store_ps(out_data + i * 8 + 4, row1); _mm_store_ps(out_data + (i + 1) * 8 + 4, row2); _mm_store_ps(out_data + (i + 2) * 8 + 4, row3); _mm_store_ps(out_data + (i + 3) * 8 + 4, row4); } }
Mat44 Mat44::Transpose() const { __m128 mat1rows[] = { _mm_load_ps(mat), _mm_load_ps(mat+4), _mm_load_ps(mat+8), _mm_load_ps(mat+12) }; _MM_TRANSPOSE4_PS(mat1rows[0], mat1rows[1], mat1rows[2], mat1rows[3]); Mat44 res; _mm_store_ps(res.mat , mat1rows[0]); _mm_store_ps(res.mat+4 , mat1rows[1]); _mm_store_ps(res.mat+8 , mat1rows[2]); _mm_store_ps(res.mat+12, mat1rows[3]); return res; };
void transpose4x4<float, float>(const float* src, size_t lda, float* dst, size_t ldb) { __m128 row0, row1, row2, row3; row0 = _mm_loadu_ps(src); row1 = _mm_loadu_ps(src+lda); row2 = _mm_loadu_ps(src+2*lda); row3 = _mm_loadu_ps(src+3*lda); _MM_TRANSPOSE4_PS(row0, row1, row2, row3); _mm_storeu_ps(dst, row0); _mm_storeu_ps(dst+ldb, row1); _mm_storeu_ps(dst+2*ldb, row2); _mm_storeu_ps(dst+3*ldb, row3); }
inline void transpose_4x4block_SSE_32(float* A, float* B, const size_t lda, const size_t ldb) { __m128 row1 = _mm_load_ps(&A[0*ldb]); __m128 row2 = _mm_load_ps(&A[1*ldb]); __m128 row3 = _mm_load_ps(&A[2*ldb]); __m128 row4 = _mm_load_ps(&A[3*ldb]); _MM_TRANSPOSE4_PS(row1, row2, row3, row4); _mm_store_ps(&B[0*lda], row1); _mm_store_ps(&B[1*lda], row2); _mm_store_ps(&B[2*lda], row3); _mm_store_ps(&B[3*lda], row4); }
template <typename T> void Transpose(void) { #ifdef __SSE_AVAIL__ _MM_TRANSPOSE4_PS(ssem1, ssem2, ssem3, ssem4); #else Matrix4x4<T> m = *this; Mat[0] = m.Mat[0]; Mat[1] = m.Mat[4]; Mat[2] = m.Mat[8]; Mat[3] = m.Mat[12]; Mat[4] = m.Mat[1]; Mat[5] = m.Mat[5]; Mat[6] = m.Mat[9]; Mat[7] = m.Mat[13]; Mat[8] = m.Mat[2]; Mat[9] = m.Mat[6]; Mat[10] = m.Mat[10]; Mat[11] = m.Mat[14]; Mat[12] = m.Mat[3]; Mat[13] = m.Mat[7]; Mat[14] = m.Mat[11]; Mat[15] = m.Mat[15]; #endif };
static inline void transpose_sse (gfloat *src, gfloat *dst, const int width, const int height) { __m128 row1 = _mm_loadu_ps (src); __m128 row2 = _mm_loadu_ps (src + height); __m128 row3 = _mm_loadu_ps (src + 2 * height); __m128 row4 = _mm_loadu_ps (src + 3 * height); _MM_TRANSPOSE4_PS (row1, row2, row3, row4); _mm_storeu_ps (dst, row1); _mm_storeu_ps (dst + width, row2); _mm_storeu_ps (dst + 2 * width, row3); _mm_storeu_ps (dst + 3 * width, row4); }
void Matrix4x4<float>::transpose() { __m128 row0 = m_rows[0].asSSE(); __m128 row1 = m_rows[1].asSSE(); __m128 row2 = m_rows[2].asSSE(); __m128 row3 = m_rows[3].asSSE(); _MM_TRANSPOSE4_PS(row0, row1, row2, row3); m_rows[0] = row0; m_rows[1] = row1; m_rows[2] = row2; m_rows[3] = row3; }
//Binning static void gather4Simd(VecF32Soa dest[3],VecF32 vertices[12]){ for(uint32 i = 0;i<3;++i){ __m128 v0 = vertices[i].simd; //x0, y0, z0, w0 __m128 v1 = vertices[3+i].simd;//x1, y1, z1, w1 __m128 v2 = vertices[6+i].simd;//x2, y2, z2, w2 __m128 v3 = vertices[9+i].simd;//x3, y3, z3, w3 _MM_TRANSPOSE4_PS(v0, v1, v2, v3); dest[i].x = VecF32(v0); dest[i].y = VecF32(v1); dest[i].z = VecF32(v2); dest[i].w = VecF32(v3); } }
// It's 20% faster but wasn't the 2x speed up I was hoping for. // Will have to investigate later. mat4 mat4::operator*(const mat4& b) const { mat4 temp = b; temp.makeTranspose(); __m128 r[4]; for(int i = 0; i < 4; ++i) { __m128 c0 = _mm_mul_ps(m[i], temp.m[0]); __m128 c1 = _mm_mul_ps(m[i], temp.m[1]); __m128 c2 = _mm_mul_ps(m[i], temp.m[2]); __m128 c3 = _mm_mul_ps(m[i], temp.m[3]); _MM_TRANSPOSE4_PS(c0, c1, c2, c3); r[i] = _mm_add_ps(c0, _mm_add_ps(c1, _mm_add_ps(c2, c3))); } return r; }
inline __m128 convolute_loop(__m128 bspline, __m128 m[4]) { #if defined(SSE4) return _mm_set_ps( dot_product(bspline, m[0]), dot_product(bspline, m[1]), dot_product(bspline, m[2]), dot_product(bspline, m[3])); #else _MM_TRANSPOSE4_PS(m[3], m[2], m[1], m[0]); return _mm_add_ps( _mm_add_ps( _mm_add_ps( _mm_mul_ps(m[0], _mm_shuffle_ps(bspline, bspline, _MM_SHUFFLE(3,3,3,3))), _mm_mul_ps(m[1], _mm_shuffle_ps(bspline, bspline, _MM_SHUFFLE(2,2,2,2)))), _mm_mul_ps(m[2], _mm_shuffle_ps(bspline, bspline, _MM_SHUFFLE(1,1,1,1)))), _mm_mul_ps(m[3], _mm_shuffle_ps(bspline, bspline, _MM_SHUFFLE(0,0,0,0)))); #endif }
void AABBTransformAsAABB_SIMD(AABB &aabb, const float4x4 &m) { simd4f minPt = aabb.MinPoint_SSE(); simd4f maxPt = aabb.MaxPoint_SSE(); simd4f centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f)); simd4f halfSize = _mm_sub_ps(centerPoint, minPt); simd4f newCenter = mat4x4_mul_vec4(m.row, centerPoint); simd4f x = abs_ps(_mm_mul_ps(m.row[0], halfSize)); simd4f y = abs_ps(_mm_mul_ps(m.row[1], halfSize)); simd4f z = abs_ps(_mm_mul_ps(m.row[2], halfSize)); simd4f w = _mm_setzero_ps(); _MM_TRANSPOSE4_PS(x, y, z, w); // Contains 2x unpacklo's, 2x unpackhi's, 2x movelh's and 2x movehl's. (or 8 shuffles, depending on the compiler) simd4f newDir = _mm_add_ps(_mm_add_ps(x, y), _mm_add_ps(z, w)); aabb.MinPoint_SSE() = _mm_sub_ps(newCenter, newDir); aabb.MaxPoint_SSE() = _mm_add_ps(newCenter, newDir); }
void TransformedAABBoxSSE::Gather(vFloat4 pOut[3], UINT triId, const __m128 xformedPos[], UINT idx) { for(int i = 0; i < 3; i++) { UINT ind0 = sBBIndexList[triId*3 + i + 0]; UINT ind1 = sBBIndexList[triId*3 + i + 3]; UINT ind2 = sBBIndexList[triId*3 + i + 6]; UINT ind3 = sBBIndexList[triId*3 + i + 9]; __m128 v0 = xformedPos[ind0]; // x0 y0 z0 w0 __m128 v1 = xformedPos[ind1]; // x1 y1 z1 w1 __m128 v2 = xformedPos[ind2]; // x2 y2 z2 w2 __m128 v3 = xformedPos[ind3]; // x3 y3 z3 w3 _MM_TRANSPOSE4_PS(v0, v1, v2, v3); pOut[i].X = v0; pOut[i].Y = v1; pOut[i].Z = v2; pOut[i].W = v3; } }
inline wg_v4sf Recognizer::local_distance4(float *s, float *t0, float *t1, float *t2, float *t3) { wg_v4sf v_, v0, v1, v2, v3; v_.v = _mm_set_ps1(-0.0); v0.v = _mm_sub_ps(((wg_v4sf *)t0)->v, ((wg_v4sf *)s)->v); v0.v = _mm_andnot_ps(v_.v,v0.v); // absolute value v1.v = _mm_sub_ps(((wg_v4sf *)t1)->v, ((wg_v4sf *)s)->v); v1.v = _mm_andnot_ps(v_.v,v1.v); // absolute value v2.v = _mm_sub_ps(((wg_v4sf *)t2)->v, ((wg_v4sf *)s)->v); v2.v = _mm_andnot_ps(v_.v,v2.v); // absolute value v3.v = _mm_sub_ps(((wg_v4sf *)t3)->v, ((wg_v4sf *)s)->v); v3.v = _mm_andnot_ps(v_.v,v3.v); // absolute value // convert row vectors to column vectors _MM_TRANSPOSE4_PS(v0.v, v1.v, v2.v, v3.v); v3.v = _mm_add_ps(v3.v, v2.v); v3.v = _mm_add_ps(v3.v, v1.v); v3.v = _mm_add_ps(v3.v, v0.v); return v3; }
void mult_matrix_vecf(const float* ina, const float* const inv, float* dst) { #ifdef ARCAN_MATH_ALIGNED_SIMD assert((uintptr_t)dst % 16 == 0); assert((uintptr_t)ina % 16 == 0); assert((uintptr_t)inv % 16 == 0); __m128 r0 = _mm_loadu_ps(&ina[0]); __m128 r1 = _mm_loadu_ps(&ina[4]); __m128 r2 = _mm_loadu_ps(&ina[8]); __m128 r3 = _mm_loadu_ps(&ina[12]); const __m128 ir = _mm_loadu_ps(inv); #else __m128 r0 = _mm_load_ps(&ina[0]); __m128 r1 = _mm_load_ps(&ina[4]); __m128 r2 = _mm_load_ps(&ina[8]); __m128 r3 = _mm_load_ps(&ina[12]); const __m128 ir = _mm_load_ps(inv); #endif /* column major curses .. */ _MM_TRANSPOSE4_PS(r0, r1, r2, r3); __m128 m0 = _mm_mul_ps(r0, ir); __m128 m1 = _mm_mul_ps(r1, ir); __m128 m2 = _mm_mul_ps(r2, ir); __m128 m3 = _mm_mul_ps(r3, ir); __m128 a1 = _mm_hadd_ps(m0, m1); __m128 a2 = _mm_hadd_ps(m2, m3); __m128 rs = _mm_hadd_ps(a1, a2); #ifdef ARCAN_MATH_ALIGNED_SIMD _mm_storeu_ps(dst, rs); #else _mm_store_ps(dst, rs); #endif }
void BrushToolEdit::drawSmoothen(const QPoint &pt, float amount) { Terrain *tip = tool->tip(pt).data(); // compute affected rectangle QRect dirtyRect(pt, tip->size()); dirtyRect = dirtyRect.intersected(QRect(QPoint(0, 0), terrain->size())); if (!dirtyRect.isValid()) { return; } edit->beginEdit(dirtyRect, terrain); QSize tSize = terrain->size(); QSize tipSize = tip->size(); QSize blurBufferSize(dirtyRect.width() + 6, dirtyRect.height() + 6); TemporaryBuffer<float> blurBuffer1(blurBufferSize.width() * blurBufferSize.height(), 4); TemporaryBuffer<float> blurBuffer2(blurBufferSize.width() * blurBufferSize.height(), 4); TemporaryBuffer<float> tipBuffer(blurBufferSize.width() * blurBufferSize.height(), 4); for (int y = 0; y < blurBufferSize.height(); ++y) { int cy = y + dirtyRect.top() - 3; cy = std::max(std::min(cy, tSize.height() - 1), 0); for (int x = 0; x < blurBufferSize.width(); ++x) { int cx = x + dirtyRect.left() - 3; cx = std::max(std::min(cx, tSize.width() - 1), 0); blurBuffer1[x + y * blurBufferSize.width()] = terrain->landform(cx, cy); } } for (int y = 0; y < blurBufferSize.height(); ++y) { int cy = y + dirtyRect.top() - 3; int ty = cy - pt.y(); if (ty >= 0 && ty < tipSize.height()) { for (int x = 0; x < blurBufferSize.width(); ++x) { int cx = x + dirtyRect.left() - 3; int tx = cx - pt.x(); tipBuffer[x + y * blurBufferSize.width()] = tx >= 0 && tx < tipSize.width() ? tip->landform(tx, ty) * amount : 0.f; } } else { std::fill(&tipBuffer[y * blurBufferSize.width()], &tipBuffer[(y + 1) * blurBufferSize.width()], 0.f); } } // apply horizontal blur for (int y = 0; y < blurBufferSize.height(); ++y) { float *inBuf = blurBuffer1 + y * blurBufferSize.width(); float *outBuf = blurBuffer2 + y * blurBufferSize.width(); float *varBuf = tipBuffer + y * blurBufferSize.width(); for (int x = 3; x < blurBufferSize.width() - 3; ++x) { float variance = varBuf[x]; __m128 kernel = globalGaussianKernelTable.fetch(variance); // sample input __m128 p1 = _mm_loadu_ps(inBuf + x - 3); __m128 p2 = _mm_loadu_ps(inBuf + x); p1 = _mm_shuffle_ps(p1, p1, _MM_SHUFFLE(0, 1, 2, 3)); auto p = _mm_add_ps(p1, p2); // apply kernel p = _mm_mul_ps(p, kernel); p = _mm_hadd_ps(p, p); p = _mm_hadd_ps(p, p); // write _mm_store_ss(outBuf + x, p); } } // apply vertical blur for (int y = 3; y < blurBufferSize.height() - 3; ++y) { float *inBuf = blurBuffer2 + y * blurBufferSize.width(); float *outBuf = blurBuffer1 + y * blurBufferSize.width(); float *varBuf = tipBuffer + y * blurBufferSize.width(); for (int x = 0; x < blurBufferSize.width() - 3; x += 4) { // fetch kernel __m128 kernel1 = globalGaussianKernelTable.fetch(varBuf[x]); __m128 kernel2 = globalGaussianKernelTable.fetch(varBuf[x + 1]); __m128 kernel3 = globalGaussianKernelTable.fetch(varBuf[x + 2]); __m128 kernel4 = globalGaussianKernelTable.fetch(varBuf[x + 3]); _MM_TRANSPOSE4_PS(kernel1, kernel2, kernel3, kernel4); // load input __m128 p1 = _mm_loadu_ps(inBuf + x); p1 = _mm_add_ps(p1, p1); __m128 p2 = _mm_loadu_ps(inBuf + x - blurBufferSize.width()); p2 = _mm_add_ps(p2, _mm_loadu_ps(inBuf + x + blurBufferSize.width())); __m128 p3 = _mm_loadu_ps(inBuf + x - blurBufferSize.width() * 2); p3 = _mm_add_ps(p3, _mm_loadu_ps(inBuf + x + blurBufferSize.width() * 2)); __m128 p4 = _mm_loadu_ps(inBuf + x - blurBufferSize.width() * 3); p4 = _mm_add_ps(p4, _mm_loadu_ps(inBuf + x + blurBufferSize.width() * 3)); // apply kernel p1 = _mm_mul_ps(p1, kernel1); p2 = _mm_mul_ps(p2, kernel2); p3 = _mm_mul_ps(p3, kernel3); p4 = _mm_mul_ps(p4, kernel4); p1 = _mm_add_ps(p1, p2); p3 = _mm_add_ps(p3, p4); auto p = _mm_add_ps(p1, p3); // store _mm_storeu_ps(outBuf + x, p); } } for (int y = 0; y < dirtyRect.height(); ++y) { float *inBuf = blurBuffer1 + (y + 3) * blurBufferSize.width() + 3; for (int x = 0; x < dirtyRect.width(); ++x) { int cx = x + dirtyRect.left(); int cy = y + dirtyRect.top(); terrain->landform(cx, cy) = inBuf[x]; } } edit->endEdit(terrain); }
void sINLINE RNMarchingCubesBase<T>::func(const sVector31 &v,typename T::FieldType &pot,const funcinfo &fi) { __m128 vx = _mm_load_ps1(&v.x); __m128 vy = _mm_load_ps1(&v.y); __m128 vz = _mm_load_ps1(&v.z); __m128 po = _mm_setzero_ps(); // p __m128 nx = _mm_setzero_ps(); __m128 ny = _mm_setzero_ps(); __m128 nz = _mm_setzero_ps(); __m128 akkur = _mm_setzero_ps(); __m128 akkug = _mm_setzero_ps(); __m128 akkub = _mm_setzero_ps(); __m128 akkua = _mm_setzero_ps(); __m128 s255 = _mm_set_ps1(255.0f); sBool good = 0; for(sInt i=0;i<fi.pn4;i++) { const T::SimdType *part = fi.parts4 + i; __m128 dx = _mm_sub_ps(vx,part->x); __m128 dy = _mm_sub_ps(vy,part->y); __m128 dz = _mm_sub_ps(vz,part->z); __m128 ddx = _mm_mul_ps(dx,dx); __m128 ddy = _mm_mul_ps(dy,dy); __m128 ddz = _mm_mul_ps(dz,dz); __m128 pp = _mm_add_ps(_mm_add_ps(ddx,ddy),ddz); if(_mm_movemask_ps(_mm_cmple_ps(pp,fi.treshf4))!=0) { __m128 pp2 = _mm_sub_ps(_mm_div_ps(fi.one,pp),fi.tresh4); __m128 pp3 = _mm_max_ps(pp2,_mm_setzero_ps()); po = _mm_add_ps(po,pp3); // p = p+pp; __m128 pp4 = _mm_mul_ps(pp3,pp3); // pp*pp nx = _mm_add_ps(nx,_mm_mul_ps(pp4,dx)); // n += d*(pp*pp) ny = _mm_add_ps(ny,_mm_mul_ps(pp4,dy)); nz = _mm_add_ps(nz,_mm_mul_ps(pp4,dz)); if(T::Color==1) { akkur = _mm_add_ps(akkur,_mm_mul_ps(pp3,part->cr)); akkug = _mm_add_ps(akkug,_mm_mul_ps(pp3,part->cg)); akkub = _mm_add_ps(akkub,_mm_mul_ps(pp3,part->cb)); good = 1; } } } sF32 p = 0; sVector30 n; _MM_TRANSPOSE4_PS(po,nx,ny,nz); __m128 r = _mm_add_ps(_mm_add_ps(_mm_add_ps(nx,ny),nz),po); n.x = r.m128_f32[1]; n.y = r.m128_f32[2]; n.z = r.m128_f32[3]; p = r.m128_f32[0]; if(p==0) n.Init(0,0,0); else n.UnitFast(); pot.x = n.x; pot.y = n.y; pot.z = n.z; pot.w = p-fi.iso; if(T::Color) { if(good) { r = _mm_mul_ss(s255,_mm_rcp_ss(r)); // r = _mm_rcp_ss(r); _MM_TRANSPOSE4_PS(akkub,akkug,akkur,akkua); __m128 r2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(akkur,akkug),akkub),akkua); r2 = _mm_mul_ps(r2,_mm_shuffle_ps(r,r,0x00)); __m128i r3 = _mm_cvtps_epi32(r2); r3 = _mm_packs_epi32(r3,r3); __m128i r4 = _mm_packus_epi16(r3,r3); pot.c = r4.m128i_u32[0]|0xff000000; } else { pot.c = 0; } } }
void mat4::makeTranspose() { _MM_TRANSPOSE4_PS(m[0], m[1], m[2], m[3]); }
kw_mat4 kw_transpose(kw_mat4 m){ kw_mat4 result = m; _MM_TRANSPOSE4_PS(result.simd[0], result.simd[1], result.simd[2], result.simd[3]); return result; }
int main() { float *arr = get_arr(); // [4, 3, 2, 1] float *uarr = get_uarr(); // [5, 4, 3, 2] float *arr2 = get_arr2(); // [4, 3, 2, 1] float *uarr2 = get_uarr2(); // [5, 4, 3, 2] __m128 a = get_a(); // [8, 6, 4, 2] __m128 b = get_b(); // [1, 2, 3, 4] // Check that test data is like expected. Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned. Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned. // Test that aeq itself works and does not trivially return true on everything. Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false); #ifdef TEST_M64 Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false); #endif // SSE1 Load instructions: aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address. aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide. aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest. aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1 aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest. aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest. aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order. aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address. // SSE1 Set instructions: aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands. aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded. aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher. aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1 aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order. aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register. // SSE1 Move instructions: aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b. aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output. aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output. // SSE1 Store instructions: #ifdef TEST_M64 /*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value. /*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL; _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64. #endif _mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address. _mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory. _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1 _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory. _mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output. _mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address. #ifdef TEST_M64 /*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint. #endif _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint. // SSE1 Arithmetic instructions: aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add. aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a. aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div. aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a. aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul. aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a. #ifdef TEST_M64 __m64 m1 = get_m1(); /*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts. /*M64*/aeq64( _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16. __m64 m2 = get_m2(); /*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar. /*M64*/aeq64( _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8. #endif aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub. aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a. // SSE1 Elementary Math functions: #ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass. aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x. aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged. aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x). aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged. #endif aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x). aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged. __m128 i1 = get_i1(); __m128 i2 = get_i2(); // SSE1 Logical instructions: #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2 aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR #endif // SSE1 Compare instructions: // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp == aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged. aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >= aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged. aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp > aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged. aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <= aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged. aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp < aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged. aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp != aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged. aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >= aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged. aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not > aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged. aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <= aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged. aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not < aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged. __m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN] __m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0] aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan. aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged. // Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan. #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged. #endif Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int. Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int. Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int. Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int. Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int. Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int. // The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP // exception when one of the input operands is either a QNaN or a SNaN. #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1); #endif Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0); Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0); Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1); Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1); #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0); #endif // SSE1 Convert instructions: __m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 e = get_e(); // [INF, -INF, 2.5, 3.5] __m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808] #ifdef TEST_M64 /*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128. /*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64. #endif aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128. aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss. #ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions. Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int. Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32. #endif #ifdef TEST_M64 /*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged. /*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float. /*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128. /*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64. /*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64. /*M64*/aeq64(_mm_cvtps_pi8(c), 0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64. /*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128. #endif aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged. Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float. Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64. #endif Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32. Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64. #endif Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64. #ifndef __EMSCRIPTEN__ // TODO: Not implemented. // SSE1 General support: unsigned int mask = _MM_GET_EXCEPTION_MASK(); _MM_SET_EXCEPTION_MASK(mask); unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(flushZeroMode); unsigned int roundingMode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(roundingMode); unsigned int csr = _mm_getcsr(); _mm_setcsr(csr); unsigned char dummyData[4096]; _mm_prefetch(dummyData, _MM_HINT_T0); _mm_prefetch(dummyData, _MM_HINT_T1); _mm_prefetch(dummyData, _MM_HINT_T2); _mm_prefetch(dummyData, _MM_HINT_NTA); _mm_sfence(); #endif // SSE1 Misc instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64. /*M64*/Assert( _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8. #endif Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels. // SSE1 Probability/Statistics instructions: #ifdef TEST_M64 /*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16. /*M64*/aeq64(_mm_avg_pu8(m1, m2), 0x7FEE9D4D43A23548ULL); // 8-way average uint8s. /*M64*/aeq64( _m_pavgb(m1, m2), 0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8. // SSE1 Special Math instructions: /*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16. /*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8. /*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16. /*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8. #endif // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max. aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged. aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min. aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged. // SSE1 Swizzle instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64. /*M64*/Assert( _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16. /*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64. /*M64*/aeq64( _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16. /*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64. /*M64*/aeq64( _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16. #endif aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f); aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f); aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f); // Transposing a matrix via the xmmintrin.h-provided intrinsic. __m128 c0 = a; // [8, 6, 4, 2] __m128 c1 = b; // [1, 2, 3, 4] __m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5] _MM_TRANSPOSE4_PS(c0, c1, c2, c3); aeq(c0, 2.5f, 4.5f, 4.f, 2.f); aeq(c1, 4.5f, 3.5f, 3.f, 4.f); aeq(c2, 6.5f, 2.5f, 2.f, 6.f); aeq(c3, 8.5f, 1.5f, 1.f, 8.f); // All done! if (numFailures == 0) printf("Success!\n"); else printf("%d tests failed!\n", numFailures); }
//------------------------------------------------------------------------------- // For each tile go through all the bins and process all the triangles in it. // Rasterize each triangle to the CPU depth buffer. //------------------------------------------------------------------------------- void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_setr_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; // Based on TaskId determine which tile to process UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS; UINT tileX = tileId % screenWidthInTiles; UINT tileY = tileId / screenWidthInTiles; int tileStartX = tileX * TILE_WIDTH_IN_PIXELS; int tileEndX = tileStartX + TILE_WIDTH_IN_PIXELS - 1; int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS; int tileEndY = tileStartY + TILE_HEIGHT_IN_PIXELS - 1; ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx); UINT bin = 0; UINT binIndex = 0; UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX; UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX; UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; __m128 gatherBuf[4][3]; bool done = false; bool allBinsEmpty = true; mNumRasterizedTris[idx][tileId] = numTrisInBin; while(!done) { // Loop through all the bins and process 4 binned traingles at a time UINT ii; int numSimdTris = 0; for(ii = 0; ii < SSE; ii++) { while(numTrisInBin <= 0) { // This bin is empty. Move to next bin. if(++bin >= 1) { break; } numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; mNumRasterizedTris[idx][tileId] += numTrisInBin; binIndex = 0; } if(!numTrisInBin) { break; // No more tris in the bins } USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx); allBinsEmpty = false; numSimdTris++; ++binIndex; --numTrisInBin; } done = bin >= NUM_XFORMVERTS_TASKS; if(allBinsEmpty) { return; } // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; __m128 Z[3]; for(int i = 0; i < 3; i++) { __m128 v0 = gatherBuf[0][i]; __m128 v1 = gatherBuf[1][i]; __m128 v2 = gatherBuf[2][i]; __m128 v3 = gatherBuf[3][i]; // transpose into SoA layout _MM_TRANSPOSE4_PS(v0, v1, v2, v3); fxPtX[i] = _mm_cvtps_epi32(v0); fxPtY[i] = _mm_cvtps_epi32(v1); Z[i] = v2; } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX)); __m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < numSimdTris; lane++) { // Extract this triangle's properties from the SIMD versions __m128 zz[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane])); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane])); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane])); __m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row); __m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row); __m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); __m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm_add_epi32(sum0Row, bb0Inc), sum1Row = _mm_add_epi32(sum1Row, bb1Inc), sum2Row = _mm_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m128i alpha = sum0Row; __m128i beta = sum1Row; __m128i gama = sum2Row; //Compute barycentric-interpolated depth __m128 depth = zz[0]; depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); for(int c = startXx; c < endXx; c += 2, index += 4, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc), depth = _mm_add_ps(depth, zx)) { //Test Pixel inside triangle __m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama); __m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]); __m128 mergedDepth = _mm_max_ps(depth, previousDepthValue); __m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask)); _mm_store_ps(&pDepthBuffer[index], finalDepth); }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax) { assert(vertices.size() % 16 == 0); // Simple k-means clustering by normal direction to improve backface culling efficiency std::vector<__m128> quadNormals; for (auto i = 0; i < vertices.size(); i += 4) { auto v0 = vertices[i + 0]; auto v1 = vertices[i + 1]; auto v2 = vertices[i + 2]; auto v3 = vertices[i + 3]; quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3)))); } std::vector<__m128> centroids; std::vector<uint32_t> centroidAssignment; centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f)); centroidAssignment.resize(vertices.size() / 4); bool anyChanged = true; for (int iter = 0; iter < 10 && anyChanged; ++iter) { anyChanged = false; for (auto j = 0; j < quadNormals.size(); ++j) { __m128 normal = quadNormals[j]; __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity()); int bestCentroid = -1; for (int k = 0; k < centroids.size(); ++k) { __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F); if (_mm_comige_ss(distance, bestDistance)) { bestDistance = distance; bestCentroid = k; } } if (centroidAssignment[j] != bestCentroid) { centroidAssignment[j] = bestCentroid; anyChanged = true; } } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = _mm_setzero_ps(); } for (int j = 0; j < quadNormals.size(); ++j) { int k = centroidAssignment[j]; centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]); } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = normalize(centroids[k]); } } std::vector<__m128> orderedVertices; for (int k = 0; k < centroids.size(); ++k) { for (int j = 0; j < vertices.size() / 4; ++j) { if (centroidAssignment[j] == k) { orderedVertices.push_back(vertices[4 * j + 0]); orderedVertices.push_back(vertices[4 * j + 1]); orderedVertices.push_back(vertices[4 * j + 2]); orderedVertices.push_back(vertices[4 * j + 3]); } } } auto occluder = std::make_unique<Occluder>(); __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin)); __m128 scalingX = _mm_set1_ps(2047.0f); __m128 scalingY = _mm_set1_ps(2047.0f); __m128 scalingZ = _mm_set1_ps(1023.0f); __m128 half = _mm_set1_ps(0.5f); for (size_t i = 0; i < orderedVertices.size(); i += 16) { for (auto j = 0; j < 4; ++j) { // Transform into [0,1] space relative to bounding box __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents); __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents); __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents); __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents); // Transpose into [xxxx][yyyy][zzzz][wwww] _MM_TRANSPOSE4_PS(v0, v1, v2, v3); // Scale and truncate to int v0 = _mm_fmadd_ps(v0, scalingX, half); v1 = _mm_fmadd_ps(v1, scalingY, half); v2 = _mm_fmadd_ps(v2, scalingZ, half); __m128i X = _mm_cvttps_epi32(v0); __m128i Y = _mm_cvttps_epi32(v1); __m128i Z = _mm_cvttps_epi32(v2); // Pack to 11/11/10 format __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z)); occluder->m_vertexData.push_back(XYZ); } } occluder->m_refMin = refMin; occluder->m_refMax = refMax; __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity()); __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity()); for (size_t i = 0; i < orderedVertices.size(); ++i) { min = _mm_min_ps(vertices[i], min); max = _mm_max_ps(vertices[i], max); } // Set W = 1 - this is expected by frustum culling code min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000); max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000); occluder->m_boundsMin = min; occluder->m_boundsMax = max; occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f)); return occluder; }
out.row_[2].x_ = row_[2].x_ * row_[0].x_ + row_[2].y_ * row_[1].x_ + row_[2].z_ * row_[2].x_ + row_[2].w_ * row_[3].x_; out.row_[2].y_ = row_[2].x_ * row_[0].y_ + row_[2].y_ * row_[1].y_ + row_[2].z_ * row_[2].y_ + row_[2].w_ * row_[3].y_; out.row_[2].z_ = row_[2].x_ * row_[0].z_ + row_[2].y_ * row_[1].z_ + row_[2].z_ * row_[2].z_ + row_[2].w_ * row_[3].z_; out.row_[2].w_ = row_[2].x_ * row_[0].w_ + row_[2].y_ * row_[1].w_ + row_[2].z_ * row_[2].w_ + row_[2].w_ * row_[3].w_; out.row_[3].x_ = row_[3].x_ * row_[0].x_ + row_[3].y_ * row_[1].x_ + row_[3].z_ * row_[2].x_ + row_[3].w_ * row_[3].x_; out.row_[3].y_ = row_[3].x_ * row_[0].y_ + row_[3].y_ * row_[1].y_ + row_[3].z_ * row_[2].y_ + row_[3].w_ * row_[3].y_; out.row_[3].z_ = row_[3].x_ * row_[0].z_ + row_[3].y_ * row_[1].z_ + row_[3].z_ * row_[2].z_ + row_[3].w_ * row_[3].z_; out.row_[3].w_ = row_[3].x_ * row_[0].w_ + row_[3].y_ * row_[1].w_ + row_[3].z_ * row_[2].w_ + row_[3].w_ * row_[3].w_; *this = out; return *this; # endif } inline void Matrix4::Transpose() { _MM_TRANSPOSE4_PS(row_[0], row_[1], row_[2], row_[3]) } Matrix4 MatrixOrthographicLH(const float & ViewWidth, const float & ViewHeight, const float & NearZ, const float & FarZ) { Matrix4 M; float fRange = 1.0f / (FarZ - NearZ); // Note: This is recorded on the stack Vector4 rMem = { 2.0f / ViewWidth, 2.0f / ViewHeight, fRange, -fRange * NearZ }; // Copy from memory to SSE register Vector4 vValues = rMem; Vector4 vTemp = _mm_setzero_ps(); // Copy x only vTemp = _mm_move_ss(vTemp, vValues); // 2.0f / ViewWidth,0,0,0 M.row_[0] = vTemp;