/** transform vector by rigid transform */ inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec) { #ifdef SIMPLE_GL_USE_SSE4 __m128 res; __m128 dotProd; res = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\ dotProd = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\ dotProd = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\ dotProd = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) ); return Matrix<float, 4, 1>(res); #elif defined(SIMPLE_GL_USE_SSE3) __m128 res; __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); __m128 vec01 = _mm_unpacklo_ps(dotProd0, dotProd1); __m128 vec23 = _mm_unpackhi_ps(dotProd2, dotProd3); res = _mm_movelh_ps(vec01, vec23); return Matrix<float, 4, 1>(res); #else // SSE2 // TODO: Think about good sse optimization Matrix<float, 4, 1> res; res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3]; res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3]; res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3]; res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3]; return res; #endif }
void test4bit (void) { d1 = _mm_round_pd (d2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ d1 = _mm_round_sd (d2, d3, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ a1 = _mm_round_ps (a2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ a1 = _mm_round_ss (a2, a2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ a1 = _mm_blend_ps (a2, a3, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ e1 = _mm256_blend_pd (e2, e3, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ e1 = _mm256_round_pd (e2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ b1 = _mm256_round_ps (b2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ }
static void TEST (void) { __m128 x, y; union { __m128 x[NUM]; float f[NUM * 4]; } dst, src1, src2; union { __m128 x; float f[4]; } src3; int i; init_blendps (src1.f, src2.f); for (i = 0; i < 4; i++) src3.f[i] = (int) random (); /* Check blendps imm8, m128, xmm */ for (i = 0; i < NUM; i++) { dst.x[i] = _mm_blend_ps (src1.x[i], src2.x[i], MASK); if (check_blendps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4])) abort (); } /* Check blendps imm8, xmm, xmm */ x = _mm_blend_ps (dst.x[2], src3.x, MASK); y = _mm_blend_ps (src3.x, dst.x[2], MASK); if (check_blendps (&x, &dst.f[8], &src3.f[0])) abort (); if (check_blendps (&y, &src3.f[0], &dst.f[8])) abort (); }
void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh) { __m128 ms0 = _mm_load_ps(src); __m128 ms1 = _mm_load_ps(src + 4); const __m128 mm = _mm_set1_ps(0.5f); __m128 a = _mm_add_ps(ms0, ms1); __m128 b = _mm_sub_ps(ms0, ms1); __m128 t1 = _mm_unpacklo_ps(a, b); __m128 t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth); ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk); #ifdef _KEEP_00_COEF_ ms0 = _mm_blend_ps(ms0, a, 1); #endif msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth); ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk); a = _mm_add_ps(ms0, ms1); b = _mm_sub_ps(ms0, ms1); t1 = _mm_unpacklo_ps(a, b); t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); _mm_store_ps(dest, a); _mm_store_ps(dest + 4, b); }
void fDCT2D4x4_and_threshold_keep00_32f(float* s, float* d, float thresh) { const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); const __m128 zeros = _mm_setzero_ps(); const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 p03 = _mm_add_ps(s0, s3); __m128 p12 = _mm_add_ps(s1, s2); __m128 m03 = _mm_sub_ps(s0, s3); __m128 m12 = _mm_sub_ps(s1, s2); __m128 v = _mm_add_ps(p03, p12); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); // keep 00 coef. __m128 v2 = _mm_blendv_ps(zeros, v, msk); v2 = _mm_blend_ps(v2, v, 1); _mm_store_ps(d, v2); v = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 4, v); v = _mm_sub_ps(p03, p12); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 8, v); v = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 12, v); }
// Blend together two vector's components template <Bool X, Bool Y, Bool Z, Bool W> inline Vector VFunction Blend(const Vector& vectorA, const Vector& vectorB) { return _mm_blend_ps(vectorA, vectorB, (X << 3) | (Y << 2) | (Z << 1) | W); }
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax) { assert(vertices.size() % 16 == 0); // Simple k-means clustering by normal direction to improve backface culling efficiency std::vector<__m128> quadNormals; for (auto i = 0; i < vertices.size(); i += 4) { auto v0 = vertices[i + 0]; auto v1 = vertices[i + 1]; auto v2 = vertices[i + 2]; auto v3 = vertices[i + 3]; quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3)))); } std::vector<__m128> centroids; std::vector<uint32_t> centroidAssignment; centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f)); centroidAssignment.resize(vertices.size() / 4); bool anyChanged = true; for (int iter = 0; iter < 10 && anyChanged; ++iter) { anyChanged = false; for (auto j = 0; j < quadNormals.size(); ++j) { __m128 normal = quadNormals[j]; __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity()); int bestCentroid = -1; for (int k = 0; k < centroids.size(); ++k) { __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F); if (_mm_comige_ss(distance, bestDistance)) { bestDistance = distance; bestCentroid = k; } } if (centroidAssignment[j] != bestCentroid) { centroidAssignment[j] = bestCentroid; anyChanged = true; } } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = _mm_setzero_ps(); } for (int j = 0; j < quadNormals.size(); ++j) { int k = centroidAssignment[j]; centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]); } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = normalize(centroids[k]); } } std::vector<__m128> orderedVertices; for (int k = 0; k < centroids.size(); ++k) { for (int j = 0; j < vertices.size() / 4; ++j) { if (centroidAssignment[j] == k) { orderedVertices.push_back(vertices[4 * j + 0]); orderedVertices.push_back(vertices[4 * j + 1]); orderedVertices.push_back(vertices[4 * j + 2]); orderedVertices.push_back(vertices[4 * j + 3]); } } } auto occluder = std::make_unique<Occluder>(); __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin)); __m128 scalingX = _mm_set1_ps(2047.0f); __m128 scalingY = _mm_set1_ps(2047.0f); __m128 scalingZ = _mm_set1_ps(1023.0f); __m128 half = _mm_set1_ps(0.5f); for (size_t i = 0; i < orderedVertices.size(); i += 16) { for (auto j = 0; j < 4; ++j) { // Transform into [0,1] space relative to bounding box __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents); __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents); __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents); __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents); // Transpose into [xxxx][yyyy][zzzz][wwww] _MM_TRANSPOSE4_PS(v0, v1, v2, v3); // Scale and truncate to int v0 = _mm_fmadd_ps(v0, scalingX, half); v1 = _mm_fmadd_ps(v1, scalingY, half); v2 = _mm_fmadd_ps(v2, scalingZ, half); __m128i X = _mm_cvttps_epi32(v0); __m128i Y = _mm_cvttps_epi32(v1); __m128i Z = _mm_cvttps_epi32(v2); // Pack to 11/11/10 format __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z)); occluder->m_vertexData.push_back(XYZ); } } occluder->m_refMin = refMin; occluder->m_refMax = refMax; __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity()); __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity()); for (size_t i = 0; i < orderedVertices.size(); ++i) { min = _mm_min_ps(vertices[i], min); max = _mm_max_ps(vertices[i], max); } // Set W = 1 - this is expected by frustum culling code min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000); max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000); occluder->m_boundsMin = min; occluder->m_boundsMax = max; occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f)); return occluder; }
gboolean gimp_operation_normal_mode_process_pixels_sse4 (gfloat *in, gfloat *aux, gfloat *mask, gfloat *out, gfloat opacity, glong samples, const GeglRectangle *roi, gint level) { /* check alignment */ if ((((uintptr_t)in) | ((uintptr_t)aux) | ((uintptr_t)out)) & 0x0F) { return gimp_operation_normal_mode_process_pixels_core (in, aux, mask, out, opacity, samples, roi, level); } else { const __v4sf *v_in = (const __v4sf*) in; const __v4sf *v_aux = (const __v4sf*) aux; __v4sf *v_out = ( __v4sf*) out; const __v4sf one = _mm_set1_ps (1.0f); const __v4sf v_opacity = _mm_set1_ps (opacity); while (samples--) { __v4sf rgba_in, rgba_aux, alpha; rgba_in = *v_in++; rgba_aux = *v_aux++; /* expand alpha */ alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_aux, _MM_SHUFFLE (3, 3, 3, 3)); if (mask) { __v4sf mask_alpha; /* multiply aux's alpha by the mask */ mask_alpha = _mm_set1_ps (*mask++); alpha = alpha * mask_alpha; } alpha = alpha * v_opacity; if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ())) { __v4sf dst_alpha, a_term, out_pixel, out_alpha; /* expand alpha */ dst_alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_in, _MM_SHUFFLE (3, 3, 3, 3)); /* a_term = dst_a * (1.0 - src_a) */ a_term = dst_alpha * (one - alpha); /* out(color) = src * src_a + dst * a_term */ out_pixel = rgba_aux * alpha + rgba_in * a_term; /* out(alpha) = 1.0 * src_a + 1.0 * a_term */ out_alpha = alpha + a_term; /* un-premultiply */ out_pixel = out_pixel / out_alpha; /* swap in the real alpha */ out_pixel = _mm_blend_ps (out_pixel, out_alpha, 0x08); *v_out++ = out_pixel; } else { *v_out++ = rgba_in; } } } return TRUE; }
__m128 test_blend_ps(__m128 V1, __m128 V2) { // CHECK-LABEL: test_blend_ps // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 3> // CHECK-ASM: blendps $6, %xmm{{.*}}, %xmm{{.*}} return _mm_blend_ps(V1, V2, 6); }
void fDCT2D8x4_and_threshold_keep00_32f(const float* x, float* y, float thresh) { const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); const __m128 zeros = _mm_setzero_ps(); __m128 c0 = _mm_load_ps(x); __m128 c1 = _mm_load_ps(x + 56); __m128 t0 = _mm_add_ps(c0, c1); __m128 t7 = _mm_sub_ps(c0, c1); c1 = _mm_load_ps(x + 48); c0 = _mm_load_ps(x + 8); __m128 t1 = _mm_add_ps(c0, c1); __m128 t6 = _mm_sub_ps(c0, c1); c1 = _mm_load_ps(x + 40); c0 = _mm_load_ps(x + 16); __m128 t2 = _mm_add_ps(c0, c1); __m128 t5 = _mm_sub_ps(c0, c1); c0 = _mm_load_ps(x + 24); c1 = _mm_load_ps(x + 32); __m128 t3 = _mm_add_ps(c0, c1); __m128 t4 = _mm_sub_ps(c0, c1); /* c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2; c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2; c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2; c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2; */ c0 = _mm_add_ps(t0, t3); __m128 c3 = _mm_sub_ps(t0, t3); c1 = _mm_add_ps(t1, t2); __m128 c2 = _mm_sub_ps(t1, t2); /* c0 = t0 + t3; c3 = t0 - t3; c1 = t1 + t2; c2 = t1 - t2; */ const __m128 invsqrt2h = _mm_set_ps1(0.353554f); __m128 v = _mm_mul_ps(_mm_add_ps(c0, c1), invsqrt2h); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); // keep 00 coef. __m128 v2 = _mm_blendv_ps(zeros, v, msk); v2 = _mm_blend_ps(v2, v, 1); _mm_store_ps(y, v2); v = _mm_mul_ps(_mm_sub_ps(c0, c1), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 32, v); /*y[0] = c0 + c1; y[4] = c0 - c1;*/ __m128 w0 = _mm_set_ps1(0.541196f); __m128 w1 = _mm_set_ps1(1.306563f); v = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(w0, c2), _mm_mul_ps(w1, c3)), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 16, v); v = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(w0, c3), _mm_mul_ps(w1, c2)), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 48, v); /* y[2] = c2 * r[6] + c3 * r[2]; y[6] = c3 * r[6] - c2 * r[2]; */ w0 = _mm_set_ps1(1.175876f); w1 = _mm_set_ps1(0.785695f); c3 = _mm_add_ps(_mm_mul_ps(w0, t4), _mm_mul_ps(w1, t7)); c0 = _mm_sub_ps(_mm_mul_ps(w0, t7), _mm_mul_ps(w1, t4)); /* c3 = t4 * r[3] + t7 * r[5]; c0 = t7 * r[3] - t4 * r[5]; */ w0 = _mm_set_ps1(1.387040f); w1 = _mm_set_ps1(0.275899f); c2 = _mm_add_ps(_mm_mul_ps(w0, t5), _mm_mul_ps(w1, t6)); c1 = _mm_sub_ps(_mm_mul_ps(w0, t6), _mm_mul_ps(w1, t5)); /* c2 = t5 * r[1] + t6 * r[7]; c1 = t6 * r[1] - t5 * r[7]; */ v = _mm_mul_ps(_mm_sub_ps(c0, c2), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 24, v); v = _mm_mul_ps(_mm_sub_ps(c3, c1), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 40, v); //y[5] = c3 - c1; y[3] = c0 - c2; const __m128 invsqrt2 = _mm_set_ps1(0.707107f); c0 = _mm_mul_ps(_mm_add_ps(c0, c2), invsqrt2); c3 = _mm_mul_ps(_mm_add_ps(c3, c1), invsqrt2); //c0 = (c0 + c2) * invsqrt2; //c3 = (c3 + c1) * invsqrt2; v = _mm_mul_ps(_mm_add_ps(c0, c3), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 8, v); v = _mm_mul_ps(_mm_sub_ps(c0, c3), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 56, v); //y[1] = c0 + c3; y[7] = c0 - c3; /*for(i = 0;i < 8;i++) { y[i] *= invsqrt2h; }*/ }
__m128 test_blend_ps(__m128 V1, __m128 V2) { // CHECK-LABEL: @test_blend_ps // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3> return _mm_blend_ps(V1, V2, 5); }
static void bar( // bitonic_simd_sort float (& inout)[8]) { const __m128 r0_in0 = _mm_load_ps(inout + 0); // 0, 1, 2, 3 const __m128 r0_in1 = _mm_load_ps(inout + 4); // 4, 5, 6, 7 // stage 0 const __m128 r0_A = _mm_shuffle_ps(r0_in0, r0_in1, 0xcc); // 0, 3, 4, 7 const __m128 r0_B = _mm_shuffle_ps(r0_in0, r0_in1, 0x99); // 1, 2, 5, 6 const __m128 r0_min = _mm_min_ps(r0_A, r0_B); // 0, 3, 4, 7 const __m128 r0_max = _mm_max_ps(r0_A, r0_B); // 1, 2, 5, 6 // stage 1 __m128 r1_A = _mm_shuffle_ps(r0_max, r0_max, 0xf0); // 1, 1, 6, 6 __m128 r1_B = _mm_shuffle_ps(r0_max, r0_max, 0xa5); // 2, 2, 5, 5 r1_A = _mm_blend_ps(r1_A, r0_min, 0x9); // 0, 1, 6, 7 r1_B = _mm_blend_ps(r1_B, r0_min, 0x6); // 2, 3, 4, 5 const __m128 r1_min = _mm_min_ps(r1_A, r1_B); // 0, 1, 6, 7 const __m128 r1_max = _mm_max_ps(r1_A, r1_B); // 2, 3, 4, 5 // stage 2 __m128 r2_A = _mm_shuffle_ps(r1_max, r1_max, 0xf0); // 2, 2, 5, 5 __m128 r2_B = _mm_shuffle_ps(r1_min, r1_min, 0xa5); // 1, 1, 6, 6 r2_A = _mm_blend_ps(r2_A, r1_min, 0x9); // 0, 2, 5, 7 r2_B = _mm_blend_ps(r2_B, r1_max, 0x6); // 1, 3, 4, 6 const __m128 r2_min = _mm_min_ps(r2_A, r2_B); // 0, 2, 5, 7 const __m128 r2_max = _mm_max_ps(r2_A, r2_B); // 1, 3, 4, 6 // stage 3 const __m128 r3_A = _mm_unpacklo_ps(r2_min, r2_max); // 0, 1, 2, 3 const __m128 r3_B = _mm_unpackhi_ps(r2_max, r2_min); // 4, 5, 6, 7 const __m128 r3_min = _mm_min_ps(r3_A, r3_B); // 0, 1, 2, 3 const __m128 r3_max = _mm_max_ps(r3_A, r3_B); // 4, 5, 6, 7 // stage 4 const __m128 r4_A = _mm_movelh_ps(r3_min, r3_max); // 0, 1, 4, 5 const __m128 r4_B = _mm_movehl_ps(r3_max, r3_min); // 2, 3, 6, 7 const __m128 r4_min = _mm_min_ps(r4_A, r4_B); // 0, 1, 4, 5 const __m128 r4_max = _mm_max_ps(r4_A, r4_B); // 2, 3, 6, 7 // stage 5 const __m128 r5_a = _mm_unpacklo_ps(r4_min, r4_max); // 0, 2, 1, 3 const __m128 r5_b = _mm_unpackhi_ps(r4_min, r4_max); // 4, 6, 5, 7 const __m128 r5_A = _mm_movelh_ps(r5_a, r5_b); // 0, 2, 4, 6 const __m128 r5_B = _mm_movehl_ps(r5_b, r5_a); // 1, 3, 5, 7 const __m128 r5_min = _mm_min_ps(r5_A, r5_B); // 0, 2, 4, 6 const __m128 r5_max = _mm_max_ps(r5_A, r5_B); // 1, 3, 5, 7 // output const __m128 out0 = _mm_unpacklo_ps(r5_min, r5_max); // 0, 1, 2, 3 const __m128 out1 = _mm_unpackhi_ps(r5_min, r5_max); // 4, 5, 6, 7 _mm_store_ps(inout + 0, out0); _mm_store_ps(inout + 4, out1); }