inline vector4f select(const vector4fb& cond, const vector4f& a, const vector4f& b) { #if SSE_INSTR_SET >= 5 // SSE 4.1 return _mm_blendv_ps(b, a, cond); #else return _mm_or_ps(_mm_and_ps(cond, a), _mm_andnot_ps(cond, b)); #endif }
void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh) { __m128 ms0 = _mm_load_ps(src); __m128 ms1 = _mm_load_ps(src + 4); const __m128 mm = _mm_set1_ps(0.5f); __m128 a = _mm_add_ps(ms0, ms1); __m128 b = _mm_sub_ps(ms0, ms1); __m128 t1 = _mm_unpacklo_ps(a, b); __m128 t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth); ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk); #ifdef _KEEP_00_COEF_ ms0 = _mm_blend_ps(ms0, a, 1); #endif msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth); ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk); a = _mm_add_ps(ms0, ms1); b = _mm_sub_ps(ms0, ms1); t1 = _mm_unpacklo_ps(a, b); t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); _mm_store_ps(dest, a); _mm_store_ps(dest + 4, b); }
void fDCT2D4x4_and_threshold_keep00_32f(float* s, float* d, float thresh) { const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); const __m128 zeros = _mm_setzero_ps(); const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 p03 = _mm_add_ps(s0, s3); __m128 p12 = _mm_add_ps(s1, s2); __m128 m03 = _mm_sub_ps(s0, s3); __m128 m12 = _mm_sub_ps(s1, s2); __m128 v = _mm_add_ps(p03, p12); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); // keep 00 coef. __m128 v2 = _mm_blendv_ps(zeros, v, msk); v2 = _mm_blend_ps(v2, v, 1); _mm_store_ps(d, v2); v = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 4, v); v = _mm_sub_ps(p03, p12); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 8, v); v = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 12, v); }
void blurRemoveMinMax_(const Mat& src, Mat& dest, const int r) { const Size ksize = Size(2 * r + 1, 2 * r + 1); if (src.data != dest.data)src.copyTo(dest); Mat xv; Mat nv; Mat element = Mat::ones(2 * r + 1, 2 * r + 1, CV_8U); dilate(src, xv, element); erode(src, nv, element); Mat mind; Mat maxd; Mat mask; absdiff(src, nv, mind);//can move to loop absdiff(src, xv, maxd);// min(mind, maxd, mask);// T* n = nv.ptr<T>(0); T* x = xv.ptr<T>(0); T* d = dest.ptr<T>(0); T* nd = mind.ptr<T>(0); T* mk = mask.ptr<T>(0); int remsize = src.size().area(); #if CV_SSE4_1 if (src.depth() == CV_8U) { const int ssesize = src.size().area() / 16; remsize = src.size().area() - ssesize * 16; for (int i = 0; i < ssesize; i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi8(mnd, mmk); _mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk)); nd += 16; mk += 16; d += 16; n += 16; x += 16; } } else if (src.depth() == CV_16S || src.depth() == CV_16U) { const int ssesize = src.size().area() / 8; remsize = src.size().area() - ssesize * 8; for (int i = 0; i < ssesize; i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi16(mnd, mmk); _mm_stream_si128((__m128i*)d, _mm_blendv_epi8(mmx, mmn, msk)); nd += 8; mk += 8; d += 8; n += 8; x += 8; } } else if (src.depth() == CV_32F) { const int ssesize = src.size().area() / 4; remsize = src.size().area() - ssesize * 4; for (int i = 0; i < ssesize; i++) { __m128 mmk = _mm_load_ps((float*)mk); __m128 mnd = _mm_load_ps((float*)nd); __m128 mmn = _mm_load_ps((float*)n); __m128 mmx = _mm_load_ps((float*)x); __m128 msk = _mm_cmpeq_ps(mnd, mmk); _mm_stream_ps((float*)d, _mm_blendv_ps(mmx, mmn, msk)); nd += 4; mk += 4; d += 4; n += 4; x += 4; } } else if (src.depth() == CV_64F) { const int ssesize = src.size().area() / 2; remsize = src.size().area() - ssesize * 2; for (int i = 0; i < ssesize; i++) { __m128d mmk = _mm_load_pd((double*)mk); __m128d mnd = _mm_load_pd((double*)nd); __m128d mmn = _mm_load_pd((double*)n); __m128d mmx = _mm_load_pd((double*)x); __m128d msk = _mm_cmpeq_pd(mnd, mmk); _mm_stream_pd((double*)d, _mm_blendv_pd(mmx, mmn, msk)); nd += 2; mk += 2; d += 2; n += 2; x += 2; } } #endif for (int i = 0; i < remsize; i++) { { if (nd[i] == mk[i]) { d[i] = n[i]; } else { d[i] = x[i]; } } } }
// -------------------------------------------------------------- vuint32 mandelbrot_SIMD_F32(vfloat32 a, vfloat32 b, int max_iter) // -------------------------------------------------------------- { // version avec test de sortie en float vuint32 iter = _mm_set1_epi32(0); vfloat32 fiter = _mm_set_ps(0,0,0,0); vfloat32 x,y,t,t2,zero,un,deux,quatre; // COMPLETER ICI int test,i = 0; // initialisation des variables x = _mm_set_ps(0,0,0,0); y = _mm_set_ps(0,0,0,0); deux = _mm_set_ps(2,2,2,2); quatre = _mm_set_ps(4,4,4,4); un = _mm_set_ps(1,1,1,1); zero = _mm_set_ps(0,0,0,0); // iteration zero t = _mm_mul_ps(x, x); t2 = _mm_mul_ps(y, y); y = _mm_mul_ps(x,y); y = _mm_mul_ps(y,deux); y = _mm_add_ps(y,b); x = _mm_sub_ps(t,t2); x = _mm_add_ps(x,a); // calcul while(i<max_iter && _mm_movemask_ps(t) != 15){ t = _mm_mul_ps(x, x); t2 = _mm_mul_ps(y, y); y = _mm_mul_ps(_mm_mul_ps(x,y),deux); y = _mm_add_ps(y,b); x = _mm_sub_ps(t,t2); x = _mm_add_ps(x,a); t2 = _mm_add_ps(t,t2); t2 = _mm_cmple_ps(t2,quatre); t = _mm_blendv_ps(zero,un,t2); fiter = _mm_add_ps(fiter,t); t = _mm_cmpeq_ps(t, zero); //display_vfloat32(t,"%f\t","T :: "); //printf(" MASK::%d \n",_mm_movemask_ps(t)); i+=1; } iter = _mm_cvtps_epi32(fiter); return iter; }
void minmax_vec(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_) { // We suppose that pointers are aligned on an 16-byte boundary // Initialise SSE registers __m128i sse_idx_min = _mm_setzero_si128(); __m128i sse_idx_max = _mm_setzero_si128(); __m128 sse_min = _mm_set1_ps(FLT_MAX); __m128 sse_max = _mm_set1_ps(FLT_MIN); // We will unroll the for-loop by for, thus doing // (n/4) iterations. const uint32_t n_sse = n & ~3ULL; __m128i sse_idx = _mm_set_epi32(3, 2, 1, 0); const __m128i sse_4 = _mm_set1_epi32(4); for (uint32_t i = 0; i < n_sse; i += 4) { const __m128 sse_v = _mm_load_ps(&buf[i]); const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min); const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); sse_idx = _mm_add_epi32(sse_idx, sse_4); } // SSE reduction float __attribute__((aligned(16))) mins[4]; float __attribute__((aligned(16))) maxs[4]; _mm_store_ps(mins, sse_min); _mm_store_ps(maxs, sse_max); float min = mins[0]; float max = maxs[0]; uint32_t idx_min = _mm_extract_epi32(sse_idx_min, 0); uint32_t idx_max = _mm_extract_epi32(sse_idx_max, 0); // Unrolled by GCC for (int i = 1; i < 4; i++) { float v = mins[i]; if (v < min) { min = v; idx_min = _mm_extract_epi32(sse_idx_min, i); } v = maxs[i]; if (v > max) { max = v; idx_max = _mm_extract_epi32(sse_idx_max, i); } } // Epilogue for (uint32_t i = n_sse; i < n; i++) { const float v = buf[i]; if (v < min) { min = v; idx_min = i; } if (v > max) { max = v; idx_max = i; } } *idx_min_ = idx_min; *min_ = min; *idx_max_ = idx_max; *max_ = max; }
void minmax_vec2(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_) { // We suppose that pointers are aligned on an 16-byte boundary // Initialise SSE registers __m128i sse_idx_min = _mm_setzero_si128(); __m128i sse_idx_max = _mm_setzero_si128(); __m128 sse_min = _mm_set1_ps(FLT_MAX); __m128 sse_max = _mm_set1_ps(FLT_MIN); // We will unroll the for-loop by for, thus doing // (n/4) iterations. const uint32_t n_sse = n & ~3ULL; __m128i sse_idx = _mm_set_epi32(3, 2, 1, 0); const __m128i sse_4 = _mm_set1_epi32(4); for (uint32_t i = 0; i < n_sse; i += 4) { const __m128 sse_v = _mm_load_ps(&buf[i]); const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min); const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); sse_idx = _mm_add_epi32(sse_idx, sse_4); } // SSE reduction __m128 sse_min_permute = _mm_shuffle_epi32(sse_min, 2 | (3<<2)); __m128 sse_max_permute = _mm_shuffle_epi32(sse_max, 2 | (3<<2)); __m128i sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 2 | (3<<2)); __m128i sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 2 | (3<<2)); __m128 sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min); __m128 sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); sse_min_permute = _mm_shuffle_epi32(sse_min, 1); sse_max_permute = _mm_shuffle_epi32(sse_max, 1); sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 1); sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 1); sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min); sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); // Epilogue float min, max; uint32_t idx_min, idx_max; _mm_store_ss(&min, sse_min); _mm_store_ss(&max, sse_max); idx_min = _mm_extract_epi32(sse_idx_min, 0); idx_max = _mm_extract_epi32(sse_idx_max, 0); for (uint32_t i = n_sse; i < n; i++) { const float v = buf[i]; if (v < min) { min = v; idx_min = i; } if (v > max) { max = v; idx_max = i; } } *idx_min_ = idx_min; *min_ = min; *idx_max_ = idx_max; *max_ = max; }
__m128 test_blendv_ps(__m128 V1, __m128 V2, __m128 V3) { // CHECK-LABEL: test_blendv_ps // CHECK: call <4 x float> @llvm.x86.sse41.blendvps // CHECK-ASM: blendvps %xmm{{.*}}, %xmm{{.*}} return _mm_blendv_ps(V1, V2, V3); }
void fDCT2D8x4_and_threshold_keep00_32f(const float* x, float* y, float thresh) { const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); const __m128 zeros = _mm_setzero_ps(); __m128 c0 = _mm_load_ps(x); __m128 c1 = _mm_load_ps(x + 56); __m128 t0 = _mm_add_ps(c0, c1); __m128 t7 = _mm_sub_ps(c0, c1); c1 = _mm_load_ps(x + 48); c0 = _mm_load_ps(x + 8); __m128 t1 = _mm_add_ps(c0, c1); __m128 t6 = _mm_sub_ps(c0, c1); c1 = _mm_load_ps(x + 40); c0 = _mm_load_ps(x + 16); __m128 t2 = _mm_add_ps(c0, c1); __m128 t5 = _mm_sub_ps(c0, c1); c0 = _mm_load_ps(x + 24); c1 = _mm_load_ps(x + 32); __m128 t3 = _mm_add_ps(c0, c1); __m128 t4 = _mm_sub_ps(c0, c1); /* c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2; c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2; c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2; c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2; */ c0 = _mm_add_ps(t0, t3); __m128 c3 = _mm_sub_ps(t0, t3); c1 = _mm_add_ps(t1, t2); __m128 c2 = _mm_sub_ps(t1, t2); /* c0 = t0 + t3; c3 = t0 - t3; c1 = t1 + t2; c2 = t1 - t2; */ const __m128 invsqrt2h = _mm_set_ps1(0.353554f); __m128 v = _mm_mul_ps(_mm_add_ps(c0, c1), invsqrt2h); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); // keep 00 coef. __m128 v2 = _mm_blendv_ps(zeros, v, msk); v2 = _mm_blend_ps(v2, v, 1); _mm_store_ps(y, v2); v = _mm_mul_ps(_mm_sub_ps(c0, c1), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 32, v); /*y[0] = c0 + c1; y[4] = c0 - c1;*/ __m128 w0 = _mm_set_ps1(0.541196f); __m128 w1 = _mm_set_ps1(1.306563f); v = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(w0, c2), _mm_mul_ps(w1, c3)), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 16, v); v = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(w0, c3), _mm_mul_ps(w1, c2)), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 48, v); /* y[2] = c2 * r[6] + c3 * r[2]; y[6] = c3 * r[6] - c2 * r[2]; */ w0 = _mm_set_ps1(1.175876f); w1 = _mm_set_ps1(0.785695f); c3 = _mm_add_ps(_mm_mul_ps(w0, t4), _mm_mul_ps(w1, t7)); c0 = _mm_sub_ps(_mm_mul_ps(w0, t7), _mm_mul_ps(w1, t4)); /* c3 = t4 * r[3] + t7 * r[5]; c0 = t7 * r[3] - t4 * r[5]; */ w0 = _mm_set_ps1(1.387040f); w1 = _mm_set_ps1(0.275899f); c2 = _mm_add_ps(_mm_mul_ps(w0, t5), _mm_mul_ps(w1, t6)); c1 = _mm_sub_ps(_mm_mul_ps(w0, t6), _mm_mul_ps(w1, t5)); /* c2 = t5 * r[1] + t6 * r[7]; c1 = t6 * r[1] - t5 * r[7]; */ v = _mm_mul_ps(_mm_sub_ps(c0, c2), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 24, v); v = _mm_mul_ps(_mm_sub_ps(c3, c1), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 40, v); //y[5] = c3 - c1; y[3] = c0 - c2; const __m128 invsqrt2 = _mm_set_ps1(0.707107f); c0 = _mm_mul_ps(_mm_add_ps(c0, c2), invsqrt2); c3 = _mm_mul_ps(_mm_add_ps(c3, c1), invsqrt2); //c0 = (c0 + c2) * invsqrt2; //c3 = (c3 + c1) * invsqrt2; v = _mm_mul_ps(_mm_add_ps(c0, c3), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 8, v); v = _mm_mul_ps(_mm_sub_ps(c0, c3), invsqrt2h); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(y + 56, v); //y[1] = c0 + c3; y[7] = c0 - c3; /*for(i = 0;i < 8;i++) { y[i] *= invsqrt2h; }*/ }
double CChiSquaredKernel<float>::Evaluate(float* x, float* y) { #ifndef __SSE4_1__ /* only cast at the end to guarantee that we get the same * result as when using SSE4 registers */ float result, num; result = 0; for(size_t i=0; i<m_n; i++) { num = x[i]*y[i]; if(num>0) // this implies that x+y!=0 if x,y>0 result += num/(x[i]+y[i]); } return static_cast<double>(result); #else __m128* px = (__m128*)x; __m128* py = (__m128*)y; __m128 sum = _mm_set1_ps(0.0f); __m128 mzero = _mm_set1_ps(0.0f); for(size_t i=0; i<m_offset/4; i++) { __m128 num = _mm_mul_ps(px[i],py[i]); __m128 denom = _mm_add_ps(px[i],py[i]); __m128 invdenom = _mm_rcp_ps(denom); // find nonzeros in numerator __m128 nans = _mm_cmpeq_ps(num,mzero); __m128 factors = _mm_blendv_ps(invdenom,mzero,nans); // compute product __m128 temp = _mm_mul_ps(num,factors); // add sum = _mm_add_ps(sum,temp); } float result[4] = {0,0,0,0}; _mm_storeu_ps(result,sum); float fresult = result[0] + result[1] + result[2] + result[3]; // add offset float num; for(size_t i=m_offset; i<m_n; i++) { num = x[i]*y[i]; if(num>0) // this implies that x+y!=0 if x,y>0 fresult += num/(x[i]+y[i]); } return static_cast<double>(fresult); #endif }
__m128 test_mm_blendv_ps(__m128 V1, __m128 V2, __m128 V3) { // CHECK-LABEL: test_mm_blendv_ps // CHECK: call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_blendv_ps(V1, V2, V3); }
//------------------------------------------------------------------------------- // For each tile go through all the bins and process all the triangles in it. // Rasterize each triangle to the CPU depth buffer. //------------------------------------------------------------------------------- void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_setr_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; // Based on TaskId determine which tile to process UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS; UINT tileX = tileId % screenWidthInTiles; UINT tileY = tileId / screenWidthInTiles; int tileStartX = tileX * TILE_WIDTH_IN_PIXELS; int tileEndX = tileStartX + TILE_WIDTH_IN_PIXELS - 1; int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS; int tileEndY = tileStartY + TILE_HEIGHT_IN_PIXELS - 1; ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx); UINT bin = 0; UINT binIndex = 0; UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX; UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX; UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; __m128 gatherBuf[4][3]; bool done = false; bool allBinsEmpty = true; mNumRasterizedTris[idx][tileId] = numTrisInBin; while(!done) { // Loop through all the bins and process 4 binned traingles at a time UINT ii; int numSimdTris = 0; for(ii = 0; ii < SSE; ii++) { while(numTrisInBin <= 0) { // This bin is empty. Move to next bin. if(++bin >= 1) { break; } numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; mNumRasterizedTris[idx][tileId] += numTrisInBin; binIndex = 0; } if(!numTrisInBin) { break; // No more tris in the bins } USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx); allBinsEmpty = false; numSimdTris++; ++binIndex; --numTrisInBin; } done = bin >= NUM_XFORMVERTS_TASKS; if(allBinsEmpty) { return; } // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; __m128 Z[3]; for(int i = 0; i < 3; i++) { __m128 v0 = gatherBuf[0][i]; __m128 v1 = gatherBuf[1][i]; __m128 v2 = gatherBuf[2][i]; __m128 v3 = gatherBuf[3][i]; // transpose into SoA layout _MM_TRANSPOSE4_PS(v0, v1, v2, v3); fxPtX[i] = _mm_cvtps_epi32(v0); fxPtY[i] = _mm_cvtps_epi32(v1); Z[i] = v2; } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX)); __m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < numSimdTris; lane++) { // Extract this triangle's properties from the SIMD versions __m128 zz[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane])); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane])); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane])); __m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row); __m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row); __m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); __m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm_add_epi32(sum0Row, bb0Inc), sum1Row = _mm_add_epi32(sum1Row, bb1Inc), sum2Row = _mm_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m128i alpha = sum0Row; __m128i beta = sum1Row; __m128i gama = sum2Row; //Compute barycentric-interpolated depth __m128 depth = zz[0]; depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); for(int c = startXx; c < endXx; c += 2, index += 4, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc), depth = _mm_add_ps(depth, zx)) { //Test Pixel inside triangle __m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama); __m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]); __m128 mergedDepth = _mm_max_ps(depth, previousDepthValue); __m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask)); _mm_store_ps(&pDepthBuffer[index], finalDepth); }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }