/* Function: esl_sse_expf() * Synopsis: <r[z] = exp x[z]> * Incept: SRE, Fri Dec 14 14:46:27 2007 [Janelia] * * Purpose: Given a vector <x> containing four floats, returns a * vector <r> in which each element <r[z] = expf(x[z])>. * * Valid for all IEEE754 floats $x_z$. * * Xref: J2/71 * J10/62: bugfix, minlogf/maxlogf range was too wide; * (k+127) must be >=0 and <=255, so (k+127)<<23 * is a valid IEEE754 float, without touching * the sign bit. Pommier had this right in the * first place, and I didn't understand. * * Note: Derived from an SSE1 implementation by Julian * Pommier. Converted to SSE2. * * Note on maxlogf/minlogf, which are close to but not * exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so * k+127 is 0..255, a valid IEEE754 8-bit exponent * (0..255), so the bit pattern (k+127)<<23 is IEEE754 * single-precision for 2^k. If k=-127, we get IEEE754 0. * If k=128, we get IEEE754 +inf. If k<-127, k+127 is * negative and we get screwed up. If k>128, k+127 * overflows the 8-bit exponent and sets the sign bit. So * for x' (base 2) < -127.5 we must definitely return e^x ~ * 0; for x' < 126.5 we're going to calculate 0 anyway * (because k=floor(-126.5-epsilon+0.5) = -127). So any * minlogf between -126.5 log2 ... -127.5 log2 will suffice * as the cutoff. Ditto for 126.5 log2 .. 127.5log2. * That's 87.68312 .. 88.3762655. I think Pommier's * thinking is, you don't want to get to close to the * edges, lest fp roundoff error screw you (he may have * consider 1 ulp carefully, I can't tell), but otherwise * you may as well put your bounds close to the outer edge; * so * maxlogf = 127.5 log(2) - epsilon * minlogf = -127.5 log(2) + epsilon * for an epsilon that happen to be ~ 3e-6. */ __m128 esl_sse_expf(__m128 x) { static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f, 4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f }; static float cephes_c[2] = { 0.693359375f, -2.12194440e-4f }; static float maxlogf = 88.3762626647949f; /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */ static float minlogf = -88.3762626647949f; /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */ __m128i k; __m128 mask, tmp, fx, z, y, minmask, maxmask; /* handle out-of-range and special conditions */ maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf)); minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf)); /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */ fx = _mm_mul_ps(x, _mm_set1_ps(eslCONST_LOG2R)); fx = _mm_add_ps(fx, _mm_set1_ps(0.5f)); /* floorf() with SSE: */ k = _mm_cvttps_epi32(fx); /* cast to int with truncation */ tmp = _mm_cvtepi32_ps(k); /* cast back to float */ mask = _mm_cmpgt_ps(tmp, fx); /* if it increased (i.e. if it was negative...) */ mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch... */ fx = _mm_sub_ps(tmp, mask); /* then subtract one. */ k = _mm_cvttps_epi32(fx); /* k is now ready for the 2^k part. */ /* polynomial approx for e^f for f in range [-0.5, 0.5] */ tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0])); z = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1])); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x, x); y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(1.0f)); /* build 2^k by hand, by creating a IEEE754 float */ k = _mm_add_epi32(k, _mm_set1_epi32(127)); k = _mm_slli_epi32(k, 23); fx = _mm_castsi128_ps(k); /* put 2^k e^f together (fx = 2^k, y = e^f) and we're done */ y = _mm_mul_ps(y, fx); /* special/range cleanup */ y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128) */ y = esl_sse_select_ps(y, _mm_set1_ps(0.0f), minmask); /* exp(x) = 0 for x < log(2^-149) */ return y; }
__m128 exp_128( const __m128& x) { //! Clip the value __m128 y = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(88.3762626647949f)), _mm_set1_ps(-88.3762626647949f)); //! Express exp(x) as exp(g + n * log(2)) __m128 fx = y * _mm_set1_ps(1.44269504088896341) + _mm_set1_ps(0.5f); //! Floor const __m128 tmp = _mm_round_ps(fx, _MM_FROUND_TO_ZERO); //! If greater, substract 1 const __m128 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), _mm_set1_ps(1.f)); fx = tmp - mask; y -= fx * _mm_set1_ps(0.693359375 - 2.12194440e-4); const __m128 z = y * y; const __m128 t = (((((_mm_set1_ps(1.9875691500E-4) * y + _mm_set1_ps(1.3981999507E-3)) * y + _mm_set1_ps(8.3334519073E-3)) * y + _mm_set1_ps(4.1665795894E-2)) * y + _mm_set1_ps(1.6666665459E-1)) * y + _mm_set1_ps(5.0000001201E-1)) * z + y + _mm_set1_ps(1.f); //! Build 2^n const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(fx), _mm_set1_epi32(0x7f)); //! Return the result return t * _mm_castsi128_ps(_mm_slli_epi32(emm0, 23)); }
void ColorFragment(MSR_FShaderParameters *params) { MSR_SSEColor3 &out = params->output; const MSR_ShaderGlobals *globals = params->globals; // Get the texture MSR_SSEColor3 tex = MSR_Tex2D_Wrap(params->globals->tex0, params->varyings[0], params->varyings[1]); // Depth params->varyings[10] = _mm_rcp_ps( *params->varyings[10] ); params->varyings[8] *= params->varyings[10]; params->varyings[9] = SSE_ONE - (params->varyings[9] * params->varyings[10]); MSR_SSEFloat shadow = MSR_Tex2D_F32(shadow_map_depth, params->varyings[8], params->varyings[9]); // Assemble the light direction const MSR_Vec4 &ld = params->globals->lights[0].direction; MSR_SSEVec3 LightDir( MSR_SSEFloat(ld.x), MSR_SSEFloat(ld.y), MSR_SSEFloat(ld.z) ); params->varyings[10] += bias_amt; float4 cmp = _mm_cmplt_ps(*shadow, *params->varyings[10]); // Get the eye vector MSR_SSEVec3 *Eye = (MSR_SSEVec3*)¶ms->varyings[2]; Eye->Normalize(); // Get the normal vector MSR_SSEVec3 *Normal = (MSR_SSEVec3*)¶ms->varyings[5]; Normal->Normalize(); // Diffuse MSR_SSEFloat diff = MSR_Clamp(Normal->Dot(LightDir), *SSE_ZERO, *SSE_ONE); // Specular MSR_SSEVec3 Reflect = ((diff * MSR_SSEFloat(2.0f)) * *Normal) - LightDir; Reflect.Normalize(); MSR_SSEFloat spec = MSR_Clamp(Reflect.Dot(*Eye), *SSE_ZERO, *SSE_ONE); spec = spec * spec * spec * spec * spec * spec * spec; float4 cmp2 = _mm_cmpgt_ps( *diff, *SSE_ZERO ); cmp = _mm_and_ps( cmp2, cmp ); MSR_SSEFloat clamped_ao = MSR_Clamp( params->varyings[11], *SSE_ZERO, *SSE_ONE ); out.r = tex.r * clamped_ao; out.g = tex.g * clamped_ao; out.b = tex.b * clamped_ao; MSR_SSEColor3 opt1, opt2; opt1.r = globals->ml_ambient[0].r; opt1.g = globals->ml_ambient[0].g; opt1.b = globals->ml_ambient[0].b; opt2.r = (diff * globals->ml_diffuse[0].r) + (spec * globals->ml_specular[0].r) + opt1.r; opt2.g = (diff * globals->ml_diffuse[0].g) + (spec * globals->ml_specular[0].g) + opt1.g; opt2.b = (diff * globals->ml_diffuse[0].b) + (spec * globals->ml_specular[0].b) + opt1.b; out.r *= _mm_or_ps( _mm_and_ps(*opt2.r,cmp), _mm_andnot_ps(cmp, *opt1.r) ); out.g *= _mm_or_ps( _mm_and_ps(*opt2.g,cmp), _mm_andnot_ps(cmp, *opt1.g) ); out.b *= _mm_or_ps( _mm_and_ps(*opt2.b,cmp), _mm_andnot_ps(cmp, *opt1.b) ); }
static inline __v4sf gamma_2_2_to_linear_sse2 (__v4sf x) { __v4sf curve = sse_pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f)); __v4sf line = x * splat4f (1/12.92f); __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.04045f)); return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line)); }
static inline __v4sf linear_to_gamma_2_2_sse2 (__v4sf x) { __v4sf curve = sse_pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f); __v4sf line = x * splat4f (12.92f); __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f)); return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line)); }
SIMDValue SIMDFloat32x4Operation::OpGreaterThan(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128_value = _mm_cmpgt_ps(tmpaValue.m128_value, tmpbValue.m128_value); // a > b? return X86SIMDValue::ToSIMDValue(x86Result); }
// Normalization ---------------------------------------------------------------------- static Float3 VFunction Normalize(const Float3& vector) { Vector vLengthSquared = Dot(vector, vector); Vector vEpsilon = _mm_cmpgt_ps(vLengthSquared, Constant::Epsilon); Vector vLengthReciprocal = _mm_rsqrt_ps(vLengthSquared); Vector vMultiplier = _mm_and_ps(vLengthReciprocal, vEpsilon); Vector vResult = _mm_mul_ps(vector, vMultiplier); return vResult; }
void Detect32f(const HidHaarCascade & hid, size_t offset, const __m128 & norm, __m128i & result) { typedef HidHaarCascade Hid; const float * leaves = hid.leaves.data(); const Hid::Node * node = hid.nodes.data(); const Hid::Stage * stages = hid.stages.data(); for (int i = 0, n = (int)hid.stages.size(); i < n; ++i) { const Hid::Stage & stage = stages[i]; if (stage.canSkip) continue; const Hid::Node * end = node + stage.ntrees; __m128 stageSum = _mm_setzero_ps(); if (stage.hasThree) { for (; node < end; ++node, leaves += 2) { const Hid::Feature & feature = hid.features[node->featureIdx]; __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); if (feature.rect[2].p0) sum = _mm_add_ps(sum, WeightedSum32f(feature.rect[2], offset)); StageSum32f(leaves, node->threshold, sum, norm, stageSum); } } else { for (; node < end; ++node, leaves += 2) { const Hid::Feature & feature = hid.features[node->featureIdx]; __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); StageSum32f(leaves, node->threshold, sum, norm, stageSum); } } result = _mm_andnot_si128(_mm_castps_si128(_mm_cmpgt_ps(_mm_set1_ps(stage.threshold), stageSum)), result); int resultCount = ResultCount(result); if (resultCount == 0) { return; } else if (resultCount == 1) { uint32_t SIMD_ALIGNED(16) _result[4]; float SIMD_ALIGNED(16) _norm[4]; _mm_store_si128((__m128i*)_result, result); _mm_store_ps(_norm, norm); for (int j = 0; j < 4; ++j) { if (_result[j]) { _result[j] = Base::Detect32f(hid, offset + j, i + 1, _norm[j]) > 0 ? 1 : 0; break; } } result = _mm_load_si128((__m128i*)_result); return; } }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128 & dx, const __m128 & dy, Buffer & buffer, size_t col) { __m128 bestDot = _mm_setzero_ps(); __m128i bestIndex = _mm_setzero_si128(); for(int i = 0; i < buffer.size; ++i) { __m128 dot = _mm_add_ps(_mm_mul_ps(dx, buffer.cos[i]), _mm_mul_ps(dy, buffer.sin[i])); __m128 mask = _mm_cmpgt_ps(dot, bestDot); bestDot = _mm_max_ps(dot, bestDot); bestIndex = Combine(_mm_castps_si128(mask), buffer.pos[i], bestIndex); dot = _mm_sub_ps(_mm_setzero_ps(), dot); mask = _mm_cmpgt_ps(dot, bestDot); bestDot = _mm_max_ps(dot, bestDot); bestIndex = Combine(_mm_castps_si128(mask), buffer.neg[i], bestIndex); } Store<align>((__m128i*)(buffer.index + col), bestIndex); Sse::Store<align>(buffer.value + col, _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy)))); }
static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1]) { const __m128 k1e_10f = _mm_set1_ps(1e-10f); const __m128 kThresh = _mm_set1_ps(aec->errThresh); const __m128 kMu = _mm_set1_ps(aec->mu); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]); const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]); const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]); const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f); __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus); __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus); const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re); const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im); const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2); const __m128 absEf = _mm_sqrt_ps(ef_sum2); const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh); __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f); const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus); __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv); __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv); ef_re_if = _mm_and_ps(bigger, ef_re_if); ef_im_if = _mm_and_ps(bigger, ef_im_if); ef_re = _mm_andnot_ps(bigger, ef_re); ef_im = _mm_andnot_ps(bigger, ef_im); ef_re = _mm_or_ps(ef_re, ef_re_if); ef_im = _mm_or_ps(ef_im, ef_im_if); ef_re = _mm_mul_ps(ef_re, kMu); ef_im = _mm_mul_ps(ef_im, kMu); _mm_storeu_ps(&ef[0][i], ef_re); _mm_storeu_ps(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < (PART_LEN1); i++) { float absEf; ef[0][i] /= (aec->xPow[i] + 1e-10f); ef[1][i] /= (aec->xPow[i] + 1e-10f); absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (absEf > aec->errThresh) { absEf = aec->errThresh / (absEf + 1e-10f); ef[0][i] *= absEf; ef[1][i] *= absEf; } // Stepsize factor ef[0][i] *= aec->mu; ef[1][i] *= aec->mu; } }
void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh) { __m128 ms0 = _mm_load_ps(src); __m128 ms1 = _mm_load_ps(src + 4); const __m128 mm = _mm_set1_ps(0.5f); __m128 a = _mm_add_ps(ms0, ms1); __m128 b = _mm_sub_ps(ms0, ms1); __m128 t1 = _mm_unpacklo_ps(a, b); __m128 t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth); ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk); #ifdef _KEEP_00_COEF_ ms0 = _mm_blend_ps(ms0, a, 1); #endif msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth); ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk); a = _mm_add_ps(ms0, ms1); b = _mm_sub_ps(ms0, ms1); t1 = _mm_unpacklo_ps(a, b); t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); _mm_store_ps(dest, a); _mm_store_ps(dest + 4, b); }
static inline __m128 curve_vec4( const __m128 x, const __m128 g, const __m128 sigma, const __m128 shadows, const __m128 highlights, const __m128 clarity) { // TODO: pull these non-data depedent constants out of the loop to see // whether the compiler fail to do so const __m128 const0 = _mm_set_ps1(0x3f800000u); const __m128 const1 = _mm_set_ps1(0x402DF854u); // for e^x const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 const __m128 one = _mm_set1_ps(1.0f); const __m128 two = _mm_set1_ps(2.0f); const __m128 twothirds = _mm_set1_ps(2.0f/3.0f); const __m128 twosig = _mm_mul_ps(two, sigma); const __m128 sigma2 = _mm_mul_ps(sigma, sigma); const __m128 s22 = _mm_mul_ps(twothirds, sigma2); const __m128 c = _mm_sub_ps(x, g); const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps()); // select shadows or highlights as multiplier for linear part, based on c < 0 const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights)); // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma) const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask)); // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma)))); const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(), _mm_div_ps(c, _mm_mul_ps(two, ssigma)))); const __m128 t2 = _mm_mul_ps(t, t); const __m128 mt = _mm_sub_ps(one, t); // midtone value fading over to linear part, without local contrast: const __m128 vmid = _mm_add_ps(g, _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)), _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi))))); // c > 2*sigma? const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig); const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid)); // midtone local contrast // dt_fast_expf in sse: const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22)); const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0))); const __m128 k = _mm_max_ps(k0, _mm_setzero_ps()); const __m128i ki = _mm_cvtps_epi32(k); const __m128 gauss = _mm_load_ps((float*)&ki); const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss)); return _mm_add_ps(val, vcon); }
// a > b void _SIMD_cmpgt_ps(__SIMD a, __SIMD b, void** resultPtr) { __SIMD* result = (__SIMD*)malloc(sizeof(__SIMD)); *resultPtr = result; #ifdef USE_SSE *result = _mm_cmpgt_ps(a,b); #elif defined USE_AVX *result = _mm256_cmp_ps(a,b,30); #elif defined USE_IBM *result = vec_cmpgt(a,b); #endif }
__m128 exp_ps(__m128 x) { typedef __m128 v4sf; typedef __m128i v4si; v4sf tmp = _mm_setzero_ps(), fx; v4si emm0; v4sf one = constants::ps_1.ps; x = _mm_min_ps(x, constants::exp_hi.ps); x = _mm_max_ps(x, constants::exp_lo.ps); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm_mul_ps(x, constants::cephes_LOG2EF.ps); fx = _mm_add_ps(fx, constants::ps_0p5.ps); /* how to perform a floorf with SSE: just below */ emm0 = _mm_cvttps_epi32(fx); tmp = _mm_cvtepi32_ps(emm0); /* if greater, substract 1 */ v4sf mask = _mm_cmpgt_ps(tmp, fx); mask = _mm_and_ps(mask, one); fx = _mm_sub_ps(tmp, mask); tmp = _mm_mul_ps(fx, constants::cephes_exp_C1.ps); v4sf z = _mm_mul_ps(fx, constants::cephes_exp_C2.ps); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x,x); v4sf y = constants::cephes_exp_p0.ps; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p1.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p2.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p3.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p4.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p5.ps); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, one); /* build 2^n */ emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, constants::pi32_0x7f.pi); emm0 = _mm_slli_epi32(emm0, 23); v4sf pow2n = _mm_castsi128_ps(emm0); y = _mm_mul_ps(y, pow2n); return y; }
void fDCT2D4x4_and_threshold_keep00_32f(float* s, float* d, float thresh) { const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); const __m128 zeros = _mm_setzero_ps(); const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 p03 = _mm_add_ps(s0, s3); __m128 p12 = _mm_add_ps(s1, s2); __m128 m03 = _mm_sub_ps(s0, s3); __m128 m12 = _mm_sub_ps(s1, s2); __m128 v = _mm_add_ps(p03, p12); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); // keep 00 coef. __m128 v2 = _mm_blendv_ps(zeros, v, msk); v2 = _mm_blend_ps(v2, v, 1); _mm_store_ps(d, v2); v = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 4, v); v = _mm_sub_ps(p03, p12); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 8, v); v = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 12, v); }
static inline __m128 lab_f_inv_m(const __m128 x) { const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f); const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f); const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f); // x > epsilon const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x); // x <= epsilon const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16); // blend results according to whether each component is > epsilon or not const __m128 mask = _mm_cmpgt_ps(x, epsilon); return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small)); }
// Operators INLINE bool SVec4::operator==(const SVec4 &rhs) const { #ifdef USE_SSE SIMDvec dif = _mm_sub_ps( m_128, rhs.m_128 ); SIMDvec ep = _mm_set1_ps( math::Epsilon ); SIMDvec neg_ep = _mm_set1_ps( -math::Epsilon ); return ( 0xf == _mm_movemask_ps( _mm_and_ps( _mm_cmpgt_ps( ep, dif ), _mm_cmplt_ps( neg_ep, dif ) ) ) ); #else return math::IsEqual(m_x, rhs.X()) && math::IsEqual(m_y, rhs.Y()) && math::IsEqual(m_z, rhs.Z()) && math::IsEqual(m_w, rhs.W()); #endif }
v4sf exp_ps(v4sf x) { v4sf tmp = _mm_setzero_ps(), fx; v4si emm0; v4sf one = *(v4sf*)_ps_1; x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); emm0 = _mm_cvttps_epi32(fx); tmp = _mm_cvtepi32_ps(emm0); v4sf mask = _mm_cmpgt_ps(tmp, fx); mask = _mm_and_ps(mask, one); fx = _mm_sub_ps(tmp, mask); tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x,x); v4sf y = *(v4sf*)_ps_cephes_exp_p0; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, one); emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); emm0 = _mm_slli_epi32(emm0, 23); v4sf pow2n = _mm_castsi128_ps(emm0); y = _mm_mul_ps(y, pow2n); return y; }
static inline __m128 bicubic_sse(__m128 width, __m128 t) { static const __m128 half = { .5f, .5f, .5f, .5f}; static const __m128 one = { 1.f, 1.f, 1.f, 1.f}; static const __m128 two = { 2.f, 2.f, 2.f, 2.f}; static const __m128 three = { 3.f, 3.f, 3.f, 3.f}; static const __m128 four = { 4.f, 4.f, 4.f, 4.f}; static const __m128 five = { 5.f, 5.f, 5.f, 5.f}; static const __m128 eight = { 8.f, 8.f, 8.f, 8.f}; t = _mm_abs_ps(t); __m128 t2 = _mm_mul_ps(t, t); /* Compute 1 < t < 2 case: * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f) * half*(t*(mt2 + t5 - eight) + four) * half*(t*(mt2 + t5_sub_8) + four) * half*(t*(mt2_add_t5_sub_8) + four) */ __m128 t5 = _mm_mul_ps(five, t); __m128 t5_sub_8 = _mm_sub_ps(t5, eight); __m128 zero = _mm_setzero_ps(); __m128 mt2 = _mm_sub_ps(zero, t2); __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8); __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8); __m128 b = _mm_add_ps(a, four); __m128 r12 = _mm_mul_ps(b, half); /* Compute case < 1 * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */ __m128 t23 = _mm_mul_ps(three, t2); __m128 c = _mm_sub_ps(t23, t5); __m128 d = _mm_mul_ps(t, c); __m128 e = _mm_add_ps(d, two); __m128 r01 = _mm_mul_ps(half, e); // Compute masks fr keeping correct components __m128 mask01 = _mm_cmple_ps(t, one); __m128 mask12 = _mm_cmpgt_ps(t, one); r01 = _mm_and_ps(mask01, r01); r12 = _mm_and_ps(mask12, r12); return _mm_or_ps(r01, r12); }
void zlimit (int simd, float *src , float *dst , size_t size) { if (simd) { __m128 zero4 = _mm_set1_ps(0.f); while ( size >= 4) { __m128 srcv = _mm_loadu_ps(src); __m128 cmpv = _mm_cmpgt_ps(srcv, zero4); __m128 dstv = _mm_and_ps(cmpv, srcv); _mm_storeu_ps(dst, dstv); src += 4; dst += 4; size -= 4; } } else { while (size) { *dst = *src > 0.f ? *src : 0.f ; src++; dst++; size--; } } }
static void thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const float* src = (const float*)_src.data; float* dst = (float*)_dst.data; size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) IppiSize sz = { roi.width, roi.height }; switch( type ) { case THRESH_TRUNC: if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh)) return; setIppErrorStatus(); break; case THRESH_TOZERO: if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0)) return; setIppErrorStatus(); break; case THRESH_TOZERO_INV: if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0)) return; setIppErrorStatus(); break; } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_cmpgt_ps( v0, thresh4 ); v1 = _mm_cmpgt_ps( v1, thresh4 ); v0 = _mm_and_ps( v0, maxval4 ); v1 = _mm_and_ps( v1, maxval4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_cmple_ps( v0, thresh4 ); v1 = _mm_cmple_ps( v1, thresh4 ); v0 = _mm_and_ps( v0, maxval4 ); v1 = _mm_and_ps( v1, maxval4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_min_ps( v0, thresh4 ); v1 = _mm_min_ps( v1, thresh4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4)); v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4)); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #endif for( ; j < roi.width; j++ ) { float v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4)); v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4)); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #endif for( ; j < roi.width; j++ ) { float v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
void BM3D_Basic_Process::CollaborativeFilter(int plane, FLType *ResNum, FLType *ResDen, const FLType *src, const FLType *ref, const PosPairCode &code) const { PCType GroupSize = static_cast<PCType>(code.size()); // When para.GroupSize > 0, limit GroupSize up to para.GroupSize if (d.para.GroupSize > 0 && GroupSize > d.para.GroupSize) { GroupSize = d.para.GroupSize; } // Construct source group guided by matched pos code block_group srcGroup(src, src_stride[plane], code, GroupSize, d.para.BlockSize, d.para.BlockSize); // Initialize retianed coefficients of hard threshold filtering int retainedCoefs = 0; // Apply forward 3D transform to the source group d.f[plane].fp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data()); // Apply hard-thresholding to the source group auto srcp = srcGroup.data(); auto thrp = d.f[plane].thrTable[GroupSize - 1].get(); const auto upper = srcp + srcGroup.size(); #if defined(__SSE2__) static const ptrdiff_t simd_step = 4; const ptrdiff_t simd_residue = srcGroup.size() % simd_step; const ptrdiff_t simd_width = srcGroup.size() - simd_residue; static const __m128 zero_ps = _mm_setzero_ps(); __m128i cmp_sum = _mm_setzero_si128(); for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step) { const __m128 s1 = _mm_load_ps(srcp); const __m128 t1p = _mm_load_ps(thrp); const __m128 t1n = _mm_sub_ps(zero_ps, t1p); const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p); const __m128 cmp2 = _mm_cmplt_ps(s1, t1n); const __m128 cmp = _mm_or_ps(cmp1, cmp2); const __m128 d1 = _mm_and_ps(cmp, s1); _mm_store_ps(srcp, d1); cmp_sum = _mm_sub_epi32(cmp_sum, _mm_castps_si128(cmp)); } alignas(16) int32_t cmp_sum_i32[4]; _mm_store_si128(reinterpret_cast<__m128i *>(cmp_sum_i32), cmp_sum); retainedCoefs += cmp_sum_i32[0] + cmp_sum_i32[1] + cmp_sum_i32[2] + cmp_sum_i32[3]; #endif for (; srcp < upper; ++srcp, ++thrp) { if (*srcp > *thrp || *srcp < -*thrp) { ++retainedCoefs; } else { *srcp = 0; } } // Apply backward 3D transform to the filtered group d.f[plane].bp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data()); // Calculate weight for the filtered group // Also include the normalization factor to compensate for the amplification introduced in 3D transform FLType denWeight = retainedCoefs < 1 ? 1 : FLType(1) / static_cast<FLType>(retainedCoefs); FLType numWeight = static_cast<FLType>(denWeight / d.f[plane].finalAMP[GroupSize - 1]); // Store the weighted filtered group to the numerator part of the basic estimation // Store the weight to the denominator part of the basic estimation srcGroup.AddTo(ResNum, dst_stride[plane], numWeight); srcGroup.CountTo(ResDen, dst_stride[plane], denWeight); }
_declspec(dllexport) DiffResult __stdcall diff_img(Image left, Image right, DiffOptions options) { if (options.ignoreColor) { makeGreyscale(left); makeGreyscale(right); } float* imgMem = (float*)_aligned_malloc(left.width * left.height * sizeof(float) * 4, 16); int colorOffset = left.width * left.height; Image diff = { left.width, left.height, left.stride, imgMem, imgMem + colorOffset, imgMem + colorOffset * 2, imgMem + colorOffset * 3 }; float* drp = diff.r; float* dgp = diff.g; float* dbp = diff.b; float* dap = diff.a; float* lrp = left.r; float* lgp = left.g; float* lbp = left.b; float* lap = left.a; float* rrp = right.r; float* rgp = right.g; float* rbp = right.b; float* rap = right.a; Color error = ConvertToFloat(options.errorColor); auto er = _mm_set_ps1(error.r); auto eg = _mm_set_ps1(error.g); auto eb = _mm_set_ps1(error.b); auto ea = _mm_set_ps1(error.a); auto tolerance = _mm_set_ps1(options.tolerance); auto overlayTransparency = _mm_set_ps1(options.overlayTransparency); OverlayType overlayType = options.overlayType; byte weightByDiffPercentage = options.weightByDiffPercentage; auto diffPixelCount = _mm_set_epi32(0, 0, 0, 0); auto onei = _mm_set1_epi32(1); auto one = _mm_set1_ps(1); auto zero = _mm_set1_ps(0); for (int y = 0; y < left.height; y++) { for (int x = 0; x < left.width; x+=4) { auto lr = _mm_load_ps(lrp); auto lg = _mm_load_ps(lgp); auto lb = _mm_load_ps(lbp); auto la = _mm_load_ps(lap); auto rr = _mm_load_ps(rrp); auto rg = _mm_load_ps(rgp); auto rb = _mm_load_ps(rbp); auto ra = _mm_load_ps(rap); auto rdiff = _mm_sub_ps(rr, lr); auto gdiff = _mm_sub_ps(rg, lg); auto bdiff = _mm_sub_ps(rb, lb); auto adiff = _mm_sub_ps(ra, la); auto distance = _mm_mul_ps(rdiff, rdiff); distance = _mm_add_ps(distance, _mm_mul_ps(gdiff, gdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(bdiff, bdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(adiff, adiff)); distance = _mm_sqrt_ps(distance); auto t = overlayTransparency; if (weightByDiffPercentage) { t = _mm_mul_ps(t, distance); } auto isdiff = _mm_cmpgt_ps(distance, tolerance); t = _mm_min_ps(one, _mm_max_ps(zero, t)); auto mlr = rr; auto mlg = rg; auto mlb = rb; auto mla = ra; if (overlayType == OverlayType::Movement) { mlr = _mm_mul_ps(mlr, er); mlg = _mm_mul_ps(mlg, eg); mlb = _mm_mul_ps(mlb, eb); mla = _mm_mul_ps(mla, ea); } auto oneMinusT = _mm_sub_ps(one, t); auto mixedR = _mm_add_ps(_mm_mul_ps(mlr, oneMinusT), _mm_mul_ps(er, t)); auto mixedG = _mm_add_ps(_mm_mul_ps(mlg, oneMinusT), _mm_mul_ps(eg, t)); auto mixedB = _mm_add_ps(_mm_mul_ps(mlb, oneMinusT), _mm_mul_ps(eb, t)); auto mixedA = one; if (overlayType != OverlayType::Movement) { mixedA = _mm_add_ps(_mm_mul_ps(mla, oneMinusT), _mm_mul_ps(ea, t)); } // (((b ^ a) & mask)^a) auto dr = _mm_xor_ps(lr, _mm_and_ps(isdiff, _mm_xor_ps(mixedR, lr))); auto dg = _mm_xor_ps(lg, _mm_and_ps(isdiff, _mm_xor_ps(mixedG, lg))); auto db = _mm_xor_ps(lb, _mm_and_ps(isdiff, _mm_xor_ps(mixedB, lb))); auto da = _mm_xor_ps(la, _mm_and_ps(isdiff, _mm_xor_ps(mixedA, la))); diffPixelCount = _mm_xor_si128(diffPixelCount, _mm_and_si128(_mm_castps_si128(isdiff), _mm_xor_si128(_mm_add_epi32(diffPixelCount, onei), diffPixelCount))); _mm_store_ps(drp, dr); _mm_store_ps(dgp, dg); _mm_store_ps(dbp, db); _mm_store_ps(dap, da); drp+=4; dgp+=4; dbp+=4; dap+=4; lrp+=4; lgp+=4; lbp+=4; lap+=4; rrp+=4; rgp+=4; rbp+=4; rap+=4; } } int* pixelCounts = (int*)_aligned_malloc(4 * sizeof(int), 16); _mm_store_si128((__m128i*)pixelCounts, diffPixelCount); int totalCount = pixelCounts[0] + pixelCounts[1] + pixelCounts[2] + pixelCounts[3]; _aligned_free(pixelCounts); return{ diff, 1.0f - float(totalCount) / (left.height * left.width - left.height * left.stride) }; }
/* ============ R_DecalPointCullStatic ============ */ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, const idDrawVert * verts, const int numVerts ) { assert_16_byte_aligned( cullBits ); assert_16_byte_aligned( verts ); idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts ); const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f }; const __m128i vector_int_mask0 = _mm_set1_epi32( 1 << 0 ); const __m128i vector_int_mask1 = _mm_set1_epi32( 1 << 1 ); const __m128i vector_int_mask2 = _mm_set1_epi32( 1 << 2 ); const __m128i vector_int_mask3 = _mm_set1_epi32( 1 << 3 ); const __m128i vector_int_mask4 = _mm_set1_epi32( 1 << 4 ); const __m128i vector_int_mask5 = _mm_set1_epi32( 1 << 5 ); const __m128 p0 = _mm_loadu_ps( planes[0].ToFloatPtr() ); const __m128 p1 = _mm_loadu_ps( planes[1].ToFloatPtr() ); const __m128 p2 = _mm_loadu_ps( planes[2].ToFloatPtr() ); const __m128 p3 = _mm_loadu_ps( planes[3].ToFloatPtr() ); const __m128 p4 = _mm_loadu_ps( planes[4].ToFloatPtr() ); const __m128 p5 = _mm_loadu_ps( planes[5].ToFloatPtr() ); const __m128 p0X = _mm_splat_ps( p0, 0 ); const __m128 p0Y = _mm_splat_ps( p0, 1 ); const __m128 p0Z = _mm_splat_ps( p0, 2 ); const __m128 p0W = _mm_splat_ps( p0, 3 ); const __m128 p1X = _mm_splat_ps( p1, 0 ); const __m128 p1Y = _mm_splat_ps( p1, 1 ); const __m128 p1Z = _mm_splat_ps( p1, 2 ); const __m128 p1W = _mm_splat_ps( p1, 3 ); const __m128 p2X = _mm_splat_ps( p2, 0 ); const __m128 p2Y = _mm_splat_ps( p2, 1 ); const __m128 p2Z = _mm_splat_ps( p2, 2 ); const __m128 p2W = _mm_splat_ps( p2, 3 ); const __m128 p3X = _mm_splat_ps( p3, 0 ); const __m128 p3Y = _mm_splat_ps( p3, 1 ); const __m128 p3Z = _mm_splat_ps( p3, 2 ); const __m128 p3W = _mm_splat_ps( p3, 3 ); const __m128 p4X = _mm_splat_ps( p4, 0 ); const __m128 p4Y = _mm_splat_ps( p4, 1 ); const __m128 p4Z = _mm_splat_ps( p4, 2 ); const __m128 p4W = _mm_splat_ps( p4, 3 ); const __m128 p5X = _mm_splat_ps( p5, 0 ); const __m128 p5Y = _mm_splat_ps( p5, 1 ); const __m128 p5Z = _mm_splat_ps( p5, 2 ); const __m128 p5W = _mm_splat_ps( p5, 3 ); for ( int i = 0; i < numVerts; ) { const int nextNumVerts = vertsODS.FetchNextBatch() - 4; for ( ; i <= nextNumVerts; i += 4 ) { const __m128 v0 = _mm_load_ps( vertsODS[i + 0].xyz.ToFloatPtr() ); const __m128 v1 = _mm_load_ps( vertsODS[i + 1].xyz.ToFloatPtr() ); const __m128 v2 = _mm_load_ps( vertsODS[i + 2].xyz.ToFloatPtr() ); const __m128 v3 = _mm_load_ps( vertsODS[i + 3].xyz.ToFloatPtr() ); const __m128 r0 = _mm_unpacklo_ps( v0, v2 ); // v0.x, v2.x, v0.z, v2.z const __m128 r1 = _mm_unpackhi_ps( v0, v2 ); // v0.y, v2.y, v0.w, v2.w const __m128 r2 = _mm_unpacklo_ps( v1, v3 ); // v1.x, v3.x, v1.z, v3.z const __m128 r3 = _mm_unpackhi_ps( v1, v3 ); // v1.y, v3.y, v1.w, v3.w const __m128 vX = _mm_unpacklo_ps( r0, r2 ); // v0.x, v1.x, v2.x, v3.x const __m128 vY = _mm_unpackhi_ps( r0, r2 ); // v0.y, v1.y, v2.y, v3.y const __m128 vZ = _mm_unpacklo_ps( r1, r3 ); // v0.z, v1.z, v2.z, v3.z const __m128 d0 = _mm_madd_ps( vX, p0X, _mm_madd_ps( vY, p0Y, _mm_madd_ps( vZ, p0Z, p0W ) ) ); const __m128 d1 = _mm_madd_ps( vX, p1X, _mm_madd_ps( vY, p1Y, _mm_madd_ps( vZ, p1Z, p1W ) ) ); const __m128 d2 = _mm_madd_ps( vX, p2X, _mm_madd_ps( vY, p2Y, _mm_madd_ps( vZ, p2Z, p2W ) ) ); const __m128 d3 = _mm_madd_ps( vX, p3X, _mm_madd_ps( vY, p3Y, _mm_madd_ps( vZ, p3Z, p3W ) ) ); const __m128 d4 = _mm_madd_ps( vX, p4X, _mm_madd_ps( vY, p4Y, _mm_madd_ps( vZ, p4Z, p4W ) ) ); const __m128 d5 = _mm_madd_ps( vX, p5X, _mm_madd_ps( vY, p5Y, _mm_madd_ps( vZ, p5Z, p5W ) ) ); __m128i c0 = __m128c( _mm_cmpgt_ps( d0, vector_float_zero ) ); __m128i c1 = __m128c( _mm_cmpgt_ps( d1, vector_float_zero ) ); __m128i c2 = __m128c( _mm_cmpgt_ps( d2, vector_float_zero ) ); __m128i c3 = __m128c( _mm_cmpgt_ps( d3, vector_float_zero ) ); __m128i c4 = __m128c( _mm_cmpgt_ps( d4, vector_float_zero ) ); __m128i c5 = __m128c( _mm_cmpgt_ps( d5, vector_float_zero ) ); c0 = _mm_and_si128( c0, vector_int_mask0 ); c1 = _mm_and_si128( c1, vector_int_mask1 ); c2 = _mm_and_si128( c2, vector_int_mask2 ); c3 = _mm_and_si128( c3, vector_int_mask3 ); c4 = _mm_and_si128( c4, vector_int_mask4 ); c5 = _mm_and_si128( c5, vector_int_mask5 ); c0 = _mm_or_si128( c0, c1 ); c2 = _mm_or_si128( c2, c3 ); c4 = _mm_or_si128( c4, c5 ); c0 = _mm_or_si128( c0, c2 ); c0 = _mm_or_si128( c0, c4 ); __m128i s0 = _mm_packs_epi32( c0, c0 ); __m128i b0 = _mm_packus_epi16( s0, s0 ); *(unsigned int *)&cullBits[i] = _mm_cvtsi128_si32( b0 ); } } }
void LightDesc_t::ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal, FourVectors &color, bool DoHalfLambert ) const { FourVectors delta; Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL)); switch (m_Type) { case MATERIAL_LIGHT_POINT: case MATERIAL_LIGHT_SPOT: delta.DuplicateVector(m_Position); delta-=pos; break; case MATERIAL_LIGHT_DIRECTIONAL: delta.DuplicateVector(m_Direction); delta*=-1.0; break; default: delta.x = Four_Zeros; delta.y = Four_Zeros; delta.z = Four_Zeros; break; } __m128 dist2 = delta*delta; __m128 falloff; if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 ) { falloff = MMReplicate(m_Attenuation0); } else falloff= Four_Epsilons; if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 ) { falloff=_mm_add_ps(falloff,_mm_mul_ps(MMReplicate(m_Attenuation1),_mm_sqrt_ps(dist2))); } if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 ) { falloff=_mm_add_ps(falloff,_mm_mul_ps(MMReplicate(m_Attenuation2),dist2)); } falloff=_mm_rcp_ps(falloff); // Cull out light beyond this radius // now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format if (m_Range != 0.f) { __m128 RangeSquared=MMReplicate(m_RangeSquared); // !!speed!! falloff=_mm_and_ps(falloff,_mm_cmplt_ps(dist2,RangeSquared)); } delta.VectorNormalizeFast(); __m128 strength=delta*normal; if (DoHalfLambert) { strength=_mm_add_ps(_mm_mul_ps(strength,Four_PointFives),Four_PointFives); } else strength=_mm_max_ps(Four_Zeros,delta*normal); switch(m_Type) { case MATERIAL_LIGHT_POINT: // half-lambert break; case MATERIAL_LIGHT_SPOT: { __m128 dot2=_mm_sub_ps(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff __m128 cone_falloff_scale=_mm_mul_ps(MMReplicate(OneOver_ThetaDot_Minus_PhiDot), _mm_sub_ps(dot2,MMReplicate(m_PhiDot))); cone_falloff_scale=_mm_min_ps(cone_falloff_scale,Four_Ones); if ((m_Falloff!=0.0) && (m_Falloff!=1.0)) { // !!speed!! could compute integer exponent needed by powsse and store in light cone_falloff_scale=PowSSE(cone_falloff_scale,m_Falloff); } strength=_mm_mul_ps(cone_falloff_scale,strength); // now, zero out lighting where dot2<phidot. This will mask out any invalid results // from pow function, etc __m128 OutsideMask=_mm_cmpgt_ps(dot2,MMReplicate(m_PhiDot)); // outside light cone? strength=_mm_and_ps(OutsideMask,strength); } break; case MATERIAL_LIGHT_DIRECTIONAL: break; default: break; } strength=_mm_mul_ps(strength,falloff); color.x=_mm_add_ps(color.x,_mm_mul_ps(strength,MMReplicate(m_Color.x))); color.y=_mm_add_ps(color.y,_mm_mul_ps(strength,MMReplicate(m_Color.y))); color.z=_mm_add_ps(color.z,_mm_mul_ps(strength,MMReplicate(m_Color.z))); }
static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]) { int i; const __m128 vec_hNlFb = _mm_set1_ps(hNlFb); const __m128 vec_one = _mm_set1_ps(1.0f); const __m128 vec_minus_one = _mm_set1_ps(-1.0f); const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm); // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i+=4) { // Weight subbands __m128 vec_hNl = _mm_loadu_ps(&hNl[i]); const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]); const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb); const __m128 vec_weightCurve_hNlFb = _mm_mul_ps( vec_weightCurve, vec_hNlFb); const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve); const __m128 vec_one_weightCurve_hNl = _mm_mul_ps( vec_one_weightCurve, vec_hNl); const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl); const __m128 vec_if1 = _mm_and_ps( bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl)); vec_hNl = _mm_or_ps(vec_if0, vec_if1); { const __m128 vec_overDriveCurve = _mm_loadu_ps( &WebRtcAec_overDriveCurve[i]); const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps( vec_overDriveSm, vec_overDriveCurve); vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve); _mm_storeu_ps(&hNl[i], vec_hNl); } // Suppress error signal { __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]); __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]); vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl); vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl); // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one); _mm_storeu_ps(&efw[0][i], vec_efw_re); _mm_storeu_ps(&efw[1][i], vec_efw_im); } } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { // Weight subbands if (hNl[i] > hNlFb) { hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + (1 - WebRtcAec_weightCurve[i]) * hNl[i]; } hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); // Suppress error signal efw[0][i] *= hNl[i]; efw[1][i] *= hNl[i]; // Ooura fft returns incorrect sign on imaginary component. It matters // here because we are making an additive change with comfort noise. efw[1][i] *= -1; } }
// comparison operators RETf CMPGT(const __m128 x, const __m128 y) { return _mm_cmpgt_ps(x, y); }
static int forward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *opt_sc) { register __m128 mpv, dpv, ipv; /* previous row values */ register __m128 sv; /* temp storage of 1 curr row value in progress */ register __m128 dcv; /* delayed storage of D(i,q+1) */ register __m128 xEv; /* E state: keeps max for Mk->E as we go */ register __m128 xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ __m128 zerov; /* splatted 0.0's in a vector */ float xN, xE, xB, xC, xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over quads 0..nq-1 */ int j; /* counter over DD iterations (4 is full serialization) */ int Q = p7O_NQF(om->M); /* segment length: # of vectors */ __m128 *dpc = ox->dpf[0]; /* current row, for use in {MDI}MO(dpp,q) access macro */ __m128 *dpp; /* previous row, for use in {MDI}MO(dpp,q) access macro */ __m128 *rp; /* will point at om->rfv[x] for residue x[i] */ __m128 *tp; /* will point into (and step thru) om->tfv */ /* Initialization. */ ox->M = om->M; ox->L = L; ox->has_own_scales = TRUE; /* all forward matrices control their own scalefactors */ zerov = _mm_setzero_ps(); for (q = 0; q < Q; q++) MMO(dpc,q) = IMO(dpc,q) = DMO(dpc,q) = zerov; xE = ox->xmx[p7X_E] = 0.; xN = ox->xmx[p7X_N] = 1.; xJ = ox->xmx[p7X_J] = 0.; xB = ox->xmx[p7X_B] = om->xf[p7O_N][p7O_MOVE]; xC = ox->xmx[p7X_C] = 0.; ox->xmx[p7X_SCALE] = 1.0; ox->totscale = 0.0; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, 0, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=0, width=8, precision=5*/ #endif for (i = 1; i <= L; i++) { dpp = dpc; dpc = ox->dpf[do_full * i]; /* avoid conditional, use do_full as kronecker delta */ rp = om->rfv[dsq[i]]; tp = om->tfv; dcv = _mm_setzero_ps(); xEv = _mm_setzero_ps(); xBv = _mm_set1_ps(xB); /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. Shift zeros on. */ mpv = esl_sse_rightshift_ps(MMO(dpp,Q-1), zerov); dpv = esl_sse_rightshift_ps(DMO(dpp,Q-1), zerov); ipv = esl_sse_rightshift_ps(IMO(dpp,Q-1), zerov); for (q = 0; q < Q; q++) { /* Calculate new MMO(i,q); don't store it yet, hold it in sv. */ sv = _mm_mul_ps(xBv, *tp); tp++; sv = _mm_add_ps(sv, _mm_mul_ps(mpv, *tp)); tp++; sv = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++; sv = _mm_add_ps(sv, _mm_mul_ps(dpv, *tp)); tp++; sv = _mm_mul_ps(sv, *rp); rp++; xEv = _mm_add_ps(xEv, sv); /* Load {MDI}(i-1,q) into mpv, dpv, ipv; * {MDI}MX(q) is then the current, not the prev row */ mpv = MMO(dpp,q); dpv = DMO(dpp,q); ipv = IMO(dpp,q); /* Do the delayed stores of {MD}(i,q) now that memory is usable */ MMO(dpc,q) = sv; DMO(dpc,q) = dcv; /* Calculate the next D(i,q+1) partially: M->D only; * delay storage, holding it in dcv */ dcv = _mm_mul_ps(sv, *tp); tp++; /* Calculate and store I(i,q); assumes odds ratio for emission is 1.0 */ sv = _mm_mul_ps(mpv, *tp); tp++; IMO(dpc,q) = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++; } /* Now the DD paths. We would rather not serialize them but * in an accurate Forward calculation, we have few options. */ /* dcv has carried through from end of q loop above; store it * in first pass, we add M->D and D->D path into DMX */ /* We're almost certainly're obligated to do at least one complete * DD path to be sure: */ dcv = esl_sse_rightshift_ps(dcv, zerov); DMO(dpc,0) = zerov; tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q)); dcv = _mm_mul_ps(DMO(dpc,q), *tp); tp++; /* extend DMO(q), so we include M->D and D->D paths */ } /* now. on small models, it seems best (empirically) to just go * ahead and serialize. on large models, we can do a bit better, * by testing for when dcv (DD path) accrued to DMO(q) is below * machine epsilon for all q, in which case we know DMO(q) are all * at their final values. The tradeoff point is (empirically) somewhere around M=100, * at least on my desktop. We don't worry about the conditional here; * it's outside any inner loops. */ if (om->M < 100) { /* Fully serialized version */ for (j = 1; j < 4; j++) { dcv = esl_sse_rightshift_ps(dcv, zerov); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { /* note, extend dcv, not DMO(q); only adding DD paths now */ DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q)); dcv = _mm_mul_ps(dcv, *tp); tp++; } } } else { /* Slightly parallelized version, but which incurs some overhead */ for (j = 1; j < 4; j++) { register __m128 cv; /* keeps track of whether any DD's change DMO(q) */ dcv = esl_sse_rightshift_ps(dcv, zerov); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ cv = zerov; for (q = 0; q < Q; q++) { /* using cmpgt below tests if DD changed any DMO(q) *without* conditional branch */ sv = _mm_add_ps(dcv, DMO(dpc,q)); cv = _mm_or_ps(cv, _mm_cmpgt_ps(sv, DMO(dpc,q))); DMO(dpc,q) = sv; /* store new DMO(q) */ dcv = _mm_mul_ps(dcv, *tp); tp++; /* note, extend dcv, not DMO(q) */ } if (! _mm_movemask_ps(cv)) break; /* DD's didn't change any DMO(q)? Then done, break out. */ } } /* Add D's to xEv */ for (q = 0; q < Q; q++) xEv = _mm_add_ps(DMO(dpc,q), xEv); /* Finally the "special" states, which start from Mk->E (->C, ->J->B) */ /* The following incantation is a horizontal sum of xEv's elements */ /* These must follow DD calculations, because D's contribute to E in Forward * (as opposed to Viterbi) */ xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(0, 3, 2, 1))); xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(1, 0, 3, 2))); _mm_store_ss(&xE, xEv); xN = xN * om->xf[p7O_N][p7O_LOOP]; xC = (xC * om->xf[p7O_C][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_MOVE]); xJ = (xJ * om->xf[p7O_J][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_LOOP]); xB = (xJ * om->xf[p7O_J][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_MOVE]); /* and now xB will carry over into next i, and xC carries over after i=L */ /* Sparse rescaling. xE above threshold? trigger a rescaling event. */ if (xE > 1.0e4) /* that's a little less than e^10, ~10% of our dynamic range */ { xN = xN / xE; xC = xC / xE; xJ = xJ / xE; xB = xB / xE; xEv = _mm_set1_ps(1.0 / xE); for (q = 0; q < Q; q++) { MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xEv); DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xEv); IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xEv); } ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = xE; ox->totscale += log(xE); xE = 1.0; } else ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = 1.0; /* Storage of the specials. We could've stored these already * but using xE, etc. variables makes it easy to convert this * code to O(M) memory versions just by deleting storage steps. */ ox->xmx[i*p7X_NXCELLS+p7X_E] = xE; ox->xmx[i*p7X_NXCELLS+p7X_N] = xN; ox->xmx[i*p7X_NXCELLS+p7X_J] = xJ; ox->xmx[i*p7X_NXCELLS+p7X_B] = xB; ox->xmx[i*p7X_NXCELLS+p7X_C] = xC; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, i, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=i, width=8, precision=5*/ #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and flip total score back to log space (nats) */ /* On overflow, xC is inf or nan (nan arises because inf*0 = nan). */ /* On an underflow (which shouldn't happen), we counterintuitively return infinity: * the effect of this is to force the caller to rescore us with full range. */ if (isnan(xC)) ESL_EXCEPTION(eslERANGE, "forward score is NaN"); else if (L>0 && xC == 0.0) ESL_EXCEPTION(eslERANGE, "forward score underflow (is 0.0)"); /* if L==0, xC *should* be 0.0; J5/118 */ else if (isinf(xC) == 1) ESL_EXCEPTION(eslERANGE, "forward score overflow (is infinity)"); if (opt_sc != NULL) *opt_sc = ox->totscale + log(xC * om->xf[p7O_C][p7O_MOVE]); return eslOK; }
void BrushToolEdit::drawInner(const QPoint &pt, float strength) { float fixedStrength = params.strength; strength *= fixedStrength; auto color = params.color; std::array<int, 3> colorParts = Terrain::expandColor(color); __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0); SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST); (void) roundingModeScope; switch (tool->type()) { case BrushType::Blur: drawBlur(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Smoothen: drawSmoothen(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Raise: case BrushType::Lower: if (tool->type() == BrushType::Lower) { fixedStrength = -fixedStrength; strength = -strength; } switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength *= 3.f; drawRaiseLower(pt, [=](float ¤t, float before, float tip) { (void) before; current -= tip * strength; }); break; case BrushPressureMode::Constant: if (tool->type() == BrushType::Lower) { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength)); }); } else { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength)); }); } break; case BrushPressureMode::Adjustable: drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(before - tip * strength); }); break; } break; case BrushType::Paint: switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength = 1.f - std::exp2(-strength); drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { (void) before; // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); auto factor = _mm_set1_ps(tip * strength); // blend auto diff = _mm_sub_ps(colorMM, currentMF); diff = _mm_mul_ps(diff, factor); currentMF = _mm_add_ps(currentMF, diff); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Constant: fixedStrength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128()); // use "before" image to which way of color change is possible, and // compute possible range of result color auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * fixedStrength); auto adddiff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, adddiff); auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps()); // compute output image auto out1 = _mm_max_ps(currentMF, beforeMF); auto out2 = _mm_min_ps(currentMF, beforeMF); currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2)); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Adjustable: strength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // blend auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * strength); diff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, diff); // convert to RGB32 beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128()); beforeMM = _mm_cvttps_epi32(beforeMF); beforeMM = _mm_packs_epi32(beforeMM, beforeMM); beforeMM = _mm_packus_epi16(beforeMM, beforeMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(beforeMM)); }); break; } break; } }
inline vec4 operator>(vec4 a, vec4 b) { return _mm_cmpgt_ps(a, b); }