SIMDValue SIMDFloat32x4Operation::OpMaxNum(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); X86SIMDValue mask, mask2, t1, t2; // This is the correct result or b if either is NaN or both are +/-0.0 x86Result.m128_value = _mm_max_ps(tmpaValue.m128_value, tmpbValue.m128_value); // Find NaNs in b mask.m128_value = _mm_cmpunord_ps(tmpbValue.m128_value, tmpbValue.m128_value); // Find +0.0 in a mask2.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, X86_ALL_ZEROS.m128i_value); // mask2 is -0.0 where a is +0.0 mask2.m128_value = _mm_and_ps(mask2.m128_value, X86_TWO_31_I4.m128_value); // For lanes where a is +0.0, the result is either correct (positive), or b which is possibly -0.0 // Safe to force sign to positive for those lanes, +0.0 becomes -0.0. x86Result.m128_value = _mm_andnot_ps(mask2.m128_value, x86Result.m128_value); // For NaNs in b, choose a, else keep result. t1.m128_value = _mm_and_ps(tmpaValue.m128_value, mask.m128_value); t2.m128_value = _mm_andnot_ps(mask.m128_value, x86Result.m128_value); x86Result.m128_value = _mm_or_ps(t1.m128_value, t2.m128_value); return X86SIMDValue::ToSIMDValue(x86Result); }
void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c) { #ifdef USE_SIMD __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse); __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit); __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit); __m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm))); __m128 deltaVel1Dotn = _mm_add_ps(_vmathVfDot3(c.m_contactNormal.mVec128,body1.m_deltaLinearVelocity.mVec128), _vmathVfDot3(c.m_relpos1CrossNormal.mVec128,body1.m_deltaAngularVelocity.mVec128)); __m128 deltaVel2Dotn = _mm_sub_ps(_vmathVfDot3(c.m_relpos2CrossNormal.mVec128,body2.m_deltaAngularVelocity.mVec128),_vmathVfDot3((c.m_contactNormal).mVec128,body2.m_deltaLinearVelocity.mVec128)); deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv))); deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv))); btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse); btSimdScalar resultLowerLess,resultUpperLess; resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1); resultUpperLess = _mm_cmplt_ps(sum,upperLimit1); __m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp); deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) ); c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) ); __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.m_invMass.mVec128); __m128 linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.m_invMass.mVec128); __m128 impulseMagnitude = deltaImpulse; body1.m_deltaLinearVelocity.mVec128 = _mm_add_ps(body1.m_deltaLinearVelocity.mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude)); body1.m_deltaAngularVelocity.mVec128 = _mm_add_ps(body1.m_deltaAngularVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude)); body2.m_deltaLinearVelocity.mVec128 = _mm_sub_ps(body2.m_deltaLinearVelocity.mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude)); body2.m_deltaAngularVelocity.mVec128 = _mm_add_ps(body2.m_deltaAngularVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude)); #else resolveSingleConstraintRowLowerLimit(body1,body2,c); #endif }
void btSequentialImpulseConstraintSolver::resolveSplitPenetrationSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c) { #ifdef USE_SIMD if (!c.m_rhsPenetration) return; gNumSplitImpulseRecoveries++; __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedPushImpulse); __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit); __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit); __m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhsPenetration), _mm_mul_ps(_mm_set1_ps(c.m_appliedPushImpulse),_mm_set1_ps(c.m_cfm))); __m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal.mVec128,body1.internalGetPushVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetTurnVelocity().mVec128)); __m128 deltaVel2Dotn = _mm_sub_ps(btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetTurnVelocity().mVec128),btSimdDot3((c.m_contactNormal).mVec128,body2.internalGetPushVelocity().mVec128)); deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv))); deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv))); btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse); btSimdScalar resultLowerLess,resultUpperLess; resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1); resultUpperLess = _mm_cmplt_ps(sum,upperLimit1); __m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp); deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) ); c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) ); __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.internalGetInvMass().mVec128); __m128 linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.internalGetInvMass().mVec128); __m128 impulseMagnitude = deltaImpulse; body1.internalGetPushVelocity().mVec128 = _mm_add_ps(body1.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude)); body1.internalGetTurnVelocity().mVec128 = _mm_add_ps(body1.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude)); body2.internalGetPushVelocity().mVec128 = _mm_sub_ps(body2.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude)); body2.internalGetTurnVelocity().mVec128 = _mm_add_ps(body2.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude)); #else resolveSplitPenetrationImpulseCacheFriendly(body1,body2,c); #endif }
static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1]) { const __m128 k1e_10f = _mm_set1_ps(1e-10f); const __m128 kThresh = _mm_set1_ps(aec->errThresh); const __m128 kMu = _mm_set1_ps(aec->mu); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]); const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]); const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]); const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f); __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus); __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus); const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re); const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im); const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2); const __m128 absEf = _mm_sqrt_ps(ef_sum2); const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh); __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f); const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus); __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv); __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv); ef_re_if = _mm_and_ps(bigger, ef_re_if); ef_im_if = _mm_and_ps(bigger, ef_im_if); ef_re = _mm_andnot_ps(bigger, ef_re); ef_im = _mm_andnot_ps(bigger, ef_im); ef_re = _mm_or_ps(ef_re, ef_re_if); ef_im = _mm_or_ps(ef_im, ef_im_if); ef_re = _mm_mul_ps(ef_re, kMu); ef_im = _mm_mul_ps(ef_im, kMu); _mm_storeu_ps(&ef[0][i], ef_re); _mm_storeu_ps(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < (PART_LEN1); i++) { float absEf; ef[0][i] /= (aec->xPow[i] + 1e-10f); ef[1][i] /= (aec->xPow[i] + 1e-10f); absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (absEf > aec->errThresh) { absEf = aec->errThresh / (absEf + 1e-10f); ef[0][i] *= absEf; ef[1][i] *= absEf; } // Stepsize factor ef[0][i] *= aec->mu; ef[1][i] *= aec->mu; } }
F32 Aabb::testPlane(const Plane& p) const { const Aabb& aabb = *this; #if ANKI_SIMD == ANKI_SIMD_SSE __m128 gezero = _mm_cmpge_ps(p.getNormal().getSimd(), _mm_setzero_ps()); Vec4 diagMin; diagMin.getSimd() = _mm_or_ps(_mm_and_ps(gezero, aabb.getMin().getSimd()), _mm_andnot_ps(gezero, aabb.getMax().getSimd())); #else Vec4 diagMin(0.0), diagMax(0.0); // set min/max values for x,y,z direction for(U i = 0; i < 3; i++) { if(p.getNormal()[i] >= 0.0) { diagMin[i] = aabb.getMin()[i]; diagMax[i] = aabb.getMax()[i]; } else { diagMin[i] = aabb.getMax()[i]; diagMax[i] = aabb.getMin()[i]; } } #endif // minimum on positive side of plane, box on positive side ANKI_ASSERT(diagMin.w() == 0.0); F32 test = p.test(diagMin); if(test > 0.0) { return test; } #if ANKI_SIMD == ANKI_SIMD_SSE Vec4 diagMax; diagMax.getSimd() = _mm_or_ps(_mm_and_ps(gezero, aabb.getMax().getSimd()), _mm_andnot_ps(gezero, aabb.getMin().getSimd())); #endif ANKI_ASSERT(diagMax.w() == 0.0); test = p.test(diagMax); if(test >= 0.0) { // min on non-positive side, max on non-negative side, intersection return 0.0; } else { // max on negative side, box on negative side return test; } }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out) { dt_develop_t *dev = self->dev; const int ch = piece->colors; const __m128 upper = _mm_set_ps(FLT_MAX, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f); const __m128 lower = _mm_set_ps(FLT_MAX, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f); const int colorscheme = dev->overexposed.colorscheme; const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]); const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]); #ifdef _OPENMP #pragma omp parallel for default(none) shared(ovoid) schedule(static) #endif for(int k=0; k<roi_out->height; k++) { const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width; float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width; for (int j=0; j<roi_out->width; j++,in+=4,out+=4) { const __m128 pixel = _mm_load_ps(in); __m128 isoe = _mm_cmpge_ps(pixel, upper); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); __m128 isue = _mm_cmple_ps(pixel, lower); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel), _mm_and_ps(isoe, upper_color)); result = _mm_or_ps(_mm_andnot_ps(isue, result), _mm_and_ps(isue, lower_color)); _mm_stream_ps(out, result); } } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
static inline __m128 curve_vec4( const __m128 x, const __m128 g, const __m128 sigma, const __m128 shadows, const __m128 highlights, const __m128 clarity) { // TODO: pull these non-data depedent constants out of the loop to see // whether the compiler fail to do so const __m128 const0 = _mm_set_ps1(0x3f800000u); const __m128 const1 = _mm_set_ps1(0x402DF854u); // for e^x const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 const __m128 one = _mm_set1_ps(1.0f); const __m128 two = _mm_set1_ps(2.0f); const __m128 twothirds = _mm_set1_ps(2.0f/3.0f); const __m128 twosig = _mm_mul_ps(two, sigma); const __m128 sigma2 = _mm_mul_ps(sigma, sigma); const __m128 s22 = _mm_mul_ps(twothirds, sigma2); const __m128 c = _mm_sub_ps(x, g); const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps()); // select shadows or highlights as multiplier for linear part, based on c < 0 const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights)); // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma) const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask)); // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma)))); const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(), _mm_div_ps(c, _mm_mul_ps(two, ssigma)))); const __m128 t2 = _mm_mul_ps(t, t); const __m128 mt = _mm_sub_ps(one, t); // midtone value fading over to linear part, without local contrast: const __m128 vmid = _mm_add_ps(g, _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)), _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi))))); // c > 2*sigma? const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig); const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid)); // midtone local contrast // dt_fast_expf in sse: const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22)); const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0))); const __m128 k = _mm_max_ps(k0, _mm_setzero_ps()); const __m128i ki = _mm_cvtps_epi32(k); const __m128 gauss = _mm_load_ps((float*)&ki); const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss)); return _mm_add_ps(val, vcon); }
int intr_frustum_box(const float* frustum, const float* box_min, const float* box_max) { const __m128 min = _mm_load_ps(box_min); const __m128 max = _mm_load_ps(box_max); for (int i = 0; i < 6; i++) { const __m128 plane = _mm_load_ps(frustum + 4 * i); const __m128 mask = _mm_cmplt_ps(plane, _mm_setzero_ps()); const __m128 n = _mm_or_ps(_mm_and_ps(mask, max), _mm_andnot_ps(mask, min)); const __m128 d = _mm_mul_ps(n, plane); const __m128 d0 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(2, 1, 0, 3)); const __m128 d1 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(1, 0, 3, 2)); const __m128 d2 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(0, 3, 2, 1)); const __m128 dot = _mm_add_ss(_mm_add_ss(d0, d), _mm_add_ss(d1, d2)); const __m128 ret = _mm_cmpgt_ss(dot, _mm_setzero_ps()); float reti; _mm_store_ss(&reti, ret); if (reti != 0) return 0; } return 1; }
__m128 t2(__m128 a, __m128 b) { a=_mm_sqrt_ps(a); b=_mm_sqrt_ps(b); return _mm_andnot_ps (a,b); }
SIMD_INLINE void StageSum32f(const float * leaves, float threshold, const __m128 & sum, const __m128 & norm, __m128 & stageSum) { __m128 mask = _mm_cmplt_ps(sum, _mm_mul_ps(_mm_set1_ps(threshold), norm)); __m128 leaf0 = _mm_and_ps(mask, _mm_set1_ps(leaves[0])); __m128 leaf1 = _mm_andnot_ps(mask, _mm_set1_ps(leaves[1])); stageSum = _mm_add_ps(stageSum, _mm_or_ps(leaf0, leaf1)); }
inline void GDALCopyWordSSE(const float fValueIn, Tout &tValueOut) { float fMaxVal, fMinVal; GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal); __m128 xmm = _mm_set_ss(fValueIn); __m128 xmm_min = _mm_set_ss(fMinVal); __m128 xmm_max = _mm_set_ss(fMaxVal); xmm = _mm_min_ss(_mm_max_ss(xmm, xmm_min), xmm_max); #ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE __m128 p0d5 = _mm_set_ss(0.5f); if (std::numeric_limits<Tout>::is_signed) { __m128 mask = _mm_cmpge_ss(xmm, _mm_set_ss(0.f)); __m128 m0d5 = _mm_set_ss(-0.5f); xmm = _mm_add_ss(xmm, _mm_or_ps(_mm_and_ps(mask, p0d5), _mm_andnot_ps(mask, m0d5))); } else { xmm = _mm_add_ss(xmm, p0d5); } #endif #ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE tValueOut = (Tout)_mm_cvttss_si32(xmm); #else tValueOut = (Tout)_mm_cvtss_si32(xmm); #endif }
void vFindMax(__m128i *pixels, int n) { __m128i vIdx,vMax; int i; vIdx = _mm_setzero_si128(); vMax = _mm_set_epi32(INT_MIN,INT_MIN,INT_MIN,INT_MIN); for(i = 0; i < n; i++) { __m128i v = _mm_load_si128(pixels+i); __m128i vCmp = _mm_cmpgt_epi32(v, vMax); /* max value */ vMax = _mm_max_epi32(vMax,v); __m128i vBdxIdx = _mm_set_epi32(i,i,i,i); __m128 t0 = _mm_and_ps((__m128)vBdxIdx,(__m128)vCmp); __m128 t1 = _mm_andnot_ps((__m128)vCmp, (__m128)vIdx); /* max index */ vIdx = (__m128i)_mm_or_ps(t0,t1); } int indices[4]; int values[4]; _mm_store_si128((__m128i*)indices, vIdx); _mm_store_si128((__m128i*)values, vMax); printf("SSE:\n"); for(i=0;i<4;i++) { printf("%d:max=%d,idx=%d\n",i,values[i],indices[i]); //int idx = 4*indices[i] + i; //int *sArr = (int*)pixels; //printf("sArr[%d]=%d\n",idx,sArr[idx]); } }
/* the fast arctan function adopted from OpenCV */ static void _ccv_atan2(float* x, float* y, float* angle, float* mag, int len) { int i = 0; float scale = (float)(180.0 / CCV_PI); #ifdef HAVE_SSE2 #ifndef _WIN32 union { int i; float fl; } iabsmask; iabsmask.i = 0x7fffffff; __m128 eps = _mm_set1_ps((float)1e-6), absmask = _mm_set1_ps(iabsmask.fl); __m128 _90 = _mm_set1_ps((float)(3.141592654 * 0.5)), _180 = _mm_set1_ps((float)3.141592654), _360 = _mm_set1_ps((float)(3.141592654 * 2)); __m128 zero = _mm_setzero_ps(), _0_28 = _mm_set1_ps(0.28f), scale4 = _mm_set1_ps(scale); for(; i <= len - 4; i += 4) { __m128 x4 = _mm_loadu_ps(x + i), y4 = _mm_loadu_ps(y + i); __m128 xq4 = _mm_mul_ps(x4, x4), yq4 = _mm_mul_ps(y4, y4); __m128 xly = _mm_cmplt_ps(xq4, yq4); __m128 z4 = _mm_div_ps(_mm_mul_ps(x4, y4), _mm_add_ps(_mm_add_ps(_mm_max_ps(xq4, yq4), _mm_mul_ps(_mm_min_ps(xq4, yq4), _0_28)), eps)); // a4 <- x < y ? 90 : 0; __m128 a4 = _mm_and_ps(xly, _90); // a4 <- (y < 0 ? 360 - a4 : a4) == ((x < y ? y < 0 ? 270 : 90) : (y < 0 ? 360 : 0)) __m128 mask = _mm_cmplt_ps(y4, zero); a4 = _mm_or_ps(_mm_and_ps(_mm_sub_ps(_360, a4), mask), _mm_andnot_ps(mask, a4)); // a4 <- (x < 0 && !(x < y) ? 180 : a4) mask = _mm_andnot_ps(xly, _mm_cmplt_ps(x4, zero)); a4 = _mm_or_ps(_mm_and_ps(_180, mask), _mm_andnot_ps(mask, a4)); // a4 <- (x < y ? a4 - z4 : a4 + z4) a4 = _mm_mul_ps(_mm_add_ps(_mm_xor_ps(z4, _mm_andnot_ps(absmask, xly)), a4), scale4); __m128 m4 = _mm_sqrt_ps(_mm_add_ps(xq4, yq4)); _mm_storeu_ps(angle + i, a4); _mm_storeu_ps(mag + i, m4); } #endif #endif for(; i < len; i++) { float xf = x[i], yf = y[i]; float a, x2 = xf * xf, y2 = yf * yf; if(y2 <= x2) a = xf * yf / (x2 + 0.28f * y2 + (float)1e-6) + (float)(xf < 0 ? CCV_PI : yf >= 0 ? 0 : CCV_PI * 2); else a = (float)(yf >= 0 ? CCV_PI * 0.5 : CCV_PI * 1.5) - xf * yf / (y2 + 0.28f * x2 + (float)1e-6); angle[i] = a * scale; mag[i] = sqrtf(x2 + y2); } }
inline vector4f select(const vector4fb& cond, const vector4f& a, const vector4f& b) { #if SSE_INSTR_SET >= 5 // SSE 4.1 return _mm_blendv_ps(b, a, cond); #else return _mm_or_ps(_mm_and_ps(cond, a), _mm_andnot_ps(cond, b)); #endif }
static inline __m128 sigmoid_ps( __m128 xin ) { __m128 mask = _mm_cmplt_ps( xin, _mm_setzero_ps() ); __m128 c; xin = _mm_and_ps (xin , abs_mask.ps ); /* Abs. value by clearing signbit */ c = sigmoid_positive_ps(xin); return _mm_or_ps( _mm_and_ps( mask, c ) , _mm_andnot_ps ( mask , _mm_sub_ps( ones.ps, c ))); }
static inline __v4sf gamma_2_2_to_linear_sse2 (__v4sf x) { __v4sf curve = sse_pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f)); __v4sf line = x * splat4f (1/12.92f); __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.04045f)); return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line)); }
static inline __v4sf linear_to_gamma_2_2_sse2 (__v4sf x) { __v4sf curve = sse_pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f); __v4sf line = x * splat4f (12.92f); __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f)); return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line)); }
float Math::Abs(float x) { static const __m128 SIGNMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); __m128 val; val.m128_f32[0] = x; __m128 absval = _mm_andnot_ps(SIGNMASK, val); return absval.m128_f32[0]; }
// SIMD abs __SIMD _SIMD_abs_ps(__SIMD a) { #ifdef USE_SSE return _mm_andnot_ps(_mm_set1_ps(-0.0f), a); #elif defined USE_AVX return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a); #elif defined USE_IBM return vec_abs(a); #endif }
/* SIMD selection */ __SIMD _SIMD_sel_ps(__SIMD a, __SIMD b, void** resultPtr) { #ifdef USE_SSE __SIMD* result = (__SIMD*) (*resultPtr); return _mm_or_ps(_mm_andnot_ps(*result,a),_mm_and_ps(*result,b)); #elif defined USE_AVX __SIMD* result = (__SIMD*) resultPtr; return _mm256_or_ps(_mm256_andnot_ps(*result,a),_mm256_and_ps(*result,b)); #elif defined USE_IBM return vec_sel(a,b,c); #endif }
void HighPassFilter::setFlaggedValuesToZeroAndMakeWeightsSSE(const Image2DCPtr &inputImage, const Image2DPtr &outputImage, const Mask2DCPtr &inputMask, const Image2DPtr &weightsOutput) { const size_t width = inputImage->Width(); const __m128i zero4i = _mm_set_epi32(0, 0, 0, 0); const __m128 zero4 = _mm_set_ps(0.0, 0.0, 0.0, 0.0); const __m128 one4 = _mm_set_ps(1.0, 1.0, 1.0, 1.0); for(size_t y=0;y<inputImage->Height();++y) { const bool *rowPtr = inputMask->ValuePtr(0, y); const float *inputPtr = inputImage->ValuePtr(0, y); float *outputPtr = outputImage->ValuePtr(0, y); float *weightsPtr = weightsOutput->ValuePtr(0, y); const float *end = inputPtr + width; while(inputPtr < end) { // Assign each integer to one bool in the mask // Convert false to 0xFFFFFFFF and true to 0 __m128 conditionMask = _mm_castsi128_ps( _mm_cmpeq_epi32(_mm_set_epi32(rowPtr[3] || !isfinite(inputPtr[3]), rowPtr[2] || !isfinite(inputPtr[2]), rowPtr[1] || !isfinite(inputPtr[1]), rowPtr[0] || !isfinite(inputPtr[0])), zero4i)); _mm_store_ps(weightsPtr, _mm_or_ps( _mm_and_ps(conditionMask, one4), _mm_andnot_ps(conditionMask, zero4) )); _mm_store_ps(outputPtr, _mm_or_ps( _mm_and_ps(conditionMask, _mm_load_ps(inputPtr)), _mm_andnot_ps(conditionMask, zero4) )); rowPtr += 4; outputPtr += 4; inputPtr += 4; weightsPtr += 4; } } }
inline wg_v4sf Recognizer::local_distance4(float *s, float *t0, float *t1, float *t2, float *t3) { wg_v4sf v_, v0, v1, v2, v3; v_.v = _mm_set_ps1(-0.0); v0.v = _mm_sub_ps(((wg_v4sf *)t0)->v, ((wg_v4sf *)s)->v); v0.v = _mm_andnot_ps(v_.v,v0.v); // absolute value v1.v = _mm_sub_ps(((wg_v4sf *)t1)->v, ((wg_v4sf *)s)->v); v1.v = _mm_andnot_ps(v_.v,v1.v); // absolute value v2.v = _mm_sub_ps(((wg_v4sf *)t2)->v, ((wg_v4sf *)s)->v); v2.v = _mm_andnot_ps(v_.v,v2.v); // absolute value v3.v = _mm_sub_ps(((wg_v4sf *)t3)->v, ((wg_v4sf *)s)->v); v3.v = _mm_andnot_ps(v_.v,v3.v); // absolute value // convert row vectors to column vectors _MM_TRANSPOSE4_PS(v0.v, v1.v, v2.v, v3.v); v3.v = _mm_add_ps(v3.v, v2.v); v3.v = _mm_add_ps(v3.v, v1.v); v3.v = _mm_add_ps(v3.v, v0.v); return v3; }
SIMDValue SIMDFloat32x4Operation::OpSelect(const SIMDValue& mV, const SIMDValue& tV, const SIMDValue& fV) { X86SIMDValue x86Result; X86SIMDValue maskValue = X86SIMDValue::ToX86SIMDValue(mV); X86SIMDValue trueValue = X86SIMDValue::ToX86SIMDValue(tV); X86SIMDValue falseValue = X86SIMDValue::ToX86SIMDValue(fV); X86SIMDValue tempTrue, tempFalse; tempTrue.m128_value = _mm_and_ps(maskValue.m128_value, trueValue.m128_value); // mask & True tempFalse.m128_value = _mm_andnot_ps(maskValue.m128_value, falseValue.m128_value); // !mask & False x86Result.m128_value = _mm_or_ps(tempTrue.m128_value, tempFalse.m128_value); // tempTrue | tempFalse return X86SIMDValue::ToSIMDValue(x86Result); }
static inline bool equals_sse(const float3& f1, const float3& f2) { // same as equals_new() just with SSE __m128 eq; __m128 m1 = _mm_set_ps(f1[0], f1[1], f1[2], 0.f); __m128 m2 = _mm_set_ps(f2[0], f2[1], f2[2], 0.f); eq = _mm_cmpeq_ps(m1, m2); if ((eq[0] != 0) && (eq[1] != 0) && (eq[2] != 0)) return true; static const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 static const __m128 eps = _mm_set1_ps(float3::cmp_eps()); static const __m128 ones = _mm_set1_ps(1.f); __m128 am1 = _mm_andnot_ps(sign_mask, m1); __m128 am2 = _mm_andnot_ps(sign_mask, m2); __m128 right = _mm_add_ps(am1, am2); right = _mm_add_ps(right, ones); right = _mm_mul_ps(right, eps); __m128 left = _mm_sub_ps(m1, m2); left = _mm_andnot_ps(sign_mask, left); eq = _mm_cmple_ps(left, right); return ((eq[0] != 0) && (eq[1] != 0) && (eq[2] != 0)); }
static inline __m128 lab_f_inv_m(const __m128 x) { const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f); const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f); const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f); // x > epsilon const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x); // x <= epsilon const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16); // blend results according to whether each component is > epsilon or not const __m128 mask = _mm_cmpgt_ps(x, epsilon); return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small)); }
inline float Recognizer::local_distance(float *v1, float *v2) { float sum = 0; #ifdef __SSE__ wg_v4sf res, v_; v_.v = _mm_set_ps1(-0.0); res.v = _mm_sub_ps(((wg_v4sf *)v2)->v, ((wg_v4sf *)v1)->v); res.v = _mm_andnot_ps(v_.v,res.v); // absolute value for (unsigned int i=0; i < dimension; i++) sum += res.s[i]; #else for (unsigned int i=0; i < dimension; i++) sum += fabs(v2[i] - v1[i]); #endif return sum; }
inline void GDALCopy4WordsSSE(const float* pValueIn, Tout* const &pValueOut) { float fMaxVal, fMinVal; GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal); __m128 xmm = _mm_loadu_ps(pValueIn); __m128 xmm_min = _mm_set1_ps(fMinVal); __m128 xmm_max = _mm_set1_ps(fMaxVal); xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max); #ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE __m128 p0d5 = _mm_set1_ps(0.5f); if (std::numeric_limits<Tout>::is_signed) { __m128 m0d5 = _mm_set1_ps(-0.5f); //__m128 mask = _mm_cmpge_ps(xmm, _mm_set1_ps(0.f)); __m128 mask = _mm_cmpge_ps(xmm, p0d5); xmm = _mm_add_ps(xmm, _mm_or_ps(_mm_and_ps(mask, p0d5), _mm_andnot_ps(mask, m0d5))); /* f >= 0.5f ? f + 0.5f : f - 0.5f */ } else { xmm = _mm_add_ps(xmm, p0d5); } #endif #ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE __m128i xmm_i = _mm_cvttps_epi32 (xmm); #else __m128i xmm_i = _mm_cvtps_epi32(xmm); #endif #if 0 int aTemp[4]; _mm_storeu_si128 ( (__m128i *)aTemp, xmm_i); pValueOut[0] = (Tout)aTemp[0]; pValueOut[1] = (Tout)aTemp[1]; pValueOut[2] = (Tout)aTemp[2]; pValueOut[3] = (Tout)aTemp[3]; #else pValueOut[0] = (Tout)_mm_extract_epi16(xmm_i, 0); pValueOut[1] = (Tout)_mm_extract_epi16(xmm_i, 2); pValueOut[2] = (Tout)_mm_extract_epi16(xmm_i, 4); pValueOut[3] = (Tout)_mm_extract_epi16(xmm_i, 6); #endif }
// Returns { f, g, f, g }, where f = bump0 (t), g = bump1 (t). v4f bumps_t::operator () (float t) const { // Compute all four polynomials by Estrin's method, and mask and combine the // values according to the region of the graph to which t belongs. v4f s = _mm_set1_ps (t); v4f S = load4f (S0); v4f T = load4f (T0); v4f U = load4f (U0); v4f V = load4f (V0); v4f f01 = load4f (c [0]) + load4f (c [1]) * s; v4f f12 = load4f (c [2]) + load4f (c [3]) * s; v4f f = f01 + f12 * s * s; v4f ltS = _mm_cmplt_ps (s, S); v4f geT = _mm_cmpge_ps (s, T); v4f x1 = _mm_andnot_ps (_mm_or_ps (ltS, geT), f); v4f x2 = _mm_and_ps (ltS, U); v4f x3 = _mm_and_ps (geT, V); v4f val = _mm_or_ps (_mm_or_ps (x1, x2), x3); return _mm_hadd_ps (val, val); }
void HighPassFilter::elementWiseDivideSSE(const Image2DPtr &leftHand, const Image2DCPtr &rightHand) { const __m128 zero4 = _mm_set_ps(0.0, 0.0, 0.0, 0.0); for(unsigned y=0;y<leftHand->Height();++y) { float *leftHandPtr = leftHand->ValuePtr(0, y); const float *rightHandPtr = rightHand->ValuePtr(0, y); float *end = leftHandPtr + leftHand->Width(); while(leftHandPtr < end) { __m128 l = _mm_load_ps(leftHandPtr), r = _mm_load_ps(rightHandPtr); __m128 conditionMask = _mm_cmpeq_ps(r, zero4); _mm_store_ps(leftHandPtr, _mm_or_ps( _mm_and_ps(conditionMask, zero4), _mm_andnot_ps(conditionMask, _mm_div_ps(l, r)) )); leftHandPtr += 4; rightHandPtr += 4; } } }
void BrushToolEdit::drawInner(const QPoint &pt, float strength) { float fixedStrength = params.strength; strength *= fixedStrength; auto color = params.color; std::array<int, 3> colorParts = Terrain::expandColor(color); __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0); SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST); (void) roundingModeScope; switch (tool->type()) { case BrushType::Blur: drawBlur(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Smoothen: drawSmoothen(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Raise: case BrushType::Lower: if (tool->type() == BrushType::Lower) { fixedStrength = -fixedStrength; strength = -strength; } switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength *= 3.f; drawRaiseLower(pt, [=](float ¤t, float before, float tip) { (void) before; current -= tip * strength; }); break; case BrushPressureMode::Constant: if (tool->type() == BrushType::Lower) { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength)); }); } else { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength)); }); } break; case BrushPressureMode::Adjustable: drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(before - tip * strength); }); break; } break; case BrushType::Paint: switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength = 1.f - std::exp2(-strength); drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { (void) before; // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); auto factor = _mm_set1_ps(tip * strength); // blend auto diff = _mm_sub_ps(colorMM, currentMF); diff = _mm_mul_ps(diff, factor); currentMF = _mm_add_ps(currentMF, diff); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Constant: fixedStrength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128()); // use "before" image to which way of color change is possible, and // compute possible range of result color auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * fixedStrength); auto adddiff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, adddiff); auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps()); // compute output image auto out1 = _mm_max_ps(currentMF, beforeMF); auto out2 = _mm_min_ps(currentMF, beforeMF); currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2)); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Adjustable: strength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // blend auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * strength); diff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, diff); // convert to RGB32 beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128()); beforeMM = _mm_cvttps_epi32(beforeMF); beforeMM = _mm_packs_epi32(beforeMM, beforeMM); beforeMM = _mm_packus_epi16(beforeMM, beforeMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(beforeMM)); }); break; } break; } }