Esempi in C++ (Cpp) per _mm_andnot_ps

Esempio n. 1

0

Mostra file

File: SimdFloat32x4OperationX86X64.cpp Progetto: Astolfoho/ChakraCore

    SIMDValue SIMDFloat32x4Operation::OpMaxNum(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        X86SIMDValue mask, mask2, t1, t2;

        // This is the correct result or b if either is NaN or both are +/-0.0
        x86Result.m128_value = _mm_max_ps(tmpaValue.m128_value, tmpbValue.m128_value);
        // Find NaNs in b
        mask.m128_value = _mm_cmpunord_ps(tmpbValue.m128_value, tmpbValue.m128_value);
        // Find +0.0 in a
        mask2.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, X86_ALL_ZEROS.m128i_value);
        // mask2 is -0.0 where a is +0.0
        mask2.m128_value = _mm_and_ps(mask2.m128_value, X86_TWO_31_I4.m128_value);
        // For lanes where a is +0.0, the result is either correct (positive), or b which is possibly -0.0
        // Safe to force sign to positive for those lanes, +0.0 becomes -0.0.
        x86Result.m128_value = _mm_andnot_ps(mask2.m128_value, x86Result.m128_value);
        // For NaNs in b, choose a, else keep result.
        t1.m128_value = _mm_and_ps(tmpaValue.m128_value, mask.m128_value);
        t2.m128_value = _mm_andnot_ps(mask.m128_value, x86Result.m128_value);
        x86Result.m128_value = _mm_or_ps(t1.m128_value, t2.m128_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }

Esempio n. 2

0

Mostra file

File: btSequentialImpulseConstraintSolver.cpp Progetto: jinjoh/NOOR

 void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c)
{
#ifdef USE_SIMD
	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
	__m128	lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
	__m128	upperLimit1 = _mm_set1_ps(c.m_upperLimit);
	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm)));
	__m128 deltaVel1Dotn	=	_mm_add_ps(_vmathVfDot3(c.m_contactNormal.mVec128,body1.m_deltaLinearVelocity.mVec128), _vmathVfDot3(c.m_relpos1CrossNormal.mVec128,body1.m_deltaAngularVelocity.mVec128));
	__m128 deltaVel2Dotn	=	_mm_sub_ps(_vmathVfDot3(c.m_relpos2CrossNormal.mVec128,body2.m_deltaAngularVelocity.mVec128),_vmathVfDot3((c.m_contactNormal).mVec128,body2.m_deltaLinearVelocity.mVec128));
	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
	btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
	btSimdScalar resultLowerLess,resultUpperLess;
	resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
	resultUpperLess = _mm_cmplt_ps(sum,upperLimit1);
	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
	deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
	c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.m_invMass.mVec128);
	__m128	linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.m_invMass.mVec128);
	__m128 impulseMagnitude = deltaImpulse;
	body1.m_deltaLinearVelocity.mVec128 = _mm_add_ps(body1.m_deltaLinearVelocity.mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
	body1.m_deltaAngularVelocity.mVec128 = _mm_add_ps(body1.m_deltaAngularVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
	body2.m_deltaLinearVelocity.mVec128 = _mm_sub_ps(body2.m_deltaLinearVelocity.mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
	body2.m_deltaAngularVelocity.mVec128 = _mm_add_ps(body2.m_deltaAngularVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
#else
	resolveSingleConstraintRowLowerLimit(body1,body2,c);
#endif
}

Esempio n. 3

0

Mostra file

File: btSequentialImpulseConstraintSolver.cpp Progetto: 3DGEOM/Blender

 void btSequentialImpulseConstraintSolver::resolveSplitPenetrationSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c)
{
#ifdef USE_SIMD
	if (!c.m_rhsPenetration)
		return;

	gNumSplitImpulseRecoveries++;

	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedPushImpulse);
	__m128	lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
	__m128	upperLimit1 = _mm_set1_ps(c.m_upperLimit);
	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhsPenetration), _mm_mul_ps(_mm_set1_ps(c.m_appliedPushImpulse),_mm_set1_ps(c.m_cfm)));
	__m128 deltaVel1Dotn	=	_mm_add_ps(btSimdDot3(c.m_contactNormal.mVec128,body1.internalGetPushVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetTurnVelocity().mVec128));
	__m128 deltaVel2Dotn	=	_mm_sub_ps(btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetTurnVelocity().mVec128),btSimdDot3((c.m_contactNormal).mVec128,body2.internalGetPushVelocity().mVec128));
	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
	deltaImpulse	=	_mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
	btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
	btSimdScalar resultLowerLess,resultUpperLess;
	resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
	resultUpperLess = _mm_cmplt_ps(sum,upperLimit1);
	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
	deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
	c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
	__m128	linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.internalGetInvMass().mVec128);
	__m128	linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.internalGetInvMass().mVec128);
	__m128 impulseMagnitude = deltaImpulse;
	body1.internalGetPushVelocity().mVec128 = _mm_add_ps(body1.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
	body1.internalGetTurnVelocity().mVec128 = _mm_add_ps(body1.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
	body2.internalGetPushVelocity().mVec128 = _mm_sub_ps(body2.internalGetPushVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
	body2.internalGetTurnVelocity().mVec128 = _mm_add_ps(body2.internalGetTurnVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
#else
	resolveSplitPenetrationImpulseCacheFriendly(body1,body2,c);
#endif
}

Esempio n. 4

0

Mostra file

File: aec_core_sse2.c Progetto: AGProjects/python-sipsimple

static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
{
  const __m128 k1e_10f = _mm_set1_ps(1e-10f);
  const __m128 kThresh = _mm_set1_ps(aec->errThresh);
  const __m128 kMu = _mm_set1_ps(aec->mu);

  int i;
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]);
    const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
    const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);

    const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f);
    __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
    __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
    const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
    const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
    const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
    const __m128 absEf = _mm_sqrt_ps(ef_sum2);
    const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
    __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
    const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
    __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
    __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
    ef_re_if = _mm_and_ps(bigger, ef_re_if);
    ef_im_if = _mm_and_ps(bigger, ef_im_if);
    ef_re = _mm_andnot_ps(bigger, ef_re);
    ef_im = _mm_andnot_ps(bigger, ef_im);
    ef_re = _mm_or_ps(ef_re, ef_re_if);
    ef_im = _mm_or_ps(ef_im, ef_im_if);
    ef_re = _mm_mul_ps(ef_re, kMu);
    ef_im = _mm_mul_ps(ef_im, kMu);

    _mm_storeu_ps(&ef[0][i], ef_re);
    _mm_storeu_ps(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
  for (; i < (PART_LEN1); i++) {
    float absEf;
    ef[0][i] /= (aec->xPow[i] + 1e-10f);
    ef[1][i] /= (aec->xPow[i] + 1e-10f);
    absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

    if (absEf > aec->errThresh) {
      absEf = aec->errThresh / (absEf + 1e-10f);
      ef[0][i] *= absEf;
      ef[1][i] *= absEf;
    }

    // Stepsize factor
    ef[0][i] *= aec->mu;
    ef[1][i] *= aec->mu;
  }
}

Esempio n. 5

0

Mostra file

F32 Aabb::testPlane(const Plane& p) const
{
	const Aabb& aabb = *this;

#if ANKI_SIMD == ANKI_SIMD_SSE
	__m128 gezero = _mm_cmpge_ps(p.getNormal().getSimd(), _mm_setzero_ps());

	Vec4 diagMin;
	diagMin.getSimd() =
		_mm_or_ps(_mm_and_ps(gezero, aabb.getMin().getSimd()), _mm_andnot_ps(gezero, aabb.getMax().getSimd()));
#else
	Vec4 diagMin(0.0), diagMax(0.0);
	// set min/max values for x,y,z direction
	for(U i = 0; i < 3; i++)
	{
		if(p.getNormal()[i] >= 0.0)
		{
			diagMin[i] = aabb.getMin()[i];
			diagMax[i] = aabb.getMax()[i];
		}
		else
		{
			diagMin[i] = aabb.getMax()[i];
			diagMax[i] = aabb.getMin()[i];
		}
	}
#endif

	// minimum on positive side of plane, box on positive side
	ANKI_ASSERT(diagMin.w() == 0.0);
	F32 test = p.test(diagMin);
	if(test > 0.0)
	{
		return test;
	}

#if ANKI_SIMD == ANKI_SIMD_SSE
	Vec4 diagMax;
	diagMax.getSimd() =
		_mm_or_ps(_mm_and_ps(gezero, aabb.getMax().getSimd()), _mm_andnot_ps(gezero, aabb.getMin().getSimd()));
#endif

	ANKI_ASSERT(diagMax.w() == 0.0);
	test = p.test(diagMax);
	if(test >= 0.0)
	{
		// min on non-positive side, max on non-negative side, intersection
		return 0.0;
	}
	else
	{
		// max on negative side, box on negative side
		return test;
	}
}

Esempio n. 6

0

Mostra file

File: overexposed.c Progetto: Coshibu/darktable

void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out)
{
  dt_develop_t *dev = self->dev;

  const int ch = piece->colors;

  const __m128 upper = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f);
  const __m128 lower = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f);

  const int colorscheme = dev->overexposed.colorscheme;
  const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]);
  const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]);

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(ovoid) schedule(static)
#endif
  for(int k=0; k<roi_out->height; k++)
  {
    const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
    float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

    for (int j=0; j<roi_out->width; j++,in+=4,out+=4)
    {
      const __m128 pixel = _mm_load_ps(in);

      __m128 isoe = _mm_cmpge_ps(pixel, upper);
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));

      __m128 isue = _mm_cmple_ps(pixel, lower);
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));

      __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel),
                                _mm_and_ps(isoe, upper_color));

      result = _mm_or_ps(_mm_andnot_ps(isue, result),
                         _mm_and_ps(isue, lower_color));

      _mm_stream_ps(out, result);
    }
  }
  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}

Esempio n. 7

0

Mostra file

File: locallaplacian.c Progetto: rgo/darktable

static inline __m128 curve_vec4(
    const __m128 x,
    const __m128 g,
    const __m128 sigma,
    const __m128 shadows,
    const __m128 highlights,
    const __m128 clarity)
{
  // TODO: pull these non-data depedent constants out of the loop to see
  // whether the compiler fail to do so
  const __m128 const0 = _mm_set_ps1(0x3f800000u);
  const __m128 const1 = _mm_set_ps1(0x402DF854u); // for e^x
  const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
  const __m128 one = _mm_set1_ps(1.0f);
  const __m128 two = _mm_set1_ps(2.0f);
  const __m128 twothirds = _mm_set1_ps(2.0f/3.0f);
  const __m128 twosig = _mm_mul_ps(two, sigma);
  const __m128 sigma2 = _mm_mul_ps(sigma, sigma);
  const __m128 s22 = _mm_mul_ps(twothirds, sigma2);

  const __m128 c = _mm_sub_ps(x, g);
  const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps());
  // select shadows or highlights as multiplier for linear part, based on c < 0
  const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights));
  // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma)
  const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask));
  // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma
  const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma))));

  const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(),
        _mm_div_ps(c, _mm_mul_ps(two, ssigma))));
  const __m128 t2 = _mm_mul_ps(t, t);
  const __m128 mt = _mm_sub_ps(one, t);

  // midtone value fading over to linear part, without local contrast:
  const __m128 vmid = _mm_add_ps(g,
      _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)),
        _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi)))));

  // c > 2*sigma?
  const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig);
  const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid));

  // midtone local contrast
  // dt_fast_expf in sse:
  const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22));
  const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0)));
  const __m128 k = _mm_max_ps(k0, _mm_setzero_ps());
  const __m128i ki = _mm_cvtps_epi32(k);
  const __m128 gauss = _mm_load_ps((float*)&ki);
  const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss));
  return _mm_add_ps(val, vcon);
}

Esempio n. 8

0

Mostra file

File: intr_frustum_opt.c Progetto: matovitch/The-Dream-Machine

int intr_frustum_box(const float* frustum, const float* box_min, const float* box_max)
{
    const __m128 min = _mm_load_ps(box_min);
    const __m128 max = _mm_load_ps(box_max);

    for (int i = 0; i < 6; i++) {
        const __m128 plane = _mm_load_ps(frustum + 4 * i);

        const __m128 mask = _mm_cmplt_ps(plane, _mm_setzero_ps());
        const __m128 n = _mm_or_ps(_mm_and_ps(mask, max),
                                   _mm_andnot_ps(mask, min));

        const __m128 d = _mm_mul_ps(n, plane);
        const __m128 d0 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(2, 1, 0, 3));
        const __m128 d1 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(1, 0, 3, 2));
        const __m128 d2 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(0, 3, 2, 1));
        const __m128 dot = _mm_add_ss(_mm_add_ss(d0, d), _mm_add_ss(d1, d2));

        const __m128 ret = _mm_cmpgt_ss(dot, _mm_setzero_ps());
        
        float reti;
        _mm_store_ss(&reti, ret);
        if (reti != 0)
            return 0;
    }
    
    return 1;
}

Esempio n. 9

0

Mostra file

File: i386-ssetype-4.c Progetto: VargMon/dd-wrt

__m128
t2(__m128 a, __m128 b)
{
a=_mm_sqrt_ps(a);
b=_mm_sqrt_ps(b);
return _mm_andnot_ps (a,b);
}

Esempio n. 10

0

Mostra file

File: SimdSse41Detection.cpp Progetto: ashh87/Simd

 SIMD_INLINE void StageSum32f(const float * leaves, float threshold, const __m128 & sum, const __m128 & norm, __m128 & stageSum)
 {
     __m128 mask = _mm_cmplt_ps(sum, _mm_mul_ps(_mm_set1_ps(threshold), norm));
     __m128 leaf0 = _mm_and_ps(mask, _mm_set1_ps(leaves[0]));
     __m128 leaf1 = _mm_andnot_ps(mask, _mm_set1_ps(leaves[1]));
     stageSum = _mm_add_ps(stageSum, _mm_or_ps(leaf0, leaf1));
 }

Esempio n. 11

0

Mostra file

File: gdal_priv_templates.hpp Progetto: bbradbury/lib_gdal

inline void GDALCopyWordSSE(const float fValueIn, Tout &tValueOut)
{
    float fMaxVal, fMinVal;
    GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
    __m128 xmm = _mm_set_ss(fValueIn);
    __m128 xmm_min = _mm_set_ss(fMinVal);
    __m128 xmm_max = _mm_set_ss(fMaxVal);
    xmm = _mm_min_ss(_mm_max_ss(xmm, xmm_min), xmm_max);
#ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE
    __m128 p0d5 = _mm_set_ss(0.5f);
    if (std::numeric_limits<Tout>::is_signed)
    {
        __m128 mask = _mm_cmpge_ss(xmm, _mm_set_ss(0.f));
        __m128 m0d5 = _mm_set_ss(-0.5f);
        xmm = _mm_add_ss(xmm, _mm_or_ps(_mm_and_ps(mask, p0d5), _mm_andnot_ps(mask, m0d5)));
    }
    else
    {
        xmm = _mm_add_ss(xmm, p0d5);
    }
#endif

#ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE
    tValueOut = (Tout)_mm_cvttss_si32(xmm);
#else
    tValueOut = (Tout)_mm_cvtss_si32(xmm);
#endif
}

Esempio n. 12

0

Mostra file

File: davids_algo.c Progetto: caomw/piggyHOG

void vFindMax(__m128i *pixels, int n)
{
  __m128i vIdx,vMax;
  int i;
  vIdx = _mm_setzero_si128();
  vMax = _mm_set_epi32(INT_MIN,INT_MIN,INT_MIN,INT_MIN);
  for(i = 0; i < n; i++)
    {
      __m128i v = _mm_load_si128(pixels+i);
      __m128i vCmp = _mm_cmpgt_epi32(v, vMax);
      /* max value */
      vMax = _mm_max_epi32(vMax,v);
      
      __m128i vBdxIdx = _mm_set_epi32(i,i,i,i); 
      
      __m128 t0 = _mm_and_ps((__m128)vBdxIdx,(__m128)vCmp);
      __m128 t1 = _mm_andnot_ps((__m128)vCmp, (__m128)vIdx);
      /* max index */
      vIdx = (__m128i)_mm_or_ps(t0,t1);
    }
  int indices[4];
  int values[4];
  _mm_store_si128((__m128i*)indices, vIdx);
  _mm_store_si128((__m128i*)values, vMax);
  printf("SSE:\n");
  for(i=0;i<4;i++)
    {
      printf("%d:max=%d,idx=%d\n",i,values[i],indices[i]);
      //int idx = 4*indices[i] + i;
      //int *sArr = (int*)pixels;
      //printf("sArr[%d]=%d\n",idx,sArr[idx]);
    }
}

Esempio n. 13

0

Mostra file

File: ccv_basic.c Progetto: Daiver/tagit

/* the fast arctan function adopted from OpenCV */
static void _ccv_atan2(float* x, float* y, float* angle, float* mag, int len)
{
	int i = 0;
	float scale = (float)(180.0 / CCV_PI);
#ifdef HAVE_SSE2
#ifndef _WIN32
	union { int i; float fl; } iabsmask; iabsmask.i = 0x7fffffff;
	__m128 eps = _mm_set1_ps((float)1e-6), absmask = _mm_set1_ps(iabsmask.fl);
	__m128 _90 = _mm_set1_ps((float)(3.141592654 * 0.5)), _180 = _mm_set1_ps((float)3.141592654), _360 = _mm_set1_ps((float)(3.141592654 * 2));
	__m128 zero = _mm_setzero_ps(), _0_28 = _mm_set1_ps(0.28f), scale4 = _mm_set1_ps(scale);
	
	for(; i <= len - 4; i += 4)
	{
		__m128 x4 = _mm_loadu_ps(x + i), y4 = _mm_loadu_ps(y + i);
		__m128 xq4 = _mm_mul_ps(x4, x4), yq4 = _mm_mul_ps(y4, y4);
		__m128 xly = _mm_cmplt_ps(xq4, yq4);
		__m128 z4 = _mm_div_ps(_mm_mul_ps(x4, y4), _mm_add_ps(_mm_add_ps(_mm_max_ps(xq4, yq4), _mm_mul_ps(_mm_min_ps(xq4, yq4), _0_28)), eps));

		// a4 <- x < y ? 90 : 0;
		__m128 a4 = _mm_and_ps(xly, _90);
		// a4 <- (y < 0 ? 360 - a4 : a4) == ((x < y ? y < 0 ? 270 : 90) : (y < 0 ? 360 : 0))
		__m128 mask = _mm_cmplt_ps(y4, zero);
		a4 = _mm_or_ps(_mm_and_ps(_mm_sub_ps(_360, a4), mask), _mm_andnot_ps(mask, a4));
		// a4 <- (x < 0 && !(x < y) ? 180 : a4)
		mask = _mm_andnot_ps(xly, _mm_cmplt_ps(x4, zero));
		a4 = _mm_or_ps(_mm_and_ps(_180, mask), _mm_andnot_ps(mask, a4));
		
		// a4 <- (x < y ? a4 - z4 : a4 + z4)
		a4 = _mm_mul_ps(_mm_add_ps(_mm_xor_ps(z4, _mm_andnot_ps(absmask, xly)), a4), scale4);
		__m128 m4 = _mm_sqrt_ps(_mm_add_ps(xq4, yq4));
		_mm_storeu_ps(angle + i, a4);
		_mm_storeu_ps(mag + i, m4);
	}
#endif
#endif
	for(; i < len; i++)
	{
		float xf = x[i], yf = y[i];
		float a, x2 = xf * xf, y2 = yf * yf;
		if(y2 <= x2)
			a = xf * yf / (x2 + 0.28f * y2 + (float)1e-6) + (float)(xf < 0 ? CCV_PI : yf >= 0 ? 0 : CCV_PI * 2);
		else
			a = (float)(yf >= 0 ? CCV_PI * 0.5 : CCV_PI * 1.5) - xf * yf / (y2 + 0.28f * x2 + (float)1e-6);
		angle[i] = a * scale;
		mag[i] = sqrtf(x2 + y2);
	}
}

Esempio n. 14

0

Mostra file

File: nx_sse_float.hpp Progetto: mywoodstock/nxsimd

    inline vector4f select(const vector4fb& cond, const vector4f& a, const vector4f& b)
    {
#if SSE_INSTR_SET >= 5  // SSE 4.1
        return _mm_blendv_ps(b, a, cond);
#else
        return _mm_or_ps(_mm_and_ps(cond, a), _mm_andnot_ps(cond, b));
#endif
    }

Esempio n. 15

0

Mostra file

File: neuralnetsse.c Progetto: bgnori/gnubg

static inline __m128 sigmoid_ps( __m128 xin )
{	
	__m128 mask = _mm_cmplt_ps( xin, _mm_setzero_ps() );
	__m128 c; 
	xin = _mm_and_ps (xin , abs_mask.ps ); /* Abs. value by clearing signbit */
	c = sigmoid_positive_ps(xin);
	return _mm_or_ps( _mm_and_ps(  mask, c ) , _mm_andnot_ps ( mask , _mm_sub_ps( ones.ps, c )));
}

Esempio n. 16

0

Mostra file

File: sse2-float.c Progetto: LebedevRI/babl

static inline __v4sf
gamma_2_2_to_linear_sse2 (__v4sf x)
{
  __v4sf curve = sse_pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f));
  __v4sf line = x * splat4f (1/12.92f);
  __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.04045f));
  return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
}

Esempio n. 17

0

Mostra file

File: sse2-float.c Progetto: LebedevRI/babl

static inline __v4sf
linear_to_gamma_2_2_sse2 (__v4sf x)
{
  __v4sf curve = sse_pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f);
  __v4sf line = x * splat4f (12.92f);
  __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f));
  return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
}

Esempio n. 18

0

Mostra file

File: Math.cpp Progetto: janivanecky/Bark

float Math::Abs(float x)
{
    static const __m128 SIGNMASK =
        _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
    __m128 val;
    val.m128_f32[0] = x;
    __m128 absval = _mm_andnot_ps(SIGNMASK, val);
    return absval.m128_f32[0];
}

Esempio n. 19

0

Mostra file

File: rosesimd.c Progetto: peihunglin/TSVC_benchmark

// SIMD abs
__SIMD _SIMD_abs_ps(__SIMD a)
{
#ifdef  USE_SSE
  return _mm_andnot_ps(_mm_set1_ps(-0.0f), a); 
#elif defined USE_AVX
  return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a); 
#elif defined USE_IBM
  return vec_abs(a);
#endif
}

Esempio n. 20

0

Mostra file

File: rosesimd.c Progetto: peihunglin/TSVC_benchmark

/*
  SIMD selection
*/
__SIMD _SIMD_sel_ps(__SIMD a, __SIMD b, void** resultPtr)
{
#ifdef  USE_SSE
  __SIMD* result = (__SIMD*) (*resultPtr);
  return _mm_or_ps(_mm_andnot_ps(*result,a),_mm_and_ps(*result,b));
#elif defined USE_AVX
  __SIMD* result = (__SIMD*) resultPtr;
  return _mm256_or_ps(_mm256_andnot_ps(*result,a),_mm256_and_ps(*result,b));
#elif defined USE_IBM
  return vec_sel(a,b,c);
#endif
}

Esempio n. 21

0

Mostra file

File: highpassfilter.cpp Progetto: saiyanprince/pyimager

void HighPassFilter::setFlaggedValuesToZeroAndMakeWeightsSSE(const Image2DCPtr &inputImage, const Image2DPtr &outputImage, const Mask2DCPtr &inputMask, const Image2DPtr &weightsOutput)
{
	const size_t width = inputImage->Width();
	const __m128i zero4i = _mm_set_epi32(0, 0, 0, 0);
	const __m128 zero4 = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
	const __m128 one4 = _mm_set_ps(1.0, 1.0, 1.0, 1.0);
	for(size_t y=0;y<inputImage->Height();++y)
	{
		const bool *rowPtr = inputMask->ValuePtr(0, y);
		const float *inputPtr = inputImage->ValuePtr(0, y);
		float *outputPtr = outputImage->ValuePtr(0, y);
		float *weightsPtr = weightsOutput->ValuePtr(0, y);
		const float *end = inputPtr + width;
		while(inputPtr < end)
		{
			
			// Assign each integer to one bool in the mask
			// Convert false to 0xFFFFFFFF and true to 0
			__m128 conditionMask = _mm_castsi128_ps(
				_mm_cmpeq_epi32(_mm_set_epi32(rowPtr[3] || !isfinite(inputPtr[3]), rowPtr[2] || !isfinite(inputPtr[2]),
																			rowPtr[1] || !isfinite(inputPtr[1]), rowPtr[0] || !isfinite(inputPtr[0])),
												zero4i));
			
			_mm_store_ps(weightsPtr, _mm_or_ps(
				_mm_and_ps(conditionMask, one4),
				_mm_andnot_ps(conditionMask, zero4)
			));
			_mm_store_ps(outputPtr, _mm_or_ps(
				_mm_and_ps(conditionMask, _mm_load_ps(inputPtr)),
				_mm_andnot_ps(conditionMask, zero4)
			));
			
			rowPtr += 4;
			outputPtr += 4;
			inputPtr += 4;
			weightsPtr += 4;
		}
	}
}

Esempio n. 22

0

Mostra file

File: wagomu.cpp Progetto: ZhangXinNan/tegaki

inline wg_v4sf Recognizer::local_distance4(float *s,
                                           float *t0,
                                           float *t1,
                                           float *t2,
                                           float *t3) {
    wg_v4sf v_, v0, v1, v2, v3;
    v_.v = _mm_set_ps1(-0.0);
    v0.v = _mm_sub_ps(((wg_v4sf *)t0)->v, ((wg_v4sf *)s)->v);
    v0.v = _mm_andnot_ps(v_.v,v0.v); // absolute value
    v1.v = _mm_sub_ps(((wg_v4sf *)t1)->v, ((wg_v4sf *)s)->v);
    v1.v = _mm_andnot_ps(v_.v,v1.v); // absolute value
    v2.v = _mm_sub_ps(((wg_v4sf *)t2)->v, ((wg_v4sf *)s)->v);
    v2.v = _mm_andnot_ps(v_.v,v2.v); // absolute value
    v3.v = _mm_sub_ps(((wg_v4sf *)t3)->v, ((wg_v4sf *)s)->v);
    v3.v = _mm_andnot_ps(v_.v,v3.v); // absolute value
    // convert row vectors to column vectors
    _MM_TRANSPOSE4_PS(v0.v, v1.v, v2.v, v3.v);
    v3.v = _mm_add_ps(v3.v, v2.v);
    v3.v = _mm_add_ps(v3.v, v1.v);
    v3.v = _mm_add_ps(v3.v, v0.v);
    return v3;
}

Esempio n. 23

0

Mostra file

File: SimdFloat32x4OperationX86X64.cpp Progetto: Astolfoho/ChakraCore

    SIMDValue SIMDFloat32x4Operation::OpSelect(const SIMDValue& mV, const SIMDValue& tV, const SIMDValue& fV)
    {
        X86SIMDValue x86Result;
        X86SIMDValue maskValue  = X86SIMDValue::ToX86SIMDValue(mV);
        X86SIMDValue trueValue  = X86SIMDValue::ToX86SIMDValue(tV);
        X86SIMDValue falseValue = X86SIMDValue::ToX86SIMDValue(fV);

        X86SIMDValue tempTrue, tempFalse;
        tempTrue.m128_value = _mm_and_ps(maskValue.m128_value, trueValue.m128_value);      // mask & True
        tempFalse.m128_value = _mm_andnot_ps(maskValue.m128_value, falseValue.m128_value); // !mask & False
        x86Result.m128_value = _mm_or_ps(tempTrue.m128_value, tempFalse.m128_value); // tempTrue | tempFalse

        return X86SIMDValue::ToSIMDValue(x86Result);
    }

Esempio n. 24

0

Mostra file

File: testFloat3.cpp Progetto: Liuyangbiao/spring

static inline bool equals_sse(const float3& f1, const float3& f2)
{
	// same as equals_new() just with SSE
	__m128 eq;
	__m128 m1 = _mm_set_ps(f1[0], f1[1], f1[2], 0.f);
	__m128 m2 = _mm_set_ps(f2[0], f2[1], f2[2], 0.f);
	eq = _mm_cmpeq_ps(m1, m2);
	if ((eq[0] != 0) && (eq[1] != 0) && (eq[2] != 0))
		return true;

	static const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
	static const __m128 eps = _mm_set1_ps(float3::cmp_eps());
	static const __m128 ones = _mm_set1_ps(1.f);
	__m128 am1 = _mm_andnot_ps(sign_mask, m1);
	__m128 am2 = _mm_andnot_ps(sign_mask, m2);
	__m128 right = _mm_add_ps(am1, am2);
	right = _mm_add_ps(right, ones);
	right = _mm_mul_ps(right, eps);
	__m128 left = _mm_sub_ps(m1, m2);
	left = _mm_andnot_ps(sign_mask, left);

	eq = _mm_cmple_ps(left, right);
	return ((eq[0] != 0) && (eq[1] != 0) && (eq[2] != 0));
}

Esempio n. 25

0

Mostra file

File: colorout.c Progetto: dirkbr/darktable

static inline __m128 lab_f_inv_m(const __m128 x)
{
  const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f);
  const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f);
  const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f);

  // x > epsilon
  const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x);
  // x <= epsilon
  const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16);

  // blend results according to whether each component is > epsilon or not
  const __m128 mask = _mm_cmpgt_ps(x, epsilon);
  return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
}

Esempio n. 26

0

Mostra file

File: wagomu.cpp Progetto: ZhangXinNan/tegaki

inline float Recognizer::local_distance(float *v1, float *v2) {
    float sum = 0;
#ifdef __SSE__
    wg_v4sf res, v_;
    v_.v = _mm_set_ps1(-0.0);
    
    res.v = _mm_sub_ps(((wg_v4sf *)v2)->v, ((wg_v4sf *)v1)->v);
    res.v = _mm_andnot_ps(v_.v,res.v); // absolute value

    for (unsigned int i=0; i < dimension; i++)
        sum += res.s[i];
#else
    for (unsigned int i=0; i < dimension; i++) 
        sum += fabs(v2[i] - v1[i]);
#endif
    return sum;
}

Esempio n. 27

0

Mostra file

File: gdal_priv_templates.hpp Progetto: bbradbury/lib_gdal

inline void GDALCopy4WordsSSE(const float* pValueIn, Tout* const &pValueOut)
{
    float fMaxVal, fMinVal;
    GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
    __m128 xmm = _mm_loadu_ps(pValueIn);

    __m128 xmm_min = _mm_set1_ps(fMinVal);
    __m128 xmm_max = _mm_set1_ps(fMaxVal);
    xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);

#ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE
    __m128 p0d5 = _mm_set1_ps(0.5f);
     if (std::numeric_limits<Tout>::is_signed)
     {
        __m128 m0d5 = _mm_set1_ps(-0.5f);
        //__m128 mask = _mm_cmpge_ps(xmm, _mm_set1_ps(0.f));
        __m128 mask = _mm_cmpge_ps(xmm, p0d5);
        xmm = _mm_add_ps(xmm, _mm_or_ps(_mm_and_ps(mask, p0d5), _mm_andnot_ps(mask, m0d5))); /* f >= 0.5f ? f + 0.5f : f - 0.5f */
     }
     else
     {
         xmm = _mm_add_ps(xmm, p0d5);
     }
#endif

#ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE
    __m128i xmm_i = _mm_cvttps_epi32 (xmm);
#else
    __m128i xmm_i = _mm_cvtps_epi32(xmm);
#endif
#if 0
    int aTemp[4];
    _mm_storeu_si128 ( (__m128i *)aTemp, xmm_i);
    pValueOut[0] = (Tout)aTemp[0];
    pValueOut[1] = (Tout)aTemp[1];
    pValueOut[2] = (Tout)aTemp[2];
    pValueOut[3] = (Tout)aTemp[3];
#else
    pValueOut[0] = (Tout)_mm_extract_epi16(xmm_i, 0);
    pValueOut[1] = (Tout)_mm_extract_epi16(xmm_i, 2);
    pValueOut[2] = (Tout)_mm_extract_epi16(xmm_i, 4);
    pValueOut[3] = (Tout)_mm_extract_epi16(xmm_i, 6);
#endif
}

Esempio n. 28

0

Mostra file

File: bump.cpp Progetto: bustercopley/polymorph

// Returns { f, g, f, g }, where f = bump0 (t), g = bump1 (t).
v4f bumps_t::operator () (float t) const
{
  // Compute all four polynomials by Estrin's method, and mask and combine the
  // values according to the region of the graph to which t belongs.
  v4f s = _mm_set1_ps (t);
  v4f S = load4f (S0);
  v4f T = load4f (T0);
  v4f U = load4f (U0);
  v4f V = load4f (V0);
  v4f f01 = load4f (c [0]) + load4f (c [1]) * s;
  v4f f12 = load4f (c [2]) + load4f (c [3]) * s;
  v4f f = f01 + f12 * s * s;
  v4f ltS = _mm_cmplt_ps (s, S);
  v4f geT = _mm_cmpge_ps (s, T);
  v4f x1 = _mm_andnot_ps (_mm_or_ps (ltS, geT), f);
  v4f x2 = _mm_and_ps (ltS, U);
  v4f x3 = _mm_and_ps (geT, V);
  v4f val = _mm_or_ps (_mm_or_ps (x1, x2), x3);
  return _mm_hadd_ps (val, val);
}

Esempio n. 29

0

Mostra file

File: highpassfilter.cpp Progetto: saiyanprince/pyimager

void HighPassFilter::elementWiseDivideSSE(const Image2DPtr &leftHand, const Image2DCPtr &rightHand)
{
	const __m128 zero4 = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
	
	for(unsigned y=0;y<leftHand->Height();++y) {
		float *leftHandPtr = leftHand->ValuePtr(0, y);
		const float *rightHandPtr = rightHand->ValuePtr(0, y);
		float *end = leftHandPtr + leftHand->Width();
		while(leftHandPtr < end)
		{
			__m128
				l = _mm_load_ps(leftHandPtr),
				r = _mm_load_ps(rightHandPtr);
			__m128 conditionMask = _mm_cmpeq_ps(r, zero4);
			_mm_store_ps(leftHandPtr, _mm_or_ps(
				_mm_and_ps(conditionMask, zero4),
				_mm_andnot_ps(conditionMask, _mm_div_ps(l, r))
			));
			leftHandPtr += 4;
			rightHandPtr += 4;
		}
	}
}

Esempio n. 30

0

Mostra file

File: brushtool.cpp Progetto: BuildandShoot/terravox

void BrushToolEdit::drawInner(const QPoint &pt, float strength)
{
    float fixedStrength = params.strength;
    strength *= fixedStrength;

    auto color = params.color;
    std::array<int, 3> colorParts = Terrain::expandColor(color);
    __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0);

    SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST);
    (void) roundingModeScope;

    switch (tool->type()) {
    case BrushType::Blur:
        drawBlur(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Smoothen:
        drawSmoothen(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Raise:
    case BrushType::Lower:
        if (tool->type() == BrushType::Lower) {
            fixedStrength = -fixedStrength;
            strength = -strength;
        }
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength *= 3.f;
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                (void) before;
                current -= tip * strength;
            });
            break;
        case BrushPressureMode::Constant:
            if (tool->type() == BrushType::Lower) {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength));
                });
            } else {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength));
                });
            }
            break;
        case BrushPressureMode::Adjustable:
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                current = Terrain::quantizeOne(before - tip * strength);
            });
            break;
        }
        break;
    case BrushType::Paint:
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength = 1.f - std::exp2(-strength);

            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                (void) before;

                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                auto factor = _mm_set1_ps(tip * strength);

                // blend
                auto diff = _mm_sub_ps(colorMM, currentMF);
                diff = _mm_mul_ps(diff, factor);
                currentMF = _mm_add_ps(currentMF, diff);

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Constant:
            fixedStrength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);
                // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128());

                // use "before" image to which way of color change is possible, and
                // compute possible range of result color
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * fixedStrength);
                auto adddiff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, adddiff);
                auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps());

                // compute output image
                auto out1 = _mm_max_ps(currentMF, beforeMF);
                auto out2 = _mm_min_ps(currentMF, beforeMF);
                currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2));

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Adjustable:
            strength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);

                // blend
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * strength);
                diff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, diff);

                // convert to RGB32
                beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128());
                beforeMM = _mm_cvttps_epi32(beforeMF);
                beforeMM = _mm_packs_epi32(beforeMM, beforeMM);
                beforeMM = _mm_packus_epi16(beforeMM, beforeMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(beforeMM));
            });
            break;
        }
        break;
    }

}