Пример #1
0
/* Function:  esl_sse_expf()
 * Synopsis:  <r[z] = exp x[z]>
 * Incept:    SRE, Fri Dec 14 14:46:27 2007 [Janelia]
 *
 * Purpose:   Given a vector <x> containing four floats, returns a
 *            vector <r> in which each element <r[z] = expf(x[z])>.
 *            
 *            Valid for all IEEE754 floats $x_z$.
 *            
 * Xref:      J2/71
 *            J10/62: bugfix, minlogf/maxlogf range was too wide; 
 *                    (k+127) must be >=0 and <=255, so (k+127)<<23
 *                    is a valid IEEE754 float, without touching 
 *                    the sign bit. Pommier had this right in the
 *                    first place, and I didn't understand.
 * 
 * Note:      Derived from an SSE1 implementation by Julian
 *            Pommier. Converted to SSE2.
 *            
 *            Note on maxlogf/minlogf, which are close to but not
 *            exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so
 *            k+127 is 0..255, a valid IEEE754 8-bit exponent
 *            (0..255), so the bit pattern (k+127)<<23 is IEEE754
 *            single-precision for 2^k.  If k=-127, we get IEEE754 0.
 *            If k=128, we get IEEE754 +inf.  If k<-127, k+127 is
 *            negative and we get screwed up.  If k>128, k+127
 *            overflows the 8-bit exponent and sets the sign bit.  So
 *            for x' (base 2) < -127.5 we must definitely return e^x ~
 *            0; for x' < 126.5 we're going to calculate 0 anyway
 *            (because k=floor(-126.5-epsilon+0.5) = -127).  So any
 *            minlogf between -126.5 log2 ... -127.5 log2 will suffice
 *            as the cutoff. Ditto for 126.5 log2 .. 127.5log2.
 *            That's 87.68312 .. 88.3762655.  I think Pommier's
 *            thinking is, you don't want to get to close to the
 *            edges, lest fp roundoff error screw you (he may have
 *            consider 1 ulp carefully, I can't tell), but otherwise
 *            you may as well put your bounds close to the outer edge;
 *            so 
 *              maxlogf =  127.5 log(2) - epsilon 
 *              minlogf = -127.5 log(2) + epsilon 
 *            for an epsilon that happen to be ~ 3e-6.
 */
__m128 
esl_sse_expf(__m128 x) 
{
  static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f, 
			       4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f };
  static float cephes_c[2] = { 0.693359375f,    -2.12194440e-4f };
  static float maxlogf     =  88.3762626647949f;  /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */
  static float minlogf     = -88.3762626647949f;  /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */
  __m128i k;
  __m128  mask, tmp, fx, z, y, minmask, maxmask;
  
  /* handle out-of-range and special conditions */
  maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf));
  minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf));

  /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */
  fx = _mm_mul_ps(x,  _mm_set1_ps(eslCONST_LOG2R));
  fx = _mm_add_ps(fx, _mm_set1_ps(0.5f));

  /* floorf() with SSE:  */
  k    = _mm_cvttps_epi32(fx);	              /* cast to int with truncation                  */
  tmp  = _mm_cvtepi32_ps(k);	              /* cast back to float                           */
  mask = _mm_cmpgt_ps(tmp, fx);               /* if it increased (i.e. if it was negative...) */
  mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch...           */
  fx   = _mm_sub_ps(tmp, mask);	              /* then subtract one.                           */
  k    = _mm_cvttps_epi32(fx);	              /* k is now ready for the 2^k part.             */
  
  /* polynomial approx for e^f for f in range [-0.5, 0.5] */
  tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0]));
  z   = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1]));
  x   = _mm_sub_ps(x, tmp);
  x   = _mm_sub_ps(x, z);
  z   = _mm_mul_ps(x, x);
  
  y =               _mm_set1_ps(cephes_p[0]);    y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1]));   y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2]));   y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3]));   y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4]));   y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5]));   y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(1.0f));

  /* build 2^k by hand, by creating a IEEE754 float */
  k  = _mm_add_epi32(k, _mm_set1_epi32(127));
  k  = _mm_slli_epi32(k, 23);
  fx = _mm_castsi128_ps(k);
  
  /* put 2^k e^f together (fx = 2^k,  y = e^f) and we're done */
  y = _mm_mul_ps(y, fx);	

  /* special/range cleanup */
  y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128)  */
  y = esl_sse_select_ps(y, _mm_set1_ps(0.0f),        minmask); /* exp(x) = 0   for x < log(2^-149) */
  return y;
}
Пример #2
0
__m128 exp_128(
  const __m128& x) {

  //! Clip the value
  __m128 y = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(88.3762626647949f)),
                                      _mm_set1_ps(-88.3762626647949f));

  //! Express exp(x) as exp(g + n * log(2))
  __m128 fx = y * _mm_set1_ps(1.44269504088896341) + _mm_set1_ps(0.5f);

  //! Floor
  const __m128 tmp = _mm_round_ps(fx, _MM_FROUND_TO_ZERO);

  //! If greater, substract 1
  const __m128 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), _mm_set1_ps(1.f));
  fx = tmp - mask;

  y -= fx * _mm_set1_ps(0.693359375 - 2.12194440e-4);
  const __m128 z = y * y;


  const __m128 t = (((((_mm_set1_ps(1.9875691500E-4)  * y +
                        _mm_set1_ps(1.3981999507E-3)) * y +
                        _mm_set1_ps(8.3334519073E-3)) * y +
                        _mm_set1_ps(4.1665795894E-2)) * y +
                        _mm_set1_ps(1.6666665459E-1)) * y +
                        _mm_set1_ps(5.0000001201E-1)) * z + y + _mm_set1_ps(1.f);

  //! Build 2^n
  const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(fx), _mm_set1_epi32(0x7f));

  //! Return the result
  return t * _mm_castsi128_ps(_mm_slli_epi32(emm0, 23));
}
Пример #3
0
void ColorFragment(MSR_FShaderParameters *params)
{
    MSR_SSEColor3 &out = params->output;
    const MSR_ShaderGlobals *globals = params->globals;

    // Get the texture
    MSR_SSEColor3 tex = MSR_Tex2D_Wrap(params->globals->tex0, params->varyings[0], params->varyings[1]);

    // Depth
    params->varyings[10] = _mm_rcp_ps( *params->varyings[10] );
    params->varyings[8] *= params->varyings[10];
    params->varyings[9] = SSE_ONE - (params->varyings[9] * params->varyings[10]);
    MSR_SSEFloat shadow = MSR_Tex2D_F32(shadow_map_depth, params->varyings[8], params->varyings[9]);

    // Assemble the light direction
    const MSR_Vec4 &ld = params->globals->lights[0].direction;
    MSR_SSEVec3 LightDir( MSR_SSEFloat(ld.x), MSR_SSEFloat(ld.y), MSR_SSEFloat(ld.z) );

    params->varyings[10] += bias_amt;
    float4 cmp = _mm_cmplt_ps(*shadow, *params->varyings[10]);

    // Get the eye vector
    MSR_SSEVec3 *Eye = (MSR_SSEVec3*)&params->varyings[2];
    Eye->Normalize();

    // Get the normal vector
    MSR_SSEVec3 *Normal = (MSR_SSEVec3*)&params->varyings[5];
    Normal->Normalize();

    // Diffuse
    MSR_SSEFloat diff = MSR_Clamp(Normal->Dot(LightDir), *SSE_ZERO, *SSE_ONE);

    // Specular
    MSR_SSEVec3 Reflect = ((diff * MSR_SSEFloat(2.0f)) * *Normal) - LightDir;
    Reflect.Normalize();

    MSR_SSEFloat spec = MSR_Clamp(Reflect.Dot(*Eye), *SSE_ZERO, *SSE_ONE);
    spec = spec * spec * spec * spec * spec * spec * spec;

    float4 cmp2 = _mm_cmpgt_ps( *diff, *SSE_ZERO );
    cmp = _mm_and_ps( cmp2, cmp );

    MSR_SSEFloat clamped_ao = MSR_Clamp( params->varyings[11], *SSE_ZERO, *SSE_ONE );
    out.r = tex.r * clamped_ao;
    out.g = tex.g * clamped_ao;
    out.b = tex.b * clamped_ao;

    MSR_SSEColor3 opt1, opt2;
    opt1.r = globals->ml_ambient[0].r;
    opt1.g = globals->ml_ambient[0].g;
    opt1.b = globals->ml_ambient[0].b;

    opt2.r = (diff * globals->ml_diffuse[0].r) + (spec * globals->ml_specular[0].r) + opt1.r;
    opt2.g = (diff * globals->ml_diffuse[0].g) + (spec * globals->ml_specular[0].g) + opt1.g;
    opt2.b = (diff * globals->ml_diffuse[0].b) + (spec * globals->ml_specular[0].b) + opt1.b;

    out.r *= _mm_or_ps( _mm_and_ps(*opt2.r,cmp), _mm_andnot_ps(cmp, *opt1.r) );
    out.g *= _mm_or_ps( _mm_and_ps(*opt2.g,cmp), _mm_andnot_ps(cmp, *opt1.g) );
    out.b *= _mm_or_ps( _mm_and_ps(*opt2.b,cmp), _mm_andnot_ps(cmp, *opt1.b) );
}
Пример #4
0
static inline __v4sf
gamma_2_2_to_linear_sse2 (__v4sf x)
{
  __v4sf curve = sse_pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f));
  __v4sf line = x * splat4f (1/12.92f);
  __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.04045f));
  return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
}
Пример #5
0
static inline __v4sf
linear_to_gamma_2_2_sse2 (__v4sf x)
{
  __v4sf curve = sse_pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f);
  __v4sf line = x * splat4f (12.92f);
  __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f));
  return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
}
    SIMDValue SIMDFloat32x4Operation::OpGreaterThan(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128_value = _mm_cmpgt_ps(tmpaValue.m128_value, tmpbValue.m128_value); // a > b?

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Пример #7
0
Файл: SSE.hpp Проект: Eynx/R3D
 // Normalization ----------------------------------------------------------------------
 static Float3 VFunction Normalize(const Float3& vector)
 {
     Vector vLengthSquared = Dot(vector, vector);
     Vector vEpsilon = _mm_cmpgt_ps(vLengthSquared, Constant::Epsilon);
     Vector vLengthReciprocal = _mm_rsqrt_ps(vLengthSquared);
     Vector vMultiplier = _mm_and_ps(vLengthReciprocal, vEpsilon);
     Vector vResult = _mm_mul_ps(vector, vMultiplier);
     return vResult;
 }
Пример #8
0
 void Detect32f(const HidHaarCascade & hid, size_t offset, const __m128 & norm, __m128i & result)
 {
     typedef HidHaarCascade Hid;
     const float * leaves = hid.leaves.data();
     const Hid::Node * node = hid.nodes.data();
     const Hid::Stage * stages = hid.stages.data();
     for (int i = 0, n = (int)hid.stages.size(); i < n; ++i)
     {
         const Hid::Stage & stage = stages[i];
         if (stage.canSkip)
             continue;
         const Hid::Node * end = node + stage.ntrees;
         __m128 stageSum = _mm_setzero_ps();
         if (stage.hasThree)
         {
             for (; node < end; ++node, leaves += 2)
             {
                 const Hid::Feature & feature = hid.features[node->featureIdx];
                 __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset));
                 if (feature.rect[2].p0)
                     sum = _mm_add_ps(sum, WeightedSum32f(feature.rect[2], offset));
                 StageSum32f(leaves, node->threshold, sum, norm, stageSum);
             }
         }
         else
         {
             for (; node < end; ++node, leaves += 2)
             {
                 const Hid::Feature & feature = hid.features[node->featureIdx];
                 __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset));
                 StageSum32f(leaves, node->threshold, sum, norm, stageSum);
             }
         }
         result = _mm_andnot_si128(_mm_castps_si128(_mm_cmpgt_ps(_mm_set1_ps(stage.threshold), stageSum)), result);
         int resultCount = ResultCount(result);
         if (resultCount == 0)
         {
             return;
         }
         else if (resultCount == 1)
         {
             uint32_t SIMD_ALIGNED(16) _result[4];
             float SIMD_ALIGNED(16) _norm[4];
             _mm_store_si128((__m128i*)_result, result);
             _mm_store_ps(_norm, norm);
             for (int j = 0; j < 4; ++j)
             {
                 if (_result[j])
                 {
                     _result[j] = Base::Detect32f(hid, offset + j, i + 1, _norm[j]) > 0 ? 1 : 0;
                     break;
                 }
             }
             result = _mm_load_si128((__m128i*)_result);
             return;
         }
     }
Пример #9
0
        template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128 & dx, const __m128 & dy, Buffer & buffer, size_t col)
        {
            __m128 bestDot = _mm_setzero_ps();
            __m128i bestIndex = _mm_setzero_si128();
            for(int i = 0; i < buffer.size; ++i)
            {
                __m128 dot = _mm_add_ps(_mm_mul_ps(dx, buffer.cos[i]), _mm_mul_ps(dy, buffer.sin[i]));
                __m128 mask = _mm_cmpgt_ps(dot, bestDot);
                bestDot = _mm_max_ps(dot, bestDot);
                bestIndex = Combine(_mm_castps_si128(mask), buffer.pos[i], bestIndex);

                dot = _mm_sub_ps(_mm_setzero_ps(), dot);
                mask = _mm_cmpgt_ps(dot, bestDot);
                bestDot = _mm_max_ps(dot, bestDot);
                bestIndex = Combine(_mm_castps_si128(mask), buffer.neg[i], bestIndex);
            }
            Store<align>((__m128i*)(buffer.index + col), bestIndex);
            Sse::Store<align>(buffer.value + col, _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy))));
        }
Пример #10
0
static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
{
  const __m128 k1e_10f = _mm_set1_ps(1e-10f);
  const __m128 kThresh = _mm_set1_ps(aec->errThresh);
  const __m128 kMu = _mm_set1_ps(aec->mu);

  int i;
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]);
    const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
    const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);

    const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f);
    __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
    __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
    const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
    const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
    const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
    const __m128 absEf = _mm_sqrt_ps(ef_sum2);
    const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
    __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
    const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
    __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
    __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
    ef_re_if = _mm_and_ps(bigger, ef_re_if);
    ef_im_if = _mm_and_ps(bigger, ef_im_if);
    ef_re = _mm_andnot_ps(bigger, ef_re);
    ef_im = _mm_andnot_ps(bigger, ef_im);
    ef_re = _mm_or_ps(ef_re, ef_re_if);
    ef_im = _mm_or_ps(ef_im, ef_im_if);
    ef_re = _mm_mul_ps(ef_re, kMu);
    ef_im = _mm_mul_ps(ef_im, kMu);

    _mm_storeu_ps(&ef[0][i], ef_re);
    _mm_storeu_ps(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
  for (; i < (PART_LEN1); i++) {
    float absEf;
    ef[0][i] /= (aec->xPow[i] + 1e-10f);
    ef[1][i] /= (aec->xPow[i] + 1e-10f);
    absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

    if (absEf > aec->errThresh) {
      absEf = aec->errThresh / (absEf + 1e-10f);
      ef[0][i] *= absEf;
      ef[1][i] *= absEf;
    }

    // Stepsize factor
    ef[0][i] *= aec->mu;
    ef[1][i] *= aec->mu;
  }
}
Пример #11
0
void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh)
{
	__m128 ms0 = _mm_load_ps(src);
	__m128 ms1 = _mm_load_ps(src + 4);
	const __m128 mm = _mm_set1_ps(0.5f);
	__m128 a = _mm_add_ps(ms0, ms1);
	__m128 b = _mm_sub_ps(ms0, ms1);

	__m128 t1 = _mm_unpacklo_ps(a, b);
	__m128 t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);

	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth);
	ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk);
#ifdef _KEEP_00_COEF_
	ms0 = _mm_blend_ps(ms0, a, 1);
#endif
	msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth);
	ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk);

	a = _mm_add_ps(ms0, ms1);
	b = _mm_sub_ps(ms0, ms1);

	t1 = _mm_unpacklo_ps(a, b);
	t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	_mm_store_ps(dest, a);
	_mm_store_ps(dest + 4, b);
}
Пример #12
0
static inline __m128 curve_vec4(
    const __m128 x,
    const __m128 g,
    const __m128 sigma,
    const __m128 shadows,
    const __m128 highlights,
    const __m128 clarity)
{
  // TODO: pull these non-data depedent constants out of the loop to see
  // whether the compiler fail to do so
  const __m128 const0 = _mm_set_ps1(0x3f800000u);
  const __m128 const1 = _mm_set_ps1(0x402DF854u); // for e^x
  const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
  const __m128 one = _mm_set1_ps(1.0f);
  const __m128 two = _mm_set1_ps(2.0f);
  const __m128 twothirds = _mm_set1_ps(2.0f/3.0f);
  const __m128 twosig = _mm_mul_ps(two, sigma);
  const __m128 sigma2 = _mm_mul_ps(sigma, sigma);
  const __m128 s22 = _mm_mul_ps(twothirds, sigma2);

  const __m128 c = _mm_sub_ps(x, g);
  const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps());
  // select shadows or highlights as multiplier for linear part, based on c < 0
  const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights));
  // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma)
  const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask));
  // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma
  const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma))));

  const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(),
        _mm_div_ps(c, _mm_mul_ps(two, ssigma))));
  const __m128 t2 = _mm_mul_ps(t, t);
  const __m128 mt = _mm_sub_ps(one, t);

  // midtone value fading over to linear part, without local contrast:
  const __m128 vmid = _mm_add_ps(g,
      _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)),
        _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi)))));

  // c > 2*sigma?
  const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig);
  const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid));

  // midtone local contrast
  // dt_fast_expf in sse:
  const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22));
  const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0)));
  const __m128 k = _mm_max_ps(k0, _mm_setzero_ps());
  const __m128i ki = _mm_cvtps_epi32(k);
  const __m128 gauss = _mm_load_ps((float*)&ki);
  const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss));
  return _mm_add_ps(val, vcon);
}
Пример #13
0
// a > b
void _SIMD_cmpgt_ps(__SIMD a, __SIMD b, void** resultPtr)
{
  __SIMD* result = (__SIMD*)malloc(sizeof(__SIMD));
  *resultPtr = result;
#ifdef  USE_SSE
  *result = _mm_cmpgt_ps(a,b);
#elif defined USE_AVX
  *result = _mm256_cmp_ps(a,b,30);
#elif defined USE_IBM
  *result = vec_cmpgt(a,b);
#endif
}
Пример #14
0
__m128 exp_ps(__m128 x) {
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4sf tmp = _mm_setzero_ps(), fx;
    v4si emm0;
    v4sf one = constants::ps_1.ps;

    x = _mm_min_ps(x, constants::exp_hi.ps);
    x = _mm_max_ps(x, constants::exp_lo.ps);

    /* express exp(x) as exp(g + n*log(2)) */
    fx = _mm_mul_ps(x,  constants::cephes_LOG2EF.ps);
    fx = _mm_add_ps(fx, constants::ps_0p5.ps);

    /* how to perform a floorf with SSE: just below */
    emm0 = _mm_cvttps_epi32(fx);
    tmp  = _mm_cvtepi32_ps(emm0);
    /* if greater, substract 1 */
    v4sf mask = _mm_cmpgt_ps(tmp, fx);
    mask = _mm_and_ps(mask, one);
    fx = _mm_sub_ps(tmp, mask);

    tmp = _mm_mul_ps(fx, constants::cephes_exp_C1.ps);
    v4sf z = _mm_mul_ps(fx, constants::cephes_exp_C2.ps);
    x = _mm_sub_ps(x, tmp);
    x = _mm_sub_ps(x, z);

    z = _mm_mul_ps(x,x);

    v4sf y = constants::cephes_exp_p0.ps;
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p1.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p2.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p3.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p4.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p5.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, x);
    y = _mm_add_ps(y, one);

    /* build 2^n */
    emm0 = _mm_cvttps_epi32(fx);
    emm0 = _mm_add_epi32(emm0, constants::pi32_0x7f.pi);
    emm0 = _mm_slli_epi32(emm0, 23);
    v4sf pow2n = _mm_castsi128_ps(emm0);
    y = _mm_mul_ps(y, pow2n);
    return y;
}
Пример #15
0
void fDCT2D4x4_and_threshold_keep00_32f(float* s, float* d, float thresh)
{
	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);
	const __m128 zeros = _mm_setzero_ps();
	const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2);
	const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2);

	__m128 s0 = _mm_load_ps(s); s += 4;
	__m128 s1 = _mm_load_ps(s); s += 4;
	__m128 s2 = _mm_load_ps(s); s += 4;
	__m128 s3 = _mm_load_ps(s);

	__m128 p03 = _mm_add_ps(s0, s3);
	__m128 p12 = _mm_add_ps(s1, s2);
	__m128 m03 = _mm_sub_ps(s0, s3);
	__m128 m12 = _mm_sub_ps(s1, s2);

	__m128 v = _mm_add_ps(p03, p12);
	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	// keep 00 coef.
	__m128 v2 = _mm_blendv_ps(zeros, v, msk);
	v2 = _mm_blend_ps(v2, v, 1);
	_mm_store_ps(d, v2);

	v = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12));
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 4, v);

	v = _mm_sub_ps(p03, p12);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 8, v);

	v = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12));
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 12, v);
}
Пример #16
0
static inline __m128 lab_f_inv_m(const __m128 x)
{
  const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f);
  const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f);
  const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f);

  // x > epsilon
  const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x);
  // x <= epsilon
  const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16);

  // blend results according to whether each component is > epsilon or not
  const __m128 mask = _mm_cmpgt_ps(x, epsilon);
  return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small));
}
Пример #17
0
	// Operators
	INLINE bool SVec4::operator==(const SVec4 &rhs) const
	{
#ifdef USE_SSE
		SIMDvec dif = _mm_sub_ps( m_128, rhs.m_128 );
		SIMDvec ep = _mm_set1_ps( math::Epsilon );
		SIMDvec neg_ep = _mm_set1_ps( -math::Epsilon );

		return ( 0xf == _mm_movemask_ps( _mm_and_ps( _mm_cmpgt_ps( ep, dif ), _mm_cmplt_ps( neg_ep, dif ) ) ) );
#else
        return math::IsEqual(m_x, rhs.X()) &&
               math::IsEqual(m_y, rhs.Y()) &&
               math::IsEqual(m_z, rhs.Z()) &&
               math::IsEqual(m_w, rhs.W());
#endif
	}
Пример #18
0
v4sf exp_ps(v4sf x) {
    v4sf tmp = _mm_setzero_ps(), fx;
    v4si emm0;
    v4sf one = *(v4sf*)_ps_1;

    x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
    x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);

    fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
    fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);

    emm0 = _mm_cvttps_epi32(fx);
    tmp  = _mm_cvtepi32_ps(emm0);

    v4sf mask = _mm_cmpgt_ps(tmp, fx);
    mask = _mm_and_ps(mask, one);
    fx = _mm_sub_ps(tmp, mask);

    tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
    v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
    x = _mm_sub_ps(x, tmp);
    x = _mm_sub_ps(x, z);

    z = _mm_mul_ps(x,x);

    v4sf y = *(v4sf*)_ps_cephes_exp_p0;
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, x);
    y = _mm_add_ps(y, one);

    emm0 = _mm_cvttps_epi32(fx);
    emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
    emm0 = _mm_slli_epi32(emm0, 23);
    v4sf pow2n = _mm_castsi128_ps(emm0);
    y = _mm_mul_ps(y, pow2n);
    return y;
}
Пример #19
0
static inline __m128
bicubic_sse(__m128 width, __m128 t)
{
    static const __m128 half  = { .5f, .5f, .5f, .5f};
    static const __m128 one   = { 1.f, 1.f, 1.f, 1.f};
    static const __m128 two   = { 2.f, 2.f, 2.f, 2.f};
    static const __m128 three = { 3.f, 3.f, 3.f, 3.f};
    static const __m128 four  = { 4.f, 4.f, 4.f, 4.f};
    static const __m128 five  = { 5.f, 5.f, 5.f, 5.f};
    static const __m128 eight = { 8.f, 8.f, 8.f, 8.f};

    t = _mm_abs_ps(t);
    __m128 t2 = _mm_mul_ps(t, t);

    /* Compute 1 < t < 2 case:
     * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f)
     * half*(t*(mt2 + t5 - eight) + four)
     * half*(t*(mt2 + t5_sub_8) + four)
     * half*(t*(mt2_add_t5_sub_8) + four) */
    __m128 t5 = _mm_mul_ps(five, t);
    __m128 t5_sub_8 = _mm_sub_ps(t5, eight);
    __m128 zero = _mm_setzero_ps();
    __m128 mt2 = _mm_sub_ps(zero, t2);
    __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8);
    __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8);
    __m128 b = _mm_add_ps(a, four);
    __m128 r12 = _mm_mul_ps(b, half);

    /* Compute case < 1
     * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */
    __m128 t23 = _mm_mul_ps(three, t2);
    __m128 c = _mm_sub_ps(t23, t5);
    __m128 d = _mm_mul_ps(t, c);
    __m128 e = _mm_add_ps(d, two);
    __m128 r01 = _mm_mul_ps(half, e);

    // Compute masks fr keeping correct components
    __m128 mask01 = _mm_cmple_ps(t, one);
    __m128 mask12 = _mm_cmpgt_ps(t, one);
    r01 = _mm_and_ps(mask01, r01);
    r12 = _mm_and_ps(mask12, r12);


    return _mm_or_ps(r01, r12);
}
Пример #20
0
void zlimit (int simd, float *src , float *dst , size_t size) {
  if (simd) {
    __m128 zero4 = _mm_set1_ps(0.f);
    while ( size >= 4) {
      __m128 srcv = _mm_loadu_ps(src);
      __m128 cmpv = _mm_cmpgt_ps(srcv, zero4);
      __m128 dstv = _mm_and_ps(cmpv, srcv);
      _mm_storeu_ps(dst, dstv);
      src += 4; dst += 4; size -= 4;
    }
  }
  else {
    while (size) {
      *dst = *src > 0.f ? *src : 0.f ;
      src++; dst++; size--;
    }
  }
}
Пример #21
0
static void
thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
{
    int i, j;
    Size roi = _src.size();
    roi.width *= _src.channels();
    const float* src = (const float*)_src.data;
    float* dst = (float*)_dst.data;
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);

#if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
#endif

    if( _src.isContinuous() && _dst.isContinuous() )
    {
        roi.width *= roi.height;
        roi.height = 1;
    }

#ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
        return;
#endif

#if defined(HAVE_IPP)
    IppiSize sz = { roi.width, roi.height };
    switch( type )
    {
    case THRESH_TRUNC:
        if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
            return;
        setIppErrorStatus();
        break;
    case THRESH_TOZERO:
        if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0))
            return;
        setIppErrorStatus();
        break;
    case THRESH_TOZERO_INV:
        if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
            return;
        setIppErrorStatus();
        break;
    }
#endif

    switch( type )
    {
        case THRESH_BINARY:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmpgt_ps( v0, thresh4 );
                        v1 = _mm_cmpgt_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] > thresh ? maxval : 0;
            }
            break;

        case THRESH_BINARY_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmple_ps( v0, thresh4 );
                        v1 = _mm_cmple_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] <= thresh ? maxval : 0;
            }
            break;

        case THRESH_TRUNC:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_min_ps( v0, thresh4 );
                        v1 = _mm_min_ps( v1, thresh4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = std::min(src[j], thresh);
            }
            break;

        case THRESH_TOZERO:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v > thresh ? v : 0;
                }
            }
            break;

        case THRESH_TOZERO_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif
                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v <= thresh ? v : 0;
                }
            }
            break;
        default:
            return CV_Error( CV_StsBadArg, "" );
    }
}
Пример #22
0
void BM3D_Basic_Process::CollaborativeFilter(int plane,
    FLType *ResNum, FLType *ResDen,
    const FLType *src, const FLType *ref,
    const PosPairCode &code) const
{
    PCType GroupSize = static_cast<PCType>(code.size());
    // When para.GroupSize > 0, limit GroupSize up to para.GroupSize
    if (d.para.GroupSize > 0 && GroupSize > d.para.GroupSize)
    {
        GroupSize = d.para.GroupSize;
    }

    // Construct source group guided by matched pos code
    block_group srcGroup(src, src_stride[plane], code, GroupSize, d.para.BlockSize, d.para.BlockSize);

    // Initialize retianed coefficients of hard threshold filtering
    int retainedCoefs = 0;

    // Apply forward 3D transform to the source group
    d.f[plane].fp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data());

    // Apply hard-thresholding to the source group
    auto srcp = srcGroup.data();
    auto thrp = d.f[plane].thrTable[GroupSize - 1].get();
    const auto upper = srcp + srcGroup.size();

#if defined(__SSE2__)
    static const ptrdiff_t simd_step = 4;
    const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
    const ptrdiff_t simd_width = srcGroup.size() - simd_residue;

    static const __m128 zero_ps = _mm_setzero_ps();
    __m128i cmp_sum = _mm_setzero_si128();

    for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step)
    {
        const __m128 s1 = _mm_load_ps(srcp);
        const __m128 t1p = _mm_load_ps(thrp);
        const __m128 t1n = _mm_sub_ps(zero_ps, t1p);

        const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p);
        const __m128 cmp2 = _mm_cmplt_ps(s1, t1n);
        const __m128 cmp = _mm_or_ps(cmp1, cmp2);

        const __m128 d1 = _mm_and_ps(cmp, s1);
        _mm_store_ps(srcp, d1);
        cmp_sum = _mm_sub_epi32(cmp_sum, _mm_castps_si128(cmp));
    }

    alignas(16) int32_t cmp_sum_i32[4];
    _mm_store_si128(reinterpret_cast<__m128i *>(cmp_sum_i32), cmp_sum);
    retainedCoefs += cmp_sum_i32[0] + cmp_sum_i32[1] + cmp_sum_i32[2] + cmp_sum_i32[3];
#endif

    for (; srcp < upper; ++srcp, ++thrp)
    {
        if (*srcp > *thrp || *srcp < -*thrp)
        {
            ++retainedCoefs;
        }
        else
        {
            *srcp = 0;
        }
    }

    // Apply backward 3D transform to the filtered group
    d.f[plane].bp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data());

    // Calculate weight for the filtered group
    // Also include the normalization factor to compensate for the amplification introduced in 3D transform
    FLType denWeight = retainedCoefs < 1 ? 1 : FLType(1) / static_cast<FLType>(retainedCoefs);
    FLType numWeight = static_cast<FLType>(denWeight / d.f[plane].finalAMP[GroupSize - 1]);

    // Store the weighted filtered group to the numerator part of the basic estimation
    // Store the weight to the denominator part of the basic estimation
    srcGroup.AddTo(ResNum, dst_stride[plane], numWeight);
    srcGroup.CountTo(ResDen, dst_stride[plane], denWeight);
}
Пример #23
0
  _declspec(dllexport) DiffResult __stdcall diff_img(Image left, Image right, DiffOptions options) {
    if (options.ignoreColor) {
      makeGreyscale(left);
      makeGreyscale(right);
    }

    float* imgMem = (float*)_aligned_malloc(left.width * left.height * sizeof(float) * 4, 16);
    int colorOffset = left.width * left.height;
    Image diff = { left.width, left.height, left.stride, imgMem, imgMem + colorOffset, imgMem + colorOffset * 2, imgMem + colorOffset * 3 };

    float* drp = diff.r;
    float* dgp = diff.g;
    float* dbp = diff.b;
    float* dap = diff.a;

    float* lrp = left.r;
    float* lgp = left.g;
    float* lbp = left.b;
    float* lap = left.a;

    float* rrp = right.r;
    float* rgp = right.g;
    float* rbp = right.b;
    float* rap = right.a;

    Color error = ConvertToFloat(options.errorColor);

    auto er = _mm_set_ps1(error.r);
    auto eg = _mm_set_ps1(error.g);
    auto eb = _mm_set_ps1(error.b);
    auto ea = _mm_set_ps1(error.a);

    auto tolerance = _mm_set_ps1(options.tolerance);
    auto overlayTransparency = _mm_set_ps1(options.overlayTransparency);

    OverlayType overlayType = options.overlayType;
    byte weightByDiffPercentage = options.weightByDiffPercentage;

    auto diffPixelCount = _mm_set_epi32(0, 0, 0, 0);
    auto onei = _mm_set1_epi32(1);
    auto one = _mm_set1_ps(1);
    auto zero = _mm_set1_ps(0);

    for (int y = 0; y < left.height; y++) {
      for (int x = 0; x < left.width; x+=4) {
        auto lr = _mm_load_ps(lrp);
        auto lg = _mm_load_ps(lgp);
        auto lb = _mm_load_ps(lbp);
        auto la = _mm_load_ps(lap);

        auto rr = _mm_load_ps(rrp);
        auto rg = _mm_load_ps(rgp);
        auto rb = _mm_load_ps(rbp);
        auto ra = _mm_load_ps(rap);

        auto rdiff = _mm_sub_ps(rr, lr);
        auto gdiff = _mm_sub_ps(rg, lg);
        auto bdiff = _mm_sub_ps(rb, lb);
        auto adiff = _mm_sub_ps(ra, la);

        auto distance = _mm_mul_ps(rdiff, rdiff);
        distance = _mm_add_ps(distance, _mm_mul_ps(gdiff, gdiff));
        distance = _mm_add_ps(distance, _mm_mul_ps(bdiff, bdiff));
        distance = _mm_add_ps(distance, _mm_mul_ps(adiff, adiff));
        distance = _mm_sqrt_ps(distance);

        auto t = overlayTransparency;
        if (weightByDiffPercentage) {
          t = _mm_mul_ps(t, distance);
        }

        auto isdiff = _mm_cmpgt_ps(distance, tolerance);

        t = _mm_min_ps(one, _mm_max_ps(zero, t));
        auto mlr = rr;
        auto mlg = rg;
        auto mlb = rb;
        auto mla = ra;

        if (overlayType == OverlayType::Movement) {
          mlr = _mm_mul_ps(mlr, er);
          mlg = _mm_mul_ps(mlg, eg);
          mlb = _mm_mul_ps(mlb, eb);
          mla = _mm_mul_ps(mla, ea);
        }

        auto oneMinusT = _mm_sub_ps(one, t);

        auto mixedR = _mm_add_ps(_mm_mul_ps(mlr, oneMinusT), _mm_mul_ps(er, t));
        auto mixedG = _mm_add_ps(_mm_mul_ps(mlg, oneMinusT), _mm_mul_ps(eg, t));
        auto mixedB = _mm_add_ps(_mm_mul_ps(mlb, oneMinusT), _mm_mul_ps(eb, t));
        auto mixedA = one;

        if (overlayType != OverlayType::Movement) {
          mixedA = _mm_add_ps(_mm_mul_ps(mla, oneMinusT), _mm_mul_ps(ea, t));
        }

        // (((b ^ a) & mask)^a)
        auto dr = _mm_xor_ps(lr, _mm_and_ps(isdiff, _mm_xor_ps(mixedR, lr)));
        auto dg = _mm_xor_ps(lg, _mm_and_ps(isdiff, _mm_xor_ps(mixedG, lg)));
        auto db = _mm_xor_ps(lb, _mm_and_ps(isdiff, _mm_xor_ps(mixedB, lb)));
        auto da = _mm_xor_ps(la, _mm_and_ps(isdiff, _mm_xor_ps(mixedA, la)));

        diffPixelCount = _mm_xor_si128(diffPixelCount,
          _mm_and_si128(_mm_castps_si128(isdiff),
            _mm_xor_si128(_mm_add_epi32(diffPixelCount, onei),
              diffPixelCount)));

        _mm_store_ps(drp, dr);
        _mm_store_ps(dgp, dg);
        _mm_store_ps(dbp, db);
        _mm_store_ps(dap, da);

        drp+=4;
        dgp+=4;
        dbp+=4;
        dap+=4;

        lrp+=4;
        lgp+=4;
        lbp+=4;
        lap+=4;

        rrp+=4;
        rgp+=4;
        rbp+=4;
        rap+=4;
      }
    }

    int* pixelCounts = (int*)_aligned_malloc(4 * sizeof(int), 16);
    _mm_store_si128((__m128i*)pixelCounts, diffPixelCount);

    int totalCount = pixelCounts[0] + pixelCounts[1] + pixelCounts[2] + pixelCounts[3];
    _aligned_free(pixelCounts);

    return{ diff, 1.0f - float(totalCount) / (left.height * left.width - left.height * left.stride) };
  }
Пример #24
0
/*
============
R_DecalPointCullStatic
============
*/
static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, const idDrawVert * verts, const int numVerts ) {
	assert_16_byte_aligned( cullBits );
	assert_16_byte_aligned( verts );


	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );

	const __m128 vector_float_zero	= { 0.0f, 0.0f, 0.0f, 0.0f };
	const __m128i vector_int_mask0	= _mm_set1_epi32( 1 << 0 );
	const __m128i vector_int_mask1	= _mm_set1_epi32( 1 << 1 );
	const __m128i vector_int_mask2	= _mm_set1_epi32( 1 << 2 );
	const __m128i vector_int_mask3	= _mm_set1_epi32( 1 << 3 );
	const __m128i vector_int_mask4	= _mm_set1_epi32( 1 << 4 );
	const __m128i vector_int_mask5	= _mm_set1_epi32( 1 << 5 );

	const __m128 p0 = _mm_loadu_ps( planes[0].ToFloatPtr() );
	const __m128 p1 = _mm_loadu_ps( planes[1].ToFloatPtr() );
	const __m128 p2 = _mm_loadu_ps( planes[2].ToFloatPtr() );
	const __m128 p3 = _mm_loadu_ps( planes[3].ToFloatPtr() );
	const __m128 p4 = _mm_loadu_ps( planes[4].ToFloatPtr() );
	const __m128 p5 = _mm_loadu_ps( planes[5].ToFloatPtr() );

	const __m128 p0X = _mm_splat_ps( p0, 0 );
	const __m128 p0Y = _mm_splat_ps( p0, 1 );
	const __m128 p0Z = _mm_splat_ps( p0, 2 );
	const __m128 p0W = _mm_splat_ps( p0, 3 );

	const __m128 p1X = _mm_splat_ps( p1, 0 );
	const __m128 p1Y = _mm_splat_ps( p1, 1 );
	const __m128 p1Z = _mm_splat_ps( p1, 2 );
	const __m128 p1W = _mm_splat_ps( p1, 3 );

	const __m128 p2X = _mm_splat_ps( p2, 0 );
	const __m128 p2Y = _mm_splat_ps( p2, 1 );
	const __m128 p2Z = _mm_splat_ps( p2, 2 );
	const __m128 p2W = _mm_splat_ps( p2, 3 );

	const __m128 p3X = _mm_splat_ps( p3, 0 );
	const __m128 p3Y = _mm_splat_ps( p3, 1 );
	const __m128 p3Z = _mm_splat_ps( p3, 2 );
	const __m128 p3W = _mm_splat_ps( p3, 3 );

	const __m128 p4X = _mm_splat_ps( p4, 0 );
	const __m128 p4Y = _mm_splat_ps( p4, 1 );
	const __m128 p4Z = _mm_splat_ps( p4, 2 );
	const __m128 p4W = _mm_splat_ps( p4, 3 );

	const __m128 p5X = _mm_splat_ps( p5, 0 );
	const __m128 p5Y = _mm_splat_ps( p5, 1 );
	const __m128 p5Z = _mm_splat_ps( p5, 2 );
	const __m128 p5W = _mm_splat_ps( p5, 3 );

	for ( int i = 0; i < numVerts; ) {

		const int nextNumVerts = vertsODS.FetchNextBatch() - 4;

		for ( ; i <= nextNumVerts; i += 4 ) {
			const __m128 v0 = _mm_load_ps( vertsODS[i + 0].xyz.ToFloatPtr() );
			const __m128 v1 = _mm_load_ps( vertsODS[i + 1].xyz.ToFloatPtr() );
			const __m128 v2 = _mm_load_ps( vertsODS[i + 2].xyz.ToFloatPtr() );
			const __m128 v3 = _mm_load_ps( vertsODS[i + 3].xyz.ToFloatPtr() );

			const __m128 r0 = _mm_unpacklo_ps( v0, v2 );	// v0.x, v2.x, v0.z, v2.z
			const __m128 r1 = _mm_unpackhi_ps( v0, v2 );	// v0.y, v2.y, v0.w, v2.w
			const __m128 r2 = _mm_unpacklo_ps( v1, v3 );	// v1.x, v3.x, v1.z, v3.z
			const __m128 r3 = _mm_unpackhi_ps( v1, v3 );	// v1.y, v3.y, v1.w, v3.w

			const __m128 vX = _mm_unpacklo_ps( r0, r2 );	// v0.x, v1.x, v2.x, v3.x
			const __m128 vY = _mm_unpackhi_ps( r0, r2 );	// v0.y, v1.y, v2.y, v3.y
			const __m128 vZ = _mm_unpacklo_ps( r1, r3 );	// v0.z, v1.z, v2.z, v3.z

			const __m128 d0 = _mm_madd_ps( vX, p0X, _mm_madd_ps( vY, p0Y, _mm_madd_ps( vZ, p0Z, p0W ) ) );
			const __m128 d1 = _mm_madd_ps( vX, p1X, _mm_madd_ps( vY, p1Y, _mm_madd_ps( vZ, p1Z, p1W ) ) );
			const __m128 d2 = _mm_madd_ps( vX, p2X, _mm_madd_ps( vY, p2Y, _mm_madd_ps( vZ, p2Z, p2W ) ) );
			const __m128 d3 = _mm_madd_ps( vX, p3X, _mm_madd_ps( vY, p3Y, _mm_madd_ps( vZ, p3Z, p3W ) ) );
			const __m128 d4 = _mm_madd_ps( vX, p4X, _mm_madd_ps( vY, p4Y, _mm_madd_ps( vZ, p4Z, p4W ) ) );
			const __m128 d5 = _mm_madd_ps( vX, p5X, _mm_madd_ps( vY, p5Y, _mm_madd_ps( vZ, p5Z, p5W ) ) );

			__m128i c0 = __m128c( _mm_cmpgt_ps( d0, vector_float_zero ) );
			__m128i c1 = __m128c( _mm_cmpgt_ps( d1, vector_float_zero ) );
			__m128i c2 = __m128c( _mm_cmpgt_ps( d2, vector_float_zero ) );
			__m128i c3 = __m128c( _mm_cmpgt_ps( d3, vector_float_zero ) );
			__m128i c4 = __m128c( _mm_cmpgt_ps( d4, vector_float_zero ) );
			__m128i c5 = __m128c( _mm_cmpgt_ps( d5, vector_float_zero ) );

			c0 = _mm_and_si128( c0, vector_int_mask0 );
			c1 = _mm_and_si128( c1, vector_int_mask1 );
			c2 = _mm_and_si128( c2, vector_int_mask2 );
			c3 = _mm_and_si128( c3, vector_int_mask3 );
			c4 = _mm_and_si128( c4, vector_int_mask4 );
			c5 = _mm_and_si128( c5, vector_int_mask5 );

			c0 = _mm_or_si128( c0, c1 );
			c2 = _mm_or_si128( c2, c3 );
			c4 = _mm_or_si128( c4, c5 );

			c0 = _mm_or_si128( c0, c2 );
			c0 = _mm_or_si128( c0, c4 );

			__m128i s0 = _mm_packs_epi32( c0, c0 );
			__m128i b0 = _mm_packus_epi16( s0, s0 );

			*(unsigned int *)&cullBits[i] = _mm_cvtsi128_si32( b0 );
		}
	}

}
Пример #25
0
void LightDesc_t::ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal,
										FourVectors &color, bool DoHalfLambert ) const
{
	FourVectors delta;
	
	Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL));
	switch (m_Type)
	{
		case MATERIAL_LIGHT_POINT:
		case MATERIAL_LIGHT_SPOT:
			delta.DuplicateVector(m_Position);
			delta-=pos;
			break;
				
		case MATERIAL_LIGHT_DIRECTIONAL:
			delta.DuplicateVector(m_Direction);
			delta*=-1.0;
			break;
			
		default:
			delta.x = Four_Zeros;
			delta.y = Four_Zeros;
			delta.z = Four_Zeros;
			break;
	}

	__m128 dist2 = delta*delta;

	__m128 falloff;

	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 )
	{
		falloff = MMReplicate(m_Attenuation0);
	}
	else
		falloff= Four_Epsilons;

	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 )
	{
		falloff=_mm_add_ps(falloff,_mm_mul_ps(MMReplicate(m_Attenuation1),_mm_sqrt_ps(dist2)));
	}

	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 )
	{
		falloff=_mm_add_ps(falloff,_mm_mul_ps(MMReplicate(m_Attenuation2),dist2));
	}

	falloff=_mm_rcp_ps(falloff);
	// Cull out light beyond this radius
	// now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format
	if (m_Range != 0.f)
	{
		__m128 RangeSquared=MMReplicate(m_RangeSquared); // !!speed!!
		falloff=_mm_and_ps(falloff,_mm_cmplt_ps(dist2,RangeSquared));
	}

	delta.VectorNormalizeFast();
	__m128 strength=delta*normal;
	if (DoHalfLambert)
	{
		strength=_mm_add_ps(_mm_mul_ps(strength,Four_PointFives),Four_PointFives);
	}
	else
		strength=_mm_max_ps(Four_Zeros,delta*normal);
		
	switch(m_Type)
	{
		case MATERIAL_LIGHT_POINT:
			// half-lambert
			break;
				
		case MATERIAL_LIGHT_SPOT:
		{
			__m128 dot2=_mm_sub_ps(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff


			__m128 cone_falloff_scale=_mm_mul_ps(MMReplicate(OneOver_ThetaDot_Minus_PhiDot),
												 _mm_sub_ps(dot2,MMReplicate(m_PhiDot)));
			cone_falloff_scale=_mm_min_ps(cone_falloff_scale,Four_Ones);
			
			if ((m_Falloff!=0.0) && (m_Falloff!=1.0))
			{
				// !!speed!! could compute integer exponent needed by powsse and store in light
				cone_falloff_scale=PowSSE(cone_falloff_scale,m_Falloff);
			}
			strength=_mm_mul_ps(cone_falloff_scale,strength);

			// now, zero out lighting where dot2<phidot. This will mask out any invalid results
			// from pow function, etc
			__m128 OutsideMask=_mm_cmpgt_ps(dot2,MMReplicate(m_PhiDot)); // outside light cone?
			strength=_mm_and_ps(OutsideMask,strength);
		}
		break;
			
		case MATERIAL_LIGHT_DIRECTIONAL:
			break;
			
		default:
			break;

	}
	strength=_mm_mul_ps(strength,falloff);
	color.x=_mm_add_ps(color.x,_mm_mul_ps(strength,MMReplicate(m_Color.x)));
	color.y=_mm_add_ps(color.y,_mm_mul_ps(strength,MMReplicate(m_Color.y)));
	color.z=_mm_add_ps(color.z,_mm_mul_ps(strength,MMReplicate(m_Color.z)));
}
Пример #26
0
static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
                                     const float hNlFb,
                                     float efw[2][PART_LEN1]) {
  int i;
  const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
  const __m128 vec_one = _mm_set1_ps(1.0f);
  const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
  const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i+=4) {
    // Weight subbands
    __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
    const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
    const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
    const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(
        vec_weightCurve, vec_hNlFb);
    const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
    const __m128 vec_one_weightCurve_hNl = _mm_mul_ps(
        vec_one_weightCurve, vec_hNl);
    const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
    const __m128 vec_if1 = _mm_and_ps(
        bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
    vec_hNl = _mm_or_ps(vec_if0, vec_if1);

    {
      const __m128 vec_overDriveCurve = _mm_loadu_ps(
          &WebRtcAec_overDriveCurve[i]);
      const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps(
          vec_overDriveSm, vec_overDriveCurve);
      vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
      _mm_storeu_ps(&hNl[i], vec_hNl);
    }

    // Suppress error signal
    {
      __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
      __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
      vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);

      // Ooura fft returns incorrect sign on imaginary component. It matters
      // here because we are making an additive change with comfort noise.
      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
      _mm_storeu_ps(&efw[0][i], vec_efw_re);
      _mm_storeu_ps(&efw[1][i], vec_efw_im);
    }
  }
  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    // Weight subbands
    if (hNl[i] > hNlFb) {
      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
          (1 - WebRtcAec_weightCurve[i]) * hNl[i];
    }
    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);

    // Suppress error signal
    efw[0][i] *= hNl[i];
    efw[1][i] *= hNl[i];

    // Ooura fft returns incorrect sign on imaginary component. It matters
    // here because we are making an additive change with comfort noise.
    efw[1][i] *= -1;
  }
}
Пример #27
0
// comparison operators
RETf CMPGT(const __m128 x, const __m128 y) { return _mm_cmpgt_ps(x, y); }
Пример #28
0
static int
forward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *opt_sc)
{
  register __m128 mpv, dpv, ipv;   /* previous row values                                       */
  register __m128 sv;		   /* temp storage of 1 curr row value in progress              */
  register __m128 dcv;		   /* delayed storage of D(i,q+1)                               */
  register __m128 xEv;		   /* E state: keeps max for Mk->E as we go                     */
  register __m128 xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  __m128   zerov;		   /* splatted 0.0's in a vector                                */
  float    xN, xE, xB, xC, xJ;	   /* special states' scores                                    */
  int i;			   /* counter over sequence positions 1..L                      */
  int q;			   /* counter over quads 0..nq-1                                */
  int j;			   /* counter over DD iterations (4 is full serialization)      */
  int Q       = p7O_NQF(om->M);	   /* segment length: # of vectors                              */
  __m128 *dpc = ox->dpf[0];        /* current row, for use in {MDI}MO(dpp,q) access macro       */
  __m128 *dpp;                     /* previous row, for use in {MDI}MO(dpp,q) access macro      */
  __m128 *rp;			   /* will point at om->rfv[x] for residue x[i]                 */
  __m128 *tp;			   /* will point into (and step thru) om->tfv                   */

  /* Initialization. */
  ox->M  = om->M;
  ox->L  = L;
  ox->has_own_scales = TRUE; 	/* all forward matrices control their own scalefactors */
  zerov  = _mm_setzero_ps();
  for (q = 0; q < Q; q++)
    MMO(dpc,q) = IMO(dpc,q) = DMO(dpc,q) = zerov;
  xE    = ox->xmx[p7X_E] = 0.;
  xN    = ox->xmx[p7X_N] = 1.;
  xJ    = ox->xmx[p7X_J] = 0.;
  xB    = ox->xmx[p7X_B] = om->xf[p7O_N][p7O_MOVE];
  xC    = ox->xmx[p7X_C] = 0.;

  ox->xmx[p7X_SCALE] = 1.0;
  ox->totscale       = 0.0;

#if p7_DEBUGGING
  if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, 0, 9, 5, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=0, width=8, precision=5*/
#endif

  for (i = 1; i <= L; i++)
    {
      dpp   = dpc;                      
      dpc   = ox->dpf[do_full * i];     /* avoid conditional, use do_full as kronecker delta */
      rp    = om->rfv[dsq[i]];
      tp    = om->tfv;
      dcv   = _mm_setzero_ps();
      xEv   = _mm_setzero_ps();
      xBv   = _mm_set1_ps(xB);

      /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12.  Shift zeros on. */
      mpv   = esl_sse_rightshift_ps(MMO(dpp,Q-1), zerov);
      dpv   = esl_sse_rightshift_ps(DMO(dpp,Q-1), zerov);
      ipv   = esl_sse_rightshift_ps(IMO(dpp,Q-1), zerov);
      
      for (q = 0; q < Q; q++)
	{
	  /* Calculate new MMO(i,q); don't store it yet, hold it in sv. */
	  sv   =                _mm_mul_ps(xBv, *tp);  tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(mpv, *tp)); tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(dpv, *tp)); tp++;
	  sv   = _mm_mul_ps(sv, *rp);                  rp++;
	  xEv  = _mm_add_ps(xEv, sv);
	  
	  /* Load {MDI}(i-1,q) into mpv, dpv, ipv;
	   * {MDI}MX(q) is then the current, not the prev row
	   */
	  mpv = MMO(dpp,q);
	  dpv = DMO(dpp,q);
	  ipv = IMO(dpp,q);

	  /* Do the delayed stores of {MD}(i,q) now that memory is usable */
	  MMO(dpc,q) = sv;
	  DMO(dpc,q) = dcv;

	  /* Calculate the next D(i,q+1) partially: M->D only;
           * delay storage, holding it in dcv
	   */
	  dcv   = _mm_mul_ps(sv, *tp); tp++;

	  /* Calculate and store I(i,q); assumes odds ratio for emission is 1.0 */
	  sv         =                _mm_mul_ps(mpv, *tp);  tp++;
	  IMO(dpc,q) = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++;
	}	  

      /* Now the DD paths. We would rather not serialize them but 
       * in an accurate Forward calculation, we have few options.
       */
      /* dcv has carried through from end of q loop above; store it 
       * in first pass, we add M->D and D->D path into DMX
       */
      /* We're almost certainly're obligated to do at least one complete 
       * DD path to be sure: 
       */
      dcv        = esl_sse_rightshift_ps(dcv, zerov);
      DMO(dpc,0) = zerov;
      tp         = om->tfv + 7*Q;	/* set tp to start of the DD's */
      for (q = 0; q < Q; q++) 
	{
	  DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q));	
	  dcv        = _mm_mul_ps(DMO(dpc,q), *tp); tp++; /* extend DMO(q), so we include M->D and D->D paths */
	}

      /* now. on small models, it seems best (empirically) to just go
       * ahead and serialize. on large models, we can do a bit better,
       * by testing for when dcv (DD path) accrued to DMO(q) is below
       * machine epsilon for all q, in which case we know DMO(q) are all
       * at their final values. The tradeoff point is (empirically) somewhere around M=100,
       * at least on my desktop. We don't worry about the conditional here;
       * it's outside any inner loops.
       */
      if (om->M < 100)
	{			/* Fully serialized version */
	  for (j = 1; j < 4; j++)
	    {
	      dcv = esl_sse_rightshift_ps(dcv, zerov);
	      tp  = om->tfv + 7*Q;	/* set tp to start of the DD's */
	      for (q = 0; q < Q; q++) 
		{ /* note, extend dcv, not DMO(q); only adding DD paths now */
		  DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q));	
		  dcv        = _mm_mul_ps(dcv, *tp);   tp++; 
		}	    
	    }
	} 
      else
	{			/* Slightly parallelized version, but which incurs some overhead */
	  for (j = 1; j < 4; j++)
	    {
	      register __m128 cv;	/* keeps track of whether any DD's change DMO(q) */

	      dcv = esl_sse_rightshift_ps(dcv, zerov);
	      tp  = om->tfv + 7*Q;	/* set tp to start of the DD's */
	      cv  = zerov;
	      for (q = 0; q < Q; q++) 
		{ /* using cmpgt below tests if DD changed any DMO(q) *without* conditional branch */
		  sv         = _mm_add_ps(dcv, DMO(dpc,q));	
		  cv         = _mm_or_ps(cv, _mm_cmpgt_ps(sv, DMO(dpc,q))); 
		  DMO(dpc,q) = sv;	                                    /* store new DMO(q) */
		  dcv        = _mm_mul_ps(dcv, *tp);   tp++;            /* note, extend dcv, not DMO(q) */
		}	    
	      if (! _mm_movemask_ps(cv)) break; /* DD's didn't change any DMO(q)? Then done, break out. */
	    }
	}

      /* Add D's to xEv */
      for (q = 0; q < Q; q++) xEv = _mm_add_ps(DMO(dpc,q), xEv);

      /* Finally the "special" states, which start from Mk->E (->C, ->J->B) */
      /* The following incantation is a horizontal sum of xEv's elements  */
      /* These must follow DD calculations, because D's contribute to E in Forward
       * (as opposed to Viterbi)
       */
      xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(0, 3, 2, 1)));
      xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(1, 0, 3, 2)));
      _mm_store_ss(&xE, xEv);

      xN =  xN * om->xf[p7O_N][p7O_LOOP];
      xC = (xC * om->xf[p7O_C][p7O_LOOP]) +  (xE * om->xf[p7O_E][p7O_MOVE]);
      xJ = (xJ * om->xf[p7O_J][p7O_LOOP]) +  (xE * om->xf[p7O_E][p7O_LOOP]);
      xB = (xJ * om->xf[p7O_J][p7O_MOVE]) +  (xN * om->xf[p7O_N][p7O_MOVE]);
      /* and now xB will carry over into next i, and xC carries over after i=L */

      /* Sparse rescaling. xE above threshold? trigger a rescaling event.            */
      if (xE > 1.0e4)	/* that's a little less than e^10, ~10% of our dynamic range */
	{
	  xN  = xN / xE;
	  xC  = xC / xE;
	  xJ  = xJ / xE;
	  xB  = xB / xE;
	  xEv = _mm_set1_ps(1.0 / xE);
	  for (q = 0; q < Q; q++)
	    {
	      MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xEv);
	      DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xEv);
	      IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xEv);
	    }
	  ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = xE;
	  ox->totscale += log(xE);
	  xE = 1.0;		
	}
      else ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = 1.0;

      /* Storage of the specials.  We could've stored these already
       * but using xE, etc. variables makes it easy to convert this
       * code to O(M) memory versions just by deleting storage steps.
       */
      ox->xmx[i*p7X_NXCELLS+p7X_E] = xE;
      ox->xmx[i*p7X_NXCELLS+p7X_N] = xN;
      ox->xmx[i*p7X_NXCELLS+p7X_J] = xJ;
      ox->xmx[i*p7X_NXCELLS+p7X_B] = xB;
      ox->xmx[i*p7X_NXCELLS+p7X_C] = xC;

#if p7_DEBUGGING
      if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, i, 9, 5, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=i, width=8, precision=5*/
#endif
    } /* end loop over sequence residues 1..L */

  /* finally C->T, and flip total score back to log space (nats) */
  /* On overflow, xC is inf or nan (nan arises because inf*0 = nan). */
  /* On an underflow (which shouldn't happen), we counterintuitively return infinity:
   * the effect of this is to force the caller to rescore us with full range.
   */
  if       (isnan(xC))        ESL_EXCEPTION(eslERANGE, "forward score is NaN");
  else if  (L>0 && xC == 0.0) ESL_EXCEPTION(eslERANGE, "forward score underflow (is 0.0)");     /* if L==0, xC *should* be 0.0; J5/118 */
  else if  (isinf(xC) == 1)   ESL_EXCEPTION(eslERANGE, "forward score overflow (is infinity)");

  if (opt_sc != NULL) *opt_sc = ox->totscale + log(xC * om->xf[p7O_C][p7O_MOVE]);
  return eslOK;
}
Пример #29
0
void BrushToolEdit::drawInner(const QPoint &pt, float strength)
{
    float fixedStrength = params.strength;
    strength *= fixedStrength;

    auto color = params.color;
    std::array<int, 3> colorParts = Terrain::expandColor(color);
    __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0);

    SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST);
    (void) roundingModeScope;

    switch (tool->type()) {
    case BrushType::Blur:
        drawBlur(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Smoothen:
        drawSmoothen(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Raise:
    case BrushType::Lower:
        if (tool->type() == BrushType::Lower) {
            fixedStrength = -fixedStrength;
            strength = -strength;
        }
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength *= 3.f;
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                (void) before;
                current -= tip * strength;
            });
            break;
        case BrushPressureMode::Constant:
            if (tool->type() == BrushType::Lower) {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength));
                });
            } else {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength));
                });
            }
            break;
        case BrushPressureMode::Adjustable:
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                current = Terrain::quantizeOne(before - tip * strength);
            });
            break;
        }
        break;
    case BrushType::Paint:
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength = 1.f - std::exp2(-strength);

            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                (void) before;

                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                auto factor = _mm_set1_ps(tip * strength);

                // blend
                auto diff = _mm_sub_ps(colorMM, currentMF);
                diff = _mm_mul_ps(diff, factor);
                currentMF = _mm_add_ps(currentMF, diff);

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Constant:
            fixedStrength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);
                // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128());

                // use "before" image to which way of color change is possible, and
                // compute possible range of result color
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * fixedStrength);
                auto adddiff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, adddiff);
                auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps());

                // compute output image
                auto out1 = _mm_max_ps(currentMF, beforeMF);
                auto out2 = _mm_min_ps(currentMF, beforeMF);
                currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2));

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Adjustable:
            strength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);

                // blend
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * strength);
                diff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, diff);

                // convert to RGB32
                beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128());
                beforeMM = _mm_cvttps_epi32(beforeMF);
                beforeMM = _mm_packs_epi32(beforeMM, beforeMM);
                beforeMM = _mm_packus_epi16(beforeMM, beforeMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(beforeMM));
            });
            break;
        }
        break;
    }

}
Пример #30
0
inline vec4 operator>(vec4 a, vec4 b) { return _mm_cmpgt_ps(a, b); }