SIMDValue SIMDFloat32x4Operation::OpLessThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128_value = _mm_cmple_ps(tmpaValue.m128_value, tmpbValue.m128_value); // a <= b?

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Exemplo n.º 2
0
/* Function:  esl_sse_expf()
 * Synopsis:  <r[z] = exp x[z]>
 * Incept:    SRE, Fri Dec 14 14:46:27 2007 [Janelia]
 *
 * Purpose:   Given a vector <x> containing four floats, returns a
 *            vector <r> in which each element <r[z] = expf(x[z])>.
 *            
 *            Valid for all IEEE754 floats $x_z$.
 *            
 * Xref:      J2/71
 *            J10/62: bugfix, minlogf/maxlogf range was too wide; 
 *                    (k+127) must be >=0 and <=255, so (k+127)<<23
 *                    is a valid IEEE754 float, without touching 
 *                    the sign bit. Pommier had this right in the
 *                    first place, and I didn't understand.
 * 
 * Note:      Derived from an SSE1 implementation by Julian
 *            Pommier. Converted to SSE2.
 *            
 *            Note on maxlogf/minlogf, which are close to but not
 *            exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so
 *            k+127 is 0..255, a valid IEEE754 8-bit exponent
 *            (0..255), so the bit pattern (k+127)<<23 is IEEE754
 *            single-precision for 2^k.  If k=-127, we get IEEE754 0.
 *            If k=128, we get IEEE754 +inf.  If k<-127, k+127 is
 *            negative and we get screwed up.  If k>128, k+127
 *            overflows the 8-bit exponent and sets the sign bit.  So
 *            for x' (base 2) < -127.5 we must definitely return e^x ~
 *            0; for x' < 126.5 we're going to calculate 0 anyway
 *            (because k=floor(-126.5-epsilon+0.5) = -127).  So any
 *            minlogf between -126.5 log2 ... -127.5 log2 will suffice
 *            as the cutoff. Ditto for 126.5 log2 .. 127.5log2.
 *            That's 87.68312 .. 88.3762655.  I think Pommier's
 *            thinking is, you don't want to get to close to the
 *            edges, lest fp roundoff error screw you (he may have
 *            consider 1 ulp carefully, I can't tell), but otherwise
 *            you may as well put your bounds close to the outer edge;
 *            so 
 *              maxlogf =  127.5 log(2) - epsilon 
 *              minlogf = -127.5 log(2) + epsilon 
 *            for an epsilon that happen to be ~ 3e-6.
 */
__m128 
esl_sse_expf(__m128 x) 
{
  static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f, 
			       4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f };
  static float cephes_c[2] = { 0.693359375f,    -2.12194440e-4f };
  static float maxlogf     =  88.3762626647949f;  /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */
  static float minlogf     = -88.3762626647949f;  /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */
  __m128i k;
  __m128  mask, tmp, fx, z, y, minmask, maxmask;
  
  /* handle out-of-range and special conditions */
  maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf));
  minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf));

  /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */
  fx = _mm_mul_ps(x,  _mm_set1_ps(eslCONST_LOG2R));
  fx = _mm_add_ps(fx, _mm_set1_ps(0.5f));

  /* floorf() with SSE:  */
  k    = _mm_cvttps_epi32(fx);	              /* cast to int with truncation                  */
  tmp  = _mm_cvtepi32_ps(k);	              /* cast back to float                           */
  mask = _mm_cmpgt_ps(tmp, fx);               /* if it increased (i.e. if it was negative...) */
  mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch...           */
  fx   = _mm_sub_ps(tmp, mask);	              /* then subtract one.                           */
  k    = _mm_cvttps_epi32(fx);	              /* k is now ready for the 2^k part.             */
  
  /* polynomial approx for e^f for f in range [-0.5, 0.5] */
  tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0]));
  z   = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1]));
  x   = _mm_sub_ps(x, tmp);
  x   = _mm_sub_ps(x, z);
  z   = _mm_mul_ps(x, x);
  
  y =               _mm_set1_ps(cephes_p[0]);    y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1]));   y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2]));   y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3]));   y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4]));   y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5]));   y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, x);
  y = _mm_add_ps(y, _mm_set1_ps(1.0f));

  /* build 2^k by hand, by creating a IEEE754 float */
  k  = _mm_add_epi32(k, _mm_set1_epi32(127));
  k  = _mm_slli_epi32(k, 23);
  fx = _mm_castsi128_ps(k);
  
  /* put 2^k e^f together (fx = 2^k,  y = e^f) and we're done */
  y = _mm_mul_ps(y, fx);	

  /* special/range cleanup */
  y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128)  */
  y = esl_sse_select_ps(y, _mm_set1_ps(0.0f),        minmask); /* exp(x) = 0   for x < log(2^-149) */
  return y;
}
Exemplo n.º 3
0
void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out)
{
  dt_develop_t *dev = self->dev;

  const int ch = piece->colors;

  const __m128 upper = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f);
  const __m128 lower = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f);

  const int colorscheme = dev->overexposed.colorscheme;
  const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]);
  const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]);

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(ovoid) schedule(static)
#endif
  for(int k=0; k<roi_out->height; k++)
  {
    const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
    float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

    for (int j=0; j<roi_out->width; j++,in+=4,out+=4)
    {
      const __m128 pixel = _mm_load_ps(in);

      __m128 isoe = _mm_cmpge_ps(pixel, upper);
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));

      __m128 isue = _mm_cmple_ps(pixel, lower);
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));

      __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel),
                                _mm_and_ps(isoe, upper_color));

      result = _mm_or_ps(_mm_andnot_ps(isue, result),
                         _mm_and_ps(isue, lower_color));

      _mm_stream_ps(out, result);
    }
  }
  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
Exemplo n.º 4
0
// a <= b
void _SIMD_cmple_ps(__SIMD a, __SIMD b, void** resultPtr)
{
  __SIMD* result = (__SIMD*)malloc(sizeof(__SIMD));
  *resultPtr = result;
#ifdef  USE_SSE
  *result = _mm_cmple_ps(a,b);
#elif defined USE_AVX
  *result = _mm256_cmp_ps(a,b,18);
#elif defined USE_IBM
  *result = vec_cmple(a,b);
#endif
}
Exemplo n.º 5
0
__m128 log_ps(__m128 x) {
  __m128i emm0;
  __m128 one = *_ps_1;
  __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
  x = _mm_max_ps(x, *reinterpret_cast<const __m128*>(_pi_min_norm_pos));
  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
  x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_mant_mask));
  x = _mm_or_ps(x, *_ps_0p5);
  emm0 = _mm_sub_epi32(emm0, *_pi_0x7f);
  __m128 e = _mm_cvtepi32_ps(emm0);
  e = _mm_add_ps(e, one);
  __m128 mask = _mm_cmplt_ps(x, *_ps_cephes_SQRTHF);
  __m128 tmp = _mm_and_ps(x, mask);
  x = _mm_sub_ps(x, one);
  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
  x = _mm_add_ps(x, tmp);
  __m128 z = _mm_mul_ps(x, x);
  __m128 y = *_ps_cephes_log_p0;
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *_ps_cephes_log_p1);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *_ps_cephes_log_p2);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *_ps_cephes_log_p3);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *_ps_cephes_log_p4);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *_ps_cephes_log_p5);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *_ps_cephes_log_p6);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *_ps_cephes_log_p7);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *_ps_cephes_log_p8);
  y = _mm_mul_ps(y, x);
  y = _mm_mul_ps(y, z);
  tmp = _mm_mul_ps(e, *_ps_cephes_log_q1);
  y = _mm_add_ps(y, tmp);
  tmp = _mm_mul_ps(z, *_ps_0p5);
  y = _mm_sub_ps(y, tmp);
  tmp = _mm_mul_ps(e, *_ps_cephes_log_q2);
  x = _mm_add_ps(x, y);
  x = _mm_add_ps(x, tmp);
  x = _mm_or_ps(x, invalid_mask);  // negative arg will be NAN
  return x;
}
Exemplo n.º 6
0
static inline __m128
bicubic_sse(__m128 width, __m128 t)
{
    static const __m128 half  = { .5f, .5f, .5f, .5f};
    static const __m128 one   = { 1.f, 1.f, 1.f, 1.f};
    static const __m128 two   = { 2.f, 2.f, 2.f, 2.f};
    static const __m128 three = { 3.f, 3.f, 3.f, 3.f};
    static const __m128 four  = { 4.f, 4.f, 4.f, 4.f};
    static const __m128 five  = { 5.f, 5.f, 5.f, 5.f};
    static const __m128 eight = { 8.f, 8.f, 8.f, 8.f};

    t = _mm_abs_ps(t);
    __m128 t2 = _mm_mul_ps(t, t);

    /* Compute 1 < t < 2 case:
     * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f)
     * half*(t*(mt2 + t5 - eight) + four)
     * half*(t*(mt2 + t5_sub_8) + four)
     * half*(t*(mt2_add_t5_sub_8) + four) */
    __m128 t5 = _mm_mul_ps(five, t);
    __m128 t5_sub_8 = _mm_sub_ps(t5, eight);
    __m128 zero = _mm_setzero_ps();
    __m128 mt2 = _mm_sub_ps(zero, t2);
    __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8);
    __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8);
    __m128 b = _mm_add_ps(a, four);
    __m128 r12 = _mm_mul_ps(b, half);

    /* Compute case < 1
     * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */
    __m128 t23 = _mm_mul_ps(three, t2);
    __m128 c = _mm_sub_ps(t23, t5);
    __m128 d = _mm_mul_ps(t, c);
    __m128 e = _mm_add_ps(d, two);
    __m128 r01 = _mm_mul_ps(half, e);

    // Compute masks fr keeping correct components
    __m128 mask01 = _mm_cmple_ps(t, one);
    __m128 mask12 = _mm_cmpgt_ps(t, one);
    r01 = _mm_and_ps(mask01, r01);
    r12 = _mm_and_ps(mask12, r12);


    return _mm_or_ps(r01, r12);
}
Exemplo n.º 7
0
void NBodyAlgorithm::calculateAccelerationWithColor(const float3(&posI)[4], const float massJ, const float3 posJ, float3(&accI)[4], unsigned int(&isClose)[4]) {
    __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x);
    __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y);
    __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z);

    __m128 pjx = _mm_set_ps1(posJ.x);
    __m128 pjy = _mm_set_ps1(posJ.y);
    __m128 pjz = _mm_set_ps1(posJ.z);

    __m128 rx = _mm_sub_ps(pjx, pix);
    __m128 ry = _mm_sub_ps(pjy, piy);
    __m128 rz = _mm_sub_ps(pjz, piz);

    __m128 eps2 = _mm_set_ps1(mp_properties->eps2);

    __m128 rx2 = _mm_mul_ps(rx, rx);
    __m128 ry2 = _mm_mul_ps(ry, ry);
    __m128 rz2 = _mm_mul_ps(rz, rz);
    __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2)));

    __m128 cmpDistance = _mm_set_ps1(float(mp_properties->positionScale));
    __m128 close = _mm_cmple_ps(rabs, cmpDistance);

    for (int i = 0; i < 4; i++) {
        if (close.m128_f32[i] == 0) {
            isClose[3 - i] = 0;
        }
    }

    __m128 m = _mm_set_ps1(massJ);
    __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs));

    __m128 aix = _mm_mul_ps(rx, rabsInv);
    __m128 aiy = _mm_mul_ps(ry, rabsInv);
    __m128 aiz = _mm_mul_ps(rz, rabsInv);

    for (int i = 0; i < 4; i++) {
        accI[3 - i].x = aix.m128_f32[i];
        accI[3 - i].y = aiy.m128_f32[i];
        accI[3 - i].z = aiz.m128_f32[i];
    }

}
Exemplo n.º 8
0
static inline bool equals_sse(const float3& f1, const float3& f2)
{
	// same as equals_new() just with SSE
	__m128 eq;
	__m128 m1 = _mm_set_ps(f1[0], f1[1], f1[2], 0.f);
	__m128 m2 = _mm_set_ps(f2[0], f2[1], f2[2], 0.f);
	eq = _mm_cmpeq_ps(m1, m2);
	if ((eq[0] != 0) && (eq[1] != 0) && (eq[2] != 0))
		return true;

	static const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
	static const __m128 eps = _mm_set1_ps(float3::cmp_eps());
	static const __m128 ones = _mm_set1_ps(1.f);
	__m128 am1 = _mm_andnot_ps(sign_mask, m1);
	__m128 am2 = _mm_andnot_ps(sign_mask, m2);
	__m128 right = _mm_add_ps(am1, am2);
	right = _mm_add_ps(right, ones);
	right = _mm_mul_ps(right, eps);
	__m128 left = _mm_sub_ps(m1, m2);
	left = _mm_andnot_ps(sign_mask, left);

	eq = _mm_cmple_ps(left, right);
	return ((eq[0] != 0) && (eq[1] != 0) && (eq[2] != 0));
}
Exemplo n.º 9
0
static void
thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
{
    int i, j;
    Size roi = _src.size();
    roi.width *= _src.channels();
    const float* src = (const float*)_src.data;
    float* dst = (float*)_dst.data;
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);

#if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
#endif

    if( _src.isContinuous() && _dst.isContinuous() )
    {
        roi.width *= roi.height;
        roi.height = 1;
    }

#ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
        return;
#endif

#if defined(HAVE_IPP)
    IppiSize sz = { roi.width, roi.height };
    switch( type )
    {
    case THRESH_TRUNC:
        if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
            return;
        setIppErrorStatus();
        break;
    case THRESH_TOZERO:
        if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0))
            return;
        setIppErrorStatus();
        break;
    case THRESH_TOZERO_INV:
        if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
            return;
        setIppErrorStatus();
        break;
    }
#endif

    switch( type )
    {
        case THRESH_BINARY:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmpgt_ps( v0, thresh4 );
                        v1 = _mm_cmpgt_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] > thresh ? maxval : 0;
            }
            break;

        case THRESH_BINARY_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmple_ps( v0, thresh4 );
                        v1 = _mm_cmple_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] <= thresh ? maxval : 0;
            }
            break;

        case THRESH_TRUNC:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_min_ps( v0, thresh4 );
                        v1 = _mm_min_ps( v1, thresh4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = std::min(src[j], thresh);
            }
            break;

        case THRESH_TOZERO:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v > thresh ? v : 0;
                }
            }
            break;

        case THRESH_TOZERO_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif
                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v <= thresh ? v : 0;
                }
            }
            break;
        default:
            return CV_Error( CV_StsBadArg, "" );
    }
}
Exemplo n.º 10
0
inline vec4 operator<=(vec4 a, vec4 b) { return _mm_cmple_ps(a, b); }
Exemplo n.º 11
0
static inline void   sacEvaluateModelSPRT(PROSAC_HEST* p){
	unsigned i;
	unsigned isInlier;
	double   lambda       = 1.0;
	double   lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon));
	double   lambdaAccept = ((   p->delta   ) / (    p->epsilon  ));
	float    distSq = p->maxD*p->maxD;
	float*   src = (float*)p->src;
	float*   dst = (float*)p->dst;
	float*   H   = p->H;
	
	
	p->inl      = 0;
	p->N_tested = 0;
	p->good     = 1;
	
	
	/* VECTOR */
	const __m128 distSqV=_mm_set1_ps(distSq);
	
	const __m128 H00=_mm_set1_ps(H[0]);
	const __m128 H01=_mm_set1_ps(H[1]);
	const __m128 H02=_mm_set1_ps(H[2]);
	const __m128 H10=_mm_set1_ps(H[4]);
	const __m128 H11=_mm_set1_ps(H[5]);
	const __m128 H12=_mm_set1_ps(H[6]);
	const __m128 H20=_mm_set1_ps(H[8]);
	const __m128 H21=_mm_set1_ps(H[9]);
	const __m128 H22=_mm_set1_ps(H[10]);
	
	for(i=0;i<(p->N-3) && p->good;i+=4){
		/* Backproject */
		__m128 x, y, X, Y, inter0, inter1, inter2, inter3;
		x=_mm_load_ps(src+2*i);
		y=_mm_load_ps(src+2*i+4);
		X=_mm_load_ps(dst+2*i);
		Y=_mm_load_ps(dst+2*i+4);
		
		inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0
		inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2
		inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0
		inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2
		
		x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		
		__m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02);
		__m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12);
		__m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22);
		
		__m128 recipZ = _mm_rcp_ps(reprojZ);
		reprojX = _mm_mul_ps(reprojX, recipZ);
		reprojY = _mm_mul_ps(reprojY, recipZ);
		//reprojX = _mm_div_ps(reprojX, reprojZ);
		//reprojY = _mm_div_ps(reprojY, reprojZ);
		
		reprojX = _mm_sub_ps(reprojX, X);
		reprojY = _mm_sub_ps(reprojY, Y);
		
		reprojX = _mm_mul_ps(reprojX, reprojX);
		reprojY = _mm_mul_ps(reprojY, reprojY);
		
		__m128 reprojDistV = _mm_add_ps(reprojX, reprojY);
		
		__m128 cmp = _mm_cmple_ps(reprojDistV, distSqV);
		int msk = _mm_movemask_ps(cmp);
		
		/* ... */
		/*                   0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15*/
		unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
		p->inl     += bitCnt[msk];
		
		
		/* SPRT */
		lambda *= p->lambdaTBL[msk];
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	/* SCALAR */
	for(;i<p->N && p->good;i++){
		/* Backproject */
		float x=src[i*2],y=src[i*2+1];
		float X=dst[i*2],Y=dst[i*2+1];
		
		float reprojX=H[0]*x+H[1]*y+H[2]; //  ( X_1 )     ( H_11 H_12    H_13  ) (x_1)
		float reprojY=H[4]*x+H[5]*y+H[6]; //  ( X_2 )  =  ( H_21 H_22    H_23  ) (x_2)
		float reprojZ=H[8]*x+H[9]*y+H[10];//  ( X_3 )     ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0)
		
		//reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z.
		reprojX/=reprojZ;
		reprojY/=reprojZ;
		
		//Compute distance
		reprojX-=X;
		reprojY-=Y;
		reprojX*=reprojX;
		reprojY*=reprojY;
		float reprojDist = reprojX+reprojY;
		
		/* ... */
		isInlier    = reprojDist <= distSq;
		p->inl     += isInlier;
		
		
		/* SPRT */
		lambda *= isInlier ? lambdaAccept : lambdaReject;
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	
	p->N_tested = i;
}
Exemplo n.º 12
0
// --------------------------------------------------------------
vuint32 mandelbrot_SIMD_I32(vfloat32 a, vfloat32 b, int max_iter)
// --------------------------------------------------------------
{
    // version avec test de sortie en int
    
    vuint32 iter = _mm_set1_epi32(0);
    vuint32 temp = _mm_set1_epi32(0); 
	vuint32 un = _mm_set1_epi32(1);

	vfloat32 x,y,t,t2,zero,deux,quatre; 
    // COMPLETER ICI
    int test = 0,i = 0;
    // initialisation des variables
    x      = _mm_set_ps(0,0,0,0);    
    y      = _mm_set_ps(0,0,0,0);
    deux   = _mm_set_ps(2,2,2,2);
    quatre = _mm_set_ps(4,4,4,4);
       

    // iteration zero
	t  = _mm_mul_ps(x, x);
	t2 = _mm_mul_ps(y, y);
	         
	y  = _mm_mul_ps(x,y);
	y  = _mm_mul_ps(y,deux);
	y  = _mm_add_ps(y,b);

	x = _mm_sub_ps(t,t2);
	x = _mm_add_ps(x,a);
    
    // calcul
    while(i<max_iter && test ==0 ){   
	    

	    t  = _mm_mul_ps(x, x);
        t2 = _mm_mul_ps(y, y);
              
	    y  = _mm_mul_ps(_mm_mul_ps(x,y),deux);        
	    y  = _mm_add_ps(y,b);

	    x = _mm_sub_ps(t,t2);
	    x = _mm_add_ps(x,a);	

	    t2 = _mm_add_ps(t,t2);
	    
	    t2 = _mm_cmple_ps(t2,quatre); 
 			
	    temp = _mm_and_si128(un,_mm_castps_si128(t2));
			
        iter = _mm_add_epi32(iter,temp);		
			
	    test = _mm_test_all_zeros(temp, un);
	    //display_vuint32(temp,"%d\t","T :: ");
	    //printf(" MASK::%d \n",_mm_movemask_ps(temp));
	    
	    i+=1;
  	}
 	
	
    return iter;
}
Exemplo n.º 13
0
// --------------------------------------------------------------
vuint32 mandelbrot_SIMD_F32(vfloat32 a, vfloat32 b, int max_iter)
// --------------------------------------------------------------
{
    // version avec test de sortie en float
    
    vuint32   iter  = _mm_set1_epi32(0);
    vfloat32  fiter = _mm_set_ps(0,0,0,0);

    vfloat32 x,y,t,t2,zero,un,deux,quatre; 
    // COMPLETER ICI
    int test,i = 0;
    // initialisation des variables
    x      = _mm_set_ps(0,0,0,0);    
    y      = _mm_set_ps(0,0,0,0);
    deux   = _mm_set_ps(2,2,2,2);
    quatre = _mm_set_ps(4,4,4,4);
    un     = _mm_set_ps(1,1,1,1);
    zero   = _mm_set_ps(0,0,0,0);
    
    // iteration zero
	t  = _mm_mul_ps(x, x);
	t2 = _mm_mul_ps(y, y);
	         
	y  = _mm_mul_ps(x,y);
	y  = _mm_mul_ps(y,deux);
	y  = _mm_add_ps(y,b);

	x = _mm_sub_ps(t,t2);
	x = _mm_add_ps(x,a);
    
    // calcul
    while(i<max_iter && _mm_movemask_ps(t) != 15){   
	    

	    t  = _mm_mul_ps(x, x);
        t2 = _mm_mul_ps(y, y);
              
	    y  = _mm_mul_ps(_mm_mul_ps(x,y),deux);        
	    y  = _mm_add_ps(y,b);

	    x = _mm_sub_ps(t,t2);
	    x = _mm_add_ps(x,a);	

	    t2 = _mm_add_ps(t,t2);
	    
	    t2 = _mm_cmple_ps(t2,quatre); 
 			
	    t = _mm_blendv_ps(zero,un,t2);
			
	    fiter = _mm_add_ps(fiter,t);		
			
	    t = _mm_cmpeq_ps(t, zero);
	    //display_vfloat32(t,"%f\t","T :: ");
	    //printf(" MASK::%d \n",_mm_movemask_ps(t));
	    
	    i+=1;
  	}
 	
	
	iter = _mm_cvtps_epi32(fiter);

    return iter;
}
Exemplo n.º 14
0
bool AABB::IntersectLineAABB_SSE(const float4 &rayPos, const float4 &rayDir, float tNear, float tFar) const
{
	assume(rayDir.IsNormalized4());
	assume(tNear <= tFar && "AABB::IntersectLineAABB: User gave a degenerate line as input for the intersection test!");
	/* For reference, this is the C++ form of the vectorized SSE code below.

	float4 recipDir = rayDir.RecipFast4();
	float4 t1 = (aabbMinPoint - rayPos).Mul(recipDir);
	float4 t2 = (aabbMaxPoint - rayPos).Mul(recipDir);
	float4 near = t1.Min(t2);
	float4 far = t1.Max(t2);
	float4 rayDirAbs = rayDir.Abs();

	if (rayDirAbs.x > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.x, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.x, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.x < aabbMinPoint.x || rayPos.x > aabbMaxPoint.x) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.y > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.y, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.y, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.y < aabbMinPoint.y || rayPos.y > aabbMaxPoint.y) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.z > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.z, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.z, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.z < aabbMinPoint.z || rayPos.z > aabbMaxPoint.z) // early-out if the ray can't possibly enter the box.
		return false;

	return tNear < tFar;
	*/

	__m128 recipDir = _mm_rcp_ps(rayDir.v);
	// Note: The above performs an approximate reciprocal (11 bits of precision).
	// For a full precision reciprocal, perform a div:
//	__m128 recipDir = _mm_div_ps(_mm_set1_ps(1.f), rayDir.v);

	__m128 t1 = _mm_mul_ps(_mm_sub_ps(MinPoint_SSE(), rayPos.v), recipDir);
	__m128 t2 = _mm_mul_ps(_mm_sub_ps(MaxPoint_SSE(), rayPos.v), recipDir);

	__m128 nearD = _mm_min_ps(t1, t2); // [0 n3 n2 n1]
	__m128 farD = _mm_max_ps(t1, t2);  // [0 f3 f2 f1]

	// Check if the ray direction is parallel to any of the cardinal axes, and if so,
	// mask those [near, far] ranges away from the hit test computations.
	__m128 rayDirAbs = abs_ps(rayDir.v);

	const __m128 epsilon = _mm_set1_ps(1e-4f);
	// zeroDirections[i] will be nonzero for each axis i the ray is parallel to.
	__m128 zeroDirections = _mm_cmple_ps(rayDirAbs, epsilon);

	const __m128 floatInf = _mm_set1_ps(FLOAT_INF);
	const __m128 floatNegInf = _mm_set1_ps(-FLOAT_INF);

	// If the ray is parallel to one of the axes, replace the slab range for that axis
	// with [-inf, inf] range instead. (which is a no-op in the comparisons below)
	nearD = cmov_ps(nearD, floatNegInf, zeroDirections);
	farD = cmov_ps(farD , floatInf, zeroDirections);

	// Next, we need to compute horizontally max(nearD[0], nearD[1], nearD[2]) and min(farD[0], farD[1], farD[2])
	// to see if there is an overlap in the hit ranges.
	__m128 v1 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(0, 0, 0, 0)); // [f1 f1 n1 n1]
	__m128 v2 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(1, 1, 1, 1)); // [f2 f2 n2 n2]
	__m128 v3 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(2, 2, 2, 2)); // [f3 f3 n3 n3]
	nearD = _mm_max_ps(v1, _mm_max_ps(v2, v3));
	farD = _mm_min_ps(v1, _mm_min_ps(v2, v3));
	farD = _mm_shuffle_ps(farD, farD, _MM_SHUFFLE(3, 3, 3, 3)); // Unpack the result from high offset in the register.
	nearD = _mm_max_ps(nearD, _mm_set_ss(tNear));
	farD = _mm_min_ps(farD, _mm_set_ss(tFar));

	// Finally, test if the ranges overlap.
	__m128 rangeIntersects = _mm_cmple_ss(nearD, farD);

	// To store out out the interval of intersection, uncomment the following:
	// These are disabled, since without these, the whole function runs without a single memory store,
	// which has been profiled to be very fast! Uncommenting these causes an order-of-magnitude slowdown.
	// For now, using the SSE version only where the tNear and tFar ranges are not interesting.
//	_mm_store_ss(&tNear, nearD);
//	_mm_store_ss(&tFar, farD);

	// To avoid false positives, need to have an additional rejection test for each cardinal axis the ray direction
	// is parallel to.
	__m128 out2 = _mm_cmplt_ps(rayPos.v, MinPoint_SSE());
	__m128 out3 = _mm_cmpgt_ps(rayPos.v, MaxPoint_SSE());
	out2 = _mm_or_ps(out2, out3);
	zeroDirections = _mm_and_ps(zeroDirections, out2);

	__m128 yOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(1,1,1,1));
	__m128 zOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(2,2,2,2));

	zeroDirections = _mm_or_ps(_mm_or_ps(zeroDirections, yOut), zOut);
	// Intersection occurs if the slab ranges had positive overlap and if the test was not rejected by the ray being
	// parallel to some cardinal axis.
	__m128 intersects = _mm_andnot_ps(zeroDirections, rangeIntersects);
	__m128 epsilonMasked = _mm_and_ps(epsilon, intersects);
	return _mm_comieq_ss(epsilon, epsilonMasked) != 0;
}
Exemplo n.º 15
0
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
//----------------------------------------------------------------
// Transforms the AABB vertices to screen space once every frame
// Also performs a coarse depth pre-test
//----------------------------------------------------------------
PreTestResult TransformedAABBoxAVX::TransformAndPreTestAABBox(__m128 xformedPos[], const __m128 cumulativeMatrix[4], const float *pDepthSummary)
{
	// w ends up being garbage, but it doesn't matter - we ignore it anyway.
	__m128 vCenter = _mm_loadu_ps(&mBBCenter.x);
	__m128 vHalf   = _mm_loadu_ps(&mBBHalf.x);

	__m128 vMin    = _mm_sub_ps(vCenter, vHalf);
	__m128 vMax    = _mm_add_ps(vCenter, vHalf);

	// transforms
	__m128 xRow[2], yRow[2], zRow[2];
	xRow[0] = _mm_shuffle_ps(vMin, vMin, 0x00) * cumulativeMatrix[0];
	xRow[1] = _mm_shuffle_ps(vMax, vMax, 0x00) * cumulativeMatrix[0];
	yRow[0] = _mm_shuffle_ps(vMin, vMin, 0x55) * cumulativeMatrix[1];
	yRow[1] = _mm_shuffle_ps(vMax, vMax, 0x55) * cumulativeMatrix[1];
	zRow[0] = _mm_shuffle_ps(vMin, vMin, 0xaa) * cumulativeMatrix[2];
	zRow[1] = _mm_shuffle_ps(vMax, vMax, 0xaa) * cumulativeMatrix[2];

	__m128 zAllIn = _mm_castsi128_ps(_mm_set1_epi32(~0));
	__m128 screenMin = _mm_set1_ps(FLT_MAX);
	__m128 screenMax = _mm_set1_ps(-FLT_MAX);

	for(UINT i = 0; i < AABB_VERTICES; i++)
	{
		// Transform the vertex
		__m128 vert = cumulativeMatrix[3];
		vert += xRow[sBBxInd[i]];
		vert += yRow[sBByInd[i]];
		vert += zRow[sBBzInd[i]];

		// We have inverted z; z is in front of near plane iff z <= w.
		__m128 vertZ = _mm_shuffle_ps(vert, vert, 0xaa); // vert.zzzz
		__m128 vertW = _mm_shuffle_ps(vert, vert, 0xff); // vert.wwww
		__m128 zIn = _mm_cmple_ps(vertZ, vertW);
		zAllIn = _mm_and_ps(zAllIn, zIn);

		// project
		xformedPos[i] = _mm_div_ps(vert, vertW);
		
	    // update bounds
	    screenMin = _mm_min_ps(screenMin, xformedPos[i]);
	    screenMax = _mm_max_ps(screenMax, xformedPos[i]);
	}

	// if any of the verts are z-clipped, we (conservatively) say the box is in
	if(_mm_movemask_ps(zAllIn) != 0xf)
		return ePT_VISIBLE;

	// Clip against screen bounds
	screenMin = _mm_max_ps(screenMin, _mm_setr_ps(0.0f, 0.0f, 0.0f, -FLT_MAX));
	screenMax = _mm_min_ps(screenMax, _mm_setr_ps((float) (SCREENW - 1), (float) (SCREENH - 1), 1.0f, FLT_MAX));

	// Quick rejection test
	if(_mm_movemask_ps(_mm_cmplt_ps(screenMax, screenMin)))
		return ePT_INVISIBLE;

	// Prepare integer bounds
	__m128 minMaxXY = _mm_shuffle_ps(screenMin, screenMax, 0x44); // minX,minY,maxX,maxY
	__m128i minMaxXYi = _mm_cvtps_epi32(minMaxXY);
	__m128i minMaxXYis = _mm_srai_epi32(minMaxXYi, 3);

	__m128 maxZ = _mm_shuffle_ps(screenMax, screenMax, 0xaa);

	// Traverse all 8x8 blocks covered by 2d screen-space BBox;
	// if we know for sure that this box is behind the geometry we know is there,
	// we can stop.
	int rX0 = minMaxXYis.m128i_i32[0];
	int rY0 = minMaxXYis.m128i_i32[1];
	int rX1 = minMaxXYis.m128i_i32[2];
	int rY1 = minMaxXYis.m128i_i32[3];

	__m128 anyCloser = _mm_setzero_ps();
	for(int by = rY0; by <= rY1; by++)
	{
		const float *srcRow = pDepthSummary + by * (SCREENW/BLOCK_SIZE);

		// If for any 8x8 block, maxZ is not less than (=behind) summarized
		// min Z, box might be visible.
		for(int bx = rX0; bx <= rX1; bx++)
		{
			anyCloser = _mm_or_ps(anyCloser, _mm_cmpnlt_ss(maxZ, _mm_load_ss(&srcRow[bx])));
		}

		if(_mm_movemask_ps(anyCloser))
		{
			return ePT_UNSURE; // okay, box might be in
		}
	}

	// If we get here, we know for sure that the box is fully behind the stuff in the
	// depth buffer.
	return ePT_INVISIBLE;
}
Exemplo n.º 17
0
void sINLINE RNMarchingCubesBase<T>::func(const sVector31 &v,typename T::FieldType &pot,const funcinfo &fi)
{
  __m128 vx = _mm_load_ps1(&v.x);
  __m128 vy = _mm_load_ps1(&v.y);
  __m128 vz = _mm_load_ps1(&v.z);
  __m128 po = _mm_setzero_ps();           // p
  __m128 nx = _mm_setzero_ps();
  __m128 ny = _mm_setzero_ps();
  __m128 nz = _mm_setzero_ps();
  __m128 akkur = _mm_setzero_ps();
  __m128 akkug = _mm_setzero_ps();
  __m128 akkub = _mm_setzero_ps();
  __m128 akkua = _mm_setzero_ps();
  __m128 s255 = _mm_set_ps1(255.0f);
  
  sBool good = 0;

  for(sInt i=0;i<fi.pn4;i++)
  {
    const T::SimdType *part = fi.parts4 + i;

    __m128 dx = _mm_sub_ps(vx,part->x);
    __m128 dy = _mm_sub_ps(vy,part->y);
    __m128 dz = _mm_sub_ps(vz,part->z);
    __m128 ddx = _mm_mul_ps(dx,dx);
    __m128 ddy = _mm_mul_ps(dy,dy);
    __m128 ddz = _mm_mul_ps(dz,dz);
    __m128 pp = _mm_add_ps(_mm_add_ps(ddx,ddy),ddz);

    if(_mm_movemask_ps(_mm_cmple_ps(pp,fi.treshf4))!=0)
    {
      __m128 pp2 = _mm_sub_ps(_mm_div_ps(fi.one,pp),fi.tresh4);
      __m128 pp3 = _mm_max_ps(pp2,_mm_setzero_ps());
      po = _mm_add_ps(po,pp3);                  // p = p+pp;
      __m128 pp4 = _mm_mul_ps(pp3,pp3);         // pp*pp
      nx = _mm_add_ps(nx,_mm_mul_ps(pp4,dx));   // n += d*(pp*pp)
      ny = _mm_add_ps(ny,_mm_mul_ps(pp4,dy));
      nz = _mm_add_ps(nz,_mm_mul_ps(pp4,dz));
      if(T::Color==1)
      {
        akkur = _mm_add_ps(akkur,_mm_mul_ps(pp3,part->cr));
        akkug = _mm_add_ps(akkug,_mm_mul_ps(pp3,part->cg));
        akkub = _mm_add_ps(akkub,_mm_mul_ps(pp3,part->cb));
        good = 1;
      }
    }
  }

  sF32 p = 0;
  sVector30 n;
  
  _MM_TRANSPOSE4_PS(po,nx,ny,nz);
  __m128 r = _mm_add_ps(_mm_add_ps(_mm_add_ps(nx,ny),nz),po);
  n.x = r.m128_f32[1];
  n.y = r.m128_f32[2];
  n.z = r.m128_f32[3];
  p = r.m128_f32[0];

  if(p==0)
    n.Init(0,0,0);
  else
    n.UnitFast();
  pot.x = n.x;
  pot.y = n.y;
  pot.z = n.z;
  pot.w = p-fi.iso;
  if(T::Color)
  {
    if(good)
    {
      r = _mm_mul_ss(s255,_mm_rcp_ss(r));
  //    r = _mm_rcp_ss(r);
      _MM_TRANSPOSE4_PS(akkub,akkug,akkur,akkua);
      __m128 r2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(akkur,akkug),akkub),akkua);

      r2 = _mm_mul_ps(r2,_mm_shuffle_ps(r,r,0x00));
      __m128i r3 = _mm_cvtps_epi32(r2);
      r3 = _mm_packs_epi32(r3,r3);
      __m128i r4 = _mm_packus_epi16(r3,r3);
      pot.c = r4.m128i_u32[0]|0xff000000;
    }
    else
    {
      pot.c = 0;
    }
  }
}
Exemplo n.º 18
0
/* natural logarithm computed for 4 simultaneous float 
return NaN for x <= 0
*/
__m128 log_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
#ifdef USE_SSE2
   __m128i emm0;
#else
   __m64 mm0, mm1;
#endif
   __m128 one = *(__m128*)_ps_1;

   __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());

   x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos);  /* cut off denormalized stuff */

#ifndef USE_SSE2
   /* part 1: x = frexpf(x, &e); */
   COPY_XMM_TO_MM(x, mm0, mm1);
   mm0 = _mm_srli_pi32(mm0, 23);
   mm1 = _mm_srli_pi32(mm1, 23);
#else
   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
#endif
   /* keep only the fractional part */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask);
   x = _mm_or_ps(x, *(__m128*)_ps_0p5);

#ifndef USE_SSE2
   /* now e=mm0:mm1 contain the really base-2 exponent */
   mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f);
   __m128 e = _mm_cvtpi32x2_ps(mm0, mm1);
   _mm_empty(); /* bye bye mmx */
#else
   emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f);
   __m128 e = _mm_cvtepi32_ps(emm0);
#endif

   e = _mm_add_ps(e, one);

   /* part2: 
   if( x < SQRTHF ) {
   e -= 1;
   x = x + x - 1.0;
   } else { x = x - 1.0; }
   */
   __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF);
   __m128 tmp = _mm_and_ps(x, mask);
   x = _mm_sub_ps(x, one);
   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
   x = _mm_add_ps(x, tmp);


   __m128 z = _mm_mul_ps(x,x);

   __m128 y = *(__m128*)_ps_cephes_log_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8);
   y = _mm_mul_ps(y, x);

   y = _mm_mul_ps(y, z);


   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1);
   y = _mm_add_ps(y, tmp);


   tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);

   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2);
   x = _mm_add_ps(x, y);
   x = _mm_add_ps(x, tmp);
   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
   return x;
}
Exemplo n.º 19
0
/* natural logarithm computed for 4 simultaneous float
   return NaN for x <= 0
*/
__m128 log_ps(__m128 x) {
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4si emm0;
    v4sf one = constants::ps_1.ps;

    v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());

    x = _mm_max_ps(x, constants::min_norm_pos.ps);  // cut off denormalized stuff

    emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
    // keep only the fractional part
    x = _mm_and_ps(x, constants::inv_mant_mask.ps);
    x = _mm_or_ps(x,  constants::ps_0p5.ps);

    emm0 = _mm_sub_epi32(emm0, constants::pi32_0x7f.pi);
    v4sf e = _mm_cvtepi32_ps(emm0);

    e = _mm_add_ps(e, one);

    /* part2:
       if( x < SQRTHF ) {
         e -= 1;
         x = x + x - 1.0;
       } else { x = x - 1.0; }
    */
    v4sf mask = _mm_cmplt_ps(x, constants::cephes_SQRTHF.ps);
    v4sf tmp = _mm_and_ps(x, mask);
    x = _mm_sub_ps(x, one);
    e = _mm_sub_ps(e, _mm_and_ps(one, mask));
    x = _mm_add_ps(x, tmp);

    v4sf z = _mm_mul_ps(x,x);

    v4sf y = constants::cephes_log_p0.ps;
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p1.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p2.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p3.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p4.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p5.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p6.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p7.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p8.ps);
    y = _mm_mul_ps(y, x);

    y = _mm_mul_ps(y, z);

    tmp = _mm_mul_ps(e, constants::cephes_log_q1.ps);
    y = _mm_add_ps(y, tmp);

    tmp = _mm_mul_ps(z, constants::ps_0p5.ps);
    y = _mm_sub_ps(y, tmp);

    tmp = _mm_mul_ps(e, constants::cephes_log_q2.ps);
    x = _mm_add_ps(x, y);
    x = _mm_add_ps(x, tmp);
    x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
    return x;
}
Exemplo n.º 20
0
inline float4 lte(const float4& a, const float4& b)
{
	return float4(_mm_cmple_ps(a.data, b.data));
}
Exemplo n.º 21
0
RETf CMPLE(const __m128 x, const __m128 y) { return _mm_cmple_ps(x, y); }
//-----------------------------------------------------------------------------------------
// Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer
// If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee
// as visible. If all rasterized AABB pixels are occluded then the occludee is culled
//-----------------------------------------------------------------------------------------
void TransformedAABBoxSSE::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	// Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. 
	// so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m128i colOffset = _mm_set_epi32(0, 1, 0, 1);
	__m128i rowOffset = _mm_set_epi32(0, 0, 1, 1);

	__m128i fxptZero = _mm_setzero_si128();
	float* pDepthBuffer = (float*)pRenderTargetPixels; 
	
	// Rasterize the AABB triangles 4 at a time
	for(UINT i = 0; i < AABB_TRIANGLES; i += SSE)
	{
		vFloat4 xformedPos[3];
		Gather(xformedPos, i);

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        vFxPt4 xFormedFxPtPos[3];
		for(int m = 0; m < 3; m++)
		{
			xFormedFxPtPos[m].X = _mm_cvtps_epi32(xformedPos[m].X);
			xFormedFxPtPos[m].Y = _mm_cvtps_epi32(xformedPos[m].Y);
			xFormedFxPtPos[m].Z = _mm_cvtps_epi32(xformedPos[m].Z);
			xFormedFxPtPos[m].W = _mm_cvtps_epi32(xformedPos[m].W);
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(xFormedFxPtPos[1].Y, xFormedFxPtPos[2].Y);
		__m128i A1 = _mm_sub_epi32(xFormedFxPtPos[2].Y, xFormedFxPtPos[0].Y);
		__m128i A2 = _mm_sub_epi32(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].X);
		__m128i B1 = _mm_sub_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].X);
		__m128i B2 = _mm_sub_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].X);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[2].Y), _mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].Y));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[0].Y), _mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].Y));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[1].Y), _mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].Y));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(A0, xFormedFxPtPos[0].X);
		triArea = _mm_add_epi32(triArea, _mm_mullo_epi32(B0, xFormedFxPtPos[0].Y));
		triArea = _mm_add_epi32(triArea, C0);

		__m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea));

		// Use bounding box traversal strategy to determine which pixels to rasterize 
		__m128i startX = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endX   = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENW));

		__m128i startY = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endY   = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENH));

		for(int vv = 0; vv < 3; vv++) 
		{
            // If W (holding 1/w in our case) is not between 0 and 1,
            // then vertex is behind near clip plane (1.0 in our case.
            // If W < 1, then verify 1/W > 1 (for W>0), and 1/W < 0 (for W < 0).
		    __m128 nearClipMask0 = _mm_cmple_ps(xformedPos[vv].W, _mm_set1_ps(0.0f));
		    __m128 nearClipMask1 = _mm_cmpge_ps(xformedPos[vv].W, _mm_set1_ps(1.0f));
            __m128 nearClipMask  = _mm_or_ps(nearClipMask0, nearClipMask1);

			if(!_mm_test_all_zeros(*(__m128i*)&nearClipMask, *(__m128i*)&nearClipMask))
			{
                // All four vertices are behind the near plane (we're processing four triangles at a time w/ SSE)
                *mVisible = true;
                return;
			}
		}

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < SSE; lane++)
        {
			// Skip triangle if area is zero 
			if(triArea.m128i_i32[lane] <= 0)
			{
				continue;
			}

			// Extract this triangle's properties from the SIMD versions
            __m128 zz[3], oneOverW[3];
			for(int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm_set1_ps(xformedPos[vv].Z.m128_f32[lane]);
				oneOverW[vv] = _mm_set1_ps(xformedPos[vv].W.m128_f32[lane]);
			}

			__m128 oneOverTotalArea = _mm_set1_ps(oneOverTriArea.m128_f32[lane]);
			zz[0] *= oneOverTotalArea;
			zz[1] *= oneOverTotalArea;
			zz[2] *= oneOverTotalArea;
			
			int startXx = startX.m128i_i32[lane];
			int endXx	= endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy	= endY.m128i_i32[lane];
		
			__m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]);
			__m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]);
			__m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]);

			__m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]);
			__m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]);
			__m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]);

			__m128i cc0 = _mm_set1_epi32(C0.m128i_i32[lane]);
			__m128i cc1 = _mm_set1_epi32(C1.m128i_i32[lane]);
			__m128i cc2 = _mm_set1_epi32(C2.m128i_i32[lane]);

			__m128i aa0Inc = _mm_slli_epi32(aa0, 1);
			__m128i aa1Inc = _mm_slli_epi32(aa1, 1);
			__m128i aa2Inc = _mm_slli_epi32(aa2, 1);

			__m128i row, col;

			int rowIdx;
			// To avoid this branching, choose one method to traverse and store the pixel depth
			if(gVisualizeDepthBuffer)
			{
				// Sequentially traverse and store pixel depths contiguously
				rowIdx = (startYy * SCREENW + startXx);
			}
			else
			{
				// Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X
				// This method provides better perfromance
				rowIdx = (startYy * SCREENW + 2 * startXx);
			}

			col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx));
			__m128i aa0Col = _mm_mullo_epi32(aa0, col);
			__m128i aa1Col = _mm_mullo_epi32(aa1, col);
			__m128i aa2Col = _mm_mullo_epi32(aa2, col);

			row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy));
			__m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), cc0);
			__m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), cc1);
			__m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), cc2);

			__m128i bb0Inc = _mm_slli_epi32(bb0, 1);
			__m128i bb1Inc = _mm_slli_epi32(bb1, 1);
			__m128i bb2Inc = _mm_slli_epi32(bb2, 1);

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for(int r = startYy; r < endYy; r += 2,
											row  = _mm_add_epi32(row, _mm_set1_epi32(2)),
											rowIdx = rowIdx + 2 * SCREENW,
											bb0Row = _mm_add_epi32(bb0Row, bb0Inc),
											bb1Row = _mm_add_epi32(bb1Row, bb1Inc),
											bb2Row = _mm_add_epi32(bb2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int idx = rowIdx;
				__m128i alpha = _mm_add_epi32(aa0Col, bb0Row);
				__m128i beta = _mm_add_epi32(aa1Col, bb1Row);
				__m128i gama = _mm_add_epi32(aa2Col, bb2Row);

				int idxIncr;
				if(gVisualizeDepthBuffer)
				{ 
					idxIncr = 2;
				}
				else
				{
					idxIncr = 4;
				}

				for(int c = startXx; c < endXx; c += 2,
												idx = idx + idxIncr,
												alpha = _mm_add_epi32(alpha, aa0Inc),
												beta  = _mm_add_epi32(beta, aa1Inc),
												gama  = _mm_add_epi32(gama, aa2Inc))
				{
					//Test Pixel inside triangle
					__m128i mask = _mm_cmplt_epi32(fxptZero, _mm_or_si128(_mm_or_si128(alpha, beta), gama));
					
					// Early out if all of this quad's pixels are outside the triangle.
					if(_mm_test_all_zeros(mask, mask))
					{
						continue;
					}

					// Compute barycentric-interpolated depth
			        __m128 depth = _mm_mul_ps(_mm_cvtepi32_ps(alpha), zz[0]);
					depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1]));
					depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2]));

					__m128 previousDepthValue;
					if(gVisualizeDepthBuffer)
					{
						previousDepthValue = _mm_set_ps(pDepthBuffer[idx], pDepthBuffer[idx + 1], pDepthBuffer[idx + SCREENW], pDepthBuffer[idx + SCREENW + 1]);
					}
					else
					{
						previousDepthValue = *(__m128*)&pDepthBuffer[idx];
					}

					__m128 depthMask  = _mm_cmpge_ps( depth, previousDepthValue);
					__m128i finalMask = _mm_and_si128( mask, _mm_castps_si128(depthMask));
					if(!_mm_test_all_zeros(finalMask, finalMask))
					{
						*mVisible = true;
						return; //early exit
					}
				}//for each column											
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles
}
Exemplo n.º 23
0
 inline vector4fb operator<=(const vector4f& lhs, const vector4f& rhs)
 {
     return _mm_cmple_ps(lhs, rhs);
 }
Exemplo n.º 24
0
__m128 test_mm_cmple_ps(__m128 __a, __m128 __b) {
  // CHECK-LABEL: @test_mm_cmple_ps
  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
  return _mm_cmple_ps(__a, __b);
}