SIMDValue SIMDUint32x4Operation::OpFromFloat32x4(const SIMDValue& value, bool& throws)
    {
        X86SIMDValue x86Result = { 0 };
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);
        X86SIMDValue temp, temp2;
        X86SIMDValue two_31_f4, two_31_i4;
        int mask = 0;

        // any lanes < 0 ?
        temp.m128_value = _mm_cmplt_ps(v.m128_value, X86_ALL_ZEROS.m128_value);
        mask = _mm_movemask_ps(temp.m128_value);
        // negative value are out of range, caller should throw Range Error
        if (mask)
        {
            throws = true;
            return X86SIMDValue::ToSIMDValue(x86Result);
        }
        // CVTTPS2DQ does a range check over signed range [-2^31, 2^31-1], so will fail to convert values >= 2^31.
        // To fix this, subtract 2^31 from values >= 2^31, do CVTTPS2DQ, then add 2^31 back.
        _mm_store_ps(two_31_f4.simdValue.f32, X86_TWO_31_F4.m128_value);
        // any lanes >= 2^31 ?
        temp.m128_value = _mm_cmpge_ps(v.m128_value, two_31_f4.m128_value);
        // two_31_f4 has f32(2^31) for lanes >= 2^31, 0 otherwise
        two_31_f4.m128_value = _mm_and_ps(two_31_f4.m128_value, temp.m128_value);
        // subtract 2^31 from lanes >= 2^31, unchanged otherwise.
        v.m128_value = _mm_sub_ps(v.m128_value, two_31_f4.m128_value);

        // CVTTPS2DQ
        x86Result.m128i_value = _mm_cvttps_epi32(v.m128_value);

        // check if any value is out of range (i.e. >= 2^31, meaning originally >= 2^32 before value adjustment)
        temp2.m128i_value = _mm_cmpeq_epi32(x86Result.m128i_value, X86_NEG_MASK_F4.m128i_value); // any value == 0x80000000 ?
        mask = _mm_movemask_ps(temp2.m128_value);
        if (mask)
        {
            throws = true;
            return X86SIMDValue::ToSIMDValue(x86Result);
        }
        // we pass range check

        // add 2^31 values back to adjusted values.
        // Use first bit from the 2^31 float mask (0x4f000...0 << 1)
        // and result with 2^31 int mask (0x8000..0) setting first bit to zero if lane hasn't been adjusted
        _mm_store_ps(two_31_i4.simdValue.f32, X86_TWO_31_I4.m128_value);
        two_31_f4.m128i_value = _mm_slli_epi32(two_31_f4.m128i_value, 1);
        two_31_i4.m128i_value = _mm_and_si128(two_31_i4.m128i_value, two_31_f4.m128i_value);
        // add 2^31 back to adjusted values
        // Note at this point all values are in [0, 2^31-1]. Adding 2^31 is guaranteed not to overflow.
        x86Result.m128i_value = _mm_add_epi32(x86Result.m128i_value, two_31_i4.m128i_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Beispiel #2
0
/// Test whether this frustum intersects a given sphere in world space.
///
/// @param[in] rSphere  Sphere to test.
///
/// @return  True if the sphere intersects this frustum, false if not.
bool Helium::Simd::Frustum::Intersects( const Sphere& rSphere ) const
{
    Helium::Simd::Register sphereVec = rSphere.GetSimdVector();

    Vector3Soa center(
        _mm_shuffle_ps( sphereVec, sphereVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ),
        _mm_shuffle_ps( sphereVec, sphereVec, _MM_SHUFFLE( 1, 1, 1, 1 ) ),
        _mm_shuffle_ps( sphereVec, sphereVec, _MM_SHUFFLE( 2, 2, 2, 2 ) ) );

    Helium::Simd::Register radius = _mm_shuffle_ps( sphereVec, sphereVec, _MM_SHUFFLE( 3, 3, 3, 3 ) );

    Helium::Simd::Register zeroVec = Helium::Simd::LoadZeros();
    PlaneSoa planes;
    for( size_t basePlaneIndex = 0; basePlaneIndex < PLANE_ARRAY_SIZE; basePlaneIndex += 4 )
    {
        planes.Load(
            m_planeA + basePlaneIndex,
            m_planeB + basePlaneIndex,
            m_planeC + basePlaneIndex,
            m_planeD + basePlaneIndex );

        Helium::Simd::Register distances = Helium::Simd::AddF32( planes.GetDistance( center ), radius );
        int resultMask = _mm_movemask_ps( Helium::Simd::GreaterEqualsF32( distances, zeroVec ) );
        if( resultMask != 0xf )
        {
            return false;
        }
    }

    return true;
}
Beispiel #3
0
/*Rewritten member function using the SSE Instructons
 */
__m128 member_speed(__m128 cx_m , __m128 cy_m)
{
	__m128 x = _mm_set1_ps(0.0f);
	__m128 y = _mm_set1_ps(0.0f);
	__m128 four_iter = _mm_set1_ps(0.0);
	__m128 temp_mask = _mm_set1_ps(0.0);
	__m128 mask = _mm_set1_ps(1.0);
	__m128 two_squared = _mm_set1_ps(4.0f);
	__m128 two = _mm_set1_ps(2.0f);
	__m128 x_sqr, y_sqr;
	x_sqr = _mm_mul_ps(x, x);
	y_sqr = _mm_mul_ps(y, y);
	// little bit of a hack to deal with individual iterations
	int iterations = 0;
	while ( (_mm_movemask_ps( temp_mask = _mm_cmplt_ps(_mm_add_ps(x_sqr, y_sqr),two_squared)) != 0)  && (iterations < MAX_ITS) ){
		__m128 xtemp = _mm_add_ps(_mm_sub_ps(x_sqr, y_sqr), cx_m);
		y = _mm_add_ps(_mm_mul_ps(two, _mm_mul_ps(x, y)), cy_m);
		x = xtemp;
		x_sqr = _mm_mul_ps(x, x);
		y_sqr = _mm_mul_ps(y, y);
		iterations ++;
		four_iter = _mm_add_ps(four_iter, _mm_and_ps(temp_mask, mask));
	}
	//This returns a m128 with the four iterations!
	return four_iter;
}
Beispiel #4
0
void 
bp_ray_trace_packet (const ray4_t *ray, vector_t *colors, simd4i_t srcprim_id, int depth, simd4_t fdepth)
{
	unsigned int activeMask;
	intersect4_t isect4;

	ASSIGN (colors [0], background);
	ASSIGN (colors [1], background);
	ASSIGN (colors [2], background);
	ASSIGN (colors [3], background);

 	if ((depth > curr_scene->settings.max_trace_level) 
	    | (simd4_extract_sign (simd4_float_lt (fdepth, simd4_from_float (curr_scene->settings.adc_bailout))) == 0xf)) {
		return;
	}

 	isect4.prim_id = simd4i_minus_ones;

 	bp_kd_tree_packet_find_nearest (curr_scene->kd_tree_root, ray, &isect4);

	activeMask = _mm_movemask_ps (simd4_float_eq (_mm_cvtepi32_ps (isect4.prim_id), simd4f_minus_ones));

 	/* If there was no intersection terminate early */
	if (activeMask == 0xF)
		return;

 	bp_shade_packet (curr_scene, &isect4, ray, colors, depth, fdepth, srcprim_id);
}
Beispiel #5
0
/// Test whether this frustum fully contains a given point in world space.
///
/// @param[in] rPoint  Point to test.
///
/// @return  True if the point is within this frustum, false if not.
bool Helium::Simd::Frustum::Contains( const Vector3& rPoint ) const
{
    // Test the point against each plane set.
    Vector3Soa pointSplat( rPoint );

    Helium::Simd::Register zeroVec = Helium::Simd::LoadZeros();
    PlaneSoa planes;
    for( size_t basePlaneIndex = 0; basePlaneIndex < PLANE_ARRAY_SIZE; basePlaneIndex += 4 )
    {
        planes.Load(
            m_planeA + basePlaneIndex,
            m_planeB + basePlaneIndex,
            m_planeC + basePlaneIndex,
            m_planeD + basePlaneIndex );

        Helium::Simd::Register distances = planes.GetDistance( pointSplat );
        int resultMask = _mm_movemask_ps( Helium::Simd::GreaterEqualsF32( distances, zeroVec ) );
        if( resultMask != 0xf )
        {
            return false;
        }
    }

    return true;
}
int searchSIMDTree(int32_t **tree, int *fanout, int levels, int32_t value) {
    int iLevel = 0;
    int lOffset = 0;
    int pOffset = 0;
    int32_t cmpmask = 0;
    int32_t eqmask = 0;

     __m128i key = _mm_cvtsi32_si128(value);
    key = _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0));

    while (iLevel < levels) {
        int f = fanout[iLevel];
        pOffset = lOffset;
        lOffset *= f - 1;
        int iter = 0;
        int position = 0;
        while (iter < f/4) {
            __m128i delimiters = _mm_load_si128((__m128i const*)&tree[iLevel][lOffset + iter*4]);
            __m128i compare = _mm_cmpgt_epi32(key, delimiters);
            cmpmask = _mm_movemask_ps(_mm_castsi128_ps(compare));
            cmpmask ^= 0x0F;
            if (cmpmask) {
                position = _bit_scan_forward(cmpmask);
                break;
            }
            iter++;
        }
        int offset = lOffset + iter*4 + position;
        lOffset = offset + pOffset;
        iLevel++;
    }
    return lOffset;
}
Beispiel #7
0
void
mandel_sse2(unsigned char *image, const struct spec *s)
{
    __m128 xmin = _mm_set_ps1(s->xlim[0]);
    __m128 ymin = _mm_set_ps1(s->ylim[0]);
    __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width);
    __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height);
    __m128 threshold = _mm_set_ps1(4);
    __m128 one = _mm_set_ps1(1);
    __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations);
    __m128 depth_scale = _mm_set_ps1(s->depth - 1);

    #pragma omp parallel for schedule(dynamic, 1)
    for (int y = 0; y < s->height; y++) {
        for (int x = 0; x < s->width; x += 4) {
            __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0);
            __m128 my = _mm_set_ps1(y);
            __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin);
            __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin);
            __m128 zr = cr;
            __m128 zi = ci;
            int k = 1;
            __m128 mk = _mm_set_ps1(k);
            while (++k < s->iterations) {
                /* Compute z1 from z0 */
                __m128 zr2 = _mm_mul_ps(zr, zr);
                __m128 zi2 = _mm_mul_ps(zi, zi);
                __m128 zrzi = _mm_mul_ps(zr, zi);
                /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
                /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
                zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr);
                zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci);

                /* Increment k */
                zr2 = _mm_mul_ps(zr, zr);
                zi2 = _mm_mul_ps(zi, zi);
                __m128 mag2 = _mm_add_ps(zr2, zi2);
                __m128 mask = _mm_cmplt_ps(mag2, threshold);
                mk = _mm_add_ps(_mm_and_ps(mask, one), mk);

                /* Early bailout? */
                if (_mm_movemask_ps(mask) == 0)
                    break;
            }
            mk = _mm_mul_ps(mk, iter_scale);
            mk = _mm_sqrt_ps(mk);
            mk = _mm_mul_ps(mk, depth_scale);
            __m128i pixels = _mm_cvtps_epi32(mk);
            unsigned char *dst = image + y * s->width * 3 + x * 3;
            unsigned char *src = (unsigned char *)&pixels;
            for (int i = 0; i < 4; i++) {
                dst[i * 3 + 0] = src[i * 4];
                dst[i * 3 + 1] = src[i * 4];
                dst[i * 3 + 2] = src[i * 4];
            }
        }
    }
}
Beispiel #8
0
int64_t
nv_float_nonzero_index(const float *v, int64_t s, int64_t e)
{
#if NV_ENABLE_SSE2
	{
		__m128 zero, xmm;
		int64_t i = s, e2;
		int eq;
		
		if (s & 0x3) {
			e2 = (s & 0xfffffffffffffffcLL) + 4;
			for (; i < e2 && i < e; ++i) {
				if (v[i] != 0.0f) {
					return i;
				}
			}
		}
		zero = _mm_setzero_ps();
		e2 = (e & 0xfffffffffffffffcLL);
		for (; i < e2; i += 4) {
			xmm = _mm_load_ps(&v[i]);
			xmm = _mm_cmpneq_ps(zero, xmm);
			eq = _mm_movemask_ps(xmm);
			if (eq != 0) {
				if (eq & 0x03) {
					if (eq & 0x1) {
						return i;
					} else {
						return i + 1;
					}
				} else {
					if (eq & 0x4) {
						return i + 2;
					} else {
						return i + 3;
					}
				}
			}
		}
		for (;i < e; ++i) {
			if (v[i] != 0.0f) {
				return i;
			}
		}
		return -1;
	}
#else
	{
		int64_t i;
		for (i = s; i < e; ++i) {
			if (v[i] != 0.0f) {
				return i;
			}
		}
		return -1;
	}
#endif
}
Beispiel #9
0
int64_t
nv_float_find_index(const float *v, int64_t s, int64_t e, float key)
{
#if NV_ENABLE_SSE2
	{
		__m128 xkey, xmm;
		int64_t i = s, e2;
		int eq;
		
		if (s & 0x3) {
			e2 = (s & 0xfffffffffffffffcLL) + 4;
			for (; i < e2 && i < e; ++i) {
				if (v[i] == key) {
					return i;
				}
			}
		}
		xkey = _mm_set1_ps(key);
		e2 = (e & 0xfffffffffffffffcLL);
		for (; i < e2; i += 4) {
			xmm = _mm_load_ps(&v[i]);
			xmm = _mm_cmpeq_ps(xkey, xmm);
			eq = _mm_movemask_ps(xmm);
			if (eq != 0) {
				if (eq & 0x03) {
					if (eq & 0x1) {
						return i;
					} else {
						return i + 1;
					}
				} else {
					if (eq & 0x4) {
						return i + 2;
					} else {
						return i + 3;
					}
				}
			}
		}
		for (;i < e; ++i) {
			if (v[i] == key) {
				return i;
			}
		}
		return -1;
	}
#else
	{
		int64_t i;
		for (i = s; i < e; ++i) {
			if (v[i] == key) {
				return i;
			}
		}
		return -1;
	}
#endif
}
    // Get SignMask
    int SIMDInt32x4Operation::OpGetSignMask(const SIMDValue& value)
    {
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);

        // Creates a 4-bit mask from the most significant bits of
        // the 4 single-precision, floating-point values
        // SIMD review: no suitable integer intrinsics, the float version seems working fine
        return _mm_movemask_ps(v.m128_value);
    }
Beispiel #11
0
		void Tree<ElementContainer>::TraversePacket(Context<size,flags> &c,const Selector<size> &selector) const
		{
			bool split=1;

			enum { reflected=!(flags&(isct::fPrimary|isct::fShadow)) };

			bool selectorsFiltered=size<=(reflected?4:isComplex?64:16);
			if(!Selector<size>::full)
				for(int n=0;n<size/4;n++)
					if(selector.Mask4(n)!=0x0f0f0f0f) {
						selectorsFiltered=0;
						break;
					}

			if((Selector<size>::full||selectorsFiltered)
					&& size <= (reflected?4 : isComplex? 64 : 16)) {
				const Vec3q &dir=c.Dir(0);
				bool signsFiltered=1;
				int msk=_mm_movemask_ps(_mm_shuffle_ps(_mm_shuffle_ps(dir.x.m,dir.y.m,0),dir.z.m,0+(2<<2)))&7;

				if(filterSigns) {					
					for(int n=0;n<size;n++) if(GetVecSign(c.Dir(n))!=msk) {
						signsFiltered=0;
						break;
					}
				}

				if(signsFiltered) {
					bool primary = (flags & (isct::fPrimary|isct::fShadow)) && gVals[1];

					if((flags & isct::fShadow) &&!isComplex) {
						floatq dot=1.0f;
						for(int q=1;q<size;q++) dot=Min(dot,c.Dir(0)|c.Dir(q));
						if(ForAny(dot<0.9998f)) primary=0;
					}
					if(separateFirstElement) elements[0].Collide(c,0);
					if(primary) TraversePrimary(c);
					else TraversePacket0(c);

				//	if(primary && (flags & isct::fShadow)) c.stats.Skip();
					split=0;
				}
			}

			if(split) {
				for(int q=0;q<4;q++) {
					Context<size/4,flags> subC(c.Split(q));
					if(flags & isct::fShadow) subC.shadowCache=c.shadowCache;
					TraversePacket(subC,selector.SubSelector(q));
					if(flags & isct::fShadow) c.shadowCache=subC.shadowCache;
				}
			}
		}
Beispiel #12
0
/** @overload */
CV_INLINE int cvCeil( float value )
{
#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
    __m128 t = _mm_set_ss( value );
    int i = _mm_cvtss_si32(t);
    return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
#elif defined __GNUC__
    int i = (int)value;
    return i + (i < value);
#else
    int i = cvRound(value);
    float diff = (float)(i - value);
    return i + (diff < 0);
#endif
}
Beispiel #13
0
	// Operators
	INLINE bool SVec4::operator==(const SVec4 &rhs) const
	{
#ifdef USE_SSE
		SIMDvec dif = _mm_sub_ps( m_128, rhs.m_128 );
		SIMDvec ep = _mm_set1_ps( math::Epsilon );
		SIMDvec neg_ep = _mm_set1_ps( -math::Epsilon );

		return ( 0xf == _mm_movemask_ps( _mm_and_ps( _mm_cmpgt_ps( ep, dif ), _mm_cmplt_ps( neg_ep, dif ) ) ) );
#else
        return math::IsEqual(m_x, rhs.X()) &&
               math::IsEqual(m_y, rhs.Y()) &&
               math::IsEqual(m_z, rhs.Z()) &&
               math::IsEqual(m_w, rhs.W());
#endif
	}
Beispiel #14
0
__forceinline int _aabbOverlapsFroxel(simdvec3 aabb_center, simdvec3 aabb_extent, const simdvec3* frustum_planes_xyz, const simdfloat* frustum_planes_w, int num_planes)
{
   simdbool test = true;
   for (int i_plane = 0; i_plane < num_planes; ++i_plane)
   {
      simdvec3 plane_normal = frustum_planes_xyz[i_plane];

      simdfloat d = dot(aabb_center, plane_normal);
      simdfloat r = dot(aabb_extent, abs(plane_normal));

      simdbool is_inside = simdbool((d + r) >= -frustum_planes_w[i_plane]);
      test &= is_inside;
   }

   return _mm_movemask_ps(test.val);
}
Beispiel #15
0
__forceinline int _overlap(__m128 sse_plane_normal, __m128 sse_dot_plane, __m128* corner_a, __m128* corner_b, __m128* corner_c, __m128* corner_d)
{
   __m128 dota = _mm_dp_ps(*corner_a, sse_plane_normal, 0x70 | 0x1);
   __m128 dotb = _mm_dp_ps(*corner_b, sse_plane_normal, 0x70 | 0x2);
   __m128 dotc = _mm_dp_ps(*corner_c, sse_plane_normal, 0x70 | 0x4);
   __m128 dotd = _mm_dp_ps(*corner_d, sse_plane_normal, 0x70 | 0x8);

   __m128 all_dots = _mm_add_ps(dota, dotb);
   all_dots = _mm_add_ps(all_dots, dotc);
   all_dots = _mm_add_ps(all_dots, dotd);

   __m128 intersection_test = _mm_sub_ps(all_dots, sse_dot_plane);

   __m128 zero = _mm_setzero_ps();
   intersection_test = _mm_cmplt_ps(intersection_test, zero);

   return _mm_movemask_ps(intersection_test);
}
Beispiel #16
0
/// Test whether this frustum intersects a given axis-aligned bounding box in world space.
///
/// @param[in] rBox  Box to test.
///
/// @return  True if the box intersects this frustum, false if not.
bool Helium::Simd::Frustum::Intersects( const AaBox& rBox ) const
{
    Helium::Simd::Register boxMinVec = rBox.GetMinimum().GetSimdVector();
    Helium::Simd::Register boxMaxVec = rBox.GetMaximum().GetSimdVector();

    Helium::Simd::Register boxX0 = _mm_shuffle_ps( boxMinVec, boxMinVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    Helium::Simd::Register boxX1 = _mm_shuffle_ps( boxMaxVec, boxMaxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    Helium::Simd::Register boxY = _mm_shuffle_ps( boxMinVec, boxMaxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    Helium::Simd::Register boxZ = _mm_unpackhi_ps( boxMinVec, boxMaxVec );
    boxZ = _mm_movelh_ps( boxZ, boxZ );

    PlaneSoa plane;
    Vector3Soa points( boxX0, boxY, boxZ );
    Helium::Simd::Register zeroVec = Helium::Simd::LoadZeros();

    size_t planeCount = ( m_bInfiniteFarClip ? PLANE_FAR : PLANE_MAX );
    for( size_t planeIndex = 0; planeIndex < planeCount; ++planeIndex )
    {
        plane.Load1Splat(
            m_planeA + planeIndex,
            m_planeB + planeIndex,
            m_planeC + planeIndex,
            m_planeD + planeIndex );

        points.m_x = boxX0;
        Helium::Simd::Mask containsPoints0 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec );

        points.m_x = boxX1;
        Helium::Simd::Mask containsPoints1 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec );

        int resultMask = _mm_movemask_ps( Helium::Simd::Or( containsPoints0, containsPoints1 ) );
        if( resultMask == 0 )
        {
            return false;
        }
    }

    return true;
}
Beispiel #17
0
int
nv_vector_eq(const nv_matrix_t *vec1, int j1, const nv_matrix_t *vec2, int j2)
{
	NV_ASSERT(vec1->n == vec2->n);
	
#if NV_ENABLE_SSE2
	{
		__m128 xmm;
		int i = 0;
		int eq;
		int pk_lp = (vec1->n & 0xfffffffc);
		
		for (i = 0; i < pk_lp; i += 4) {
			xmm = _mm_load_ps(&NV_MAT_V(vec2, j2, i));
			xmm = _mm_cmpneq_ps(xmm, *(const __m128 *)&NV_MAT_V(vec1, j1, i));
			eq = _mm_movemask_ps(xmm);
			if (eq != 0) {
				return 0;
			}
		}
		for (i = pk_lp; i < vec1->n; ++i) {
			if (NV_MAT_V(vec1, j1, i) != NV_MAT_V(vec2, j2, i)) {			
				return 0;
			}
		}
		return 1;
	}
#else
	{
		int i;
		for (i = 0; i < vec1->n; ++i) {
			if (NV_MAT_V(vec1, j1, i) != NV_MAT_V(vec2, j2, i)) {
				return 0;
			}
		}
		return 1;
	}
#endif
}
Beispiel #18
0
__m128 member_speed(__m128 cx_m , __m128 cy_m)
{
	__m128 x = _mm_set1_ps(0.0f);
	__m128 y = _mm_set1_ps(0.0f);
	__m128 four_iter = _mm_set1_ps(0.0);
	__m128 temp_mask = _mm_set1_ps(0.0);
	__m128 mask = _mm_set1_ps(1.0);
	__m128 two_squared = _mm_set1_ps(4.0f);
	__m128 two = _mm_set1_ps(2.0f);
	__m128 x_sqr, y_sqr;
	x_sqr = _mm_mul_ps(x, x);
	y_sqr = _mm_mul_ps(y, y);
	int iterations = 0;
	while ( (_mm_movemask_ps( temp_mask = _mm_cmplt_ps(_mm_add_ps(x_sqr, y_sqr),two_squared)) != 0)  && (iterations < MAX_ITS) ){
		__m128 xtemp = _mm_add_ps(_mm_sub_ps(x_sqr, y_sqr), cx_m);
		y = _mm_add_ps(_mm_mul_ps(two, _mm_mul_ps(x, y)), cy_m);
		x = xtemp;
		x_sqr = _mm_mul_ps(x, x);
		y_sqr = _mm_mul_ps(y, y);
		iterations ++;
		four_iter = _mm_add_ps(four_iter, _mm_and_ps(temp_mask, mask));
	}
	return four_iter;
}
Beispiel #19
0
static inline void   sacEvaluateModelSPRT(PROSAC_HEST* p){
	unsigned i;
	unsigned isInlier;
	double   lambda       = 1.0;
	double   lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon));
	double   lambdaAccept = ((   p->delta   ) / (    p->epsilon  ));
	float    distSq = p->maxD*p->maxD;
	float*   src = (float*)p->src;
	float*   dst = (float*)p->dst;
	float*   H   = p->H;
	
	
	p->inl      = 0;
	p->N_tested = 0;
	p->good     = 1;
	
	
	/* VECTOR */
	const __m128 distSqV=_mm_set1_ps(distSq);
	
	const __m128 H00=_mm_set1_ps(H[0]);
	const __m128 H01=_mm_set1_ps(H[1]);
	const __m128 H02=_mm_set1_ps(H[2]);
	const __m128 H10=_mm_set1_ps(H[4]);
	const __m128 H11=_mm_set1_ps(H[5]);
	const __m128 H12=_mm_set1_ps(H[6]);
	const __m128 H20=_mm_set1_ps(H[8]);
	const __m128 H21=_mm_set1_ps(H[9]);
	const __m128 H22=_mm_set1_ps(H[10]);
	
	for(i=0;i<(p->N-3) && p->good;i+=4){
		/* Backproject */
		__m128 x, y, X, Y, inter0, inter1, inter2, inter3;
		x=_mm_load_ps(src+2*i);
		y=_mm_load_ps(src+2*i+4);
		X=_mm_load_ps(dst+2*i);
		Y=_mm_load_ps(dst+2*i+4);
		
		inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0
		inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2
		inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0
		inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2
		
		x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		
		__m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02);
		__m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12);
		__m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22);
		
		__m128 recipZ = _mm_rcp_ps(reprojZ);
		reprojX = _mm_mul_ps(reprojX, recipZ);
		reprojY = _mm_mul_ps(reprojY, recipZ);
		//reprojX = _mm_div_ps(reprojX, reprojZ);
		//reprojY = _mm_div_ps(reprojY, reprojZ);
		
		reprojX = _mm_sub_ps(reprojX, X);
		reprojY = _mm_sub_ps(reprojY, Y);
		
		reprojX = _mm_mul_ps(reprojX, reprojX);
		reprojY = _mm_mul_ps(reprojY, reprojY);
		
		__m128 reprojDistV = _mm_add_ps(reprojX, reprojY);
		
		__m128 cmp = _mm_cmple_ps(reprojDistV, distSqV);
		int msk = _mm_movemask_ps(cmp);
		
		/* ... */
		/*                   0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15*/
		unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
		p->inl     += bitCnt[msk];
		
		
		/* SPRT */
		lambda *= p->lambdaTBL[msk];
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	/* SCALAR */
	for(;i<p->N && p->good;i++){
		/* Backproject */
		float x=src[i*2],y=src[i*2+1];
		float X=dst[i*2],Y=dst[i*2+1];
		
		float reprojX=H[0]*x+H[1]*y+H[2]; //  ( X_1 )     ( H_11 H_12    H_13  ) (x_1)
		float reprojY=H[4]*x+H[5]*y+H[6]; //  ( X_2 )  =  ( H_21 H_22    H_23  ) (x_2)
		float reprojZ=H[8]*x+H[9]*y+H[10];//  ( X_3 )     ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0)
		
		//reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z.
		reprojX/=reprojZ;
		reprojY/=reprojZ;
		
		//Compute distance
		reprojX-=X;
		reprojY-=Y;
		reprojX*=reprojX;
		reprojY*=reprojY;
		float reprojDist = reprojX+reprojY;
		
		/* ... */
		isInlier    = reprojDist <= distSq;
		p->inl     += isInlier;
		
		
		/* SPRT */
		lambda *= isInlier ? lambdaAccept : lambdaReject;
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	
	p->N_tested = i;
}
Beispiel #20
0
static inline int    sacIsSampleDegenerate(PROSAC_HEST* p){
	unsigned i0 = p->smpl[0], i1 = p->smpl[1], i2 = p->smpl[2], i3 = p->smpl[3];
	
	/**
	 * Pack the matches selected by the SAC algorithm.
	 * Must be packed  points[0:7]  = {srcx0, srcy0, srcx1, srcy1, srcx2, srcy2, srcx3, srcy3}
	 *                 points[8:15] = {dstx0, dsty0, dstx1, dsty1, dstx2, dsty2, dstx3, dsty3}
	 * Gather 4 points into the vector
	 */
	
	__m128 src10 = _mm_loadl_pi(src10, (__m64*)&p->src[i0]);
	src10        = _mm_loadh_pi(src10, (__m64*)&p->src[i1]);
	__m128 src32 = _mm_loadl_pi(src32, (__m64*)&p->src[i2]);
	src32        = _mm_loadh_pi(src32, (__m64*)&p->src[i3]);
	__m128 dst10 = _mm_loadl_pi(dst10, (__m64*)&p->dst[i0]);
	dst10        = _mm_loadh_pi(dst10, (__m64*)&p->dst[i1]);
	__m128 dst32 = _mm_loadl_pi(dst32, (__m64*)&p->dst[i2]);
	dst32        = _mm_loadh_pi(dst32, (__m64*)&p->dst[i3]);
	
	
	/**
	 * If the matches' source points have common x and y coordinates, abort.
	 */
	
	/**
	 * Check:
	 * packedPoints[0].x == packedPoints[2].x
	 * packedPoints[0].y == packedPoints[2].y
	 * packedPoints[1].x == packedPoints[3].x
	 * packedPoints[1].y == packedPoints[3].y
	 */
	
	__m128 chkEq0 = _mm_cmpeq_ps(src10, src32);
	
	/**
	 * Check:
	 * packedPoints[1].x == packedPoints[2].x
	 * packedPoints[1].y == packedPoints[2].y
	 * packedPoints[0].x == packedPoints[3].x
	 * packedPoints[0].y == packedPoints[3].y
	 */
	
	__m128 chkEq1 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src10, _MM_SHUFFLE(1, 0, 3, 2)), src32);
	
	/**
	 * Check:
	 * packedPoints[0].x == packedPoints[1].x
	 * packedPoints[0].y == packedPoints[1].y
	 * packedPoints[2].x == packedPoints[3].x
	 * packedPoints[2].y == packedPoints[3].y
	 */
	
	__m128 chkEq2 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src32, _MM_SHUFFLE(1, 0, 1, 0)),
	                             _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 2, 3, 2)));
	
	/* Verify */
	if(_mm_movemask_ps(_mm_or_ps(chkEq0, _mm_or_ps(chkEq1, chkEq2)))){
		return 1;
	}
	
	/* If the matches do not satisfy the strong geometric constraint, abort. */
	
	/**
	 * p6420x   = (p6.x, p4.x, p2.x, p0.x)
	 * p6420y   = (p6.y, p4.y, p2.y, p0.y)
	 * p7531x   = (p7.x, p5.x, p3.x, p1.x)
	 * p7531y   = (p7.y, p5.y, p3.y, p1.y)
	 * crosssd0 = p6420y - p7531y                     = (cross2d0, cross0d0, cross2s0, cross0s0)
	 * crosssd1 = p7531x - p6420x                     = (cross2d1, cross0d1, cross2s1, cross0s1)
	 * crosssd2 = p6420x * p7531y  -  p6420y * p7531x = (cross2d2, cross0d2, cross2s2, cross0s2)
	 * 
	 * shufcrosssd0 = (cross0d0, cross2d0, cross0s0, cross2s0)
	 * shufcrosssd1 = (cross0d1, cross2d1, cross0s1, cross2s1)
	 * shufcrosssd2 = (cross0d2, cross2d2, cross0s2, cross2s2)
	 * 
	 * dotsd0   = shufcrosssd0 * p6420x +
	 *            shufcrosssd1 * p6420y + 
	 *            shufcrosssd2
	 *          = (dotd0, dotd2, dots0, dots2)
	 * dotsd1   = shufcrosssd0 * p7531x +
	 *            shufcrosssd1 * p7531y + 
	 *            shufcrosssd2
	 *          = (dotd1, dotd3, dots1, dots3)
	 * 
	 * dots     = shufps(dotsd0, dotsd1, _MM_SHUFFLE(1, 0, 1, 0))
	 * dotd     = shufps(dotsd0, dotsd1, _MM_SHUFFLE(3, 2, 3, 2))
	 *            movmaskps(dots ^ dotd)
	 */
	
	__m128 p3210x       = _mm_shuffle_ps(src10,  src32,  _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p3210y       = _mm_shuffle_ps(src10,  src32,  _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p7654x       = _mm_shuffle_ps(dst10,  dst32,  _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p7654y       = _mm_shuffle_ps(dst10,  dst32,  _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p6420x       = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p6420y       = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p7531x       = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p7531y       = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(3, 1, 3, 1));
	
	__m128 crosssd0     = _mm_sub_ps(p6420y, p7531y);
	__m128 crosssd1     = _mm_sub_ps(p7531x, p6420x);
	__m128 crosssd2     = _mm_sub_ps(_mm_mul_ps(p6420x, p7531y), _mm_mul_ps(p6420y, p7531x));
	
	__m128 shufcrosssd0 = _mm_shuffle_ps(crosssd0, crosssd0, _MM_SHUFFLE(2, 3, 0, 1));
	__m128 shufcrosssd1 = _mm_shuffle_ps(crosssd1, crosssd1, _MM_SHUFFLE(2, 3, 0, 1));
	__m128 shufcrosssd2 = _mm_shuffle_ps(crosssd2, crosssd2, _MM_SHUFFLE(2, 3, 0, 1));
	
	__m128 dotsd0       = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p6420x),
	                                            _mm_mul_ps(shufcrosssd1, p6420y)),
	                                 shufcrosssd2);
	__m128 dotsd1       = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p7531x),
	                                            _mm_mul_ps(shufcrosssd1, p7531y)),
	                                 shufcrosssd2);
	
	__m128 dots         = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(0, 1, 0, 1));
	__m128 dotd         = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(2, 3, 2, 3));
	
	//if(_mm_movemask_ps(_mm_cmpge_ps(_mm_setzero_ps(), _mm_mul_ps(dots, dotd)))){
	if(_mm_movemask_epi8(_mm_cmplt_epi32(_mm_xor_si128(_mm_cvtps_epi32(dots), _mm_cvtps_epi32(dotd)), _mm_setzero_si128()))){
		return 1;
	}
	
	
	/* Otherwise, proceed with evaluation */
	_mm_store_ps((float*)&p->pkdPts[0], src10);
	_mm_store_ps((float*)&p->pkdPts[2], src32);
	_mm_store_ps((float*)&p->pkdPts[4], dst10);
	_mm_store_ps((float*)&p->pkdPts[6], dst32);
	
	return 0;
}
Beispiel #21
0
static int
forward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *opt_sc)
{
  register __m128 mpv, dpv, ipv;   /* previous row values                                       */
  register __m128 sv;		   /* temp storage of 1 curr row value in progress              */
  register __m128 dcv;		   /* delayed storage of D(i,q+1)                               */
  register __m128 xEv;		   /* E state: keeps max for Mk->E as we go                     */
  register __m128 xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  __m128   zerov;		   /* splatted 0.0's in a vector                                */
  float    xN, xE, xB, xC, xJ;	   /* special states' scores                                    */
  int i;			   /* counter over sequence positions 1..L                      */
  int q;			   /* counter over quads 0..nq-1                                */
  int j;			   /* counter over DD iterations (4 is full serialization)      */
  int Q       = p7O_NQF(om->M);	   /* segment length: # of vectors                              */
  __m128 *dpc = ox->dpf[0];        /* current row, for use in {MDI}MO(dpp,q) access macro       */
  __m128 *dpp;                     /* previous row, for use in {MDI}MO(dpp,q) access macro      */
  __m128 *rp;			   /* will point at om->rfv[x] for residue x[i]                 */
  __m128 *tp;			   /* will point into (and step thru) om->tfv                   */

  /* Initialization. */
  ox->M  = om->M;
  ox->L  = L;
  ox->has_own_scales = TRUE; 	/* all forward matrices control their own scalefactors */
  zerov  = _mm_setzero_ps();
  for (q = 0; q < Q; q++)
    MMO(dpc,q) = IMO(dpc,q) = DMO(dpc,q) = zerov;
  xE    = ox->xmx[p7X_E] = 0.;
  xN    = ox->xmx[p7X_N] = 1.;
  xJ    = ox->xmx[p7X_J] = 0.;
  xB    = ox->xmx[p7X_B] = om->xf[p7O_N][p7O_MOVE];
  xC    = ox->xmx[p7X_C] = 0.;

  ox->xmx[p7X_SCALE] = 1.0;
  ox->totscale       = 0.0;

#if p7_DEBUGGING
  if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, 0, 9, 5, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=0, width=8, precision=5*/
#endif

  for (i = 1; i <= L; i++)
    {
      dpp   = dpc;                      
      dpc   = ox->dpf[do_full * i];     /* avoid conditional, use do_full as kronecker delta */
      rp    = om->rfv[dsq[i]];
      tp    = om->tfv;
      dcv   = _mm_setzero_ps();
      xEv   = _mm_setzero_ps();
      xBv   = _mm_set1_ps(xB);

      /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12.  Shift zeros on. */
      mpv   = esl_sse_rightshift_ps(MMO(dpp,Q-1), zerov);
      dpv   = esl_sse_rightshift_ps(DMO(dpp,Q-1), zerov);
      ipv   = esl_sse_rightshift_ps(IMO(dpp,Q-1), zerov);
      
      for (q = 0; q < Q; q++)
	{
	  /* Calculate new MMO(i,q); don't store it yet, hold it in sv. */
	  sv   =                _mm_mul_ps(xBv, *tp);  tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(mpv, *tp)); tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(dpv, *tp)); tp++;
	  sv   = _mm_mul_ps(sv, *rp);                  rp++;
	  xEv  = _mm_add_ps(xEv, sv);
	  
	  /* Load {MDI}(i-1,q) into mpv, dpv, ipv;
	   * {MDI}MX(q) is then the current, not the prev row
	   */
	  mpv = MMO(dpp,q);
	  dpv = DMO(dpp,q);
	  ipv = IMO(dpp,q);

	  /* Do the delayed stores of {MD}(i,q) now that memory is usable */
	  MMO(dpc,q) = sv;
	  DMO(dpc,q) = dcv;

	  /* Calculate the next D(i,q+1) partially: M->D only;
           * delay storage, holding it in dcv
	   */
	  dcv   = _mm_mul_ps(sv, *tp); tp++;

	  /* Calculate and store I(i,q); assumes odds ratio for emission is 1.0 */
	  sv         =                _mm_mul_ps(mpv, *tp);  tp++;
	  IMO(dpc,q) = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++;
	}	  

      /* Now the DD paths. We would rather not serialize them but 
       * in an accurate Forward calculation, we have few options.
       */
      /* dcv has carried through from end of q loop above; store it 
       * in first pass, we add M->D and D->D path into DMX
       */
      /* We're almost certainly're obligated to do at least one complete 
       * DD path to be sure: 
       */
      dcv        = esl_sse_rightshift_ps(dcv, zerov);
      DMO(dpc,0) = zerov;
      tp         = om->tfv + 7*Q;	/* set tp to start of the DD's */
      for (q = 0; q < Q; q++) 
	{
	  DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q));	
	  dcv        = _mm_mul_ps(DMO(dpc,q), *tp); tp++; /* extend DMO(q), so we include M->D and D->D paths */
	}

      /* now. on small models, it seems best (empirically) to just go
       * ahead and serialize. on large models, we can do a bit better,
       * by testing for when dcv (DD path) accrued to DMO(q) is below
       * machine epsilon for all q, in which case we know DMO(q) are all
       * at their final values. The tradeoff point is (empirically) somewhere around M=100,
       * at least on my desktop. We don't worry about the conditional here;
       * it's outside any inner loops.
       */
      if (om->M < 100)
	{			/* Fully serialized version */
	  for (j = 1; j < 4; j++)
	    {
	      dcv = esl_sse_rightshift_ps(dcv, zerov);
	      tp  = om->tfv + 7*Q;	/* set tp to start of the DD's */
	      for (q = 0; q < Q; q++) 
		{ /* note, extend dcv, not DMO(q); only adding DD paths now */
		  DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q));	
		  dcv        = _mm_mul_ps(dcv, *tp);   tp++; 
		}	    
	    }
	} 
      else
	{			/* Slightly parallelized version, but which incurs some overhead */
	  for (j = 1; j < 4; j++)
	    {
	      register __m128 cv;	/* keeps track of whether any DD's change DMO(q) */

	      dcv = esl_sse_rightshift_ps(dcv, zerov);
	      tp  = om->tfv + 7*Q;	/* set tp to start of the DD's */
	      cv  = zerov;
	      for (q = 0; q < Q; q++) 
		{ /* using cmpgt below tests if DD changed any DMO(q) *without* conditional branch */
		  sv         = _mm_add_ps(dcv, DMO(dpc,q));	
		  cv         = _mm_or_ps(cv, _mm_cmpgt_ps(sv, DMO(dpc,q))); 
		  DMO(dpc,q) = sv;	                                    /* store new DMO(q) */
		  dcv        = _mm_mul_ps(dcv, *tp);   tp++;            /* note, extend dcv, not DMO(q) */
		}	    
	      if (! _mm_movemask_ps(cv)) break; /* DD's didn't change any DMO(q)? Then done, break out. */
	    }
	}

      /* Add D's to xEv */
      for (q = 0; q < Q; q++) xEv = _mm_add_ps(DMO(dpc,q), xEv);

      /* Finally the "special" states, which start from Mk->E (->C, ->J->B) */
      /* The following incantation is a horizontal sum of xEv's elements  */
      /* These must follow DD calculations, because D's contribute to E in Forward
       * (as opposed to Viterbi)
       */
      xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(0, 3, 2, 1)));
      xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(1, 0, 3, 2)));
      _mm_store_ss(&xE, xEv);

      xN =  xN * om->xf[p7O_N][p7O_LOOP];
      xC = (xC * om->xf[p7O_C][p7O_LOOP]) +  (xE * om->xf[p7O_E][p7O_MOVE]);
      xJ = (xJ * om->xf[p7O_J][p7O_LOOP]) +  (xE * om->xf[p7O_E][p7O_LOOP]);
      xB = (xJ * om->xf[p7O_J][p7O_MOVE]) +  (xN * om->xf[p7O_N][p7O_MOVE]);
      /* and now xB will carry over into next i, and xC carries over after i=L */

      /* Sparse rescaling. xE above threshold? trigger a rescaling event.            */
      if (xE > 1.0e4)	/* that's a little less than e^10, ~10% of our dynamic range */
	{
	  xN  = xN / xE;
	  xC  = xC / xE;
	  xJ  = xJ / xE;
	  xB  = xB / xE;
	  xEv = _mm_set1_ps(1.0 / xE);
	  for (q = 0; q < Q; q++)
	    {
	      MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xEv);
	      DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xEv);
	      IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xEv);
	    }
	  ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = xE;
	  ox->totscale += log(xE);
	  xE = 1.0;		
	}
      else ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = 1.0;

      /* Storage of the specials.  We could've stored these already
       * but using xE, etc. variables makes it easy to convert this
       * code to O(M) memory versions just by deleting storage steps.
       */
      ox->xmx[i*p7X_NXCELLS+p7X_E] = xE;
      ox->xmx[i*p7X_NXCELLS+p7X_N] = xN;
      ox->xmx[i*p7X_NXCELLS+p7X_J] = xJ;
      ox->xmx[i*p7X_NXCELLS+p7X_B] = xB;
      ox->xmx[i*p7X_NXCELLS+p7X_C] = xC;

#if p7_DEBUGGING
      if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, i, 9, 5, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=i, width=8, precision=5*/
#endif
    } /* end loop over sequence residues 1..L */

  /* finally C->T, and flip total score back to log space (nats) */
  /* On overflow, xC is inf or nan (nan arises because inf*0 = nan). */
  /* On an underflow (which shouldn't happen), we counterintuitively return infinity:
   * the effect of this is to force the caller to rescore us with full range.
   */
  if       (isnan(xC))        ESL_EXCEPTION(eslERANGE, "forward score is NaN");
  else if  (L>0 && xC == 0.0) ESL_EXCEPTION(eslERANGE, "forward score underflow (is 0.0)");     /* if L==0, xC *should* be 0.0; J5/118 */
  else if  (isinf(xC) == 1)   ESL_EXCEPTION(eslERANGE, "forward score overflow (is infinity)");

  if (opt_sc != NULL) *opt_sc = ox->totscale + log(xC * om->xf[p7O_C][p7O_MOVE]);
  return eslOK;
}
Beispiel #22
0
// --------------------------------------------------------------
vuint32 mandelbrot_SIMD_F32(vfloat32 a, vfloat32 b, int max_iter)
// --------------------------------------------------------------
{
    // version avec test de sortie en float
    
    vuint32   iter  = _mm_set1_epi32(0);
    vfloat32  fiter = _mm_set_ps(0,0,0,0);

    vfloat32 x,y,t,t2,zero,un,deux,quatre; 
    // COMPLETER ICI
    int test,i = 0;
    // initialisation des variables
    x      = _mm_set_ps(0,0,0,0);    
    y      = _mm_set_ps(0,0,0,0);
    deux   = _mm_set_ps(2,2,2,2);
    quatre = _mm_set_ps(4,4,4,4);
    un     = _mm_set_ps(1,1,1,1);
    zero   = _mm_set_ps(0,0,0,0);
    
    // iteration zero
	t  = _mm_mul_ps(x, x);
	t2 = _mm_mul_ps(y, y);
	         
	y  = _mm_mul_ps(x,y);
	y  = _mm_mul_ps(y,deux);
	y  = _mm_add_ps(y,b);

	x = _mm_sub_ps(t,t2);
	x = _mm_add_ps(x,a);
    
    // calcul
    while(i<max_iter && _mm_movemask_ps(t) != 15){   
	    

	    t  = _mm_mul_ps(x, x);
        t2 = _mm_mul_ps(y, y);
              
	    y  = _mm_mul_ps(_mm_mul_ps(x,y),deux);        
	    y  = _mm_add_ps(y,b);

	    x = _mm_sub_ps(t,t2);
	    x = _mm_add_ps(x,a);	

	    t2 = _mm_add_ps(t,t2);
	    
	    t2 = _mm_cmple_ps(t2,quatre); 
 			
	    t = _mm_blendv_ps(zero,un,t2);
			
	    fiter = _mm_add_ps(fiter,t);		
			
	    t = _mm_cmpeq_ps(t, zero);
	    //display_vfloat32(t,"%f\t","T :: ");
	    //printf(" MASK::%d \n",_mm_movemask_ps(t));
	    
	    i+=1;
  	}
 	
	
	iter = _mm_cvtps_epi32(fiter);

    return iter;
}
Beispiel #23
0
//Returns true if the triangle is visible
bool DepthBuffer::testTriangle2x2(const vec4f& v0,const vec4f& v1,const vec4f& v2){
	VecS32 colOffset(0, 1, 0, 1);
	VecS32 rowOffset(0, 0, 1, 1);

	vec2i vertex[3];
	vertex[0] = vec2i(int32(v0.x),int32(v0.y));
	vertex[1] = vec2i(int32(v1.x),int32(v1.y));
	vertex[2] = vec2i(int32(v2.x),int32(v2.y));

	// Reject the triangle if any of its verts is behind the nearclip plane
	if(v0.w == 0.0f || v1.w == 0.0f || v2.w == 0.0f) return true;

	float minZ = std::min(v0.z,std::min(v1.z,v2.z));
	VecF32 fixedDepth(minZ);

	// Fab(x, y) =     Ax       +       By     +      C              = 0
	// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
	// Compute A = (ya - yb) for the 3 line segments that make up each triangle
	auto A0 = vertex[1].y - vertex[2].y;
	auto A1 = vertex[2].y - vertex[0].y;
	auto A2 = vertex[0].y - vertex[1].y;

	// Compute B = (xb - xa) for the 3 line segments that make up each triangle
	auto B0 = vertex[2].x - vertex[1].x;
	auto B1 = vertex[0].x - vertex[2].x;
	auto B2 = vertex[1].x - vertex[0].x;

	// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
	auto C0 = vertex[1].x * vertex[2].y - vertex[2].x * vertex[1].y;
	auto C1 = vertex[2].x * vertex[0].y - vertex[0].x * vertex[2].y;
	auto C2 = vertex[0].x * vertex[1].y - vertex[1].x * vertex[0].y;

	// Use bounding box traversal strategy to determine which pixels to rasterize 
	auto minx = std::max(std::min(std::min(vertex[0].x,vertex[1].x),vertex[2].x),0) & (~1);
	auto maxx = std::min(std::max(std::max(vertex[0].x,vertex[1].x),vertex[2].x),size_.x-2);
	auto miny = std::max(std::min(std::min(vertex[0].y,vertex[1].y),vertex[2].y),0) & (~1);
	auto maxy = std::min(std::max(std::max(vertex[0].y,vertex[1].y),vertex[2].y),size_.y-2);

	VecS32 a0(A0);
	VecS32 a1(A1);
	VecS32 a2(A2);
	VecS32 b0(B0);
	VecS32 b1(B1);
	VecS32 b2(B2);

	VecS32 col = VecS32(minx) + colOffset;
	VecS32 row = VecS32(miny) + rowOffset;
	auto rowIdx = miny*size_.x + 2 * minx;
	VecS32 w0_row  = a0 * col + b0 * row + VecS32(C0);
	VecS32 w1_row  = a1 * col + b1 * row + VecS32(C1);
	VecS32 w2_row  = a2 * col + b2 * row + VecS32(C2);

	//Multiply each weight by two(rasterize 2x2 quad at once).
	a0 = shiftl<1>(a0);
	a1 = shiftl<1>(a1);
	a2 = shiftl<1>(a2);
	b0 = shiftl<1>(b0);
	b1 = shiftl<1>(b1);
	b2 = shiftl<1>(b2);

	for(int32 y = miny;y<=maxy;y+=2,rowIdx += 2 * size_.x){
		auto w0 = w0_row;
		auto w1 = w1_row;
		auto w2 = w2_row;

		auto idx = rowIdx;
				
		for(int32 x = minx;x<=maxx;x+=2,idx+=4){
			auto mask = w0|w1|w2;
			auto masks = _mm_movemask_ps(bits2float(mask).simd);
			if(masks != 0xF){
				VecF32 previousDepth = VecF32::load(data_+idx);
				auto cmpMask = ((~masks)&0xF)& _mm_movemask_ps(cmple(fixedDepth,previousDepth).simd);
				if(cmpMask){
					return true;
				}
			}
			
			w0+=a0;
			w1+=a1;
			w2+=a2;
		}
		w0_row += b0;
		w1_row += b1;
		w2_row += b2;
	}
	return false;
}
inline bool compare_byFloatSSE(const char * p1, const char * p2)
{
	return !_mm_movemask_ps(_mm_cmpneq_ps(					/// Кажется, некорректно при сравнении субнормальных float-ов.
		_mm_loadu_ps(reinterpret_cast<const float *>(p1)),
		_mm_loadu_ps(reinterpret_cast<const float *>(p2))));
}
Beispiel #25
0
CPLErr
GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
                                        const void *poOptions,
                                        GUInt32 nPoints,
                                        CPL_UNUSED const double *unused_padfX,
                                        CPL_UNUSED const double *unused_padfY,
                                        CPL_UNUSED const double *unused_padfZ,
                                        double dfXPoint, double dfYPoint,
                                        double *pdfValue,
                                        void* hExtraParamsIn )
{
    size_t i = 0;
    GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn;
    const float* pafX = psExtraParams->pafX;
    const float* pafY = psExtraParams->pafY;
    const float* pafZ = psExtraParams->pafZ;

    const float fEpsilon = 0.0000000000001f;
    const float fXPoint = (float)dfXPoint;
    const float fYPoint = (float)dfYPoint;
    const __m128 xmm_small = _mm_load1_ps((float*)&fEpsilon);
    const __m128 xmm_x = _mm_load1_ps((float*)&fXPoint);
    const __m128 xmm_y = _mm_load1_ps((float*)&fYPoint);
    __m128 xmm_nominator = _mm_setzero_ps();
    __m128 xmm_denominator = _mm_setzero_ps();
    int mask = 0;

#if defined(__x86_64) || defined(_M_X64)
    /* This would also work in 32bit mode, but there are only 8 XMM registers */
    /* whereas we have 16 for 64bit */
#define LOOP_SIZE   8
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
    for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
    {
        __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x);            /* rx = pafX[i] - fXPoint */
        __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x);
        __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y);            /* ry = pafY[i] - fYPoint */
        __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y);
        __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx),               /* r2 = rx * rx + ry * ry */
                                   _mm_mul_ps(xmm_ry, xmm_ry));
        __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4),
                                     _mm_mul_ps(xmm_ry_4, xmm_ry_4));
        __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2);                               /* invr2 = 1.0f / r2 */
        __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4);
        xmm_nominator = _mm_add_ps(xmm_nominator,                            /* nominator += invr2 * pafZ[i] */
                            _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
        xmm_nominator = _mm_add_ps(xmm_nominator,
                            _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4)));
        xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2);           /* denominator += invr2 */
        xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4);
        mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) |           /* if( r2 < fEpsilon) */
              (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4);
        if( mask )
            break;
    }
#else
#define LOOP_SIZE   4
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
    for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
    {
        __m128 xmm_rx = _mm_sub_ps(_mm_load_ps((float*)pafX + i), xmm_x);           /* rx = pafX[i] - fXPoint */
        __m128 xmm_ry = _mm_sub_ps(_mm_load_ps((float*)pafY + i), xmm_y);           /* ry = pafY[i] - fYPoint */
        __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx),              /* r2 = rx * rx + ry * ry */
                                   _mm_mul_ps(xmm_ry, xmm_ry));
        __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2);                              /* invr2 = 1.0f / r2 */
        xmm_nominator = _mm_add_ps(xmm_nominator,                           /* nominator += invr2 * pafZ[i] */
                            _mm_mul_ps(xmm_invr2, _mm_load_ps((float*)pafZ + i)));
        xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2);           /* denominator += invr2 */
        mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small));            /* if( r2 < fEpsilon) */
        if( mask )
            break;
    }
#endif

    /* Find which i triggered r2 < fEpsilon */
    if( mask )
    {
        for(int j = 0; j < LOOP_SIZE; j++ )
        {
            if( mask & (1 << j) )
            {
                (*pdfValue) = (pafZ)[i + j];
                return CE_None;
            }
        }
    }

    /* Get back nominator and denominator values for XMM registers */
    float afNominator[4], afDenominator[4];
    _mm_storeu_ps(afNominator, xmm_nominator);
    _mm_storeu_ps(afDenominator, xmm_denominator);

    float fNominator = afNominator[0] + afNominator[1] +
                       afNominator[2] + afNominator[3];
    float fDenominator = afDenominator[0] + afDenominator[1] +
                         afDenominator[2] + afDenominator[3];

    /* Do the few remaining loop iterations */
    for ( ; i < nPoints; i++ )
    {
        const float fRX = pafX[i] - fXPoint;
        const float fRY = pafY[i] - fYPoint;
        const float fR2 =
            fRX * fRX + fRY * fRY;

        // If the test point is close to the grid node, use the point
        // value directly as a node value to avoid singularity.
        if ( fR2 < 0.0000000000001 )
        {
            break;
        }
        else
        {
            const float fInvR2 = 1.0f / fR2;
            fNominator += fInvR2 * pafZ[i];
            fDenominator += fInvR2;
        }
    }

    if( i != nPoints )
    {
        (*pdfValue) = pafZ[i];
    }
    else
    if ( fDenominator == 0.0 )
    {
        (*pdfValue) =
            ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue;
    }
    else
        (*pdfValue) = fNominator / fDenominator;

    return CE_None;
}
Beispiel #26
0
bool DepthBuffer::testAABB(vec3f min,vec3f max){
	vec4f vertices[8];
	gatherBoxVertices(vertices,min,max);
	transformBoxVertices(vertices,viewProjection_);
	boxVerticesToScreenVertices(vertices,clipSpaceToScreenSpaceMultiplier,center_);
	ScreenSpaceQuad faces[6];
	auto faceCount = extractBoxQuads(faces,vertices,aabbFrontFaceMask);
	for(uint32 i = 0;i<faceCount;++i){
		if(testTriangle2x2(faces[i].v[0],faces[i].v[1],faces[i].v[2])) return true;
		if(testTriangle2x2(faces[i].v[2],faces[i].v[3],faces[i].v[0])) return true;
	}
	return false;

	//Transform the AABB vertices to Homogenous clip space
	//vec4f vertices[8];
	vertices[0] = vec4f(min.x,min.y,min.z,1);
	vertices[1] = vec4f(max.x,min.y,min.z,1);
	vertices[2] = vec4f(max.x,max.y,min.z,1);
	vertices[3] = vec4f(min.x,max.y,min.z,1);
	vertices[4] = vec4f(min.x,min.y,max.z,1);
	vertices[5] = vec4f(max.x,min.y,max.z,1);
	vertices[6] = vec4f(max.x,max.y,max.z,1);
	vertices[7] = vec4f(min.x,max.y,max.z,1);
	vec4f clipMin,clipMax;
	for(uint32 i =0;i<8;++i){
		vertices[i] = viewProjection_ * vertices[i]; 
		vertices[i] = vertices[i] * (1.0f/vertices[i].w);
		//Homogenous coordinates => non-homogenous coordinates
		//Determine the min/max for the screen aabb
		clipMin = vec4f::min(vertices[i],clipMin);
		clipMax = vec4f::max(vertices[i],clipMax);
	}
	//Determine the screen aabb which covers the box
	//Clip space coordinates => screen coordinates [-1,1] -> [-320,320] -> [0,640]
	clipMin = vec4f::max(clipMin,vec4f(-1.0f,-1.0f,-1.0f,-1.0f))*clipSpaceToScreenSpaceMultiplier;
	clipMax = vec4f::min(clipMax,vec4f(1.0f,1.0f,1.0f,1.0f))*clipSpaceToScreenSpaceMultiplier;
	vec2i screenMin = vec2i(int32(clipMin.x),int32(clipMin.y))+center_;
	screenMin.x &= (~1);screenMin.y &= (~1);
	vec2i screenMax = vec2i(int32(clipMax.x),int32(clipMax.y))+center_;
	
	auto rowIdx = screenMin.y*size_.x + 2 * screenMin.x;

	float minZ = vertices[0].z;
	for(uint32 i = 1;i< 8;i++){
		minZ = std::min(minZ,vertices[i].z);
	}
	VecF32 flatDepth(minZ);

	//Iterate over the pixels
	for(;screenMin.y < screenMax.y;screenMin.y+=2,rowIdx += 2 * size_.x){
		auto idx = rowIdx;
	for(int32 x = screenMin.x;x<screenMax.x;x+=2,idx+=4){
		//Fetch the distance value for the current pixel.
		auto depth = VecF32::load(data_ + idx);
		vec3f rayo,rayd;
		getRay(rayo,rayd,x,screenMin.y);
		float dist;
		if(!rayAABBIntersect(min,max,rayo,rayd,dist)) continue;
		dist = ((dist-znear_)/zfar_ ) * 2.0f - 1.0f;
		VecF32 flatDepth(dist);
		flatDepth.store(data_+idx);
		auto mask = _mm_movemask_ps(cmple(flatDepth,depth).simd);
		//if(mask != 0)//{
			//return true; //Visible
		//}
			
		//
		//

		//Compute the distance to the aabb (raytrace)
		//vec3f rayo,rayd;
		//getRay(rayo,rayd,x,screenMin.y);
		//float dist;
		
		//Convert the distance from view space to depth space [-1,1]
		//dist = ((dist-znear_)/zfar_ ) * 2.0f - 1.0f;
		//Compare the values.
		//if(dist <= depth){
			//return true;
		//	data_[idx] = dist;
		//}
	} }
	return false;
}
Beispiel #27
0
inline unsigned int to_bitmask(const float4& a)
{
	return _mm_movemask_ps(a.data);
}
Beispiel #28
0
KFR_SINTRIN bool bittestany(const f32sse& x) { return _mm_movemask_ps(*x); }
Beispiel #29
0
KFR_SINTRIN bool bittestall(const f32sse& x) { return !_mm_movemask_ps(*~x); }
void PreviewWorker::processCoherent(const WorkUnit *workUnit, WorkResult *workResult, 
	const bool &stop) {
#if defined(MTS_HAS_COHERENT_RT)
	const RectangularWorkUnit *rect = static_cast<const RectangularWorkUnit *>(workUnit);
	ImageBlock *block = static_cast<ImageBlock *>(workResult);

	block->setOffset(rect->getOffset());
	block->setSize(rect->getSize());

	/* Some constants */
	const int sx = rect->getOffset().x, sy = block->getOffset().y;
	const int ex = sx + rect->getSize().x, ey = sy + rect->getSize().y;
	const int width = rect->getSize().x;
	const SSEVector MM_ALIGN16 xOffset(0.0f, 1.0f, 0.0f, 1.0f);
	const SSEVector MM_ALIGN16 yOffset(0.0f, 0.0f, 1.0f, 1.0f);
	const int pixelOffset[] = {0, 1, width, width+1};
	const __m128 clamping = _mm_set1_ps(1/(m_minDist*m_minDist));
	uint8_t temp[MTS_KD_INTERSECTION_TEMP*4];

	const __m128 camTL[3] = {
		 _mm_set1_ps(m_cameraTL.x),
		 _mm_set1_ps(m_cameraTL.y),
		 _mm_set1_ps(m_cameraTL.z)
	}; 
	const __m128 camDx[3] = {
		 _mm_set1_ps(m_cameraDx.x),
		 _mm_set1_ps(m_cameraDx.y),
		 _mm_set1_ps(m_cameraDx.z)
	}; 
	const __m128 camDy[3] = {
		 _mm_set1_ps(m_cameraDy.x),
		 _mm_set1_ps(m_cameraDy.y),
		 _mm_set1_ps(m_cameraDy.z)
	}; 
	const __m128 lumPos[3] = {
		_mm_set1_ps(m_vpl.its.p.x),
		_mm_set1_ps(m_vpl.its.p.y),
		_mm_set1_ps(m_vpl.its.p.z)
	};
	const __m128 lumDir[3] = {
		_mm_set1_ps(m_vpl.its.shFrame.n.x),
		_mm_set1_ps(m_vpl.its.shFrame.n.y),
		_mm_set1_ps(m_vpl.its.shFrame.n.z)
	};

	/* Some local variables */
	int pos = 0;
	int numRays = 0;
	RayPacket4 MM_ALIGN16 primRay4, secRay4;
	Intersection4 MM_ALIGN16 its4, secIts4;
	RayInterval4 MM_ALIGN16 itv4, secItv4;
	SSEVector MM_ALIGN16 nSecD[3], cosThetaLight, invLengthSquared;
	Spectrum emitted[4], direct[4];
	Intersection its;
	Vector wo, wi;
	its.hasUVPartials = false;

	bool diffuseVPL = false, vplOnSurface = false;
	Spectrum vplWeight;

	if (m_vpl.type == ESurfaceVPL && (m_diffuseSources || m_vpl.its.shape->getBSDF()->getType() == BSDF::EDiffuseReflection)) {
		diffuseVPL = true;
		vplOnSurface = true;
		vplWeight = m_vpl.its.shape->getBSDF()->getDiffuseReflectance(m_vpl.its) * m_vpl.P / M_PI;
	} else if (m_vpl.type == ELuminaireVPL) {
		vplOnSurface = m_vpl.luminaire->getType() & Luminaire::EOnSurface;
		diffuseVPL = m_vpl.luminaire->getType() & Luminaire::EDiffuseDirection;
		EmissionRecord eRec(m_vpl.luminaire, 
			ShapeSamplingRecord(m_vpl.its.p, m_vpl.its.shFrame.n), m_vpl.its.shFrame.n);
		vplWeight = m_vpl.P * m_vpl.luminaire->evalDirection(eRec);
	}

	primRay4.o[0].ps = _mm_set1_ps(m_cameraO.x);
	primRay4.o[1].ps = _mm_set1_ps(m_cameraO.y);
	primRay4.o[2].ps = _mm_set1_ps(m_cameraO.z);
	secItv4.mint.ps = _mm_set1_ps(ShadowEpsilon);

	/* Work on 2x2 sub-blocks */
	for (int y=sy; y<ey; y += 2, pos += width) {
		for (int x=sx; x<ex; x += 2, pos += 2) {
			/* Generate camera rays without normalization */
			const __m128
				xPixel = _mm_add_ps(xOffset.ps, _mm_set1_ps((float) x)),
				yPixel = _mm_add_ps(yOffset.ps, _mm_set1_ps((float) y));

			primRay4.d[0].ps = _mm_add_ps(camTL[0], _mm_add_ps(
				_mm_mul_ps(xPixel, camDx[0]), _mm_mul_ps(yPixel, camDy[0])));
			primRay4.d[1].ps = _mm_add_ps(camTL[1], _mm_add_ps(
				_mm_mul_ps(xPixel, camDx[1]), _mm_mul_ps(yPixel, camDy[1])));
			primRay4.d[2].ps = _mm_add_ps(camTL[2], _mm_add_ps(
				_mm_mul_ps(xPixel, camDx[2]), _mm_mul_ps(yPixel, camDy[2])));

			primRay4.dRcp[0].ps = _mm_div_ps(SSEConstants::one.ps, primRay4.d[0].ps);
			primRay4.dRcp[1].ps = _mm_div_ps(SSEConstants::one.ps, primRay4.d[1].ps);
			primRay4.dRcp[2].ps = _mm_div_ps(SSEConstants::one.ps, primRay4.d[2].ps);

			/* Ray coherence test */
			const int primSignsX = _mm_movemask_ps(primRay4.d[0].ps);
			const int primSignsY = _mm_movemask_ps(primRay4.d[1].ps);
			const int primSignsZ = _mm_movemask_ps(primRay4.d[2].ps);

			const bool primCoherent =
				   (primSignsX == 0 || primSignsX == 0xF)
				&& (primSignsY == 0 || primSignsY == 0xF)
				&& (primSignsZ == 0 || primSignsZ == 0xF);

			/* Trace the primary rays */
			its4.t = SSEConstants::p_inf;
			if (EXPECT_TAKEN(primCoherent)) {
				primRay4.signs[0][0] = primSignsX ? 1 : 0;
				primRay4.signs[1][0] = primSignsY ? 1 : 0;
				primRay4.signs[2][0] = primSignsZ ? 1 : 0;
				m_kdtree->rayIntersectPacket(primRay4, itv4, its4, temp);
			} else {
				m_kdtree->rayIntersectPacketIncoherent(primRay4, itv4, its4, temp);
			}
			numRays += 4;

			/* Generate secondary rays */
			secRay4.o[0].ps = _mm_add_ps(primRay4.o[0].ps, _mm_mul_ps(its4.t.ps, primRay4.d[0].ps));
			secRay4.o[1].ps = _mm_add_ps(primRay4.o[1].ps, _mm_mul_ps(its4.t.ps, primRay4.d[1].ps));
			secRay4.o[2].ps = _mm_add_ps(primRay4.o[2].ps, _mm_mul_ps(its4.t.ps, primRay4.d[2].ps));
			secRay4.d[0].ps = _mm_sub_ps(lumPos[0], secRay4.o[0].ps);
			secRay4.d[1].ps = _mm_sub_ps(lumPos[1], secRay4.o[1].ps);
			secRay4.d[2].ps = _mm_sub_ps(lumPos[2], secRay4.o[2].ps);

			/* Normalization */
			const __m128 
				lengthSquared = _mm_add_ps(_mm_add_ps(
					_mm_mul_ps(secRay4.d[0].ps, secRay4.d[0].ps),
					_mm_mul_ps(secRay4.d[1].ps, secRay4.d[1].ps)),
					_mm_mul_ps(secRay4.d[2].ps, secRay4.d[2].ps)),
				invLength = _mm_rsqrt_ps(lengthSquared);
	
			invLengthSquared.ps = _mm_min_ps(_mm_rcp_ps(lengthSquared), clamping);

			nSecD[0].ps = _mm_mul_ps(secRay4.d[0].ps, invLength);
			nSecD[1].ps = _mm_mul_ps(secRay4.d[1].ps, invLength);
			nSecD[2].ps = _mm_mul_ps(secRay4.d[2].ps, invLength);

			secRay4.dRcp[0].ps = _mm_div_ps(SSEConstants::one.ps, secRay4.d[0].ps);
			secRay4.dRcp[1].ps = _mm_div_ps(SSEConstants::one.ps, secRay4.d[1].ps);
			secRay4.dRcp[2].ps = _mm_div_ps(SSEConstants::one.ps, secRay4.d[2].ps);

			cosThetaLight.ps = _mm_sub_ps(_mm_setzero_ps(),
				_mm_add_ps(_mm_add_ps(
					_mm_mul_ps(nSecD[0].ps, lumDir[0]),
					_mm_mul_ps(nSecD[1].ps, lumDir[1])),
					_mm_mul_ps(nSecD[2].ps, lumDir[2])));
			secItv4.maxt.ps = _mm_set1_ps(1-ShadowEpsilon);

			/* Shading (scalar) --- this is way too much work and should be 
			   rewritten to be smarter in special cases */
			for (int idx=0; idx<4; ++idx) {
				if (EXPECT_NOT_TAKEN(its4.t.f[idx] == std::numeric_limits<float>::infinity())) {
					/* Don't trace a secondary ray */
					secItv4.maxt.f[idx] = 0;
					emitted[idx] = m_scene->LeBackground(Ray(
						Point(primRay4.o[0].f[idx], primRay4.o[1].f[idx], primRay4.o[2].f[idx]),
						Vector(primRay4.d[0].f[idx], primRay4.d[1].f[idx], primRay4.d[2].f[idx]),
						0.0f
					)) * m_backgroundScale;
					memset(&direct[idx], 0, sizeof(Spectrum));
					continue;
				}
				const unsigned int primIndex = its4.primIndex.i[idx];
				const Shape *shape = (*m_shapes)[its4.shapeIndex.i[idx]];
				const BSDF *bsdf = shape->getBSDF();

				if (EXPECT_NOT_TAKEN(!bsdf)) {
					memset(&emitted[idx], 0, sizeof(Spectrum));
					memset(&direct[idx], 0, sizeof(Spectrum));
					continue;
				}

				if (EXPECT_TAKEN(primIndex != KNoTriangleFlag)) {
					const TriMesh *mesh = static_cast<const TriMesh *>(shape);
					const Triangle &t = mesh->getTriangles()[primIndex];
					const Normal *normals = mesh->getVertexNormals();
					const Point2 *texcoords = mesh->getVertexTexcoords();
					const Spectrum *colors = mesh->getVertexColors();
					const TangentSpace * tangents = mesh->getVertexTangents();
					const Float beta  = its4.u.f[idx],
								gamma = its4.v.f[idx],
								alpha = 1.0f - beta - gamma;
					const uint32_t idx0 = t.idx[0], idx1 = t.idx[1], idx2 = t.idx[2];

					if (EXPECT_TAKEN(normals)) {
						const Normal &n0 = normals[idx0],
							  		 &n1 = normals[idx1],
									 &n2 = normals[idx2];
						its.shFrame.n = normalize(n0 * alpha + n1 * beta + n2 * gamma);
					} else {
						const Point *positions = mesh->getVertexPositions();
						const Point &p0 = positions[idx0],
									&p1 = positions[idx1],
									&p2 = positions[idx2];
						Vector sideA = p1 - p0, sideB = p2 - p0;
						Vector n = cross(sideA, sideB);
						Float nLengthSqr = n.lengthSquared();
						if (nLengthSqr != 0)
							n /= std::sqrt(nLengthSqr);
						its.shFrame.n = Normal(n);
					}

					if (EXPECT_TAKEN(texcoords)) {
						const Point2 &t0 = texcoords[idx0],
							  		 &t1 = texcoords[idx1],
									 &t2 = texcoords[idx2];
						its.uv = t0 * alpha + t1 * beta + t2 * gamma;
					} else {
						its.uv = Point2(0.0f);
					}

					if (EXPECT_NOT_TAKEN(colors)) {
						const Spectrum &c0 = colors[idx0],
							  		   &c1 = colors[idx1],
									   &c2 = colors[idx2];
						its.color = c0 * alpha + c1 * beta + c2 * gamma;
					}

					if (EXPECT_NOT_TAKEN(tangents)) {
						const TangentSpace &t0 = tangents[idx0],
							  			   &t1 = tangents[idx1],
										   &t2 = tangents[idx2];
						its.dpdu = t0.dpdu * alpha + t1.dpdu * beta + t2.dpdu * gamma;
						its.dpdv = t0.dpdv * alpha + t1.dpdv * beta + t2.dpdv * gamma;
					}
				} else {
					Ray ray(
						Point(primRay4.o[0].f[idx], primRay4.o[1].f[idx], primRay4.o[2].f[idx]),
						Vector(primRay4.d[0].f[idx], primRay4.d[1].f[idx], primRay4.d[2].f[idx]),
						0.0f
					);
					its.t = its4.t.f[idx];
					shape->fillIntersectionRecord(ray, temp + idx * MTS_KD_INTERSECTION_TEMP + 8, its);
					bsdf = its.shape->getBSDF();
				}

				wo.x = nSecD[0].f[idx]; wo.y = nSecD[1].f[idx]; wo.z = nSecD[2].f[idx];

				if (EXPECT_TAKEN(!shape->isLuminaire())) {
					memset(&emitted[idx], 0, sizeof(Spectrum));
				} else {
					Vector d(-primRay4.d[0].f[idx], -primRay4.d[1].f[idx], -primRay4.d[2].f[idx]);
					emitted[idx] = shape->getLuminaire()->Le(ShapeSamplingRecord(its.p, its.shFrame.n), d);
				}

				if (EXPECT_TAKEN(bsdf->getType() == BSDF::EDiffuseReflection && diffuseVPL)) {
					/* Fast path */
					direct[idx] = (bsdf->getDiffuseReflectance(its) * vplWeight)
						* (std::max((Float) 0.0f, dot(wo, its.shFrame.n))
						* (vplOnSurface ? (std::max(cosThetaLight.f[idx], (Float) 0.0f) * INV_PI) : INV_PI)
						* invLengthSquared.f[idx]);
				} else {
					wi.x = -primRay4.d[0].f[idx];
					wi.y = -primRay4.d[1].f[idx];
					wi.z = -primRay4.d[2].f[idx];
					its.p.x = secRay4.o[0].f[idx];
					its.p.y = secRay4.o[1].f[idx];
					its.p.z = secRay4.o[2].f[idx];
					if (EXPECT_NOT_TAKEN(bsdf->getType() & BSDF::EAnisotropic)) {
						its.shFrame.s = normalize(its.dpdu - its.shFrame.n
							* dot(its.shFrame.n, its.dpdu));
						its.shFrame.t = cross(its.shFrame.n, its.shFrame.s);
					} else {
						coordinateSystem(its.shFrame.n, its.shFrame.s, its.shFrame.t);
					}
					const Float ctLight = cosThetaLight.f[idx];
					wi = normalize(wi);

					its.wi = its.toLocal(wi);
					wo = its.toLocal(wo);

					if (!diffuseVPL) {
						if (m_vpl.type == ESurfaceVPL) {
							BSDFQueryRecord bRec(m_vpl.its, m_vpl.its.toLocal(wi));
							bRec.quantity = EImportance;
							vplWeight = m_vpl.its.shape->getBSDF()->eval(bRec) * m_vpl.P;
						} else {
							EmissionRecord eRec(m_vpl.luminaire, 
								ShapeSamplingRecord(m_vpl.its.p, m_vpl.its.shFrame.n), wi);
							eRec.type = EmissionRecord::EPreview;
							vplWeight = m_vpl.luminaire->evalDirection(eRec) * m_vpl.P;
						}
					}

					if (EXPECT_TAKEN(ctLight >= 0)) {
						direct[idx] = (bsdf->eval(BSDFQueryRecord(its, wo)) * vplWeight
							* ((vplOnSurface ? std::max(ctLight, (Float) 0.0f) : 1.0f) * invLengthSquared.f[idx]));
					} else {
						memset(&direct[idx], 0, sizeof(Spectrum));
					}
				}
				++numRays;
			}

			/* Shoot the secondary rays */
			const int secSignsX = _mm_movemask_ps(secRay4.d[0].ps);
			const int secSignsY = _mm_movemask_ps(secRay4.d[1].ps);
			const int secSignsZ = _mm_movemask_ps(secRay4.d[2].ps);

			const bool secCoherent =
				   (secSignsX == 0 || secSignsX == 0xF)
				&& (secSignsY == 0 || secSignsY == 0xF)
				&& (secSignsZ == 0 || secSignsZ == 0xF);

			/* Shoot the secondary rays */
			secIts4.t = SSEConstants::p_inf;
			if (EXPECT_TAKEN(secCoherent)) {
				secRay4.signs[0][0] = secSignsX ? 1 : 0;
				secRay4.signs[1][0] = secSignsY ? 1 : 0;
				secRay4.signs[2][0] = secSignsZ ? 1 : 0;
				m_kdtree->rayIntersectPacket(secRay4, secItv4, secIts4, temp);
			} else {
				m_kdtree->rayIntersectPacketIncoherent(secRay4, secItv4, secIts4, temp);
			}

			for (int idx=0; idx<4; ++idx) {
				if (EXPECT_TAKEN(secIts4.t.f[idx] == std::numeric_limits<float>::infinity()))
					block->setPixel(pos+pixelOffset[idx], direct[idx]+emitted[idx]);
				else
					block->setPixel(pos+pixelOffset[idx], emitted[idx]);
			}
		}
	}
	block->setExtra(numRays);
#else
	Log(EError, "Coherent raytracing support was not compiled into this binary!");
#endif
}