Exemplo n.º 1
0
Quat MUST_USE_RESULT Quat::RotateFromTo(const float4 &sourceDirection, const float4 &targetDirection)
{
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 12.289 nsecs / 33.144 ticks, Avg: 12.489 nsecs, Worst: 14.210 nsecs
	simd4f cosAngle = dot4_ps(sourceDirection.v, targetDirection.v);
	cosAngle = negate3_ps(cosAngle); // [+ - - -]
	// XYZ channels use the trigonometric formula sin(x/2) = +/-sqrt(0.5-0.5*cosx))
	// The W channel uses the trigonometric formula cos(x/2) = +/-sqrt(0.5+0.5*cosx))
	simd4f half = set1_ps(0.5f);
	simd4f cosSinHalfAngle = sqrt_ps(add_ps(half, mul_ps(half, cosAngle))); // [cos(x/2), sin(x/2), sin(x/2), sin(x/2)]
	simd4f axis = cross_ps(sourceDirection.v, targetDirection.v);
	simd4f recipLen = rsqrt_ps(dot4_ps(axis, axis));
	axis = mul_ps(axis, recipLen); // [0 z y x]
	// Set the w component to one.
	simd4f one = add_ps(half, half); // [1 1 1 1]
	simd4f highPart = _mm_unpackhi_ps(axis, one); // [_ _ 1 z]
	axis = _mm_movelh_ps(axis, highPart); // [1 z y x]
	Quat q;
	q.q = mul_ps(axis, cosSinHalfAngle);
	return q;
#else
	// Best: 19.970 nsecs / 53.632 ticks, Avg: 20.197 nsecs, Worst: 21.122 nsecs
	assume(EqualAbs(sourceDirection.w, 0.f));
	assume(EqualAbs(targetDirection.w, 0.f));
	return Quat::RotateFromTo(sourceDirection.xyz(), targetDirection.xyz());
#endif
}
Exemplo n.º 2
0
float3x4 &float3x4::operator *=(float scalar)
{
#ifdef MATH_SIMD
	simd4f s = set1_ps(scalar);
	row[0] = mul_ps(row[0], s);
	row[1] = mul_ps(row[1], s);
	row[2] = mul_ps(row[2], s);
#else
	for(int y = 0; y < Rows; ++y)
		for(int x = 0; x < Cols; ++x)
			v[y][x] *= scalar;
#endif

	return *this;
}
Exemplo n.º 3
0
float3x4 float3x4::operator *(float scalar) const
{
#ifdef MATH_SIMD
	float3x4 r;
	simd4f s = set1_ps(scalar);
	r.row[0] = mul_ps(row[0], s);
	r.row[1] = mul_ps(row[1], s);
	r.row[2] = mul_ps(row[2], s);
#else
	float3x4 r = *this;
	r *= scalar;
#endif

	return r;
}
Exemplo n.º 4
0
void AABBTransformAsAABB_SIMD(AABB &aabb, const float4x4 &m)
{
	simd4f minPt = aabb.minPoint;
	simd4f maxPt = aabb.maxPoint;
	simd4f centerPoint = muls_ps(add_ps(minPt, maxPt), 0.5f);
	simd4f newCenter = mat4x4_mul_vec4(m.row, centerPoint);

	simd4f halfSize = sub_ps(centerPoint, minPt);
	simd4f x = abs_ps(mul_ps(m.row[0], halfSize));
	simd4f y = abs_ps(mul_ps(m.row[1], halfSize));
	simd4f z = abs_ps(mul_ps(m.row[2], halfSize));
	simd4f w = zero_ps();
	simd4f newDir = hadd4_ps(x, y, z, w);
	aabb.minPoint = sub_ps(newCenter, newDir);
	aabb.maxPoint = add_ps(newCenter, newDir);
}
Exemplo n.º 5
0
void Quat::ToAxisAngle(float4 &axis, float &angle) const
{
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 35.332 nsecs / 94.328 ticks, Avg: 35.870 nsecs, Worst: 57.607 nsecs
	assume2(this->IsNormalized(), *this, this->Length());
	simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
	simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle)));
	angle = Acos(s4f_x(cosAngle)) * 2.f;
	simd4f a = mul_ps(q, rcpSinAngle);

	// Set the w component to zero.
	simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z]
	axis.v = _mm_movelh_ps(a, highPart); // [0 z y x]
#else
	// Best: 85.258 nsecs / 227.656 ticks, Avg: 85.492 nsecs, Worst: 86.410 nsecs
	ToAxisAngle(reinterpret_cast<float3&>(axis), angle);
	axis.w = 0.f;
#endif
}
Exemplo n.º 6
0
float3x4 float3x4::operator /(float scalar) const
{
	assume(!EqualAbs(scalar, 0));

#ifdef MATH_SIMD
	float3x4 r;
	simd4f s = set1_ps(scalar);
	simd4f one = set1_ps(1.f);
	s = div_ps(one, s);
	r.row[0] = mul_ps(row[0], s);
	r.row[1] = mul_ps(row[1], s);
	r.row[2] = mul_ps(row[2], s);
#else
	float3x4 r = *this;
	r /= scalar;
#endif

	return r;
}
Exemplo n.º 7
0
float4x4 operator *(const ScaleOp &lhs, const float4x4 &rhs)
{
	float4x4 ret;
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SIMD)
	simd4f x = xxxx_ps(lhs.scale.v);
	simd4f y = yyyy_ps(lhs.scale.v);
	simd4f z = zzzz_ps(lhs.scale.v);
	ret.row[0] = mul_ps(rhs.row[0], x);
	ret.row[1] = mul_ps(rhs.row[1], y);
	ret.row[2] = mul_ps(rhs.row[2], z);
	ret.row[3] = rhs.row[3];
#else
	ret[0][0] = rhs[0][0] * lhs.scale.x; ret[0][1] = rhs[0][1] * lhs.scale.x; ret[0][2] = rhs[0][2] * lhs.scale.x; ret[0][3] = rhs[0][3] * lhs.scale.x;
	ret[1][0] = rhs[1][0] * lhs.scale.y; ret[1][1] = rhs[1][1] * lhs.scale.y; ret[1][2] = rhs[1][2] * lhs.scale.y; ret[1][3] = rhs[1][3] * lhs.scale.y;
	ret[2][0] = rhs[2][0] * lhs.scale.z; ret[2][1] = rhs[2][1] * lhs.scale.z; ret[2][2] = rhs[2][2] * lhs.scale.z; ret[2][3] = rhs[2][3] * lhs.scale.z;
	ret[3][0] = rhs[3][0];         ret[3][1] = rhs[3][1];         ret[3][2] = rhs[3][2];         ret[3][3] = rhs[3][3];
#endif
	mathassert(ret.Equals(lhs.ToFloat4x4() * rhs));
	return ret;
}
Exemplo n.º 8
0
float3x4 &float3x4::operator /=(float scalar)
{
	assume(!EqualAbs(scalar, 0));

#ifdef MATH_SIMD
	simd4f s = set1_ps(scalar);
	simd4f one = set1_ps(1.f);
	s = div_ps(one, s);
	row[0] = mul_ps(row[0], s);
	row[1] = mul_ps(row[1], s);
	row[2] = mul_ps(row[2], s);
#else
	float invScalar = 1.f / scalar;
	for(int y = 0; y < Rows; ++y)
		for(int x = 0; x < Cols; ++x)
			v[y][x] *= invScalar;
#endif

	return *this;
}
Exemplo n.º 9
0
void Quat::SetFromAxisAngle(const float4 &axis, float angle)
{
	assume1(EqualAbs(axis.w, 0.f), axis);
	assume2(axis.IsNormalized(1e-4f), axis, axis.Length4());
	assume1(MATH_NS::IsFinite(angle), angle);

#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 26.499 nsecs / 71.024 ticks, Avg: 26.856 nsecs, Worst: 27.651 nsecs
	simd4f half = set1_ps(0.5f);
	simd4f halfAngle = mul_ps(set1_ps(angle), half);
	simd4f sinAngle, cosAngle;
	sincos_ps(halfAngle, &sinAngle, &cosAngle);
	simd4f quat = mul_ps(axis, sinAngle);

	// Set the w component to cosAngle.
	simd4f highPart = _mm_unpackhi_ps(quat, cosAngle); // [_ _ 1 z]
	q = _mm_movelh_ps(quat, highPart); // [1 z y x]
#else
	// Best: 36.868 nsecs / 98.312 ticks, Avg: 36.980 nsecs, Worst: 41.477 nsecs
	SetFromAxisAngle(axis.xyz(), angle);
#endif
}
Exemplo n.º 10
0
vec Quat::Axis() const
{
	assume2(this->IsNormalized(), *this, this->Length());
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 6.145 nsecs / 16.88 ticks, Avg: 6.367 nsecs, Worst: 6.529 nsecs
	assume2(this->IsNormalized(), *this, this->Length());
	simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
	simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle)));
	simd4f a = mul_ps(q, rcpSinAngle);

	// Set the w component to zero.
	simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z]
	a = _mm_movelh_ps(a, highPart); // [0 z y x]
	return FLOAT4_TO_DIR(a);
#else
	// Best: 6.529 nsecs / 18.152 ticks, Avg: 6.851 nsecs, Worst: 8.065 nsecs

	// Convert cos to sin via the identity sin^2 + cos^2 = 1, and fuse reciprocal and square root to the same instruction,
	// since we are about to divide by it.
	float rcpSinAngle = RSqrt(1.f - w*w);
	return DIR_VEC(x, y, z) * rcpSinAngle;
#endif
}
Exemplo n.º 11
0
float Quat::Normalize()
{
#ifdef MATH_AUTOMATIC_SSE
	simd4f lenSq = vec4_length_sq_ps(q);
	simd4f len = vec4_rsqrt(lenSq);
	simd4f isZero = cmplt_ps(lenSq, simd4fEpsilon); // Was the length zero?
	simd4f normalized = mul_ps(q, len); // Normalize.
	q = cmov_ps(normalized, float4::unitX.v, isZero); // If length == 0, output the vector (1,0,0,0).
	return s4f_x(len);
#else
	float length = Length();
	if (length < 1e-4f)
		return 0.f;
	float rcpLength = 1.f / length;
	x *= rcpLength;
	y *= rcpLength;
	z *= rcpLength;
	w *= rcpLength;
	return length;
#endif
}
Exemplo n.º 12
0
bool AABB::IntersectLineAABB_SSE(const float4 &rayPos, const float4 &rayDir, float tNear, float tFar) const
{
	assume(rayDir.IsNormalized4());
	assume(tNear <= tFar && "AABB::IntersectLineAABB: User gave a degenerate line as input for the intersection test!");
	/* For reference, this is the C++ form of the vectorized SSE code below.

	float4 recipDir = rayDir.RecipFast4();
	float4 t1 = (aabbMinPoint - rayPos).Mul(recipDir);
	float4 t2 = (aabbMaxPoint - rayPos).Mul(recipDir);
	float4 near = t1.Min(t2);
	float4 far = t1.Max(t2);
	float4 rayDirAbs = rayDir.Abs();

	if (rayDirAbs.x > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.x, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.x, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.x < aabbMinPoint.x || rayPos.x > aabbMaxPoint.x) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.y > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.y, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.y, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.y < aabbMinPoint.y || rayPos.y > aabbMaxPoint.y) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.z > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.z, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.z, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.z < aabbMinPoint.z || rayPos.z > aabbMaxPoint.z) // early-out if the ray can't possibly enter the box.
		return false;

	return tNear < tFar;
	*/

	simd4f recipDir = rcp_ps(rayDir.v);
	// Note: The above performs an approximate reciprocal (11 bits of precision).
	// For a full precision reciprocal, perform a div:
//	simd4f recipDir = div_ps(set1_ps(1.f), rayDir.v);

	simd4f t1 = mul_ps(sub_ps(minPoint, rayPos.v), recipDir);
	simd4f t2 = mul_ps(sub_ps(maxPoint, rayPos.v), recipDir);

	simd4f nearD = min_ps(t1, t2); // [0 n3 n2 n1]
	simd4f farD = max_ps(t1, t2);  // [0 f3 f2 f1]

	// Check if the ray direction is parallel to any of the cardinal axes, and if so,
	// mask those [near, far] ranges away from the hit test computations.
	simd4f rayDirAbs = abs_ps(rayDir.v);

	const simd4f epsilon = set1_ps(1e-4f);
	// zeroDirections[i] will be nonzero for each axis i the ray is parallel to.
	simd4f zeroDirections = cmple_ps(rayDirAbs, epsilon);

	const simd4f floatInf = set1_ps(FLOAT_INF);
	const simd4f floatNegInf = set1_ps(-FLOAT_INF);

	// If the ray is parallel to one of the axes, replace the slab range for that axis
	// with [-inf, inf] range instead. (which is a no-op in the comparisons below)
	nearD = cmov_ps(nearD, floatNegInf, zeroDirections);
	farD = cmov_ps(farD, floatInf, zeroDirections);

	// Next, we need to compute horizontally max(nearD[0], nearD[1], nearD[2]) and min(farD[0], farD[1], farD[2])
	// to see if there is an overlap in the hit ranges.
	simd4f v1 = axx_bxx_ps(nearD, farD); // [f1 f1 n1 n1]
	simd4f v2 = ayy_byy_ps(nearD, farD); // [f2 f2 n2 n2]
	simd4f v3 = azz_bzz_ps(nearD, farD); // [f3 f3 n3 n3]
	nearD = max_ps(v1, max_ps(v2, v3));
	farD = min_ps(v1, min_ps(v2, v3));
	farD = wwww_ps(farD); // Unpack the result from high offset in the register.
	nearD = max_ps(nearD, setx_ps(tNear));
	farD = min_ps(farD, setx_ps(tFar));

	// Finally, test if the ranges overlap.
	simd4f rangeIntersects = cmple_ps(nearD, farD); // Only x channel used, higher ones ignored.

	// To store out out the interval of intersection, uncomment the following:
	// These are disabled, since without these, the whole function runs without a single memory store,
	// which has been profiled to be very fast! Uncommenting these causes an order-of-magnitude slowdown.
	// For now, using the SSE version only where the tNear and tFar ranges are not interesting.
//	_mm_store_ss(&tNear, nearD);
//	_mm_store_ss(&tFar, farD);

	// To avoid false positives, need to have an additional rejection test for each cardinal axis the ray direction
	// is parallel to.
	simd4f out2 = cmplt_ps(rayPos.v, minPoint);
	simd4f out3 = cmpgt_ps(rayPos.v, maxPoint);
	out2 = or_ps(out2, out3);
	zeroDirections = and_ps(zeroDirections, out2);

	simd4f yOut = yyyy_ps(zeroDirections);
	simd4f zOut = zzzz_ps(zeroDirections);

	zeroDirections = or_ps(or_ps(zeroDirections, yOut), zOut);
	// Intersection occurs if the slab ranges had positive overlap and if the test was not rejected by the ray being
	// parallel to some cardinal axis.
	simd4f intersects = andnot_ps(zeroDirections, rangeIntersects);
	simd4f epsilonMasked = and_ps(epsilon, intersects);
	return comieq_ss(epsilon, epsilonMasked) != 0;
}
Exemplo n.º 13
0
Quat MUST_USE_RESULT Quat::Slerp(const Quat &q2, float t) const
{
	assume(0.f <= t && t <= 1.f);
	assume(IsNormalized());
	assume(q2.IsNormalized());

#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	simd4f angle = dot4_ps(q, q2.q); // <q, q2.q>
	simd4f neg = cmplt_ps(angle, zero_ps()); // angle < 0?
	neg = and_ps(neg, set1_ps_hex(0x80000000)); // Convert 0/0xFFFFFFFF mask to a 0x/0x80000000 mask.
//	neg = s4i_to_s4f(_mm_slli_epi32(s4f_to_s4i(neg), 31)); // A SSE2-esque way to achieve the above would be this, but this seems to clock slower (12.04 clocks vs 11.97 clocks)
	angle = xor_ps(angle, neg); // if angle was negative, make it positive.
	simd4f one = set1_ps(1.f);
	angle = min_ps(angle, one); // If user passed t > 1 or t < -1, clamp the range.

	// Compute a fast polynomial approximation to arccos(angle).
	// arccos(x): (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f;
	angle = madd_ps(msub_ps(mul_ps(set1_ps(-0.69813170079773212f), angle), angle, set1_ps(0.87266462599716477f)), angle, set1_ps(1.5707963267948966f));

	// Shuffle an appropriate vector from 't' and 'angle' for computing two sines in one go.
	simd4f T = _mm_set_ss(t); // (.., t)
	simd4f oneSubT = sub_ps(one, T); // (.., 1-t)
	T = _mm_movelh_ps(T, oneSubT); // (.., 1-t, .., t)
	angle = mul_ps(angle, T); // (.., (1-t)*angle, .., t*angle)

	// Compute a fast polynomial approximation to sin(t*angle) and sin((1-t)*angle).
	// Here could use "angle = sin_ps(angle);" for precision, but favor speed instead with the following polynomial expansion:
	// sin(x): ((5.64311797634681035370e-03 * x * x - 1.55271410633428644799e-01) * x * x + 9.87862135574673806965e-01) * x
	simd4f angle2 = mul_ps(angle, angle);
	angle = mul_ps(angle, madd_ps(madd_ps(angle2, set1_ps(5.64311797634681035370e-03f), set1_ps(-1.55271410633428644799e-01f)), angle2, set1_ps(9.87862135574673806965e-01f)));

	// Compute the final lerp factors a and b to scale q and q2.
	simd4f a = zzzz_ps(angle);
	simd4f b = xxxx_ps(angle);
	a = xor_ps(a, neg);
	a = mul_ps(q, a);
	a = madd_ps(q2, b, a);

	// The lerp above generates an unnormalized quaternion which needs to be renormalized.
	return mul_ps(a, rsqrt_ps(dot4_ps(a, a)));
#else
	float angle = this->Dot(q2);
	float sign = 1.f; // Multiply by a sign of +/-1 to guarantee we rotate the shorter arc.
	if (angle < 0.f)
	{
		angle = -angle;
		sign = -1.f;
	}

	float a;
	float b;
	if (angle < 0.999) // perform spherical linear interpolation.
	{
		// angle = Acos(angle); // After this, angle is in the range pi/2 -> 0 as the original angle variable ranged from 0 -> 1.
		angle = (-0.69813170079773212f * angle * angle - 0.87266462599716477f) * angle + 1.5707963267948966f;

		float ta = t*angle;
#ifdef MATH_USE_SINCOS_LOOKUPTABLE
		// If Sin() is based on a lookup table, prefer that over polynomial approximation.
		a = Sin(angle - ta);
		b = Sin(ta);
#else
		// Not using a lookup table, manually compute the two sines by using a very rough approximation.
		float ta2 = ta*ta;
		b = ((5.64311797634681035370e-03f * ta2 - 1.55271410633428644799e-01f) * ta2 + 9.87862135574673806965e-01f) * ta;
		a = angle - ta;
		float a2 = a*a;
		a = ((5.64311797634681035370e-03f * a2 - 1.55271410633428644799e-01f) * a2 + 9.87862135574673806965e-01f) * a;
#endif
	}
	else // If angle is close to taking the denominator to zero, resort to linear interpolation (and normalization).
	{
		a = 1.f - t;
		b = t;
	}
	// Lerp and renormalize.
	return (*this * (a * sign) + q2 * b).Normalized();
#endif
}