Quat MUST_USE_RESULT Quat::RotateFromTo(const float4 &sourceDirection, const float4 &targetDirection) { #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE) // Best: 12.289 nsecs / 33.144 ticks, Avg: 12.489 nsecs, Worst: 14.210 nsecs simd4f cosAngle = dot4_ps(sourceDirection.v, targetDirection.v); cosAngle = negate3_ps(cosAngle); // [+ - - -] // XYZ channels use the trigonometric formula sin(x/2) = +/-sqrt(0.5-0.5*cosx)) // The W channel uses the trigonometric formula cos(x/2) = +/-sqrt(0.5+0.5*cosx)) simd4f half = set1_ps(0.5f); simd4f cosSinHalfAngle = sqrt_ps(add_ps(half, mul_ps(half, cosAngle))); // [cos(x/2), sin(x/2), sin(x/2), sin(x/2)] simd4f axis = cross_ps(sourceDirection.v, targetDirection.v); simd4f recipLen = rsqrt_ps(dot4_ps(axis, axis)); axis = mul_ps(axis, recipLen); // [0 z y x] // Set the w component to one. simd4f one = add_ps(half, half); // [1 1 1 1] simd4f highPart = _mm_unpackhi_ps(axis, one); // [_ _ 1 z] axis = _mm_movelh_ps(axis, highPart); // [1 z y x] Quat q; q.q = mul_ps(axis, cosSinHalfAngle); return q; #else // Best: 19.970 nsecs / 53.632 ticks, Avg: 20.197 nsecs, Worst: 21.122 nsecs assume(EqualAbs(sourceDirection.w, 0.f)); assume(EqualAbs(targetDirection.w, 0.f)); return Quat::RotateFromTo(sourceDirection.xyz(), targetDirection.xyz()); #endif }
void Quat::ToAxisAngle(float4 &axis, float &angle) const { #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE) // Best: 35.332 nsecs / 94.328 ticks, Avg: 35.870 nsecs, Worst: 57.607 nsecs assume2(this->IsNormalized(), *this, this->Length()); simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3)); simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle))); angle = Acos(s4f_x(cosAngle)) * 2.f; simd4f a = mul_ps(q, rcpSinAngle); // Set the w component to zero. simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z] axis.v = _mm_movelh_ps(a, highPart); // [0 z y x] #else // Best: 85.258 nsecs / 227.656 ticks, Avg: 85.492 nsecs, Worst: 86.410 nsecs ToAxisAngle(reinterpret_cast<float3&>(axis), angle); axis.w = 0.f; #endif }
float Quat::Normalize() { #ifdef MATH_AUTOMATIC_SSE simd4f lenSq = vec4_length_sq_ps(q); simd4f len = rsqrt_ps(lenSq); simd4f isZero = cmplt_ps(lenSq, simd4fEpsilon); // Was the length zero? simd4f normalized = mul_ps(q, len); // Normalize. q = cmov_ps(normalized, float4::unitX.v, isZero); // If length == 0, output the vector (1,0,0,0). len = cmov_ps(len, zero_ps(), isZero); // If length == 0, output zero as length. return s4f_x(len); #else float length = Length(); if (length < 1e-4f) return 0.f; float rcpLength = 1.f / length; x *= rcpLength; y *= rcpLength; z *= rcpLength; w *= rcpLength; return length; #endif }
vec Quat::Axis() const { assume2(this->IsNormalized(), *this, this->Length()); #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE) // Best: 6.145 nsecs / 16.88 ticks, Avg: 6.367 nsecs, Worst: 6.529 nsecs assume2(this->IsNormalized(), *this, this->Length()); simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3)); simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle))); simd4f a = mul_ps(q, rcpSinAngle); // Set the w component to zero. simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z] a = _mm_movelh_ps(a, highPart); // [0 z y x] return FLOAT4_TO_DIR(a); #else // Best: 6.529 nsecs / 18.152 ticks, Avg: 6.851 nsecs, Worst: 8.065 nsecs // Convert cos to sin via the identity sin^2 + cos^2 = 1, and fuse reciprocal and square root to the same instruction, // since we are about to divide by it. float rcpSinAngle = RSqrt(1.f - w*w); return DIR_VEC(x, y, z) * rcpSinAngle; #endif }
Quat MUST_USE_RESULT Quat::Slerp(const Quat &q2, float t) const { assume(0.f <= t && t <= 1.f); assume(IsNormalized()); assume(q2.IsNormalized()); #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE) simd4f angle = dot4_ps(q, q2.q); // <q, q2.q> simd4f neg = cmplt_ps(angle, zero_ps()); // angle < 0? neg = and_ps(neg, set1_ps_hex(0x80000000)); // Convert 0/0xFFFFFFFF mask to a 0x/0x80000000 mask. // neg = s4i_to_s4f(_mm_slli_epi32(s4f_to_s4i(neg), 31)); // A SSE2-esque way to achieve the above would be this, but this seems to clock slower (12.04 clocks vs 11.97 clocks) angle = xor_ps(angle, neg); // if angle was negative, make it positive. simd4f one = set1_ps(1.f); angle = min_ps(angle, one); // If user passed t > 1 or t < -1, clamp the range. // Compute a fast polynomial approximation to arccos(angle). // arccos(x): (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f; angle = madd_ps(msub_ps(mul_ps(set1_ps(-0.69813170079773212f), angle), angle, set1_ps(0.87266462599716477f)), angle, set1_ps(1.5707963267948966f)); // Shuffle an appropriate vector from 't' and 'angle' for computing two sines in one go. simd4f T = _mm_set_ss(t); // (.., t) simd4f oneSubT = sub_ps(one, T); // (.., 1-t) T = _mm_movelh_ps(T, oneSubT); // (.., 1-t, .., t) angle = mul_ps(angle, T); // (.., (1-t)*angle, .., t*angle) // Compute a fast polynomial approximation to sin(t*angle) and sin((1-t)*angle). // Here could use "angle = sin_ps(angle);" for precision, but favor speed instead with the following polynomial expansion: // sin(x): ((5.64311797634681035370e-03 * x * x - 1.55271410633428644799e-01) * x * x + 9.87862135574673806965e-01) * x simd4f angle2 = mul_ps(angle, angle); angle = mul_ps(angle, madd_ps(madd_ps(angle2, set1_ps(5.64311797634681035370e-03f), set1_ps(-1.55271410633428644799e-01f)), angle2, set1_ps(9.87862135574673806965e-01f))); // Compute the final lerp factors a and b to scale q and q2. simd4f a = zzzz_ps(angle); simd4f b = xxxx_ps(angle); a = xor_ps(a, neg); a = mul_ps(q, a); a = madd_ps(q2, b, a); // The lerp above generates an unnormalized quaternion which needs to be renormalized. return mul_ps(a, rsqrt_ps(dot4_ps(a, a))); #else float angle = this->Dot(q2); float sign = 1.f; // Multiply by a sign of +/-1 to guarantee we rotate the shorter arc. if (angle < 0.f) { angle = -angle; sign = -1.f; } float a; float b; if (angle < 0.999) // perform spherical linear interpolation. { // angle = Acos(angle); // After this, angle is in the range pi/2 -> 0 as the original angle variable ranged from 0 -> 1. angle = (-0.69813170079773212f * angle * angle - 0.87266462599716477f) * angle + 1.5707963267948966f; float ta = t*angle; #ifdef MATH_USE_SINCOS_LOOKUPTABLE // If Sin() is based on a lookup table, prefer that over polynomial approximation. a = Sin(angle - ta); b = Sin(ta); #else // Not using a lookup table, manually compute the two sines by using a very rough approximation. float ta2 = ta*ta; b = ((5.64311797634681035370e-03f * ta2 - 1.55271410633428644799e-01f) * ta2 + 9.87862135574673806965e-01f) * ta; a = angle - ta; float a2 = a*a; a = ((5.64311797634681035370e-03f * a2 - 1.55271410633428644799e-01f) * a2 + 9.87862135574673806965e-01f) * a; #endif } else // If angle is close to taking the denominator to zero, resort to linear interpolation (and normalization). { a = 1.f - t; b = t; } // Lerp and renormalize. return (*this * (a * sign) + q2 * b).Normalized(); #endif }