bool AABB::Contains(const vec &point) const { // Benchmarking this code is very difficult, since branch prediction makes the scalar version // look very good. In isolation the scalar version might be better, however when joined with // other SSE computation, the SIMD variants are probably more efficient because the data is // already "hot" in the registers. Therefore favoring the SSE version over the scalar version // when possible. #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SIMD) // Benchmark 'AABBContains_positive': AABB::Contains(point) positive // Best: 2.048 nsecs / 3.5128 ticks, Avg: 2.241 nsecs, Worst: 4.277 nsecs // Benchmark 'AABBContains_negative': AABB::Contains(point) negative // Best: 2.048 nsecs / 3.467 ticks, Avg: 2.115 nsecs, Worst: 4.156 nsecs // Benchmark 'AABBContains_unpredictable': AABB::Contains(point) unpredictable // Best: 2.590 nsecs / 4.4106 ticks, Avg: 2.978 nsecs, Worst: 6.084 nsecs simd4f a = cmplt_ps(point, minPoint); simd4f b = cmpgt_ps(point, maxPoint); a = or_ps(a, b); return allzero_ps(a) != 0; #else // Benchmark 'AABBContains_positive': AABB::Contains(point) positive // Best: 2.108 nsecs / 3.6022 ticks, Avg: 2.232 nsecs, Worst: 4.638 nsecs // Benchmark 'AABBContains_negative': AABB::Contains(point) negative // Best: 1.988 nsecs / 3.361 ticks, Avg: 2.148 nsecs, Worst: 4.457 nsecs // Benchmark 'AABBContains_unpredictable': AABB::Contains(point) unpredictable // Best: 3.554 nsecs / 6.0764 ticks, Avg: 3.803 nsecs, Worst: 6.264 nsecs return minPoint.x <= point.x && point.x <= maxPoint.x && minPoint.y <= point.y && point.y <= maxPoint.y && minPoint.z <= point.z && point.z <= maxPoint.z; #endif }
bool AABB::Contains(const vec &aabbMinPoint, const vec &aabbMaxPoint) const { #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SIMD) simd4f a = cmplt_ps(aabbMinPoint, minPoint); simd4f b = cmpgt_ps(aabbMaxPoint, maxPoint); a = or_ps(a, b); return allzero_ps(a) != 0; #else return minPoint.x <= aabbMinPoint.x && maxPoint.x >= aabbMaxPoint.x && minPoint.y <= aabbMinPoint.y && maxPoint.y >= aabbMaxPoint.y && minPoint.z <= aabbMinPoint.z && maxPoint.z >= aabbMaxPoint.z; #endif }
float Quat::Normalize() { #ifdef MATH_AUTOMATIC_SSE simd4f lenSq = vec4_length_sq_ps(q); simd4f len = vec4_rsqrt(lenSq); simd4f isZero = cmplt_ps(lenSq, simd4fEpsilon); // Was the length zero? simd4f normalized = mul_ps(q, len); // Normalize. q = cmov_ps(normalized, float4::unitX.v, isZero); // If length == 0, output the vector (1,0,0,0). return s4f_x(len); #else float length = Length(); if (length < 1e-4f) return 0.f; float rcpLength = 1.f / length; x *= rcpLength; y *= rcpLength; z *= rcpLength; w *= rcpLength; return length; #endif }
bool AABB::IntersectLineAABB_SSE(const float4 &rayPos, const float4 &rayDir, float tNear, float tFar) const { assume(rayDir.IsNormalized4()); assume(tNear <= tFar && "AABB::IntersectLineAABB: User gave a degenerate line as input for the intersection test!"); /* For reference, this is the C++ form of the vectorized SSE code below. float4 recipDir = rayDir.RecipFast4(); float4 t1 = (aabbMinPoint - rayPos).Mul(recipDir); float4 t2 = (aabbMaxPoint - rayPos).Mul(recipDir); float4 near = t1.Min(t2); float4 far = t1.Max(t2); float4 rayDirAbs = rayDir.Abs(); if (rayDirAbs.x > 1e-4f) // ray is parallel to plane in question { tNear = Max(near.x, tNear); // tNear tracks distance to intersect (enter) the AABB. tFar = Min(far.x, tFar); // tFar tracks the distance to exit the AABB. } else if (rayPos.x < aabbMinPoint.x || rayPos.x > aabbMaxPoint.x) // early-out if the ray can't possibly enter the box. return false; if (rayDirAbs.y > 1e-4f) // ray is parallel to plane in question { tNear = Max(near.y, tNear); // tNear tracks distance to intersect (enter) the AABB. tFar = Min(far.y, tFar); // tFar tracks the distance to exit the AABB. } else if (rayPos.y < aabbMinPoint.y || rayPos.y > aabbMaxPoint.y) // early-out if the ray can't possibly enter the box. return false; if (rayDirAbs.z > 1e-4f) // ray is parallel to plane in question { tNear = Max(near.z, tNear); // tNear tracks distance to intersect (enter) the AABB. tFar = Min(far.z, tFar); // tFar tracks the distance to exit the AABB. } else if (rayPos.z < aabbMinPoint.z || rayPos.z > aabbMaxPoint.z) // early-out if the ray can't possibly enter the box. return false; return tNear < tFar; */ simd4f recipDir = rcp_ps(rayDir.v); // Note: The above performs an approximate reciprocal (11 bits of precision). // For a full precision reciprocal, perform a div: // simd4f recipDir = div_ps(set1_ps(1.f), rayDir.v); simd4f t1 = mul_ps(sub_ps(minPoint, rayPos.v), recipDir); simd4f t2 = mul_ps(sub_ps(maxPoint, rayPos.v), recipDir); simd4f nearD = min_ps(t1, t2); // [0 n3 n2 n1] simd4f farD = max_ps(t1, t2); // [0 f3 f2 f1] // Check if the ray direction is parallel to any of the cardinal axes, and if so, // mask those [near, far] ranges away from the hit test computations. simd4f rayDirAbs = abs_ps(rayDir.v); const simd4f epsilon = set1_ps(1e-4f); // zeroDirections[i] will be nonzero for each axis i the ray is parallel to. simd4f zeroDirections = cmple_ps(rayDirAbs, epsilon); const simd4f floatInf = set1_ps(FLOAT_INF); const simd4f floatNegInf = set1_ps(-FLOAT_INF); // If the ray is parallel to one of the axes, replace the slab range for that axis // with [-inf, inf] range instead. (which is a no-op in the comparisons below) nearD = cmov_ps(nearD, floatNegInf, zeroDirections); farD = cmov_ps(farD, floatInf, zeroDirections); // Next, we need to compute horizontally max(nearD[0], nearD[1], nearD[2]) and min(farD[0], farD[1], farD[2]) // to see if there is an overlap in the hit ranges. simd4f v1 = axx_bxx_ps(nearD, farD); // [f1 f1 n1 n1] simd4f v2 = ayy_byy_ps(nearD, farD); // [f2 f2 n2 n2] simd4f v3 = azz_bzz_ps(nearD, farD); // [f3 f3 n3 n3] nearD = max_ps(v1, max_ps(v2, v3)); farD = min_ps(v1, min_ps(v2, v3)); farD = wwww_ps(farD); // Unpack the result from high offset in the register. nearD = max_ps(nearD, setx_ps(tNear)); farD = min_ps(farD, setx_ps(tFar)); // Finally, test if the ranges overlap. simd4f rangeIntersects = cmple_ps(nearD, farD); // Only x channel used, higher ones ignored. // To store out out the interval of intersection, uncomment the following: // These are disabled, since without these, the whole function runs without a single memory store, // which has been profiled to be very fast! Uncommenting these causes an order-of-magnitude slowdown. // For now, using the SSE version only where the tNear and tFar ranges are not interesting. // _mm_store_ss(&tNear, nearD); // _mm_store_ss(&tFar, farD); // To avoid false positives, need to have an additional rejection test for each cardinal axis the ray direction // is parallel to. simd4f out2 = cmplt_ps(rayPos.v, minPoint); simd4f out3 = cmpgt_ps(rayPos.v, maxPoint); out2 = or_ps(out2, out3); zeroDirections = and_ps(zeroDirections, out2); simd4f yOut = yyyy_ps(zeroDirections); simd4f zOut = zzzz_ps(zeroDirections); zeroDirections = or_ps(or_ps(zeroDirections, yOut), zOut); // Intersection occurs if the slab ranges had positive overlap and if the test was not rejected by the ray being // parallel to some cardinal axis. simd4f intersects = andnot_ps(zeroDirections, rangeIntersects); simd4f epsilonMasked = and_ps(epsilon, intersects); return comieq_ss(epsilon, epsilonMasked) != 0; }
Quat MUST_USE_RESULT Quat::Slerp(const Quat &q2, float t) const { assume(0.f <= t && t <= 1.f); assume(IsNormalized()); assume(q2.IsNormalized()); #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE) simd4f angle = dot4_ps(q, q2.q); // <q, q2.q> simd4f neg = cmplt_ps(angle, zero_ps()); // angle < 0? neg = and_ps(neg, set1_ps_hex(0x80000000)); // Convert 0/0xFFFFFFFF mask to a 0x/0x80000000 mask. // neg = s4i_to_s4f(_mm_slli_epi32(s4f_to_s4i(neg), 31)); // A SSE2-esque way to achieve the above would be this, but this seems to clock slower (12.04 clocks vs 11.97 clocks) angle = xor_ps(angle, neg); // if angle was negative, make it positive. simd4f one = set1_ps(1.f); angle = min_ps(angle, one); // If user passed t > 1 or t < -1, clamp the range. // Compute a fast polynomial approximation to arccos(angle). // arccos(x): (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f; angle = madd_ps(msub_ps(mul_ps(set1_ps(-0.69813170079773212f), angle), angle, set1_ps(0.87266462599716477f)), angle, set1_ps(1.5707963267948966f)); // Shuffle an appropriate vector from 't' and 'angle' for computing two sines in one go. simd4f T = _mm_set_ss(t); // (.., t) simd4f oneSubT = sub_ps(one, T); // (.., 1-t) T = _mm_movelh_ps(T, oneSubT); // (.., 1-t, .., t) angle = mul_ps(angle, T); // (.., (1-t)*angle, .., t*angle) // Compute a fast polynomial approximation to sin(t*angle) and sin((1-t)*angle). // Here could use "angle = sin_ps(angle);" for precision, but favor speed instead with the following polynomial expansion: // sin(x): ((5.64311797634681035370e-03 * x * x - 1.55271410633428644799e-01) * x * x + 9.87862135574673806965e-01) * x simd4f angle2 = mul_ps(angle, angle); angle = mul_ps(angle, madd_ps(madd_ps(angle2, set1_ps(5.64311797634681035370e-03f), set1_ps(-1.55271410633428644799e-01f)), angle2, set1_ps(9.87862135574673806965e-01f))); // Compute the final lerp factors a and b to scale q and q2. simd4f a = zzzz_ps(angle); simd4f b = xxxx_ps(angle); a = xor_ps(a, neg); a = mul_ps(q, a); a = madd_ps(q2, b, a); // The lerp above generates an unnormalized quaternion which needs to be renormalized. return mul_ps(a, rsqrt_ps(dot4_ps(a, a))); #else float angle = this->Dot(q2); float sign = 1.f; // Multiply by a sign of +/-1 to guarantee we rotate the shorter arc. if (angle < 0.f) { angle = -angle; sign = -1.f; } float a; float b; if (angle < 0.999) // perform spherical linear interpolation. { // angle = Acos(angle); // After this, angle is in the range pi/2 -> 0 as the original angle variable ranged from 0 -> 1. angle = (-0.69813170079773212f * angle * angle - 0.87266462599716477f) * angle + 1.5707963267948966f; float ta = t*angle; #ifdef MATH_USE_SINCOS_LOOKUPTABLE // If Sin() is based on a lookup table, prefer that over polynomial approximation. a = Sin(angle - ta); b = Sin(ta); #else // Not using a lookup table, manually compute the two sines by using a very rough approximation. float ta2 = ta*ta; b = ((5.64311797634681035370e-03f * ta2 - 1.55271410633428644799e-01f) * ta2 + 9.87862135574673806965e-01f) * ta; a = angle - ta; float a2 = a*a; a = ((5.64311797634681035370e-03f * a2 - 1.55271410633428644799e-01f) * a2 + 9.87862135574673806965e-01f) * a; #endif } else // If angle is close to taking the denominator to zero, resort to linear interpolation (and normalization). { a = 1.f - t; b = t; } // Lerp and renormalize. return (*this * (a * sign) + q2 * b).Normalized(); #endif }