float4x4 operator *(const ScaleOp &lhs, const float4x4 &rhs) { float4x4 ret; #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SIMD) simd4f x = xxxx_ps(lhs.scale.v); simd4f y = yyyy_ps(lhs.scale.v); simd4f z = zzzz_ps(lhs.scale.v); ret.row[0] = mul_ps(rhs.row[0], x); ret.row[1] = mul_ps(rhs.row[1], y); ret.row[2] = mul_ps(rhs.row[2], z); ret.row[3] = rhs.row[3]; #else ret[0][0] = rhs[0][0] * lhs.scale.x; ret[0][1] = rhs[0][1] * lhs.scale.x; ret[0][2] = rhs[0][2] * lhs.scale.x; ret[0][3] = rhs[0][3] * lhs.scale.x; ret[1][0] = rhs[1][0] * lhs.scale.y; ret[1][1] = rhs[1][1] * lhs.scale.y; ret[1][2] = rhs[1][2] * lhs.scale.y; ret[1][3] = rhs[1][3] * lhs.scale.y; ret[2][0] = rhs[2][0] * lhs.scale.z; ret[2][1] = rhs[2][1] * lhs.scale.z; ret[2][2] = rhs[2][2] * lhs.scale.z; ret[2][3] = rhs[2][3] * lhs.scale.z; ret[3][0] = rhs[3][0]; ret[3][1] = rhs[3][1]; ret[3][2] = rhs[3][2]; ret[3][3] = rhs[3][3]; #endif mathassert(ret.Equals(lhs.ToFloat4x4() * rhs)); return ret; }
bool AABB::IntersectLineAABB_SSE(const float4 &rayPos, const float4 &rayDir, float tNear, float tFar) const { assume(rayDir.IsNormalized4()); assume(tNear <= tFar && "AABB::IntersectLineAABB: User gave a degenerate line as input for the intersection test!"); /* For reference, this is the C++ form of the vectorized SSE code below. float4 recipDir = rayDir.RecipFast4(); float4 t1 = (aabbMinPoint - rayPos).Mul(recipDir); float4 t2 = (aabbMaxPoint - rayPos).Mul(recipDir); float4 near = t1.Min(t2); float4 far = t1.Max(t2); float4 rayDirAbs = rayDir.Abs(); if (rayDirAbs.x > 1e-4f) // ray is parallel to plane in question { tNear = Max(near.x, tNear); // tNear tracks distance to intersect (enter) the AABB. tFar = Min(far.x, tFar); // tFar tracks the distance to exit the AABB. } else if (rayPos.x < aabbMinPoint.x || rayPos.x > aabbMaxPoint.x) // early-out if the ray can't possibly enter the box. return false; if (rayDirAbs.y > 1e-4f) // ray is parallel to plane in question { tNear = Max(near.y, tNear); // tNear tracks distance to intersect (enter) the AABB. tFar = Min(far.y, tFar); // tFar tracks the distance to exit the AABB. } else if (rayPos.y < aabbMinPoint.y || rayPos.y > aabbMaxPoint.y) // early-out if the ray can't possibly enter the box. return false; if (rayDirAbs.z > 1e-4f) // ray is parallel to plane in question { tNear = Max(near.z, tNear); // tNear tracks distance to intersect (enter) the AABB. tFar = Min(far.z, tFar); // tFar tracks the distance to exit the AABB. } else if (rayPos.z < aabbMinPoint.z || rayPos.z > aabbMaxPoint.z) // early-out if the ray can't possibly enter the box. return false; return tNear < tFar; */ simd4f recipDir = rcp_ps(rayDir.v); // Note: The above performs an approximate reciprocal (11 bits of precision). // For a full precision reciprocal, perform a div: // simd4f recipDir = div_ps(set1_ps(1.f), rayDir.v); simd4f t1 = mul_ps(sub_ps(minPoint, rayPos.v), recipDir); simd4f t2 = mul_ps(sub_ps(maxPoint, rayPos.v), recipDir); simd4f nearD = min_ps(t1, t2); // [0 n3 n2 n1] simd4f farD = max_ps(t1, t2); // [0 f3 f2 f1] // Check if the ray direction is parallel to any of the cardinal axes, and if so, // mask those [near, far] ranges away from the hit test computations. simd4f rayDirAbs = abs_ps(rayDir.v); const simd4f epsilon = set1_ps(1e-4f); // zeroDirections[i] will be nonzero for each axis i the ray is parallel to. simd4f zeroDirections = cmple_ps(rayDirAbs, epsilon); const simd4f floatInf = set1_ps(FLOAT_INF); const simd4f floatNegInf = set1_ps(-FLOAT_INF); // If the ray is parallel to one of the axes, replace the slab range for that axis // with [-inf, inf] range instead. (which is a no-op in the comparisons below) nearD = cmov_ps(nearD, floatNegInf, zeroDirections); farD = cmov_ps(farD, floatInf, zeroDirections); // Next, we need to compute horizontally max(nearD[0], nearD[1], nearD[2]) and min(farD[0], farD[1], farD[2]) // to see if there is an overlap in the hit ranges. simd4f v1 = axx_bxx_ps(nearD, farD); // [f1 f1 n1 n1] simd4f v2 = ayy_byy_ps(nearD, farD); // [f2 f2 n2 n2] simd4f v3 = azz_bzz_ps(nearD, farD); // [f3 f3 n3 n3] nearD = max_ps(v1, max_ps(v2, v3)); farD = min_ps(v1, min_ps(v2, v3)); farD = wwww_ps(farD); // Unpack the result from high offset in the register. nearD = max_ps(nearD, setx_ps(tNear)); farD = min_ps(farD, setx_ps(tFar)); // Finally, test if the ranges overlap. simd4f rangeIntersects = cmple_ps(nearD, farD); // Only x channel used, higher ones ignored. // To store out out the interval of intersection, uncomment the following: // These are disabled, since without these, the whole function runs without a single memory store, // which has been profiled to be very fast! Uncommenting these causes an order-of-magnitude slowdown. // For now, using the SSE version only where the tNear and tFar ranges are not interesting. // _mm_store_ss(&tNear, nearD); // _mm_store_ss(&tFar, farD); // To avoid false positives, need to have an additional rejection test for each cardinal axis the ray direction // is parallel to. simd4f out2 = cmplt_ps(rayPos.v, minPoint); simd4f out3 = cmpgt_ps(rayPos.v, maxPoint); out2 = or_ps(out2, out3); zeroDirections = and_ps(zeroDirections, out2); simd4f yOut = yyyy_ps(zeroDirections); simd4f zOut = zzzz_ps(zeroDirections); zeroDirections = or_ps(or_ps(zeroDirections, yOut), zOut); // Intersection occurs if the slab ranges had positive overlap and if the test was not rejected by the ray being // parallel to some cardinal axis. simd4f intersects = andnot_ps(zeroDirections, rangeIntersects); simd4f epsilonMasked = and_ps(epsilon, intersects); return comieq_ss(epsilon, epsilonMasked) != 0; }