static bool fullContactsGenerationCapsuleConvex(const CapsuleV& capsule, const ConvexHullV& convexHull,  const PsMatTransformV& aToB, const PsTransformV& transf0,const PsTransformV& transf1,
								PersistentContact* manifoldContacts, ContactBuffer& contactBuffer, const bool idtScale, PersistentContactManifold& manifold, Vec3VArg normal, 
								const Vec3VArg closest, const FloatVArg tolerance, const FloatVArg contactDist, const bool doOverlapTest, Cm::RenderOutput* renderOutput, const FloatVArg toleranceScale)
{

	PX_UNUSED(renderOutput);
	Gu::PolygonalData polyData;
	getPCMConvexData(convexHull,idtScale, polyData);

	PxU8 buff[sizeof(SupportLocalImpl<ConvexHullV>)];
	SupportLocal* map = (idtScale ? static_cast<SupportLocal*>(PX_PLACEMENT_NEW(buff, SupportLocalImpl<ConvexHullNoScaleV>)(static_cast<const ConvexHullNoScaleV&>(convexHull), transf1, convexHull.vertex2Shape, convexHull.shape2Vertex, idtScale)) : 
	static_cast<SupportLocal*>(PX_PLACEMENT_NEW(buff, SupportLocalImpl<ConvexHullV>)(convexHull, transf1, convexHull.vertex2Shape, convexHull.shape2Vertex, idtScale)));

	PxU32 numContacts = 0;
	if (generateFullContactManifold(capsule, polyData, map, aToB, manifoldContacts, numContacts, contactDist, normal, closest, tolerance, doOverlapTest, toleranceScale))
	{

		if (numContacts > 0)
		{
			manifold.addBatchManifoldContacts2(manifoldContacts, numContacts);
			//transform normal into the world space
			normal = transf1.rotate(normal);
			manifold.addManifoldContactsToContactBuffer(contactBuffer, normal, transf0, capsule.radius, contactDist);
		}
		else
		{
			if (!doOverlapTest)
			{
				normal = transf1.rotate(normal);
				manifold.addManifoldContactsToContactBuffer(contactBuffer, normal, transf0, capsule.radius, contactDist);
			}
		}

#if	PCM_LOW_LEVEL_DEBUG
		manifold.drawManifold(*renderOutput, transf0, transf1);
#endif
		return true;
		
	}
	return false;

}
bool sweepConvex_ConvexGeom(const PxGeometry& geom, const PxTransform& pose, const PxConvexMeshGeometry& convexGeom, const PxTransform& convexPose,
							const PxVec3& unitDir, const PxReal distance, PxSweepHit& sweepHit, PxHitFlags hintFlags, const PxReal inflation)
{
	using namespace Ps::aos;
	PX_ASSERT(geom.getType() == PxGeometryType::eCONVEXMESH);
	const PxConvexMeshGeometry& otherConvexGeom = static_cast<const PxConvexMeshGeometry&>(geom);
	ConvexMesh& otherConvexMesh = *static_cast<ConvexMesh*>(otherConvexGeom.convexMesh);

	FETCH_CONVEX_HULL_DATA(convexGeom)

// PT: TODO: find a way to use the FETCH_CONVEX_HULL_DATA macro for the second hull as well
#ifdef __SPU__
	PX_COMPILE_TIME_ASSERT(&((ConvexMesh*)NULL)->getHull()==NULL);
	
	PX_ALIGN_PREFIX(16)  PxU8 otherconvexMeshBuffer[sizeof(ConvexMesh)+32] PX_ALIGN_SUFFIX(16);
	ConvexMesh* otherMesh = memFetchAsync<ConvexMesh>(otherconvexMeshBuffer, (uintptr_t)(&otherConvexMesh), sizeof(ConvexMesh),1);
	memFetchWait(1); // convexMesh	

	PxU32 otherNPolys = otherMesh->getNbPolygonsFast();
	const HullPolygonData* PX_RESTRICT otherPolysEA = otherMesh->getPolygons();
	const PxU32 otherPolysSize = sizeof(HullPolygonData)*otherNPolys + sizeof(PxVec3)*otherMesh->getNbVerts();
	
 	//TODO: Need optimization with dma cache --jiayang
	void* otherHullBuffer = PxAlloca(CELL_ALIGN_SIZE_16(otherPolysSize+32));
	HullPolygonData* otherPolys = memFetchAsync<HullPolygonData>(otherHullBuffer, (uintptr_t)(otherPolysEA), otherPolysSize, 1);

	ConvexHullData* otherHullData = &otherMesh->getHull();
	otherHullData->mPolygons = otherPolys;

	memFetchWait(1); // convexMesh
#else
	ConvexHullData* otherHullData = &otherConvexMesh.getHull();	
#endif
	
	const Vec3V zeroV = V3Zero();
	const FloatV zero = FZero();

	const Vec3V otherVScale = V3LoadU(otherConvexGeom.scale.scale);
	const QuatV otherVQuat = QuatVLoadU(&otherConvexGeom.scale.rotation.x);

	const Vec3V vScale = Vec3V_From_Vec4V(V4LoadU(&convexGeom.scale.scale.x));
	const QuatV vQuat = QuatVLoadU(&convexGeom.scale.rotation.x);

	const PsTransformV otherTransf = loadTransformU(pose);
	const PsTransformV convexTransf = loadTransformU(convexPose);

	const Vec3V worldDir = V3LoadU(unitDir);
	const FloatV dist = FLoad(distance);
	const Vec3V dir = convexTransf.rotateInv(V3Scale(worldDir, dist));

	const PsMatTransformV aToB(convexTransf.transformInv(otherTransf));
	
	ConvexHullV otherConvexHull(otherHullData, zeroV, otherVScale, otherVQuat);
	ConvexHullV convexHull(hullData, zeroV, vScale, vQuat);

	bool isMtd = hintFlags & PxHitFlag::eMTD;
	
	FloatV toi;
	Vec3V closestA, normal;
	bool hit = GJKRelativeRayCast(otherConvexHull, convexHull, aToB, zero, zeroV, dir, toi, normal, closestA,
		inflation, isMtd);

	if(hit)
	{
		sweepHit.flags = PxHitFlag::eDISTANCE | PxHitFlag::eNORMAL;

		if(FAllGrtrOrEq(zero, toi))
		{
			//initial overlap
			if(!(PX_IS_SPU) && isMtd)
			{
				sweepHit.flags |= PxHitFlag::ePOSITION;
				const Vec3V worldPointA = convexTransf.transform(closestA);
				const Vec3V destNormal = V3Neg(V3Normalize(convexTransf.rotate(normal)));
				const FloatV length = toi;
				V3StoreU(destNormal, sweepHit.normal);
				V3StoreU(worldPointA, sweepHit.position);
				FStore(length, &sweepHit.distance);
			}
			else
			{
				sweepHit.distance	= 0.0f;
				sweepHit.normal		= -unitDir;
			}
		}
		else
		{
			sweepHit.flags |= PxHitFlag::ePOSITION;
			const Vec3V worldPointA = convexTransf.transform(closestA);
			const Vec3V destNormal = V3Neg(V3Normalize(convexTransf.rotate(normal)));
			const FloatV length = FMul(dist, toi);
			V3StoreU(destNormal, sweepHit.normal);
			V3StoreU(worldPointA, sweepHit.position);
			FStore(length, &sweepHit.distance);
		}

		// PT: compute closest polygon using the same tweak as in swept-capsule-vs-mesh
		sweepHit.faceIndex = computeSweepConvexPlane(convexGeom,hullData,nbPolys,pose,sweepHit.position,unitDir);
		return true;
	}
	return false;
}
bool sweepConvex_SphereGeom(const PxGeometry& geom, const PxTransform& pose, const PxConvexMeshGeometry& convexGeom, const PxTransform& convexPose,
							const PxVec3& unitDir, const PxReal distance, PxSweepHit& sweepHit, PxHitFlags hintFlags, const PxReal inflation)
{
	PX_ASSERT(geom.getType() == PxGeometryType::eSPHERE);
	const PxSphereGeometry& sphereGeom = static_cast<const PxSphereGeometry&>(geom);

	FETCH_CONVEX_HULL_DATA(convexGeom)

	const Vec3V zeroV = V3Zero();
	const FloatV zero= FZero();

	const Vec3V vScale =  Vec3V_From_Vec4V(V4LoadU(&convexGeom.scale.scale.x));
	const QuatV vQuat = QuatVLoadU(&convexGeom.scale.rotation.x);

	const FloatV sphereRadius = FLoad(sphereGeom.radius);

	const PsTransformV sphereTransf = loadTransformU(pose);
	const PsTransformV convexTransf = loadTransformU(convexPose);

	const PsMatTransformV aToB(convexTransf.transformInv(sphereTransf));

	const Vec3V worldDir = V3LoadU(unitDir);
	const FloatV dist = FLoad(distance);
	const Vec3V dir = convexTransf.rotateInv(V3Scale(worldDir, dist));

	ConvexHullV convexHull(hullData, zeroV, vScale, vQuat);
	//CapsuleV capsule(zeroV, sphereRadius);
	CapsuleV capsule(aToB.p, sphereRadius);

	const bool isMtd = hintFlags & PxHitFlag::eMTD;

	
	FloatV toi;
	Vec3V closestA, normal;
	//bool hit = GJKRelativeRayCast(capsule, convexHull, aToB, zero, zeroV, dir, toi, normal, closestA, inflation);
	bool hit = GJKLocalRayCast(capsule, convexHull, zero, zeroV, dir, toi, normal, closestA,
		sphereGeom.radius+inflation, isMtd);


	if(hit)
	{
		sweepHit.faceIndex = 0xffffffff;

		//closestA = V3NegScaleSub(normal, sphereRadius, closestA);
		const Vec3V destWorldPointA = convexTransf.transform(closestA);
		sweepHit.flags = PxHitFlag::eDISTANCE | PxHitFlag::eNORMAL;

		if(FAllGrtrOrEq(zero, toi))
		{
			//ML: initial overlap
			if(!(PX_IS_SPU) && isMtd)
			{
				sweepHit.flags |= PxHitFlag::ePOSITION;
				const Vec3V destNormal = V3Neg(V3Normalize(convexTransf.rotate(normal)));
				const FloatV length = toi;
				V3StoreU(destNormal, sweepHit.normal);
				V3StoreU(destWorldPointA, sweepHit.position);
				FStore(length, &sweepHit.distance);
			}
			else
			{
				sweepHit.distance	= 0.0f;
				sweepHit.normal		= -unitDir;
			}
		}
		else
		{
			sweepHit.flags |= PxHitFlag::ePOSITION;
			const Vec3V destNormal = V3Neg(V3Normalize(convexTransf.rotate(normal)));
			const FloatV length = FMul(dist, toi);
			V3StoreU(destNormal, sweepHit.normal);
			V3StoreU(destWorldPointA, sweepHit.position);
			FStore(length, &sweepHit.distance);
		}
		return true;
	}
	return false;
}
bool sweepBox_ConvexGeom(GU_BOX_SWEEP_FUNC_PARAMS)
{
	using namespace Ps::aos;
	PX_ASSERT(geom.getType() == PxGeometryType::eCONVEXMESH);
	const PxConvexMeshGeometry& convexGeom = static_cast<const PxConvexMeshGeometry&>(geom);

	PX_ALIGN_PREFIX(16) PxTransform boxTransform PX_ALIGN_SUFFIX(16); boxTransform = box.getTransform();

	FETCH_CONVEX_HULL_DATA(convexGeom)

	const Vec3V zeroV = V3Zero();
	const FloatV zero = FZero();

	const PsTransformV boxPose = loadTransformA(boxTransform);
	const PsTransformV convexPose = loadTransformU(pose);

	const PsMatTransformV aToB(convexPose.transformInv(boxPose));

	const Vec3V boxExtents = V3LoadU(box.extents);

	const Vec3V vScale = V3LoadU(convexGeom.scale.scale);
	const QuatV vQuat = QuatVLoadU(&convexGeom.scale.rotation.x);
	
	BoxV boxV(zeroV, boxExtents);
	ConvexHullV convexHull(hullData, zeroV, vScale, vQuat);

	const Vec3V worldDir = V3LoadU(unitDir);
	const FloatV dist = FLoad(distance);
	const Vec3V dir =convexPose.rotateInv(V3Neg(V3Scale(worldDir, dist)));

	bool isMtd = hintFlags & PxHitFlag::eMTD;

	FloatV toi;
	Vec3V closestA, normal;

	bool hit = GJKRelativeRayCast(boxV, convexHull, aToB, zero, zeroV, dir, toi, normal, closestA,
		inflation, isMtd);

	if(hit)
	{
		
		const Vec3V worldPointA = convexPose.transform(closestA);
		
		sweepHit.flags = PxHitFlag::eDISTANCE | PxHitFlag::eNORMAL;

		if(FAllGrtrOrEq(zero, toi))
		{
			//ML: initial overlap
			if(!(PX_IS_SPU) && isMtd)
			{
				sweepHit.flags |= PxHitFlag::ePOSITION;
				const Vec3V destNormal = V3Normalize(convexPose.rotate(normal));
				const FloatV length = toi;
				const Vec3V destWorldPointA =V3NegScaleSub(destNormal, length, worldPointA);
				V3StoreU(destNormal, sweepHit.normal);
				V3StoreU(destWorldPointA, sweepHit.position);
				FStore(length, &sweepHit.distance);
			}
			else
			{
				sweepHit.distance	= 0.0f;
				sweepHit.normal		= -unitDir;
			}
		}
		else
		{
			sweepHit.flags |= PxHitFlag::ePOSITION;
			const Vec3V destNormal = V3Normalize(convexPose.rotate(normal));
			const FloatV length = FMul(dist, toi);
			const Vec3V destWorldPointA = V3ScaleAdd(worldDir, length, worldPointA);
			V3StoreU(destNormal, sweepHit.normal);
			V3StoreU(destWorldPointA, sweepHit.position);
			FStore(length, &sweepHit.distance);
		}

		// PT: compute closest polygon using the same tweak as in swept-capsule-vs-mesh
		sweepHit.faceIndex = computeSweepConvexPlane(convexGeom,hullData,nbPolys,pose,sweepHit.position,unitDir);
		return true;
	}
	return false;
}
bool sweepCapsule_ConvexGeom(GU_CAPSULE_SWEEP_FUNC_PARAMS)
{
	PX_ASSERT(geom.getType() == PxGeometryType::eCONVEXMESH);

	using namespace Ps::aos;
	
	PX_ASSERT(geom.getType() == PxGeometryType::eCONVEXMESH);
	const PxConvexMeshGeometry& convexGeom = static_cast<const PxConvexMeshGeometry&>(geom);

	FETCH_CONVEX_HULL_DATA(convexGeom)

	PxReal _capsuleHalfHeight = 0.0f;
	const PxTransform capTransform = getCapsuleTransform(lss, _capsuleHalfHeight);

	const Vec3V zeroV = V3Zero();
	const FloatV zero = FZero();
	const FloatV dist = FLoad(distance);
	const Vec3V worldDir = V3LoadU(unitDir);

	const PsTransformV capPose = loadTransformU(capTransform);
	const PsTransformV convexPose = loadTransformU(pose);

	const PsMatTransformV aToB(convexPose.transformInv(capPose));

	//const PsMatTransformV aToB(pose.transformInv(capsuleTransform));

	const FloatV capsuleHalfHeight = FLoad(_capsuleHalfHeight);
	const FloatV capsuleRadius = FLoad(lss.radius);

	const Vec3V vScale = Vec3V_From_Vec4V(V4LoadU(&convexGeom.scale.scale.x));
	const QuatV vQuat = QuatVLoadU(&convexGeom.scale.rotation.x);

	
	CapsuleV capsule(aToB.p, aToB.rotate( V3Scale(V3UnitX(), capsuleHalfHeight)), capsuleRadius);
	//CapsuleV capsule(zeroV, V3Scale(V3UnitX(), capsuleHalfHeight), capsuleRadius);
	ConvexHullV convexHull(hullData, zeroV, vScale, vQuat);

	const Vec3V dir = convexPose.rotateInv(V3Neg(V3Scale(worldDir, dist)));

	bool isMtd = hintFlags & PxHitFlag::eMTD;

	FloatV toi;
	Vec3V closestA, normal;//closestA and normal is in the local space of convex hull
	bool hit  = GJKLocalRayCast(capsule, convexHull, zero, zeroV, dir, toi, normal, closestA,
		lss.radius + inflation, isMtd);

	if(hit)
	{
		sweepHit.flags = PxHitFlag::eDISTANCE | PxHitFlag::eNORMAL;

		if(FAllGrtrOrEq(zero, toi))
		{
			if(!(PX_IS_SPU) && isMtd)
			{
				sweepHit.flags |= PxHitFlag::ePOSITION;
				const FloatV length = toi;
				const Vec3V destNormal = V3Normalize(convexPose.rotate(normal));
				const Vec3V worldPointA = convexPose.transform(closestA);
				const Vec3V destWorldPointA = V3NegScaleSub(destNormal, length, worldPointA);
				V3StoreU(destNormal, sweepHit.normal);
				V3StoreU(destWorldPointA, sweepHit.position);
				FStore(length, &sweepHit.distance);
			}
			else
			{
				sweepHit.distance = 0.f;
				sweepHit.normal = -unitDir;
			}
		}
		else
		{
			sweepHit.flags |= PxHitFlag::ePOSITION;
			const Vec3V worldPointA = convexPose.transform(closestA);
			const FloatV length = FMul(dist, toi);
			const Vec3V destNormal = V3Normalize(convexPose.rotate(normal));
			const Vec3V destWorldPointA = V3ScaleAdd(worldDir, length, worldPointA);
			V3StoreU(destNormal, sweepHit.normal);
			V3StoreU(destWorldPointA, sweepHit.position);
			FStore(length, &sweepHit.distance);
		}
	
		// PT: compute closest polygon using the same tweak as in swept-capsule-vs-mesh
		sweepHit.faceIndex = computeSweepConvexPlane(convexGeom,hullData,nbPolys,pose,sweepHit.position,unitDir);
		//pxPrintf("fi = %d, pos=%.7f %.7f %.7f\n",
		//	sweepHit.faceIndex, sweepHit.position.x, sweepHit.position.y, sweepHit.position.z);
		return true;
	}
	return false;
}
bool pcmContactPlaneBox(GU_CONTACT_METHOD_ARGS)
{
	PX_UNUSED(shape0);
	using namespace Ps::aos;


	Gu::PersistentContactManifold& manifold = cache.getManifold();
	Ps::prefetchLine(&manifold, 256);

	// Get actual shape data
	const PxBoxGeometry& shapeBox = shape1.get<const PxBoxGeometry>();

	const PsTransformV transf0 = loadTransformA(transform1);//box transform
	const PsTransformV transf1 = loadTransformA(transform0);//plane transform

	//box to plane
	const PsTransformV curTransf(transf1.transformInv(transf0));

	//in world space
	const Vec3V negPlaneNormal = V3Normalize(V3Neg(QuatGetBasisVector0(transf1.q)));
	
	const FloatV contactDist = FLoad(contactDistance);

	const Vec3V boxExtents = V3LoadU(shapeBox.halfExtents);

	const FloatV boxMargin = CalculatePCMBoxMargin(boxExtents);
	const FloatV projectBreakingThreshold = FMul(boxMargin, FLoad(0.2f));
	const PxU32 initialContacts = manifold.mNumContacts;
	
	manifold.refreshContactPoints(curTransf, projectBreakingThreshold, contactDist);

	const PxU32 newContacts = manifold.mNumContacts;
	const bool bLostContacts = (newContacts != initialContacts);//((initialContacts == 0) || (newContacts != initialContacts));

	//PX_UNUSED(bLostContacts);
	//if(bLostContacts || manifold.invalidate_BoxConvex(curTransf, boxMargin))
	if(bLostContacts || manifold.invalidate_PrimitivesPlane(curTransf, boxMargin, FLoad(0.2f)))
	{
		//ML:localNormal is the local space of plane normal, however, because shape1 is box and shape0 is plane, we need to use the reverse of contact normal(which will be the plane normal) to make the refreshContactPoints
		//work out the correct pentration for points
		const Vec3V localNormal = V3UnitX();

		manifold.mNumContacts = 0;
		manifold.setRelativeTransform(curTransf);

		const PsMatTransformV aToB(curTransf);
		const FloatV bx = V3GetX(boxExtents);
		const FloatV by = V3GetY(boxExtents);
		const FloatV bz = V3GetZ(boxExtents);

		const FloatV nbx = FNeg(bx);
		const FloatV nby = FNeg(by);
		const FloatV nbz = FNeg(bz);

		const Vec3V temp0 = V3Scale(aToB.getCol0(), bx);
		const Vec3V temp1 = V3Scale(aToB.getCol1(), by);
		const Vec3V temp2 = V3Scale(aToB.getCol2(), bz);

		const Vec3V ntemp2 = V3Neg(temp2);

		const FloatV px = V3GetX(aToB.p);

		//box's points in the local space of plane
		const Vec3V temp01 = V3Add(temp0, temp1);//(x, y)
		const Vec3V temp02 = V3Sub(temp0, temp1);//(x, -y)

		const FloatV s0 = V3GetX(V3Add(temp2,	temp01));//(x, y, z)
		const FloatV s1 = V3GetX(V3Add(ntemp2,	temp01));//(x, y, -z)
		const FloatV s2 = V3GetX(V3Add(temp2,	temp02));//(x, -y, z)
		const FloatV s3 = V3GetX(V3Add(ntemp2,	temp02));//(x, -y, -z)
		const FloatV s4 = V3GetX(V3Sub(temp2,	temp02));//(-x, y, z)
		const FloatV s5 = V3GetX(V3Sub(ntemp2,	temp02));//(-x, y, -z)
		const FloatV s6 = V3GetX(V3Sub(temp2,	temp01));//(-x, -y, z)
		const FloatV s7 = V3GetX(V3Sub(ntemp2,	temp01));//(-x, -y, -z)

		const FloatV acceptanceDist = FSub(contactDist, px);

		Gu::PersistentContact* manifoldContacts = PX_CP_TO_PCP(contactBuffer.contacts);
		PxU32 numContacts = 0;

		if(FAllGrtr(acceptanceDist, s0))
		{
			const FloatV pen = FAdd(s0, px);
			//(x, y, z)
			manifoldContacts[numContacts].mLocalPointA = boxExtents;//aToB.transformInv(p); 
			manifoldContacts[numContacts].mLocalPointB = V3NegScaleSub(localNormal, pen, aToB.transform(boxExtents)); 
			manifoldContacts[numContacts++].mLocalNormalPen = V4SetW(Vec4V_From_Vec3V(localNormal), pen);
		}
	
		if(FAllGrtr(acceptanceDist, s1))
		{
			const FloatV pen = FAdd(s1, px);
			//(x, y, -z)
			const Vec3V p = V3Merge(bx, by, nbz);
			//add to contact stream
			manifoldContacts[numContacts].mLocalPointA = p;//aToB.transformInv(p); 
			manifoldContacts[numContacts].mLocalPointB = V3NegScaleSub(localNormal, pen, aToB.transform(p)); 
			manifoldContacts[numContacts++].mLocalNormalPen = V4SetW(Vec4V_From_Vec3V(localNormal), pen);
		}

		if(FAllGrtr(acceptanceDist, s2))
		{
			const FloatV pen = FAdd(s2, px);
			//(x, -y, z)
			const Vec3V p = V3Merge(bx, nby, bz);
			manifoldContacts[numContacts].mLocalPointA = p;//aToB.transformInv(p); 
			manifoldContacts[numContacts].mLocalPointB = V3NegScaleSub(localNormal, pen, aToB.transform(p)); 
			manifoldContacts[numContacts++].mLocalNormalPen = V4SetW(Vec4V_From_Vec3V(localNormal), pen);
		}

		if(FAllGrtr(acceptanceDist, s3))
		{
			const FloatV pen = FAdd(s3, px);
			//(x, -y, -z)
			const Vec3V p = V3Merge(bx, nby, nbz);
			manifoldContacts[numContacts].mLocalPointA = p; 
			manifoldContacts[numContacts].mLocalPointB = V3NegScaleSub(localNormal, pen, aToB.transform(p)); 
			manifoldContacts[numContacts++].mLocalNormalPen = V4SetW(Vec4V_From_Vec3V(localNormal), pen);
		}
	
		if(FAllGrtr(acceptanceDist, s4))
		{
			const FloatV pen = FAdd(s4, px);
			//(-x, y, z)
			const Vec3V p =V3Merge(nbx, by, bz);
			manifoldContacts[numContacts].mLocalPointA = p;
			manifoldContacts[numContacts].mLocalPointB = V3NegScaleSub(localNormal, pen, aToB.transform(p)); 
			manifoldContacts[numContacts++].mLocalNormalPen = V4SetW(Vec4V_From_Vec3V(localNormal), pen);
		}

		if(FAllGrtr(acceptanceDist, s5))
		{
			const FloatV pen = FAdd(s5, px);
			//(-x, y, -z)
			const Vec3V p = V3Merge(nbx, by, nbz);
			manifoldContacts[numContacts].mLocalPointA = p;//aToB.transformInv(p); 
			manifoldContacts[numContacts].mLocalPointB = V3NegScaleSub(localNormal, pen, aToB.transform(p)); 
			manifoldContacts[numContacts++].mLocalNormalPen = V4SetW(Vec4V_From_Vec3V(localNormal), pen);
		}

	
		if(FAllGrtr(acceptanceDist, s6))
		{
			const FloatV pen = FAdd(s6, px);
			//(-x, -y, z)
			const Vec3V p = V3Merge(nbx, nby, bz);
			manifoldContacts[numContacts].mLocalPointA = p;//aToB.transformInv(p); 
			manifoldContacts[numContacts].mLocalPointB = V3NegScaleSub(localNormal, pen, aToB.transform(p)); 
			manifoldContacts[numContacts++].mLocalNormalPen = V4SetW(Vec4V_From_Vec3V(localNormal), pen);
		}
	
		if(FAllGrtr(acceptanceDist, s7))
		{
			const FloatV pen = FAdd(s7, px);
			//(-x, -y, -z)
			const Vec3V p = V3Merge(nbx, nby, nbz);
			manifoldContacts[numContacts].mLocalPointA = p;
			manifoldContacts[numContacts].mLocalPointB = V3NegScaleSub(localNormal, pen, aToB.transform(p)); 
			manifoldContacts[numContacts++].mLocalNormalPen = V4SetW(Vec4V_From_Vec3V(localNormal), pen);
		}
		
		//reduce contacts
		manifold.addBatchManifoldContactsCluster(manifoldContacts, numContacts);
		manifold.addManifoldContactsToContactBuffer(contactBuffer, negPlaneNormal, transf1);

		return manifold.getNumContacts() > 0;
	}
	else
	{
		manifold.addManifoldContactsToContactBuffer(contactBuffer, negPlaneNormal, transf1);
		
		//manifold.drawManifold(*gRenderOutPut, transf0, transf1);
		return manifold.getNumContacts() > 0;
	}
	
}
bool pcmContactCapsuleConvex(GU_CONTACT_METHOD_ARGS)
{
	PX_UNUSED(renderOutput);


	const PxConvexMeshGeometryLL& shapeConvex = shape1.get<const PxConvexMeshGeometryLL>();
	const PxCapsuleGeometry& shapeCapsule = shape0.get<const PxCapsuleGeometry>();

	PersistentContactManifold& manifold = cache.getManifold();

	Ps::prefetchLine(shapeConvex.hullData);

		
	PX_ASSERT(transform1.q.isSane());
	PX_ASSERT(transform0.q.isSane());

	const Vec3V zeroV = V3Zero();

	const Vec3V vScale = V3LoadU_SafeReadW(shapeConvex.scale.scale);	// PT: safe because 'rotation' follows 'scale' in PxMeshScale

	const FloatV contactDist = FLoad(params.mContactDistance);
	const FloatV capsuleHalfHeight = FLoad(shapeCapsule.halfHeight);
	const FloatV capsuleRadius = FLoad(shapeCapsule.radius);
	const ConvexHullData* hullData =shapeConvex.hullData;
	
	//Transfer A into the local space of B
	const PsTransformV transf0 = loadTransformA(transform0);
	const PsTransformV transf1 = loadTransformA(transform1);
	const PsTransformV curRTrans(transf1.transformInv(transf0));
	const PsMatTransformV aToB(curRTrans);
	

	const FloatV convexMargin = Gu::CalculatePCMConvexMargin(hullData, vScale);
	const FloatV capsuleMinMargin = Gu::CalculateCapsuleMinMargin(capsuleRadius);
	const FloatV minMargin = FMin(convexMargin, capsuleMinMargin);
	
	const PxU32 initialContacts = manifold.mNumContacts;
	const FloatV projectBreakingThreshold = FMul(minMargin, FLoad(1.25f));
	const FloatV refreshDist = FAdd(contactDist, capsuleRadius);

	manifold.refreshContactPoints(aToB,  projectBreakingThreshold, refreshDist);

	//ML: after refreshContactPoints, we might lose some contacts
	const bool bLostContacts = (manifold.mNumContacts != initialContacts);

	GjkStatus status = manifold.mNumContacts > 0 ? GJK_UNDEFINED : GJK_NON_INTERSECT;

	Vec3V closestA(zeroV), closestB(zeroV), normal(zeroV); // from a to b
	const FloatV zero = FZero();
	FloatV penDep = zero;

	PX_UNUSED(bLostContacts);
	if(bLostContacts || manifold.invalidate_SphereCapsule(curRTrans, minMargin))
	{
		const bool idtScale = shapeConvex.scale.isIdentity();

		manifold.setRelativeTransform(curRTrans);
		const QuatV vQuat = QuatVLoadU(&shapeConvex.scale.rotation.x);  
		ConvexHullV convexHull(hullData, zeroV, vScale, vQuat, idtScale);
		convexHull.setMargin(zero);
	
		//transform capsule(a) into the local space of convexHull(b)
		CapsuleV capsule(aToB.p, aToB.rotate(V3Scale(V3UnitX(), capsuleHalfHeight)), capsuleRadius);
	
		LocalConvex<CapsuleV> convexA(capsule);
		const Vec3V initialSearchDir = V3Sub(capsule.getCenter(), convexHull.getCenter());
		if(idtScale)
		{
			LocalConvex<ConvexHullNoScaleV> convexB(*PX_CONVEX_TO_NOSCALECONVEX(&convexHull));

			status = gjkPenetration<LocalConvex<CapsuleV>, LocalConvex<ConvexHullNoScaleV> >(convexA, convexB, initialSearchDir, contactDist, closestA, closestB, normal, penDep, 
				manifold.mAIndice, manifold.mBIndice, manifold.mNumWarmStartPoints, true);
		}
		else
		{
			LocalConvex<ConvexHullV> convexB(convexHull);
			status = gjkPenetration<LocalConvex<CapsuleV>, LocalConvex<ConvexHullV> >(convexA, convexB, initialSearchDir, contactDist, closestA, closestB, normal, penDep, 
				manifold.mAIndice, manifold.mBIndice, manifold.mNumWarmStartPoints, true);

		}     

		Gu::PersistentContact* manifoldContacts = PX_CP_TO_PCP(contactBuffer.contacts);
		bool doOverlapTest = false;
		if(status == GJK_NON_INTERSECT)
		{
			return false;
		}
		else if(status == GJK_DEGENERATE)
		{
			return fullContactsGenerationCapsuleConvex(capsule, convexHull, aToB, transf0, transf1, manifoldContacts, contactBuffer, idtScale, manifold, normal, 
				closestB, convexHull.getMargin(), contactDist, true, renderOutput, FLoad(params.mToleranceLength));
		}
		else 
		{
			const FloatV replaceBreakingThreshold = FMul(minMargin, FLoad(0.05f));

			if(status == GJK_CONTACT)
			{
				const Vec3V localPointA = aToB.transformInv(closestA);//curRTrans.transformInv(closestA);
				const Vec4V localNormalPen = V4SetW(Vec4V_From_Vec3V(normal), penDep);
				//Add contact to contact stream
				manifoldContacts[0].mLocalPointA = localPointA;
				manifoldContacts[0].mLocalPointB = closestB;
				manifoldContacts[0].mLocalNormalPen = localNormalPen;

				//Add contact to manifold
				manifold.addManifoldPoint2(localPointA, closestB, localNormalPen, replaceBreakingThreshold);
			}
			else
			{
				PX_ASSERT(status == EPA_CONTACT);
				
				if(idtScale)
				{
					LocalConvex<ConvexHullNoScaleV> convexB(*PX_CONVEX_TO_NOSCALECONVEX(&convexHull));

					status= Gu::epaPenetration(convexA, convexB, manifold.mAIndice, manifold.mBIndice, manifold.mNumWarmStartPoints,
					closestA, closestB, normal, penDep, true);
				}
				else
				{
					LocalConvex<ConvexHullV> convexB(convexHull);
					status= Gu::epaPenetration(convexA, convexB,  manifold.mAIndice, manifold.mBIndice, manifold.mNumWarmStartPoints,
					closestA, closestB, normal, penDep, true);
				}
				
				
				if(status == EPA_CONTACT)
				{
					const Vec3V localPointA = aToB.transformInv(closestA);//curRTrans.transformInv(closestA);
					const Vec4V localNormalPen = V4SetW(Vec4V_From_Vec3V(normal), penDep);
					//Add contact to contact stream
					manifoldContacts[0].mLocalPointA = localPointA;
					manifoldContacts[0].mLocalPointB = closestB;
					manifoldContacts[0].mLocalNormalPen = localNormalPen;

					//Add contact to manifold
					manifold.addManifoldPoint2(localPointA, closestB, localNormalPen, replaceBreakingThreshold);
					

				}
				else
				{
					doOverlapTest = true;   
				}
			}

		
			if(initialContacts == 0 || bLostContacts || doOverlapTest)
			{
				return fullContactsGenerationCapsuleConvex(capsule, convexHull, aToB, transf0, transf1, manifoldContacts, contactBuffer, idtScale, manifold, normal, 
					closestB, convexHull.getMargin(), contactDist, doOverlapTest, renderOutput, FLoad(params.mToleranceLength));
			}
			else
			{
				//This contact is either come from GJK or EPA
				normal = transf1.rotate(normal);
				manifold.addManifoldContactsToContactBuffer(contactBuffer, normal, transf0, capsuleRadius, contactDist);
#if	PCM_LOW_LEVEL_DEBUG
				manifold.drawManifold(*renderOutput, transf0, transf1);
#endif
				return true;
			}
		}	
	}
	else if (manifold.getNumContacts() > 0)
	{
		normal = manifold.getWorldNormal(transf1);
		manifold.addManifoldContactsToContactBuffer(contactBuffer, normal, transf0, capsuleRadius, contactDist);
#if	PCM_LOW_LEVEL_DEBUG
		manifold.drawManifold(*renderOutput, transf0, transf1);
#endif
		return true;
	}
	return false;
}
bool Gu::SweepBoxTriangles(	PxU32 nbTris, const PxTriangle* triangles, bool isDoubleSided,
							const PxBoxGeometry& boxGeom, const PxTransform& boxPose, const PxVec3& dir, const PxReal length, PxVec3& _hit,
							PxVec3& _normal, float& _d, PxU32& _index, const PxU32* cachedIndex, const PxReal inflation, PxHitFlags hintFlags)
{
	PX_UNUSED(hintFlags);

	if(!nbTris)
		return false;

	const bool meshBothSides = hintFlags & PxHitFlag::eMESH_BOTH_SIDES;
	const bool doBackfaceCulling = !isDoubleSided && !meshBothSides;

	Box box;
	buildFrom1(box, boxPose.p, boxGeom.halfExtents, boxPose.q);

	PxSweepHit sweepHit;
	// Move to AABB space
	Matrix34 worldToBox;
	computeWorldToBoxMatrix(worldToBox, box);

	const PxVec3 localDir = worldToBox.rotate(dir);
	const PxVec3 localMotion = localDir * length;

	const Vec3V base0 = V3LoadU(worldToBox.base0);
	const Vec3V base1 = V3LoadU(worldToBox.base1);
	const Vec3V base2 = V3LoadU(worldToBox.base2);
	const Mat33V matV(base0, base1, base2);
	const Vec3V p	  = V3LoadU(worldToBox.base3);
	const PsMatTransformV worldToBoxV(p, matV);

	const FloatV zero = FZero();
	const Vec3V zeroV = V3Zero();
	const BoolV bTrue = BTTTT();
	const Vec3V boxExtents = V3LoadU(box.extents);
	const Vec3V boxDir = V3LoadU(localDir);
	const FloatV inflationV = FLoad(inflation);
	const Vec3V absBoxDir = V3Abs(boxDir);
	const FloatV boxRadiusV = FAdd(V3Dot(absBoxDir, boxExtents), inflationV);
	BoxV boxV(zeroV, boxExtents);

#ifdef PX_DEBUG
	PxU32 totalTestsExpected = nbTris;
	PxU32 totalTestsReal = 0;
	PX_UNUSED(totalTestsExpected);
	PX_UNUSED(totalTestsReal);
#endif

	Vec3V boxLocalMotion = V3LoadU(localMotion);
	Vec3V minClosestA = zeroV, minNormal = zeroV;
	PxU32 minTriangleIndex = 0;
	PxVec3 bestTriNormal(0.0f);
	FloatV dist = FLoad(length);

	const PsTransformV boxPos = loadTransformU(boxPose);

	bool status = false;

	const PxU32 idx = cachedIndex ? *cachedIndex : 0;

	for(PxU32 ii=0;ii<nbTris;ii++)
	{
		const PxU32 triangleIndex = getTriangleIndex(ii, idx);

		const Vec3V localV0 =  V3LoadU(triangles[triangleIndex].verts[0]);
		const Vec3V localV1 =  V3LoadU(triangles[triangleIndex].verts[1]);
		const Vec3V localV2 =  V3LoadU(triangles[triangleIndex].verts[2]);

		const Vec3V triV0 = worldToBoxV.transform(localV0);
		const Vec3V triV1 = worldToBoxV.transform(localV1);
		const Vec3V triV2 = worldToBoxV.transform(localV2);

		const Vec3V triNormal = V3Cross(V3Sub(triV2, triV1),V3Sub(triV0, triV1)); 

		if(doBackfaceCulling && FAllGrtrOrEq(V3Dot(triNormal, boxLocalMotion), zero)) // backface culling
			continue;

		const FloatV dp0 = V3Dot(triV0, boxDir);
		const FloatV dp1 = V3Dot(triV1, boxDir);
		const FloatV dp2 = V3Dot(triV2, boxDir);
		
		const FloatV dp = FMin(dp0, FMin(dp1, dp2));

		const Vec3V dpV = V3Merge(dp0, dp1, dp2);

		const FloatV temp1 = FAdd(boxRadiusV, dist);
		const BoolV con0 = FIsGrtr(dp, temp1);
		const BoolV con1 = V3IsGrtr(zeroV, dpV);

		if(BAllEq(BOr(con0, con1), bTrue))
			continue;

#ifdef PX_DEBUG
		totalTestsReal++;
#endif

		TriangleV triangleV(triV0, triV1, triV2);
		
		FloatV lambda;   
		Vec3V closestA, normal;//closestA and normal is in the local space of convex hull
		bool hit  = GJKLocalRayCast(triangleV, boxV, zero, zeroV, boxLocalMotion, lambda, normal, closestA, inflation, false); 
		
		if(hit)
		{
			//hitCount++;
		
			if(FAllGrtrOrEq(zero, lambda))
			{
				_d		= 0.0f;
				_index	= triangleIndex;
				_normal	= -dir;
				return true;
			}

			dist = FMul(dist,lambda);
			boxLocalMotion = V3Scale(boxDir, dist);  
			minClosestA = closestA;
			minNormal = normal;
			minTriangleIndex = triangleIndex;
			V3StoreU(triNormal, bestTriNormal);
			status = true;
		}
	}

	if(status)
	{
		_index	= minTriangleIndex;
		const Vec3V destNormal = V3Neg(V3Normalize(boxPos.rotate(minNormal)));
		const Vec3V destWorldPointA = boxPos.transform(minClosestA);
		V3StoreU(destNormal, _normal);
		V3StoreU(destWorldPointA, _hit);
		FStore(dist, &_d);

		// PT: by design, returned normal is opposed to the sweep direction.
		if(shouldFlipNormal(_normal, meshBothSides, isDoubleSided, bestTriNormal, dir))
			_normal = -_normal;

		return true;
	}
	return false;
}
bool sweepCapsule_BoxGeom(GU_CAPSULE_SWEEP_FUNC_PARAMS)
{
	PX_UNUSED(hintFlags);

	using namespace Ps::aos;
	PX_ASSERT(geom.getType() == PxGeometryType::eBOX);
	const PxBoxGeometry& boxGeom = static_cast<const PxBoxGeometry&>(geom);

	const FloatV zero = FZero();
	const Vec3V zeroV = V3Zero();
	const Vec3V boxExtents0 = V3LoadU(boxGeom.halfExtents);
	const FloatV dist = FLoad(distance);
	const Vec3V worldDir = V3LoadU(unitDir);

	PxReal _capsuleHalfHeight = 0.0f;
	const PxTransform capTransform = getCapsuleTransform(lss, _capsuleHalfHeight);

	const PsTransformV capPos = loadTransformU(capTransform);
	const PsTransformV boxPos = loadTransformU(pose);

	const PsMatTransformV aToB(boxPos.transformInv(capPos));

	const FloatV capsuleHalfHeight = FLoad(_capsuleHalfHeight);
	const FloatV capsuleRadius = FLoad(lss.radius);

	BoxV box(zeroV, boxExtents0);
	CapsuleV capsule(aToB.p, aToB.rotate(V3Scale(V3UnitX(), capsuleHalfHeight)), capsuleRadius);

	const Vec3V dir = boxPos.rotateInv(V3Neg(V3Scale(worldDir, dist)));

	const bool isMtd = hintFlags & PxHitFlag::eMTD;
	FloatV toi = FMax();
	Vec3V closestA, normal;//closestA and normal is in the local space of box
	bool hit  = GJKLocalRayCast(capsule, box, zero, zeroV, dir, toi, normal, closestA, lss.radius + inflation, isMtd);

	if(hit)
	{
		sweepHit.flags = PxHitFlag::eDISTANCE | PxHitFlag::eNORMAL;
		if(FAllGrtrOrEq(zero, toi))
		{
			//initial overlap
			if((!PX_IS_SPU) && isMtd)
			{
				sweepHit.flags |= PxHitFlag::ePOSITION;
				const Vec3V worldPointA = boxPos.transform(closestA);
				const Vec3V destNormal = boxPos.rotate(normal);
				const FloatV length = toi;
				const Vec3V destWorldPointA = V3NegScaleSub(destNormal, length, worldPointA);
				V3StoreU(destWorldPointA, sweepHit.position);
				V3StoreU(destNormal, sweepHit.normal);
				FStore(length, &sweepHit.distance);
			}
			else
			{
				sweepHit.distance	= 0.0f;
				sweepHit.normal		= -unitDir;
			}
		}
		else
		{
			sweepHit.flags |= PxHitFlag::ePOSITION;
			const Vec3V worldPointA = boxPos.transform(closestA);
			const Vec3V destNormal = boxPos.rotate(normal);
			const FloatV length = FMul(dist, toi);
			const Vec3V destWorldPointA = V3ScaleAdd(worldDir, length, worldPointA);
			V3StoreU(destNormal, sweepHit.normal);
			V3StoreU(destWorldPointA, sweepHit.position);
			FStore(length, &sweepHit.distance);

		}

		return true;
	}
	return false;
}
bool sweepBox_BoxGeom(GU_BOX_SWEEP_FUNC_PARAMS)
{
	PX_ASSERT(geom.getType() == PxGeometryType::eBOX);
	const PxBoxGeometry& boxGeom = static_cast<const PxBoxGeometry&>(geom);

	const FloatV zero = FZero();
	const Vec3V zeroV = V3Zero();
	const Vec3V boxExtents0 = V3LoadU(boxGeom.halfExtents);
	const Vec3V boxExtents1 = V3LoadU(box.extents);
	const FloatV worldDist = FLoad(distance);
	const Vec3V  unitDirV = V3LoadU(unitDir);

	const PxTransform boxWorldPose = box.getTransform();

	const PsTransformV boxTrans0 = loadTransformU(pose);
	const PsTransformV boxTrans1 = loadTransformU(boxWorldPose);

	const PsMatTransformV aToB(boxTrans1.transformInv(boxTrans0));

	BoxV box0(zeroV, boxExtents0);
	BoxV box1(zeroV, boxExtents1);

	//transform into b space
	const Vec3V dir = boxTrans1.rotateInv(V3Scale(unitDirV, worldDist));
	const bool isMtd = hintFlags & PxHitFlag::eMTD;
	FloatV toi;
	Vec3V closestA, normal;//closestA and normal is in the local space of box
	bool hit  = GJKRelativeRayCast(box0, box1, aToB, zero, zeroV, dir, toi, normal, closestA, inflation, isMtd);
	
	if(hit)
	{
		sweepHit.flags = PxHitFlag::eDISTANCE | PxHitFlag::eNORMAL;
		if(FAllGrtrOrEq(zero, toi))
		{
			if((!PX_IS_SPU) && isMtd)
			{
				sweepHit.flags |= PxHitFlag::ePOSITION;
				const FloatV length = toi;
				const Vec3V destWorldPointA = boxTrans1.transform(closestA);
				const Vec3V destNormal = V3Normalize(boxTrans1.rotate(normal));
				V3StoreU(V3Neg(destNormal), sweepHit.normal);
				V3StoreU(destWorldPointA, sweepHit.position);
				FStore(length, &sweepHit.distance);
			}
			else
			{
				sweepHit.distance	= 0.0f;
				sweepHit.normal		= -unitDir;
			}
		}
		else
		{
			sweepHit.flags |= PxHitFlag::ePOSITION;
			const Vec3V destWorldPointA = boxTrans1.transform(closestA);
			const Vec3V destNormal = V3Normalize(boxTrans1.rotate(normal));
			const FloatV length = FMul(worldDist, toi);
			V3StoreU(V3Neg(destNormal), sweepHit.normal);
			V3StoreU(destWorldPointA, sweepHit.position);
			FStore(length, &sweepHit.distance);
		}
		return true;
	}
	return false;
}
bool sweepBox_CapsuleGeom(GU_BOX_SWEEP_FUNC_PARAMS)
{
	using namespace Ps::aos;
	PX_ASSERT(geom.getType() == PxGeometryType::eCAPSULE);
	PX_UNUSED(hintFlags);
	const PxCapsuleGeometry& capsuleGeom = static_cast<const PxCapsuleGeometry&>(geom);

	const FloatV capsuleHalfHeight = FLoad(capsuleGeom.halfHeight);
	const FloatV capsuleRadius = FLoad(capsuleGeom.radius);

	const FloatV zero = FZero();
	const Vec3V zeroV = V3Zero();
	const Vec3V boxExtents = V3LoadU(box.extents);
	const FloatV worldDist = FLoad(distance);
	const Vec3V  unitDirV = V3LoadU(unitDir);

	const PxTransform boxWorldPose = box.getTransform();

	const PsTransformV capPos = loadTransformU(pose);
	const PsTransformV boxPos = loadTransformU(boxWorldPose);

	const PsMatTransformV aToB(boxPos.transformInv(capPos));

	BoxV boxV(zeroV, boxExtents);
	CapsuleV capsuleV(aToB.p, aToB.rotate(V3Scale(V3UnitX(), capsuleHalfHeight)), capsuleRadius);

	//transform into b space
	const Vec3V dir = boxPos.rotateInv(V3Scale(unitDirV, worldDist));

	const bool isMtd = hintFlags & PxHitFlag::eMTD;
	FloatV toi;
	Vec3V closestA, normal;//closestA and normal is in the local space of box
	bool hit  = GJKLocalRayCast(capsuleV, boxV, zero, zeroV, dir, toi, normal, closestA, capsuleGeom.radius+inflation, isMtd);

	if(hit)
	{
		sweepHit.flags = PxHitFlag::eDISTANCE | PxHitFlag::eNORMAL;

		//initial overlap
		if(FAllGrtrOrEq(zero, toi))
		{
			if((!PX_IS_SPU) && isMtd)
			{
				sweepHit.flags |= PxHitFlag::ePOSITION;
				//initial overlap is toi < 0 
				const FloatV length = toi;
				const Vec3V destWorldPointA = boxPos.transform(closestA);
				const Vec3V destNormal = boxPos.rotate(normal);
				V3StoreU(V3Neg(destNormal), sweepHit.normal);
				V3StoreU(destWorldPointA, sweepHit.position);
				FStore(length, &sweepHit.distance);
			}
			else
			{
				sweepHit.distance	= 0.0f;
				sweepHit.normal		= -unitDir;
			}
			return true;
		}
		else
		{
			sweepHit.flags |= PxHitFlag::ePOSITION;
			const Vec3V destWorldPointA = boxPos.transform(closestA);
			const Vec3V destNormal = boxPos.rotate(normal);
			const FloatV length = FMul(worldDist, toi);
			V3StoreU(V3Neg(destNormal), sweepHit.normal);
			V3StoreU(destWorldPointA, sweepHit.position);
			FStore(length, &sweepHit.distance);
		}
		
		return true;	
	}
	return false;
}
bool sweepBox_SphereGeom(GU_BOX_SWEEP_FUNC_PARAMS)
{
	PX_ASSERT(geom.getType() == PxGeometryType::eSPHERE);
	PX_UNUSED(hintFlags);
	const PxSphereGeometry& sphereGeom = static_cast<const PxSphereGeometry&>(geom);

	const FloatV zero = FZero();
	const Vec3V zeroV = V3Zero();
	const Vec3V boxExtents = V3LoadU(box.extents);
	const FloatV worldDist = FLoad(distance);
	const Vec3V  unitDirV = V3LoadU(unitDir);

	const FloatV sphereRadius = FLoad(sphereGeom.radius);

	/* 
		DE10168
		original code:
		const PxTransform boxWorldPose = box.getTransform(); 
		starting with SDK 1.700 it breaks various PS4 non-debug tests:
		SqTestSweep and RecoveryModuleTest

		for some unknown reason, the vector elements end up in the wrong location
		(in y where it should be in x it seems)

		while we investigate the real cause, use the equivalent code directly
	*/
	const PxTransform boxWorldPose = PxTransform(box.center, PxQuat(box.rot));
	
	const PsTransformV spherePos = loadTransformU(pose);
	const PsTransformV boxPos = loadTransformU(boxWorldPose);

	const PsMatTransformV aToB(boxPos.transformInv(spherePos));

	BoxV boxV(zeroV, boxExtents);
	CapsuleV capsuleV(aToB.p, sphereRadius);

	//transform into b space
	const Vec3V dir = boxPos.rotateInv(V3Scale(unitDirV, worldDist));

	bool isMtd = hintFlags & PxHitFlag::eMTD;
	FloatV toi;
	Vec3V closestA, normal;//closestA and normal is in the local space of box
	bool hit  = GJKLocalRayCast(capsuleV, boxV, zero, zeroV, dir, toi, normal, closestA, sphereGeom.radius+inflation, isMtd);

	if(hit)
	{
		sweepHit.flags = PxHitFlag::eDISTANCE | PxHitFlag::eNORMAL;

		//initial overlap
		if(FAllGrtrOrEq(zero, toi))
		{
			if((!PX_IS_SPU) && isMtd)
			{
				sweepHit.flags |= PxHitFlag::ePOSITION;
				const Vec3V destWorldPointA = boxPos.transform(closestA);
				const Vec3V destNormal = V3Neg(boxPos.rotate(normal));
				const FloatV length = toi;
				V3StoreU(destNormal, sweepHit.normal);
				V3StoreU(destWorldPointA, sweepHit.position);
				FStore(length, &sweepHit.distance);
			}
			else
			{
				sweepHit.distance	= 0.0f;
				sweepHit.normal		= -unitDir;
			}
		}
		else
		{

			sweepHit.flags |= PxHitFlag::ePOSITION;
			const Vec3V destWorldPointA = boxPos.transform(closestA);
			const Vec3V destNormal = V3Neg(boxPos.rotate(normal));
			const FloatV length = FMul(worldDist, toi);
			V3StoreU(destNormal, sweepHit.normal);
			V3StoreU(destWorldPointA, sweepHit.position);
			FStore(length, &sweepHit.distance);
		}
		return true;
	}
	return false;
}