static void accumulate(PvdHitType& query, Ps::Array<PvdHitType>& accumulated, const char* arrayName, Ps::Array<PvdSqHit>& dst, const SDKHitType* src, PxU32 nb, const PxQueryFilterData& fd)
{
	query.mFilterFlags = fd.flags;
	query.mHits = PvdReference(arrayName, dst.size(), nb);

	PX_ASSERT(PxU32(-1) != nb);
	for(PxU32 i=0; i<nb; i++)
		dst.pushBack(PvdSqHit(src[i]));

	accumulated.pushBack(query);
}
		void ProfileEventHandler::reportCudaCollection(PxBufferedProfilerCallback& callback)
		{
			if(mCUDAEventOccurences.empty())
				return;

			// sort the cuda occurrences - according to eventID and startTime
			Ps::sort(mCUDAEventOccurences.begin(), mCUDAEventOccurences.size(), SortCUDAProfileOccurences());
			PxU16 currentEventId = mCUDAEventOccurences[0].eventId;
			CUDAProfileEventOccurence currentEvent = mCUDAEventOccurences[0];
			Ps::Array<CUDAProfileEventOccurence> outOccurences;

			// now group the occurrences that do overlap
			// put the group occurrences into out array
			for (PxU32 i = 1; i < mCUDAEventOccurences.size(); i++)
			{
				const CUDAProfileEventOccurence& occurence = mCUDAEventOccurences[i];
				// occurrences are sorted by eventId, so once it changes, we have new group occurrence
				if(currentEventId != occurence.eventId)
				{
					outOccurences.pushBack(currentEvent);
					currentEvent = occurence;
					currentEventId = occurence.eventId;
				}
				else
				{
					// occurrences are sorted by start time, so we check for overlap and add occurrence or create new group
					if(currentEvent.endTime >= occurence.startTime)
					{
						currentEvent.endTime = PxMax(currentEvent.endTime, occurence.endTime);
					}
					else
					{
						outOccurences.pushBack(currentEvent);
						currentEvent = occurence;
					}
				}
			}
			outOccurences.pushBack(currentEvent);

			// fire the callback with grouped occurrences
			for (PxU32 i = outOccurences.size(); i--;)
			{
				const CUDAProfileEventOccurence& ev = outOccurences[i];				

				const char* name = mProfileZoneInterface->getProfileEventName(ev.eventId);
				const char* profileZoneName = mProfileZoneInterface->getName();

				PxBufferedProfilerEvent ce = { ev.startTime, ev.endTime, name, profileZoneName, ev.eventId, 0, 0, 0, 0 };

				callback.onEvent(ce);
			}
		}
void DeformableMesh::generateConstraintsFromTriangles()
{
	PxU32 numTriangles = mPrimitives.size() / 3;

	DeformableTriEdge e;
	Ps::Array<DeformableTriEdge> edges PX_DEBUG_EXP("defoMeshEdges");
	edges.reserve(numTriangles * 3);
	const PxU32 *i0 = mPrimitives.begin();

	for (PxU32 i = 0; i < numTriangles; i++, i0 += 3) 
	{
		const PxU32 *i1 = i0+1;
		const PxU32 *i2 = i0+2;
		e.set(mVertexToParticleMap[*i0], mVertexToParticleMap[*i1], mVertexToParticleMap[*i2], i); 
		edges.pushBack(e);
		e.set(mVertexToParticleMap[*i1], mVertexToParticleMap[*i2], mVertexToParticleMap[*i0], i); 
		edges.pushBack(e);
		e.set(mVertexToParticleMap[*i2], mVertexToParticleMap[*i0], mVertexToParticleMap[*i1], i); 
		edges.pushBack(e);
	}
	quickSortEdges(edges, 0, edges.size()-1);

	DeformableConstraint constraint;
	constraint.particleId[4] = -1;	// only used when torn
	constraint.particleId[5] = -1;	// only used when torn

	for(PxU32 i=0; i<edges.size(); )
	{
		const DeformableTriEdge &e0 = edges[i];
		PxU32 p0 = constraint.particleId[0] = e0.particleId[0];
		PxU32 p1 = constraint.particleId[1] = e0.particleId[1];
		PxVec3 d0 = mWeldedVertices[p1] - mWeldedVertices[p0]; 

		constraint.stretchingRestLength = d0.magnitude();
		constraint.particleId[2] = e0.third;
		constraint.particleId[3] = -1;			// for border edges -> no bending possible
		constraint.bendingRestLength = 0.0f;
		constraint.flags = 0;
		if (++i < edges.size()) {
			const DeformableTriEdge &e1 = edges[i];
			if (e0 == e1) {
				constraint.particleId[2] = e0.third;
				constraint.particleId[3] = e1.third;
				PxVec3 d1 = mWeldedVertices[e1.third] - mWeldedVertices[e0.third]; 
				constraint.bendingRestLength = d1.magnitude();
			}
			while (i < edges.size() && edges[i] == e0)
				++i;
		}
		mConstraints.pushBack(constraint);
	}
}
void InternalIndexPool::freeIndices(PxU32 num, const PxStrideIterator<const PxU32>& indexBuffer)
{
	PxU32 numAllocated = mIndexCount - mFreeList.size();
	if (num > numAllocated)
	{
		Ps::getFoundation().error(PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__, 
			"PxParticleExt::IndexPool::freeIndices: Provided number of indices exceeds number of actually allocated indices. Call faild.");
		return;
	}

#ifdef PX_CHECK
	for (PxU32 i = 0; i < num; ++i)
	{
		if (indexBuffer[i] < mIndexCount)
			continue;
			
		Ps::getFoundation().error(PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__, 
			"PxParticleExt::IndexPool::freeIndices: Provided indices which where not actually allocated before. Call failed.");
		return;
	}
#endif

	for (PxU32 i = 0; i < num; ++i)
		mFreeList.pushBack(indexBuffer[i]);
}
bool Cm::CompleteBoxPruning(const PxBounds3* bounds, PxU32 nb, Ps::Array<PxU32>& pairs, const Axes& axes)
{
	pairs.clear();

	// Checkings
	if(!nb)
		return false;

	// Catch axes
	const PxU32 Axis0 = axes.mAxis0;
	const PxU32 Axis1 = axes.mAxis1;
	const PxU32 Axis2 = axes.mAxis2;

	PX_UNUSED(Axis1);
	PX_UNUSED(Axis2);

	// Allocate some temporary data
	float* PosList = reinterpret_cast<float*>(PX_ALLOC_TEMP(sizeof(float)*nb, "Cm::CompleteBoxPruning"));

	// 1) Build main list using the primary axis
	for(PxU32 i=0;i<nb;i++)	PosList[i] = bounds[i].minimum[Axis0];

	// 2) Sort the list
	/*static*/ RadixSortBuffered RS;	// Static for coherence
	const PxU32* Sorted = RS.Sort(PosList, nb).GetRanks();

	// 3) Prune the list
	const PxU32* const LastSorted = &Sorted[nb];
	const PxU32* RunningAddress = Sorted;
	PxU32 Index0, Index1;
	while(RunningAddress<LastSorted && Sorted<LastSorted)
	{
		Index0 = *Sorted++;

		while(RunningAddress<LastSorted && PosList[*RunningAddress++]<PosList[Index0]);

		const PxU32* RunningAddress2 = RunningAddress;

		while(RunningAddress2<LastSorted && PosList[Index1 = *RunningAddress2++]<=bounds[Index0].maximum[Axis0])
		{
			if(Index0!=Index1)
			{
				if(bounds[Index0].intersects(bounds[Index1]))
				{
					pairs.pushBack(Index0);
					pairs.pushBack(Index1);
				}
			}
		}
	}

	PX_FREE(PosList);

	return true;
}
		void ProfileEventHandler::onCUDAProfileBuffer( PxU64 , PxF32  , const PxU8* cudaData, PxU32 bufLenInBytes, PxU32 bufferVersion )
		{
			if(bufferVersion == 1)
			{
				struct currentWarpProfileEvent
				{
					PxU16 block;
					PxU8  warp;
					PxU8  mpId;
					PxU8  hwWarpId;
					PxU8  userDataCfg;
					PxU16 eventId;
					PxU32 startTime;
					PxU32 endTime;
				};

				Ps::Array<Local::OverflowRecordPair> overflowPair;

				//Run through dataset.  We need to be able to correct rollover, meaning one of the timers
				//runs through PxU32.MAX_VALUE and resets to zero.
				PxU32 numEvents = bufLenInBytes/sizeof(currentWarpProfileEvent);
				const currentWarpProfileEvent* cudaEvents = reinterpret_cast<const currentWarpProfileEvent*> (cudaData);
				for (PxU32 i = 0; i < numEvents; i++)
				{
					const currentWarpProfileEvent& cudaEvent = cudaEvents[i];
					Local::OverflowRecord* record = NULL;
					for (PxU32 j = 0; j < overflowPair.size(); j++)
					{
						if(overflowPair[j].mpId == cudaEvent.mpId)
						{
							record = &overflowPair[j].mOverflowRecord;
							break;
						}
					}

					if(!record)
					{
						Local::OverflowRecordPair pair;
						pair.mpId = cudaEvent.mpId;						
						overflowPair.pushBack(pair);
						record = &overflowPair.back().mOverflowRecord;
					}
					
					//account for overflow
					PxU64 startTime = record->NextValue(cudaEvent.startTime);
					PxU64 endTime = record->NextValue(cudaEvent.endTime);

					CUDAProfileEventOccurence cudaEventOccurence = { cudaEvent.eventId, startTime, endTime };
					mCUDAEventOccurences.pushBack(cudaEventOccurence);
				}
			}		
		}
static void collectBatchedHits(const QueryResultT* results, Ps::Array<PvdHitType>& accumulated, Ps::Array<PvdSqHit>& pvdSqHits, PxU32 nb, PxU32 startIdx, const char* arrayName)
{
	for(PxU32 i=0; i<nb; i++)
	{
		const QueryResultT& result = results[i];
		if(result.queryStatus != PxBatchQueryStatus::eSUCCESS)
			continue;

		PvdHitType& query = accumulated[startIdx + i];
		const PxU32 nbAnyHits = result.getNbAnyHits();
		if(query.mHits.mCount != nbAnyHits)
		{
			query.mHits = PvdReference(arrayName, pvdSqHits.size(), nbAnyHits);

			for(PxU32 j=0; j<nbAnyHits; j++)
				pvdSqHits.pushBack(PvdSqHit(result.getAnyHit(j)));
		}
	}
}
template<class Type> static void pushBackT(Ps::Array<Type>& array, const Type& item, PvdReference& ref, const char* arrayName)
{
	ref = PvdReference(arrayName, array.size(), 1);
	array.pushBack(item);
}
bool Cm::BipartiteBoxPruning(const PxBounds3* bounds0, PxU32 nb0, const PxBounds3* bounds1, PxU32 nb1, Ps::Array<PxU32>& pairs, const Axes& axes)
{
	pairs.clear();
	// Checkings
	if(nb0 == 0 || nb1 == 0)
		return false;

	// Catch axes
	PxU32 Axis0 = axes.mAxis0;
	PxU32 Axis1 = axes.mAxis1;
	PxU32 Axis2 = axes.mAxis2;

	PX_UNUSED(Axis1);
	PX_UNUSED(Axis2);

	// Allocate some temporary data
	float* MinPosBounds0 = reinterpret_cast<float*>(PX_ALLOC_TEMP(sizeof(float)*nb0, "Gu::BipartiteBoxPruning"));
	float* MinPosBounds1 = reinterpret_cast<float*>(PX_ALLOC_TEMP(sizeof(float)*nb1, "Gu::BipartiteBoxPruning"));

	// 1) Build main lists using the primary axis
	for(PxU32 i=0;i<nb0;i++)	MinPosBounds0[i] = bounds0[i].minimum[Axis0];
	for(PxU32 i=0;i<nb1;i++)	MinPosBounds1[i] = bounds1[i].minimum[Axis0];

	// 2) Sort the lists
	//static RadixSort RS0, RS1;	// Static for coherence. Crashes on exit
	RadixSortBuffered RS0, RS1;	// Static for coherence.

	const PxU32* Sorted0 = RS0.Sort(MinPosBounds0, nb0).GetRanks();
	const PxU32* Sorted1 = RS1.Sort(MinPosBounds1, nb1).GetRanks();

	// 3) Prune the lists
	PxU32 Index0, Index1;

	const PxU32* const LastSorted0 = &Sorted0[nb0];
	const PxU32* const LastSorted1 = &Sorted1[nb1];
	const PxU32* RunningAddress0 = Sorted0;
	const PxU32* RunningAddress1 = Sorted1;

	while(RunningAddress1<LastSorted1 && Sorted0<LastSorted0)
	{
		Index0 = *Sorted0++;

		while(RunningAddress1<LastSorted1 && MinPosBounds1[*RunningAddress1]<MinPosBounds0[Index0])	RunningAddress1++;

		const PxU32* RunningAddress2_1 = RunningAddress1;

		while(RunningAddress2_1<LastSorted1 && MinPosBounds1[Index1 = *RunningAddress2_1++]<=bounds0[Index0].maximum[Axis0])
		{
			if(bounds0[Index0].intersects(bounds1[Index1]))
			{
				pairs.pushBack(Index0);
				pairs.pushBack(Index1);
			}
		}
	}

	////

	while(RunningAddress0<LastSorted0 && Sorted1<LastSorted1)
	{
		Index0 = *Sorted1++;

		while(RunningAddress0<LastSorted0 && MinPosBounds0[*RunningAddress0]<=MinPosBounds1[Index0])	RunningAddress0++;

		const PxU32* RunningAddress2_0 = RunningAddress0;

		while(RunningAddress2_0<LastSorted0 && MinPosBounds0[Index1 = *RunningAddress2_0++]<=bounds1[Index0].maximum[Axis0])
		{
			if(bounds0[Index1].intersects(bounds1[Index0]))
			{
				pairs.pushBack(Index1);
				pairs.pushBack(Index0);
			}
		}
	}

	PX_FREE(MinPosBounds1);
	PX_FREE(MinPosBounds0);

	return true;
}
void DeformableMesh::generateConstraintsFromTetrahedra()
{
	PxU32 edgeIndices[6][2] = { {0,1}, {0,2}, {0,3}, {1,2}, {1,3}, {2,3} };
	PxU32 *tetIndices;

	// - tetrahedra are assumed to be unique (thus no parent code here)

	PxU32 numTetrahedra = mPrimitives.size() / 4;

	DeformableTetraEdge e;
	Ps::Array<DeformableTetraEdge> edges PX_DEBUG_EXP("defoMeshEdges2");
	edges.reserve(numTetrahedra * 6);
	tetIndices = mPrimitives.begin();
	PxU32 i, j;
	for (i = 0; i < numTetrahedra; i++, tetIndices += 4) 
	{
		for(j = 0; j < 6; j++) 
		{
			PxU32 e0 = mVertexToParticleMap[tetIndices[edgeIndices[j][0]]];
			PxU32 e1 = mVertexToParticleMap[tetIndices[edgeIndices[j][1]]];
			e.set(e0, e1, i); edges.pushBack(e);	
		}
	}
	
	quickSortTetraEdges(edges, 0, edges.size()-1);

	mConstraints.resize(numTetrahedra);

	DeformableConstraint constraint;
	DeformableTetraEdge *tetEdges[6];
	tetIndices = mPrimitives.begin();
	
	bool warningIssued = false;
	for (i = 0; i < numTetrahedra; i++, tetIndices += 4) 
	{
		for (j = 0; j < 6; j++) 
		{
			PxU32 e0 = mVertexToParticleMap[tetIndices[edgeIndices[j][0]]];
			PxU32 e1 = mVertexToParticleMap[tetIndices[edgeIndices[j][1]]];
			DeformableTetraEdge goalEdge;
			goalEdge.set(e0, e1, i);
			tetEdges[j] = binarySearchTetraEdge(edges, goalEdge);
		}

		for (j = 0; j < 4; j++)
			constraint.particleId[j] = mVertexToParticleMap[tetIndices[j]];
		
		PxVec3 groundArea = (mWeldedVertices[tetIndices[1]] - mWeldedVertices[tetIndices[0]]).cross(mWeldedVertices[tetIndices[2]] - mWeldedVertices[tetIndices[0]]);
		constraint.restVolume = groundArea.dot(mWeldedVertices[tetIndices[3]] - mWeldedVertices[tetIndices[0]]);
		constraint.flags = 0;

		if (!warningIssued && constraint.restVolume < 0.0f)
		{
			Ps::getFoundation().error(PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__, "Soft body mesh tetrahedron %d has illegal winding order.", i);
			warningIssued = true;
		}

		for (j = 0; j < 6; j++) 
		{
			PxU32 e0 = mVertexToParticleMap[tetIndices[edgeIndices[j][0]]];
			PxU32 e1 = mVertexToParticleMap[tetIndices[edgeIndices[j][1]]];
			PxVec3 edgeVec = mWeldedVertices[e1] - mWeldedVertices[e0];
			if(tetEdges[j]) 
			{
				PX_ASSERT(tetEdges[j]->tetrahedron == i);
				constraint.restEdgeLengths[j] = edgeVec.magnitude();
			} 
			else 
				constraint.restEdgeLengths[j] = -edgeVec.magnitude();
		}

		mConstraints[i] = constraint;
	}
}