Пример #1
0
void SlowVideo(char* Video,int BPP,int Width,int Height,int StrideX,int StrideY,int32_t* Result)
{
	if (BPP == 16)
	{
		int64_t t0,t1,t;
		int nv,ns;
		int Mul,Len;

		int SystemLength = 512*1024;
		char* System = (char*) malloc( SystemLength ); //hopefully cpu cache will be smaller

		if (System)
		{
			TRY_BEGIN

			if (StrideX < 0)
			{
				StrideX = -StrideX;
				Video -= StrideX * (Width - 1);
			}

			if (StrideY < 0)
			{
				StrideY = -StrideY;
				Video -= StrideY * (Height - 1);
			}

			if (StrideX > StrideY)
			{
				Swap(&Width,&Height);
				Swap(&StrideX,&StrideY);
			}		

			Len = (Width*BPP) >> 3;

			if (Len>32 && !IsBadWritePtr(Video,Len) && !IsBadReadPtr(Video,Len))
			{
				int i;
				int Rows = SystemLength / Len;

				memset(System,0,Rows*Len);
				memcpy(System,Video,Len);

				BeginCounter(&t);

				t >>= 1; // 0.5 sec

				SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_HIGHEST);

				nv=0;
				GetCounter(&t0);
				t0 += t;				
				do
				{
					for (i=0;i<Rows;++i)
						memcpy(Video,System,Len);
					nv++;
					GetCounter(&t1);
				} while (t1 < t0);

				ns=0;
				GetCounter(&t0);
				t0 += t;				
				do
				{
					for (i=0;i<Rows;i++)
						memcpy(System+i*Len,System,Len);
					ns++;
					GetCounter(&t1);
				} while (t1 < t0);

				Mul = 6;

				if (nv*Mul >= ns)
					*Result = 0; // fast
				else
					*Result = 1; // slow (video memory or in general)

				SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_NORMAL);
				EndCounter();

#ifdef BENCH
				{
					tchar_t Msg[256];
					stprintf_s(Msg,TSIZEOF(Msg),T("Video %d\nSystem %d\nResult %d"),nv,ns,*Result);
					MessageBox(NULL,Msg,T(""),MB_OK|MB_SETFOREGROUND);
				}
#endif
			}

			TRY_END

			free(System);
		}
Пример #2
0
HRESULT Update(double deltaTime)
{
	HRESULT hr = S_OK;

	if (g_endgame)
		return EndgameUpdate(deltaTime);

	// TODO: Optimize for cache coherency
	//		 We could attempt to store chains of nodes linearly in memory. That would make the update loop for nodes in those chains
	//		 super fast (since the most chains could probably fit in one cache line). But it would involve a lot of mem moves and 
	//		 could introduce some complexity. Since we're already under 1ms average, I'd say let's not do it.

	// Sort into buckets
	BeginCounter(&binningCounter);
	{
		float pixelsPerVert = float((g_width * g_height) / g_numActiveNodes);
		float binDiameterPixels = sqrt(pixelsPerVert); // conservative
	
		g_binNHeight = binDiameterPixels / g_height;
		g_binNWidth  = binDiameterPixels / g_width;

		g_binCountX  = uint(ceilf(1.0f / g_binNWidth) )+2;  // Add a boundary around the outside
		g_binCountY  = uint(ceilf(1.0f / g_binNHeight))+2;

		uint xiter = g_binUpdateIter % g_numBinSplits;
		uint yiter = g_binUpdateIter / g_numBinSplits;
		g_binRangeX[0] = (g_binCountX * xiter/g_numBinSplits)		  - 1;	// Subtract/Add 1 to each of these ranges for a buffer layer
		g_binRangeX[1] = (g_binCountX * (xiter+1)/g_numBinSplits - 1) + 1;	// This buffer layer will be overlap for each quadrant
		g_binRangeY[0] = (g_binCountY * yiter/g_numBinSplits)		  - 1;	// But without it verts would only target verts in their quadrant
		g_binRangeY[1] = (g_binCountY * (yiter+1)/g_numBinSplits - 1) + 1;
		g_binStride  = g_numSlots / ((g_binRangeX[1] - g_binRangeX[0] + 1) * (g_binRangeY[1] - g_binRangeY[0] + 1));

		int bin;
		memset(g_slots, EMPTY_SLOT, sizeof(g_slots));
		for (uint i = 0; i < g_numNodes; i++)
		{
			if (g_nodes[i].attribs.hasChild == true) continue; // Only bin the chompable tails
			hr = Bin(g_nodes[i].position.getX(), g_nodes[i].position.getY(), &bin);
			if (FAILED(hr)) // If this bin isn't backed by memory, we can't be a target this frame
				continue;

			// Find first empty bin slot
			for (uint slot = 0; slot < g_binStride; slot++) 
			{
				if (g_slots[bin*g_binStride + slot] == EMPTY_SLOT)
				{
					g_slots[bin*g_binStride + slot] = i;
					break;
				}
			}
			// If we overflow the bins, the vertex cannot be targeted. Haven't seen any cases yet...
		}
		g_binUpdateIter = (g_binUpdateIter+1) % (g_numBinSplits*g_numBinSplits);
	}
	EndCounter(&binningCounter);

	// Determine nearest neighbors
	BeginCounter(&nearestNeighborCounter);
	for (uint i = 0; i < g_numNodes; i++)
	{
		IFC( FindNearestNeighbor(i) );
	}
	EndCounter(&nearestNeighborCounter);

	BeginCounter(&positionUpdate);
	for (uint i = 0; i < g_numNodes; i++)
	{
		// Do our memory reads here so we can optimize our access patterns
		Node& current = g_nodes[i];
		Node& target = g_nodes[current.attribs.targetID];

		// Get target vector
		// For optimal precision, pull our shorts into floats and do all math at full precision...
		float2 targetVec;
		targetVec.x = target.position.getX() - current.position.getX(); 
		targetVec.y = target.position.getY() - current.position.getY();

		float dist = targetVec.getLength();
		float2 dir = targetVec;
		if (dist != 0)
			dir = dir / dist;
		
		// Calculate change in position
		float2 offset;
		if (current.attribs.hasParent)
		{
			// This controls wigglyness. Perhaps it should be a function of velocity? (static is more wiggly)
			float parentPaddingRadius = g_tailDist;// + (rand() * 2 - 1)*g_tailDist*0.3f;
			offset = targetVec - dir * parentPaddingRadius;
		}
		else
			offset = min(targetVec, dir * float(g_speed * deltaTime));
		
		// ... then finally, at the verrrry end, stuff our FP floats into 16-bit shorts
		current.position.setX(current.position.getX() + offset.x);
		current.position.setY(current.position.getY() + offset.y);
		
		// Check for chomps
		if (current.attribs.hasParent == false && dist <= g_tailDist)
			Chomp(i);
	}
	EndCounter(&positionUpdate);

Cleanup:
	if (g_numActiveNodes == 1)
		return EndgameInit();

	return hr;
}