void SlowVideo(char* Video,int BPP,int Width,int Height,int StrideX,int StrideY,int32_t* Result) { if (BPP == 16) { int64_t t0,t1,t; int nv,ns; int Mul,Len; int SystemLength = 512*1024; char* System = (char*) malloc( SystemLength ); //hopefully cpu cache will be smaller if (System) { TRY_BEGIN if (StrideX < 0) { StrideX = -StrideX; Video -= StrideX * (Width - 1); } if (StrideY < 0) { StrideY = -StrideY; Video -= StrideY * (Height - 1); } if (StrideX > StrideY) { Swap(&Width,&Height); Swap(&StrideX,&StrideY); } Len = (Width*BPP) >> 3; if (Len>32 && !IsBadWritePtr(Video,Len) && !IsBadReadPtr(Video,Len)) { int i; int Rows = SystemLength / Len; memset(System,0,Rows*Len); memcpy(System,Video,Len); BeginCounter(&t); t >>= 1; // 0.5 sec SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_HIGHEST); nv=0; GetCounter(&t0); t0 += t; do { for (i=0;i<Rows;++i) memcpy(Video,System,Len); nv++; GetCounter(&t1); } while (t1 < t0); ns=0; GetCounter(&t0); t0 += t; do { for (i=0;i<Rows;i++) memcpy(System+i*Len,System,Len); ns++; GetCounter(&t1); } while (t1 < t0); Mul = 6; if (nv*Mul >= ns) *Result = 0; // fast else *Result = 1; // slow (video memory or in general) SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_NORMAL); EndCounter(); #ifdef BENCH { tchar_t Msg[256]; stprintf_s(Msg,TSIZEOF(Msg),T("Video %d\nSystem %d\nResult %d"),nv,ns,*Result); MessageBox(NULL,Msg,T(""),MB_OK|MB_SETFOREGROUND); } #endif } TRY_END free(System); }
HRESULT Update(double deltaTime) { HRESULT hr = S_OK; if (g_endgame) return EndgameUpdate(deltaTime); // TODO: Optimize for cache coherency // We could attempt to store chains of nodes linearly in memory. That would make the update loop for nodes in those chains // super fast (since the most chains could probably fit in one cache line). But it would involve a lot of mem moves and // could introduce some complexity. Since we're already under 1ms average, I'd say let's not do it. // Sort into buckets BeginCounter(&binningCounter); { float pixelsPerVert = float((g_width * g_height) / g_numActiveNodes); float binDiameterPixels = sqrt(pixelsPerVert); // conservative g_binNHeight = binDiameterPixels / g_height; g_binNWidth = binDiameterPixels / g_width; g_binCountX = uint(ceilf(1.0f / g_binNWidth) )+2; // Add a boundary around the outside g_binCountY = uint(ceilf(1.0f / g_binNHeight))+2; uint xiter = g_binUpdateIter % g_numBinSplits; uint yiter = g_binUpdateIter / g_numBinSplits; g_binRangeX[0] = (g_binCountX * xiter/g_numBinSplits) - 1; // Subtract/Add 1 to each of these ranges for a buffer layer g_binRangeX[1] = (g_binCountX * (xiter+1)/g_numBinSplits - 1) + 1; // This buffer layer will be overlap for each quadrant g_binRangeY[0] = (g_binCountY * yiter/g_numBinSplits) - 1; // But without it verts would only target verts in their quadrant g_binRangeY[1] = (g_binCountY * (yiter+1)/g_numBinSplits - 1) + 1; g_binStride = g_numSlots / ((g_binRangeX[1] - g_binRangeX[0] + 1) * (g_binRangeY[1] - g_binRangeY[0] + 1)); int bin; memset(g_slots, EMPTY_SLOT, sizeof(g_slots)); for (uint i = 0; i < g_numNodes; i++) { if (g_nodes[i].attribs.hasChild == true) continue; // Only bin the chompable tails hr = Bin(g_nodes[i].position.getX(), g_nodes[i].position.getY(), &bin); if (FAILED(hr)) // If this bin isn't backed by memory, we can't be a target this frame continue; // Find first empty bin slot for (uint slot = 0; slot < g_binStride; slot++) { if (g_slots[bin*g_binStride + slot] == EMPTY_SLOT) { g_slots[bin*g_binStride + slot] = i; break; } } // If we overflow the bins, the vertex cannot be targeted. Haven't seen any cases yet... } g_binUpdateIter = (g_binUpdateIter+1) % (g_numBinSplits*g_numBinSplits); } EndCounter(&binningCounter); // Determine nearest neighbors BeginCounter(&nearestNeighborCounter); for (uint i = 0; i < g_numNodes; i++) { IFC( FindNearestNeighbor(i) ); } EndCounter(&nearestNeighborCounter); BeginCounter(&positionUpdate); for (uint i = 0; i < g_numNodes; i++) { // Do our memory reads here so we can optimize our access patterns Node& current = g_nodes[i]; Node& target = g_nodes[current.attribs.targetID]; // Get target vector // For optimal precision, pull our shorts into floats and do all math at full precision... float2 targetVec; targetVec.x = target.position.getX() - current.position.getX(); targetVec.y = target.position.getY() - current.position.getY(); float dist = targetVec.getLength(); float2 dir = targetVec; if (dist != 0) dir = dir / dist; // Calculate change in position float2 offset; if (current.attribs.hasParent) { // This controls wigglyness. Perhaps it should be a function of velocity? (static is more wiggly) float parentPaddingRadius = g_tailDist;// + (rand() * 2 - 1)*g_tailDist*0.3f; offset = targetVec - dir * parentPaddingRadius; } else offset = min(targetVec, dir * float(g_speed * deltaTime)); // ... then finally, at the verrrry end, stuff our FP floats into 16-bit shorts current.position.setX(current.position.getX() + offset.x); current.position.setY(current.position.getY() + offset.y); // Check for chomps if (current.attribs.hasParent == false && dist <= g_tailDist) Chomp(i); } EndCounter(&positionUpdate); Cleanup: if (g_numActiveNodes == 1) return EndgameInit(); return hr; }