void QueueDraw(SWR_CONTEXT *pContext) { _ReadWriteBarrier(); pContext->DrawEnqueued ++; if (KNOB_SINGLE_THREADED) { // flush denormals to 0 uint32_t mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); std::unordered_set<uint32_t> lockedTiles; WorkOnFifoFE(pContext, 0, pContext->WorkerFE[0], 0); WorkOnFifoBE(pContext, 0, pContext->WorkerBE[0], lockedTiles); // restore csr _mm_setcsr(mxcsr); } else { RDTSC_START(APIDrawWakeAllThreads); WakeAllThreads(pContext); RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); } // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. pContext->pPrevDrawContext = pContext->pCurDrawContext; pContext->pCurDrawContext = nullptr; }
DWORD workerThreadMain(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; SWR_CONTEXT *pContext = pThreadData->pContext; uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); { char threadName[64]; sprintf_s(threadName, #if defined(_WIN32) "SWRWorker_%02d_NUMA%d_Core%02d_T%d", #else // linux pthread name limited to 16 chars (including \0) "w%03d-n%d-c%03d-t%d", #endif workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId); SetCurrentThreadName(threadName); } RDTSC_INIT(threadId); // Only need offset numa index from base for correct masking uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE; uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. TileSet lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. // the logic to determine what to work on is: // 1- try to work on the FE any draw that is queued. For now there are no dependencies // on the FE work, so any worker can grab any FE and process in parallel. Eventually // we'll need dependency tracking to force serialization on FEs. The worker will try // to pick an FE by atomically incrementing a counter in the swr context. he'll keep // trying until he reaches the tail. // 2- BE work must be done in strict order. we accomplish this today by pulling work off // the oldest draw (ie the head) of the dcRing. the worker can determine if there is // any work left by comparing the total # of binned work items and the total # of completed // work items. If they are equal, then there is no more work to do for this draw, and // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; uint32_t curDrawBE = 0; uint32_t curDrawFE = 0; bool bShutdown = false; while (true) { if (bShutdown && !threadHasWork(curDrawBE)) { break; } uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) { _mm_pause(); } if (!threadHasWork(curDrawBE)) { lock.lock(); // check for thread idle condition again under lock if (threadHasWork(curDrawBE)) { lock.unlock(); continue; } pContext->FifosNotEmpty.wait(lock); lock.unlock(); } if (IsBEThread) { RDTSC_BEGIN(WorkerWorkOnFifoBE, 0); bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_END(WorkerWorkOnFifoBE, 0); WorkOnCompute(pContext, workerId, curDrawBE); } if (IsFEThread) { WorkOnFifoFE(pContext, workerId, curDrawFE); if (!IsBEThread) { curDrawBE = curDrawFE; } } } return 0; }
DWORD workerThread(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; SWR_CONTEXT *pContext = pThreadData->pContext; uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; bindThread(threadId, pThreadData->procGroupId); RDTSC_INIT(threadId); int numaNode = (int)pThreadData->numaId; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. std::unordered_set<uint32_t> lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. // the logic to determine what to work on is: // 1- try to work on the FE any draw that is queued. For now there are no dependencies // on the FE work, so any worker can grab any FE and process in parallel. Eventually // we'll need dependency tracking to force serialization on FEs. The worker will try // to pick an FE by atomically incrementing a counter in the swr context. he'll keep // trying until he reaches the tail. // 2- BE work must be done in strict order. we accomplish this today by pulling work off // the oldest draw (ie the head) of the dcRing. the worker can determine if there is // any work left by comparing the total # of binned work items and the total # of completed // work items. If they are equal, then there is no more work to do for this draw, and // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); while (pContext->threadPool.inThreadShutdown == false) { uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && pContext->WorkerBE[workerId] == pContext->DrawEnqueued) { _mm_pause(); } if (pContext->WorkerBE[workerId] == pContext->DrawEnqueued) { lock.lock(); // check for thread idle condition again under lock if (pContext->WorkerBE[workerId] != pContext->DrawEnqueued) { lock.unlock(); continue; } if (pContext->threadPool.inThreadShutdown) { lock.unlock(); break; } RDTSC_START(WorkerWaitForThreadEvent); pContext->FifosNotEmpty.wait(lock); lock.unlock(); RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0); if (pContext->threadPool.inThreadShutdown) { break; } } RDTSC_START(WorkerWorkOnFifoBE); WorkOnFifoBE(pContext, workerId, pContext->WorkerBE[workerId], lockedTiles); RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); WorkOnCompute(pContext, workerId, pContext->WorkerBE[workerId]); WorkOnFifoFE(pContext, workerId, pContext->WorkerFE[workerId], numaNode); } return 0; }