////////////////////////////////////////////////////////////////////////// /// @brief Create SWR Context. /// @param pCreateInfo - pointer to creation info. HANDLE SwrCreateContext( const SWR_CREATECONTEXT_INFO* pCreateInfo) { RDTSC_RESET(); RDTSC_INIT(0); void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4); memset(pContextMem, 0, sizeof(SWR_CONTEXT)); SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT(); pContext->driverType = pCreateInfo->driver; pContext->privateStateSize = pCreateInfo->privateStateSize; pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT); pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT); for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { pContext->dcRing[dc].arena.Init(); pContext->dcRing[dc].inUse = false; pContext->dcRing[dc].pTileMgr = new MacroTileMgr(); pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. pContext->dsRing[dc].arena.Init(); } if (!KNOB_SINGLE_THREADED) { memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); new (&pContext->WaitLock) std::mutex(); new (&pContext->FifosNotEmpty) std::condition_variable(); CreateThreadPool(pContext, &pContext->threadPool); } // Calling createThreadPool() above can set SINGLE_THREADED if (KNOB_SINGLE_THREADED) { pContext->NumWorkerThreads = 1; } // Allocate scratch space for workers. ///@note We could lazily allocate this but its rather small amount of memory. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { ///@todo Use numa API for allocations using numa information from thread data (if exists). pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); } pContext->LastRetiredId = 0; pContext->nextDrawId = 1; // workers start at draw 1 for (uint32_t i = 0; i < KNOB_MAX_NUM_THREADS; ++i) { pContext->WorkerFE[i] = 1; pContext->WorkerBE[i] = 1; } pContext->DrawEnqueued = 1; // State setup AFTER context is fully initialized SetupDefaultState(pContext); // initialize hot tile manager pContext->pHotTileMgr = new HotTileMgr(); // initialize function pointer tables InitClearTilesTable(); // initialize store tiles function pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; pContext->pfnClearTile = pCreateInfo->pfnClearTile; return (HANDLE)pContext; }
DWORD workerThreadMain(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; SWR_CONTEXT *pContext = pThreadData->pContext; uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); { char threadName[64]; sprintf_s(threadName, #if defined(_WIN32) "SWRWorker_%02d_NUMA%d_Core%02d_T%d", #else // linux pthread name limited to 16 chars (including \0) "w%03d-n%d-c%03d-t%d", #endif workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId); SetCurrentThreadName(threadName); } RDTSC_INIT(threadId); // Only need offset numa index from base for correct masking uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE; uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. TileSet lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. // the logic to determine what to work on is: // 1- try to work on the FE any draw that is queued. For now there are no dependencies // on the FE work, so any worker can grab any FE and process in parallel. Eventually // we'll need dependency tracking to force serialization on FEs. The worker will try // to pick an FE by atomically incrementing a counter in the swr context. he'll keep // trying until he reaches the tail. // 2- BE work must be done in strict order. we accomplish this today by pulling work off // the oldest draw (ie the head) of the dcRing. the worker can determine if there is // any work left by comparing the total # of binned work items and the total # of completed // work items. If they are equal, then there is no more work to do for this draw, and // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; uint32_t curDrawBE = 0; uint32_t curDrawFE = 0; bool bShutdown = false; while (true) { if (bShutdown && !threadHasWork(curDrawBE)) { break; } uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) { _mm_pause(); } if (!threadHasWork(curDrawBE)) { lock.lock(); // check for thread idle condition again under lock if (threadHasWork(curDrawBE)) { lock.unlock(); continue; } pContext->FifosNotEmpty.wait(lock); lock.unlock(); } if (IsBEThread) { RDTSC_BEGIN(WorkerWorkOnFifoBE, 0); bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_END(WorkerWorkOnFifoBE, 0); WorkOnCompute(pContext, workerId, curDrawBE); } if (IsFEThread) { WorkOnFifoFE(pContext, workerId, curDrawFE); if (!IsBEThread) { curDrawBE = curDrawFE; } } } return 0; }
DWORD workerThread(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; SWR_CONTEXT *pContext = pThreadData->pContext; uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; bindThread(threadId, pThreadData->procGroupId); RDTSC_INIT(threadId); int numaNode = (int)pThreadData->numaId; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. std::unordered_set<uint32_t> lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. // the logic to determine what to work on is: // 1- try to work on the FE any draw that is queued. For now there are no dependencies // on the FE work, so any worker can grab any FE and process in parallel. Eventually // we'll need dependency tracking to force serialization on FEs. The worker will try // to pick an FE by atomically incrementing a counter in the swr context. he'll keep // trying until he reaches the tail. // 2- BE work must be done in strict order. we accomplish this today by pulling work off // the oldest draw (ie the head) of the dcRing. the worker can determine if there is // any work left by comparing the total # of binned work items and the total # of completed // work items. If they are equal, then there is no more work to do for this draw, and // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); while (pContext->threadPool.inThreadShutdown == false) { uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && pContext->WorkerBE[workerId] == pContext->DrawEnqueued) { _mm_pause(); } if (pContext->WorkerBE[workerId] == pContext->DrawEnqueued) { lock.lock(); // check for thread idle condition again under lock if (pContext->WorkerBE[workerId] != pContext->DrawEnqueued) { lock.unlock(); continue; } if (pContext->threadPool.inThreadShutdown) { lock.unlock(); break; } RDTSC_START(WorkerWaitForThreadEvent); pContext->FifosNotEmpty.wait(lock); lock.unlock(); RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0); if (pContext->threadPool.inThreadShutdown) { break; } } RDTSC_START(WorkerWorkOnFifoBE); WorkOnFifoBE(pContext, workerId, pContext->WorkerBE[workerId], lockedTiles); RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); WorkOnCompute(pContext, workerId, pContext->WorkerBE[workerId]); WorkOnFifoFE(pContext, workerId, pContext->WorkerFE[workerId], numaNode); } return 0; }