void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false) { // Only bind threads when MAX_WORKER_THREADS isn't set. if (pContext->threadInfo.SINGLE_THREADED || (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false)) { return; } #if defined(_WIN32) GROUP_AFFINITY affinity = {}; affinity.Group = procGroupId; #if !defined(_WIN64) if (threadId >= 32) { // Hopefully we don't get here. Logic in CreateThreadPool should prevent this. SWR_INVALID("Shouldn't get here"); // In a 32-bit process on Windows it is impossible to bind // to logical processors 32-63 within a processor group. // In this case set the mask to 0 and let the system assign // the processor. Hopefully it will make smart choices. affinity.Mask = 0; } else #endif { // If MAX_WORKER_THREADS is set, only bind to the proc group, // Not the individual HW thread. if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS) { affinity.Mask = KAFFINITY(1) << threadId; } else { affinity.Mask = KAFFINITY(0); } } if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr)) { SWR_INVALID("Failed to set Thread Affinity"); } #elif defined(__linux__) || defined(__gnu_linux__) cpu_set_t cpuset; pthread_t thread = pthread_self(); CPU_ZERO(&cpuset); CPU_SET(threadId, &cpuset); int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); if (err != 0) { fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err)); } #endif }
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, SWR_RENDERTARGET_ATTACHMENT attachment) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(BEStoreTiles, pDC->drawId); SWR_FORMAT srcFormat; switch (attachment) { case SWR_ATTACHMENT_COLOR0: case SWR_ATTACHMENT_COLOR1: case SWR_ATTACHMENT_COLOR2: case SWR_ATTACHMENT_COLOR3: case SWR_ATTACHMENT_COLOR4: case SWR_ATTACHMENT_COLOR5: case SWR_ATTACHMENT_COLOR6: case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break; default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; } uint32_t x, y; MacroTileMgr::getTileIndices(macroTile, x, y); // Only need to store the hottile if it's been rendered to... HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false); if (pHotTile) { // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. if (pHotTile->state == HOTTILE_CLEAR) { PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat]; SWR_ASSERT(pfnClearTiles != nullptr); pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect); } if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) { int32_t destX = KNOB_MACROTILE_X_DIM * x; int32_t destY = KNOB_MACROTILE_Y_DIM * y; pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat, attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); } if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) { if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED)) { pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; } } } AR_END(BEStoreTiles, 1); }
void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) { out_nodes.clear(); out_numThreadsPerProcGroup = 0; #if defined(_WIN32) std::vector<KAFFINITY> threadMaskPerProcGroup; static std::mutex m; std::lock_guard<std::mutex> l(m); DWORD bufSize = 0; BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize); SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER); PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize); SWR_ASSERT(pBufferMem); ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize); SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); uint32_t count = bufSize / pBufferMem->Size; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem; for (uint32_t i = 0; i < count; ++i) { SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) { auto& gmask = pBuffer->Processor.GroupMask[g]; uint32_t threadId = 0; uint32_t procGroup = gmask.Group; Core* pCore = nullptr; uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) { // clear mask KAFFINITY threadMask = KAFFINITY(1) << threadId; gmask.Mask &= ~threadMask; if (procGroup >= threadMaskPerProcGroup.size()) { threadMaskPerProcGroup.resize(procGroup + 1); } if (threadMaskPerProcGroup[procGroup] & threadMask) { // Already seen this mask. This means that we are in 32-bit mode and // have seen more than 32 HW threads for this procGroup // Don't use it #if defined(_WIN64) SWR_INVALID("Shouldn't get here in 64-bit mode"); #endif continue; } threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId); // Find Numa Node uint32_t numaId = 0; PROCESSOR_NUMBER procNum = {}; procNum.Group = WORD(procGroup); procNum.Number = UCHAR(threadId); ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); SWR_ASSERT(ret); // Store data if (out_nodes.size() <= numaId) { out_nodes.resize(numaId + 1); } auto& numaNode = out_nodes[numaId]; numaNode.numaId = numaId; uint32_t coreId = 0; if (nullptr == pCore) { numaNode.cores.push_back(Core()); pCore = &numaNode.cores.back(); pCore->procGroup = procGroup; } pCore->threadIds.push_back(threadId); if (procGroup == 0) { out_numThreadsPerProcGroup++; } } } pBuffer = PtrAdd(pBuffer, pBuffer->Size); } free(pBufferMem); #elif defined(__linux__) || defined (__gnu_linux__) // Parse /proc/cpuinfo to get full topology std::ifstream input("/proc/cpuinfo"); std::string line; char* c; uint32_t procId = uint32_t(-1); uint32_t coreId = uint32_t(-1); uint32_t physId = uint32_t(-1); while (std::getline(input, line)) { if (line.find("processor") != std::string::npos) { auto data_start = line.find(": ") + 2; procId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("core id") != std::string::npos) { auto data_start = line.find(": ") + 2; coreId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("physical id") != std::string::npos) { auto data_start = line.find(": ") + 2; physId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.length() == 0) { if (physId + 1 > out_nodes.size()) out_nodes.resize(physId + 1); auto& numaNode = out_nodes[physId]; numaNode.numaId = physId; if (coreId + 1 > numaNode.cores.size()) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); } } out_numThreadsPerProcGroup = 0; for (auto &node : out_nodes) { for (auto &core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } } #elif defined(__APPLE__) auto numProcessors = 0; auto numCores = 0; auto numPhysicalIds = 0; int value; size_t size = sizeof(value); int result = sysctlbyname("hw.packages", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numPhysicalIds = value; result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numProcessors = value; result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numCores = value; out_nodes.resize(numPhysicalIds); for (auto physId = 0; physId < numPhysicalIds; ++physId) { auto &numaNode = out_nodes[physId]; auto procId = 0; numaNode.cores.resize(numCores); while (procId < numProcessors) { for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId) { auto &core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); } } } out_numThreadsPerProcGroup = 0; for (auto &node : out_nodes) { for (auto &core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } } #else #error Unsupported platform #endif // Prune empty cores and numa nodes for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); ) { // Erase empty cores (first) for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); ) { if (core_it->threadIds.size() == 0) { core_it = node_it->cores.erase(core_it); } else { ++core_it; } } // Erase empty numa nodes (second) if (node_it->cores.size() == 0) { node_it = out_nodes.erase(node_it); } else { ++node_it; } } }
HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerPrivateData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples, uint32_t renderTargetArrayIndex) { uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); HotTileSet& tile = mHotTiles[x][y]; HOTTILE& hotTile = tile.Attachment[attachment]; if (hotTile.pBuffer == NULL) { if (create) { uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; hotTile.renderTargetArrayIndex = renderTargetArrayIndex; } else { return NULL; } } else { // free the old tile and create a new one with enough space to hold all samples if (numSamples > hotTile.numSamples) { // tile should be either uninitialized or resolved if we're deleting and switching to a // new sample count SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) || (hotTile.state == HOTTILE_CLEAR)); FreeHotTileMem(hotTile.pBuffer); uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; } // if requested render target array index isn't currently loaded, need to store out the // current hottile and load the requested array slice if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) { SWR_FORMAT format; switch (attachment) { case SWR_ATTACHMENT_COLOR0: case SWR_ATTACHMENT_COLOR1: case SWR_ATTACHMENT_COLOR2: case SWR_ATTACHMENT_COLOR3: case SWR_ATTACHMENT_COLOR4: case SWR_ATTACHMENT_COLOR5: case SWR_ATTACHMENT_COLOR6: case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; default: SWR_INVALID("Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; } if (hotTile.state == HOTTILE_CLEAR) { if (attachment == SWR_ATTACHMENT_STENCIL) ClearStencilHotTile(&hotTile); else if (attachment == SWR_ATTACHMENT_DEPTH) ClearDepthHotTile(&hotTile); else ClearColorHotTile(&hotTile); hotTile.state = HOTTILE_DIRTY; } if (hotTile.state == HOTTILE_DIRTY) { pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment, x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); } pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment, x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); hotTile.renderTargetArrayIndex = renderTargetArrayIndex; hotTile.state = HOTTILE_DIRTY; } } return &tile.Attachment[attachment]; }