void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) { out_nodes.clear(); out_numThreadsPerProcGroup = 0; #if defined(_WIN32) static std::mutex m; std::lock_guard<std::mutex> l(m); static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; DWORD bufSize = sizeof(buffer); BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize); SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); uint32_t count = bufSize / buffer->Size; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer; for (uint32_t i = 0; i < count; ++i) { SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) { auto& gmask = pBuffer->Processor.GroupMask[g]; uint32_t threadId = 0; uint32_t procGroup = gmask.Group; Core* pCore = nullptr; uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) { // clear mask gmask.Mask &= ~(KAFFINITY(1) << threadId); // Find Numa Node PROCESSOR_NUMBER procNum = {}; procNum.Group = WORD(procGroup); procNum.Number = UCHAR(threadId); uint32_t numaId = 0; ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); SWR_ASSERT(ret); // Store data if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); auto& numaNode = out_nodes[numaId]; uint32_t coreId = 0; if (nullptr == pCore) { numaNode.cores.push_back(Core()); pCore = &numaNode.cores.back(); pCore->procGroup = procGroup; #if !defined(_WIN64) coreId = (uint32_t)numaNode.cores.size(); if ((coreId * numThreads) >= 32) { // Windows doesn't return threadIds >= 32 for a processor group correctly // when running a 32-bit application. // Just save -1 as the threadId threadId = uint32_t(-1); } #endif } pCore->threadIds.push_back(threadId); if (procGroup == 0) { out_numThreadsPerProcGroup++; } } } pBuffer = PtrAdd(pBuffer, pBuffer->Size); } #elif defined(__linux__) || defined (__gnu_linux__) // Parse /proc/cpuinfo to get full topology std::ifstream input("/proc/cpuinfo"); std::string line; char* c; uint32_t threadId = uint32_t(-1); uint32_t coreId = uint32_t(-1); uint32_t numaId = uint32_t(-1); while (std::getline(input, line)) { if (line.find("processor") != std::string::npos) { if (threadId != uint32_t(-1)) { // Save information. if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); auto& numaNode = out_nodes[numaId]; if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(threadId); out_numThreadsPerProcGroup++; } auto data_start = line.find(": ") + 2; threadId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("core id") != std::string::npos) { auto data_start = line.find(": ") + 2; coreId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("physical id") != std::string::npos) { auto data_start = line.find(": ") + 2; numaId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } } if (threadId != uint32_t(-1)) { // Save information. if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); auto& numaNode = out_nodes[numaId]; if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(threadId); out_numThreadsPerProcGroup++; } for (uint32_t node = 0; node < out_nodes.size(); node++) { auto& numaNode = out_nodes[node]; auto it = numaNode.cores.begin(); for ( ; it != numaNode.cores.end(); ) { if (it->threadIds.size() == 0) numaNode.cores.erase(it); else ++it; } } #else #error Unsupported platform #endif }
void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) { out_nodes.clear(); out_numThreadsPerProcGroup = 0; #if defined(_WIN32) std::vector<KAFFINITY> threadMaskPerProcGroup; static std::mutex m; std::lock_guard<std::mutex> l(m); DWORD bufSize = 0; BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize); SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER); PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize); SWR_ASSERT(pBufferMem); ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize); SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); uint32_t count = bufSize / pBufferMem->Size; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem; for (uint32_t i = 0; i < count; ++i) { SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) { auto& gmask = pBuffer->Processor.GroupMask[g]; uint32_t threadId = 0; uint32_t procGroup = gmask.Group; Core* pCore = nullptr; uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) { // clear mask KAFFINITY threadMask = KAFFINITY(1) << threadId; gmask.Mask &= ~threadMask; if (procGroup >= threadMaskPerProcGroup.size()) { threadMaskPerProcGroup.resize(procGroup + 1); } if (threadMaskPerProcGroup[procGroup] & threadMask) { // Already seen this mask. This means that we are in 32-bit mode and // have seen more than 32 HW threads for this procGroup // Don't use it #if defined(_WIN64) SWR_INVALID("Shouldn't get here in 64-bit mode"); #endif continue; } threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId); // Find Numa Node uint32_t numaId = 0; PROCESSOR_NUMBER procNum = {}; procNum.Group = WORD(procGroup); procNum.Number = UCHAR(threadId); ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); SWR_ASSERT(ret); // Store data if (out_nodes.size() <= numaId) { out_nodes.resize(numaId + 1); } auto& numaNode = out_nodes[numaId]; numaNode.numaId = numaId; uint32_t coreId = 0; if (nullptr == pCore) { numaNode.cores.push_back(Core()); pCore = &numaNode.cores.back(); pCore->procGroup = procGroup; } pCore->threadIds.push_back(threadId); if (procGroup == 0) { out_numThreadsPerProcGroup++; } } } pBuffer = PtrAdd(pBuffer, pBuffer->Size); } free(pBufferMem); #elif defined(__linux__) || defined (__gnu_linux__) // Parse /proc/cpuinfo to get full topology std::ifstream input("/proc/cpuinfo"); std::string line; char* c; uint32_t procId = uint32_t(-1); uint32_t coreId = uint32_t(-1); uint32_t physId = uint32_t(-1); while (std::getline(input, line)) { if (line.find("processor") != std::string::npos) { auto data_start = line.find(": ") + 2; procId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("core id") != std::string::npos) { auto data_start = line.find(": ") + 2; coreId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("physical id") != std::string::npos) { auto data_start = line.find(": ") + 2; physId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.length() == 0) { if (physId + 1 > out_nodes.size()) out_nodes.resize(physId + 1); auto& numaNode = out_nodes[physId]; numaNode.numaId = physId; if (coreId + 1 > numaNode.cores.size()) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); } } out_numThreadsPerProcGroup = 0; for (auto &node : out_nodes) { for (auto &core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } } #elif defined(__APPLE__) auto numProcessors = 0; auto numCores = 0; auto numPhysicalIds = 0; int value; size_t size = sizeof(value); int result = sysctlbyname("hw.packages", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numPhysicalIds = value; result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numProcessors = value; result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numCores = value; out_nodes.resize(numPhysicalIds); for (auto physId = 0; physId < numPhysicalIds; ++physId) { auto &numaNode = out_nodes[physId]; auto procId = 0; numaNode.cores.resize(numCores); while (procId < numProcessors) { for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId) { auto &core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); } } } out_numThreadsPerProcGroup = 0; for (auto &node : out_nodes) { for (auto &core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } } #else #error Unsupported platform #endif // Prune empty cores and numa nodes for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); ) { // Erase empty cores (first) for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); ) { if (core_it->threadIds.size() == 0) { core_it = node_it->cores.erase(core_it); } else { ++core_it; } } // Erase empty numa nodes (second) if (node_it->cores.size() == 0) { node_it = out_nodes.erase(node_it); } else { ++node_it; } } }
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) { // Bind application thread to HW thread 0 bindThread(0); CPUNumaNodes nodes; CalculateProcessorTopology(nodes); uint32_t numHWNodes = (uint32_t)nodes.size(); uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); uint32_t numNodes = numHWNodes; uint32_t numCoresPerNode = numHWCoresPerNode; uint32_t numHyperThreads = numHWHyperThreads; if (KNOB_MAX_NUMA_NODES) { numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES); } if (KNOB_MAX_CORES_PER_NUMA_NODE) { numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE); } if (KNOB_MAX_THREADS_PER_CORE) { numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE); } // Calculate numThreads uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; if (numThreads > KNOB_MAX_NUM_THREADS) { printf("WARNING: system thread count %u exceeds max %u, " "performance will be degraded\n", numThreads, KNOB_MAX_NUM_THREADS); } if (numThreads == 1) { // If only 1 worker thread, try to move it to an available // HW thread. If that fails, use the API thread. if (numCoresPerNode < numHWCoresPerNode) { numCoresPerNode++; } else if (numHyperThreads < numHWHyperThreads) { numHyperThreads++; } else if (numNodes < numHWNodes) { numNodes++; } else { pPool->numThreads = 0; SET_KNOB(SINGLE_THREADED, true); return; } } else { // Save a HW thread for the API thread. numThreads--; } pPool->numThreads = numThreads; pContext->NumWorkerThreads = pPool->numThreads; pPool->inThreadShutdown = false; pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); uint32_t workerId = 0; for (uint32_t n = 0; n < numNodes; ++n) { auto& node = nodes[n]; uint32_t numCores = numCoresPerNode; for (uint32_t c = 0; c < numCores; ++c) { auto& core = node.cores[c]; for (uint32_t t = 0; t < numHyperThreads; ++t) { if (c == 0 && n == 0 && t == 0) { // Skip core 0, thread0 on node 0 to reserve for API thread continue; } pPool->pThreadData[workerId].workerId = workerId; pPool->pThreadData[workerId].procGroupId = core.procGroup; pPool->pThreadData[workerId].threadId = core.threadIds[t]; pPool->pThreadData[workerId].numaId = n; pPool->pThreadData[workerId].pContext = pContext; pPool->threads[workerId] = new std::thread(workerThread, &pPool->pThreadData[workerId]); ++workerId; } } } }