void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) { out_nodes.clear(); out_numThreadsPerProcGroup = 0; #if defined(_WIN32) static std::mutex m; std::lock_guard<std::mutex> l(m); static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; DWORD bufSize = sizeof(buffer); BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize); SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); uint32_t count = bufSize / buffer->Size; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer; for (uint32_t i = 0; i < count; ++i) { SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) { auto& gmask = pBuffer->Processor.GroupMask[g]; uint32_t threadId = 0; uint32_t procGroup = gmask.Group; Core* pCore = nullptr; uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) { // clear mask gmask.Mask &= ~(KAFFINITY(1) << threadId); // Find Numa Node PROCESSOR_NUMBER procNum = {}; procNum.Group = WORD(procGroup); procNum.Number = UCHAR(threadId); uint32_t numaId = 0; ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); SWR_ASSERT(ret); // Store data if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); auto& numaNode = out_nodes[numaId]; uint32_t coreId = 0; if (nullptr == pCore) { numaNode.cores.push_back(Core()); pCore = &numaNode.cores.back(); pCore->procGroup = procGroup; #if !defined(_WIN64) coreId = (uint32_t)numaNode.cores.size(); if ((coreId * numThreads) >= 32) { // Windows doesn't return threadIds >= 32 for a processor group correctly // when running a 32-bit application. // Just save -1 as the threadId threadId = uint32_t(-1); } #endif } pCore->threadIds.push_back(threadId); if (procGroup == 0) { out_numThreadsPerProcGroup++; } } } pBuffer = PtrAdd(pBuffer, pBuffer->Size); } #elif defined(__linux__) || defined (__gnu_linux__) // Parse /proc/cpuinfo to get full topology std::ifstream input("/proc/cpuinfo"); std::string line; char* c; uint32_t threadId = uint32_t(-1); uint32_t coreId = uint32_t(-1); uint32_t numaId = uint32_t(-1); while (std::getline(input, line)) { if (line.find("processor") != std::string::npos) { if (threadId != uint32_t(-1)) { // Save information. if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); auto& numaNode = out_nodes[numaId]; if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(threadId); out_numThreadsPerProcGroup++; } auto data_start = line.find(": ") + 2; threadId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("core id") != std::string::npos) { auto data_start = line.find(": ") + 2; coreId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("physical id") != std::string::npos) { auto data_start = line.find(": ") + 2; numaId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } } if (threadId != uint32_t(-1)) { // Save information. if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); auto& numaNode = out_nodes[numaId]; if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(threadId); out_numThreadsPerProcGroup++; } for (uint32_t node = 0; node < out_nodes.size(); node++) { auto& numaNode = out_nodes[node]; auto it = numaNode.cores.begin(); for ( ; it != numaNode.cores.end(); ) { if (it->threadIds.size() == 0) numaNode.cores.erase(it); else ++it; } } #else #error Unsupported platform #endif }
void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) { out_nodes.clear(); out_numThreadsPerProcGroup = 0; #if defined(_WIN32) std::vector<KAFFINITY> threadMaskPerProcGroup; static std::mutex m; std::lock_guard<std::mutex> l(m); DWORD bufSize = 0; BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize); SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER); PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize); SWR_ASSERT(pBufferMem); ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize); SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); uint32_t count = bufSize / pBufferMem->Size; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem; for (uint32_t i = 0; i < count; ++i) { SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) { auto& gmask = pBuffer->Processor.GroupMask[g]; uint32_t threadId = 0; uint32_t procGroup = gmask.Group; Core* pCore = nullptr; uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) { // clear mask KAFFINITY threadMask = KAFFINITY(1) << threadId; gmask.Mask &= ~threadMask; if (procGroup >= threadMaskPerProcGroup.size()) { threadMaskPerProcGroup.resize(procGroup + 1); } if (threadMaskPerProcGroup[procGroup] & threadMask) { // Already seen this mask. This means that we are in 32-bit mode and // have seen more than 32 HW threads for this procGroup // Don't use it #if defined(_WIN64) SWR_INVALID("Shouldn't get here in 64-bit mode"); #endif continue; } threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId); // Find Numa Node uint32_t numaId = 0; PROCESSOR_NUMBER procNum = {}; procNum.Group = WORD(procGroup); procNum.Number = UCHAR(threadId); ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); SWR_ASSERT(ret); // Store data if (out_nodes.size() <= numaId) { out_nodes.resize(numaId + 1); } auto& numaNode = out_nodes[numaId]; numaNode.numaId = numaId; uint32_t coreId = 0; if (nullptr == pCore) { numaNode.cores.push_back(Core()); pCore = &numaNode.cores.back(); pCore->procGroup = procGroup; } pCore->threadIds.push_back(threadId); if (procGroup == 0) { out_numThreadsPerProcGroup++; } } } pBuffer = PtrAdd(pBuffer, pBuffer->Size); } free(pBufferMem); #elif defined(__linux__) || defined (__gnu_linux__) // Parse /proc/cpuinfo to get full topology std::ifstream input("/proc/cpuinfo"); std::string line; char* c; uint32_t procId = uint32_t(-1); uint32_t coreId = uint32_t(-1); uint32_t physId = uint32_t(-1); while (std::getline(input, line)) { if (line.find("processor") != std::string::npos) { auto data_start = line.find(": ") + 2; procId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("core id") != std::string::npos) { auto data_start = line.find(": ") + 2; coreId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("physical id") != std::string::npos) { auto data_start = line.find(": ") + 2; physId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.length() == 0) { if (physId + 1 > out_nodes.size()) out_nodes.resize(physId + 1); auto& numaNode = out_nodes[physId]; numaNode.numaId = physId; if (coreId + 1 > numaNode.cores.size()) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); } } out_numThreadsPerProcGroup = 0; for (auto &node : out_nodes) { for (auto &core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } } #elif defined(__APPLE__) auto numProcessors = 0; auto numCores = 0; auto numPhysicalIds = 0; int value; size_t size = sizeof(value); int result = sysctlbyname("hw.packages", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numPhysicalIds = value; result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numProcessors = value; result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numCores = value; out_nodes.resize(numPhysicalIds); for (auto physId = 0; physId < numPhysicalIds; ++physId) { auto &numaNode = out_nodes[physId]; auto procId = 0; numaNode.cores.resize(numCores); while (procId < numProcessors) { for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId) { auto &core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); } } } out_numThreadsPerProcGroup = 0; for (auto &node : out_nodes) { for (auto &core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } } #else #error Unsupported platform #endif // Prune empty cores and numa nodes for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); ) { // Erase empty cores (first) for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); ) { if (core_it->threadIds.size() == 0) { core_it = node_it->cores.erase(core_it); } else { ++core_it; } } // Erase empty numa nodes (second) if (node_it->cores.size() == 0) { node_it = out_nodes.erase(node_it); } else { ++node_it; } } }
HANDLE tMPI_Thread_create_NUMA(LPSECURITY_ATTRIBUTES lpThreadAttributes, SIZE_T dwStackSize, LPTHREAD_START_ROUTINE lpStartAddress, LPVOID lpParameter, DWORD dwCreationFlags, LPDWORD lpThreadId) { LPPROC_THREAD_ATTRIBUTE_LIST pAttributeList = NULL; HANDLE hThread = NULL; SIZE_T cbAttributeList = 0; GROUP_AFFINITY GroupAffinity; PROCESSOR_NUMBER IdealProcessorNumber; ULONG CurrentProcessorIndex; /* for each thread created, round-robin through the set of valid processors and affinity masks. the assumption is that callers of tMPI_Thread_create_NUMA are creating threads that saturate a given processor. for cases where threads are being created that rarely do work, standard thread creation (eg: CreateThread) should be invoked instead. */ CurrentProcessorIndex = (ULONG)InterlockedIncrement((volatile LONG *)&g_ulThreadIndex); CurrentProcessorIndex = CurrentProcessorIndex % g_ulTotalProcessors; /* group, mask. */ memcpy(&GroupAffinity, &(g_MPI_ProcessorInfo[CurrentProcessorIndex].GroupAffinity), sizeof(GROUP_AFFINITY)); /* group, processor number */ memcpy(&IdealProcessorNumber, &(g_MPI_ProcessorInfo[CurrentProcessorIndex].ProcessorNumber), sizeof(PROCESSOR_NUMBER)); /* determine size of allocation for AttributeList */ if(!func_InitializeProcThreadAttributeList(pAttributeList, 2, 0, &cbAttributeList)) { DWORD dwLastError = GetLastError(); if( dwLastError != ERROR_INSUFFICIENT_BUFFER ) { tMPI_Fatal_error(TMPI_FARGS, "InitializeProcThreadAttributeList, error code=%d", dwLastError); goto cleanup; } } pAttributeList = (LPPROC_THREAD_ATTRIBUTE_LIST)tMPI_Malloc( cbAttributeList ); if( pAttributeList == NULL ) { tMPI_Fatal_error(TMPI_FARGS,"Failed to allocate pAttributeList"); goto cleanup; } memset( pAttributeList, 0, cbAttributeList ); if(!func_InitializeProcThreadAttributeList(pAttributeList, 2, 0, &cbAttributeList)) { tMPI_Fatal_error(TMPI_FARGS, "InitializeProcThreadAttributeList, error code=%d", GetLastError()); goto cleanup; } if(!func_UpdateProcThreadAttribute(pAttributeList, 0, PROC_THREAD_ATTRIBUTE_GROUP_AFFINITY, &GroupAffinity, sizeof(GroupAffinity), NULL, NULL)) { tMPI_Fatal_error(TMPI_FARGS,"UpdateProcThreadAttribute, error code=%d", GetLastError()); goto cleanup; } if(!func_UpdateProcThreadAttribute(pAttributeList, 0, PROC_THREAD_ATTRIBUTE_IDEAL_PROCESSOR, &IdealProcessorNumber, sizeof(IdealProcessorNumber), NULL, NULL)) { tMPI_Fatal_error(TMPI_FARGS,"UpdateProcThreadAttribute, error code=%d", GetLastError()); goto cleanup; } hThread = func_CreateRemoteThreadEx( GetCurrentProcess(), lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, pAttributeList, lpThreadId); func_DeleteProcThreadAttributeList( pAttributeList ); #if 0 // TODO: debug only or DISCARD if( hThread ) { PROCESSOR_NUMBER ProcNumber; USHORT NodeNumber; GetThreadIdealProcessorEx(hThread, &ProcNumber); GetNumaProcessorNodeEx(&ProcNumber, &NodeNumber); printf("started thread tid=%lu group=%lu mask=0x%I64x number=%lu numanode=%lu\n", *lpThreadId, GroupAffinity.Group, (ULONGLONG)GroupAffinity.Mask, ProcNumber.Number, NodeNumber ); } #endif cleanup: if( pAttributeList ) { tMPI_Free( pAttributeList ); } return hThread; }