C++ (Cpp) CPUNumaNodes Examples

Programming Language: C++ (Cpp)

Class/Type: CPUNumaNodes

Examples at hotexamples.com: 3

C++ (Cpp) CPUNumaNodes - 3 examples found. These are the top rated real world C++ (Cpp) examples of CPUNumaNodes extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

size(3)

clear(2)

resize(2)

begin(1)

end(1)

erase(1)

Example #1

Show file

File: threads.cpp Project: notaz/mesa

void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
{
    out_nodes.clear();
    out_numThreadsPerProcGroup = 0;

#if defined(_WIN32)

    static std::mutex m;
    std::lock_guard<std::mutex> l(m);

    static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
    DWORD bufSize = sizeof(buffer);

    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
    SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");

    uint32_t count = bufSize / buffer->Size;
    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;

    for (uint32_t i = 0; i < count; ++i)
    {
        SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
        for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
        {
            auto& gmask = pBuffer->Processor.GroupMask[g];
            uint32_t threadId = 0;
            uint32_t procGroup = gmask.Group;

            Core* pCore = nullptr;

            uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);

            while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
            {
                // clear mask
                gmask.Mask &= ~(KAFFINITY(1) << threadId);

                // Find Numa Node
                PROCESSOR_NUMBER procNum = {};
                procNum.Group = WORD(procGroup);
                procNum.Number = UCHAR(threadId);

                uint32_t numaId = 0;
                ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
                SWR_ASSERT(ret);

                // Store data
                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
                auto& numaNode = out_nodes[numaId];

                uint32_t coreId = 0;

                if (nullptr == pCore)
                {
                    numaNode.cores.push_back(Core());
                    pCore = &numaNode.cores.back();
                    pCore->procGroup = procGroup;
#if !defined(_WIN64)
                    coreId = (uint32_t)numaNode.cores.size();
                    if ((coreId * numThreads) >= 32)
                    {
                        // Windows doesn't return threadIds >= 32 for a processor group correctly
                        // when running a 32-bit application.
                        // Just save -1 as the threadId
                        threadId = uint32_t(-1);
                    }
#endif
                }
                pCore->threadIds.push_back(threadId);
                if (procGroup == 0)
                {
                    out_numThreadsPerProcGroup++;
                }
            }
        }
        pBuffer = PtrAdd(pBuffer, pBuffer->Size);
    }


#elif defined(__linux__) || defined (__gnu_linux__)

    // Parse /proc/cpuinfo to get full topology
    std::ifstream input("/proc/cpuinfo");
    std::string line;
    char* c;
    uint32_t threadId = uint32_t(-1);
    uint32_t coreId = uint32_t(-1);
    uint32_t numaId = uint32_t(-1);

    while (std::getline(input, line))
    {
        if (line.find("processor") != std::string::npos)
        {
            if (threadId != uint32_t(-1))
            {
                // Save information.
                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
                auto& numaNode = out_nodes[numaId];
                if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
                auto& core = numaNode.cores[coreId];

                core.procGroup = coreId;
                core.threadIds.push_back(threadId);

                out_numThreadsPerProcGroup++;
            }

            auto data_start = line.find(": ") + 2;
            threadId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.find("core id") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.find("physical id") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            numaId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
    }

    if (threadId != uint32_t(-1))
    {
        // Save information.
        if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
        auto& numaNode = out_nodes[numaId];
        if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
        auto& core = numaNode.cores[coreId];

        core.procGroup = coreId;
        core.threadIds.push_back(threadId);
        out_numThreadsPerProcGroup++;
    }

    for (uint32_t node = 0; node < out_nodes.size(); node++) {
        auto& numaNode = out_nodes[node];
        auto it = numaNode.cores.begin();
        for ( ; it != numaNode.cores.end(); ) {
            if (it->threadIds.size() == 0)
                numaNode.cores.erase(it);
            else
                ++it;
        }
    }

#else

#error Unsupported platform

#endif
}

Example #2

Show file

File: threads.cpp Project: daniel-schuermann/mesa

void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
{
    out_nodes.clear();
    out_numThreadsPerProcGroup = 0;

#if defined(_WIN32)

    std::vector<KAFFINITY> threadMaskPerProcGroup;

    static std::mutex m;
    std::lock_guard<std::mutex> l(m);

    DWORD bufSize = 0;

    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
    SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
    SWR_ASSERT(pBufferMem);

    ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
    SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");

    uint32_t count = bufSize / pBufferMem->Size;
    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;

    for (uint32_t i = 0; i < count; ++i)
    {
        SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
        for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
        {
            auto& gmask = pBuffer->Processor.GroupMask[g];
            uint32_t threadId = 0;
            uint32_t procGroup = gmask.Group;

            Core* pCore = nullptr;

            uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);

            while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
            {
                // clear mask
                KAFFINITY threadMask = KAFFINITY(1) << threadId;
                gmask.Mask &= ~threadMask;

                if (procGroup >= threadMaskPerProcGroup.size())
                {
                    threadMaskPerProcGroup.resize(procGroup + 1);
                }

                if (threadMaskPerProcGroup[procGroup] & threadMask)
                {
                    // Already seen this mask.  This means that we are in 32-bit mode and
                    // have seen more than 32 HW threads for this procGroup
                    // Don't use it
#if defined(_WIN64)
                    SWR_INVALID("Shouldn't get here in 64-bit mode");
#endif
                    continue;
                }

                threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);

                // Find Numa Node
                uint32_t numaId = 0;
                PROCESSOR_NUMBER procNum = {};
                procNum.Group = WORD(procGroup);
                procNum.Number = UCHAR(threadId);

                ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
                SWR_ASSERT(ret);

                // Store data
                if (out_nodes.size() <= numaId)
                {
                    out_nodes.resize(numaId + 1);
                }
                auto& numaNode = out_nodes[numaId];
                numaNode.numaId = numaId;

                uint32_t coreId = 0;

                if (nullptr == pCore)
                {
                    numaNode.cores.push_back(Core());
                    pCore = &numaNode.cores.back();
                    pCore->procGroup = procGroup;
                }
                pCore->threadIds.push_back(threadId);
                if (procGroup == 0)
                {
                    out_numThreadsPerProcGroup++;
                }
            }
        }
        pBuffer = PtrAdd(pBuffer, pBuffer->Size);
    }

    free(pBufferMem);


#elif defined(__linux__) || defined (__gnu_linux__)

    // Parse /proc/cpuinfo to get full topology
    std::ifstream input("/proc/cpuinfo");
    std::string line;
    char* c;
    uint32_t procId = uint32_t(-1);
    uint32_t coreId = uint32_t(-1);
    uint32_t physId = uint32_t(-1);

    while (std::getline(input, line))
    {
        if (line.find("processor") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            procId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.find("core id") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.find("physical id") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            physId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.length() == 0)
        {
            if (physId + 1 > out_nodes.size())
                out_nodes.resize(physId + 1);
            auto& numaNode = out_nodes[physId];
            numaNode.numaId = physId;

            if (coreId + 1 > numaNode.cores.size())
                numaNode.cores.resize(coreId + 1);
            auto& core = numaNode.cores[coreId];
            core.procGroup = coreId;
            core.threadIds.push_back(procId);
        }
    }

    out_numThreadsPerProcGroup = 0;
    for (auto &node : out_nodes)
    {
        for (auto &core : node.cores)
        {
            out_numThreadsPerProcGroup += core.threadIds.size();
        }
    }

#elif defined(__APPLE__)

    auto numProcessors = 0;
    auto numCores = 0;
    auto numPhysicalIds = 0;

    int value;
    size_t size = sizeof(value);

    int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
    SWR_ASSERT(result == 0);
    numPhysicalIds = value;

    result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
    SWR_ASSERT(result == 0);
    numProcessors = value;

    result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
    SWR_ASSERT(result == 0);
    numCores = value;

    out_nodes.resize(numPhysicalIds);

    for (auto physId = 0; physId < numPhysicalIds; ++physId)
    {
        auto &numaNode = out_nodes[physId];
        auto procId = 0;

        numaNode.cores.resize(numCores);

        while (procId < numProcessors)
        {
            for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
            {
                auto &core = numaNode.cores[coreId];

                core.procGroup = coreId;
                core.threadIds.push_back(procId);
            }
        }
    }

    out_numThreadsPerProcGroup = 0;

    for (auto &node : out_nodes)
    {
        for (auto &core : node.cores)
        {
            out_numThreadsPerProcGroup += core.threadIds.size();
        }
    }

#else

#error Unsupported platform

#endif

    // Prune empty cores and numa nodes
    for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); )
    {
        // Erase empty cores (first)
        for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); )
        {
            if (core_it->threadIds.size() == 0)
            {
                core_it = node_it->cores.erase(core_it);
            }
            else
            {
                ++core_it;
            }
        }

        // Erase empty numa nodes (second)
        if (node_it->cores.size() == 0)
        {
            node_it = out_nodes.erase(node_it);
        }
        else
        {
            ++node_it;
        }
    }
}

Example #3

Show file

File: threads.cpp Project: utkarshayachit/openswr-mesa

void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
{
    // Bind application thread to HW thread 0
    bindThread(0);

    CPUNumaNodes nodes;
    CalculateProcessorTopology(nodes);

    uint32_t numHWNodes         = (uint32_t)nodes.size();
    uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
    uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();

    uint32_t numNodes           = numHWNodes;
    uint32_t numCoresPerNode    = numHWCoresPerNode;
    uint32_t numHyperThreads    = numHWHyperThreads;

    if (KNOB_MAX_NUMA_NODES)
    {
        numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
    }

    if (KNOB_MAX_CORES_PER_NUMA_NODE)
    {
        numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
    }

    if (KNOB_MAX_THREADS_PER_CORE)
    {
        numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
    }

    // Calculate numThreads
    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;

    if (numThreads > KNOB_MAX_NUM_THREADS)
    {
        printf("WARNING: system thread count %u exceeds max %u, "
            "performance will be degraded\n",
            numThreads, KNOB_MAX_NUM_THREADS);
    }

    if (numThreads == 1)
    {
        // If only 1 worker thread, try to move it to an available
        // HW thread.  If that fails, use the API thread.
        if (numCoresPerNode < numHWCoresPerNode)
        {
            numCoresPerNode++;
        }
        else if (numHyperThreads < numHWHyperThreads)
        {
            numHyperThreads++;
        }
        else if (numNodes < numHWNodes)
        {
            numNodes++;
        }
        else
        {
            pPool->numThreads = 0;
            SET_KNOB(SINGLE_THREADED, true);
            return;
        }
    }
    else
    {
        // Save a HW thread for the API thread.
        numThreads--;
    }

    pPool->numThreads = numThreads;
    pContext->NumWorkerThreads = pPool->numThreads;

    pPool->inThreadShutdown = false;
    pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));

    uint32_t workerId = 0;
    for (uint32_t n = 0; n < numNodes; ++n)
    {
        auto& node = nodes[n];

        uint32_t numCores = numCoresPerNode;
        for (uint32_t c = 0; c < numCores; ++c)
        {
            auto& core = node.cores[c];
            for (uint32_t t = 0; t < numHyperThreads; ++t)
            {
                if (c == 0 && n == 0 && t == 0)
                {
                    // Skip core 0, thread0  on node 0 to reserve for API thread
                    continue;
                }

                pPool->pThreadData[workerId].workerId = workerId;
                pPool->pThreadData[workerId].procGroupId = core.procGroup;
                pPool->pThreadData[workerId].threadId = core.threadIds[t];
                pPool->pThreadData[workerId].numaId = n;
                pPool->pThreadData[workerId].pContext = pContext;
                pPool->threads[workerId] = new std::thread(workerThread, &pPool->pThreadData[workerId]);

                ++workerId;
            }
        }
    }
}