std::size_t init_numa_node_number( std::size_t num_thread ) { // {{{ if (std::size_t(-1) == num_thread) return std::size_t(-1); UCHAR node_number = 0; if (GetNumaProcessorNode(UCHAR(num_thread), &node_number)) return node_number; std::size_t num_of_cores = hardware_concurrency(); if (0 == num_of_cores) num_of_cores = 1; // assume one core std::size_t num_of_numa_cores = num_of_cores; ULONG numa_nodes = 0; if (GetNumaHighestNodeNumber(&numa_nodes) && 0 != numa_nodes) num_of_numa_cores = num_of_cores / (numa_nodes + 1); return num_thread / num_of_numa_cores; } // }}}
ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools) { enum { MAX_NODE_NUM = 127 }; int cpusPerNode[MAX_NODE_NUM + 1]; memset(cpusPerNode, 0, sizeof(cpusPerNode)); int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); int cpuCount = getCpuCount(); bool bNumaSupport = false; #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 bNumaSupport = true; #elif HAVE_LIBNUMA bNumaSupport = numa_available() >= 0; #endif for (int i = 0; i < cpuCount; i++) { #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 UCHAR node; if (GetNumaProcessorNode((UCHAR)i, &node)) cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++; else #elif HAVE_LIBNUMA if (bNumaSupport >= 0) cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++; else #endif cpusPerNode[0]++; } if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG) for (int i = 0; i < numNumaNodes; i++) x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]); /* limit nodes based on param->numaPools */ if (p->numaPools && *p->numaPools) { const char *nodeStr = p->numaPools; for (int i = 0; i < numNumaNodes; i++) { if (!*nodeStr) { cpusPerNode[i] = 0; continue; } else if (*nodeStr == '-') cpusPerNode[i] = 0; else if (*nodeStr == '*') break; else if (*nodeStr == '+') ; else { int count = atoi(nodeStr); cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]); } /* consume current node string, comma, and white-space */ while (*nodeStr && *nodeStr != ',') ++nodeStr; if (*nodeStr == ',' || *nodeStr == ' ') ++nodeStr; } } // In the case that numa is disabled and we have more CPUs than 64, // spawn the last pool only if the # threads in that pool is > 1/2 max (heuristic) if ((numNumaNodes == 1) && (cpusPerNode[0] % MAX_POOL_THREADS < (MAX_POOL_THREADS / 2))) { cpusPerNode[0] -= (cpusPerNode[0] % MAX_POOL_THREADS); x265_log(p, X265_LOG_DEBUG, "Creating only %d worker threads to prevent asymmetry in pools; may not use all HW contexts\n", cpusPerNode[0]); } numPools = 0; for (int i = 0; i < numNumaNodes; i++) { if (bNumaSupport) x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]); if (cpusPerNode[i]) numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS; } if (!numPools) return NULL; if (numPools > p->frameNumThreads) { x265_log(p, X265_LOG_DEBUG, "Reducing number of thread pools for frame thread count\n"); numPools = X265_MAX(p->frameNumThreads / 2, 1); } ThreadPool *pools = new ThreadPool[numPools]; if (pools) { int maxProviders = (p->frameNumThreads + numPools - 1) / numPools + 1; /* +1 is Lookahead, always assigned to threadpool 0 */ int node = 0; for (int i = 0; i < numPools; i++) { while (!cpusPerNode[node]) node++; int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]); if (!pools[i].create(cores, maxProviders, node)) { X265_FREE(pools); numPools = 0; return NULL; } if (numNumaNodes > 1) x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node); else x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores); cpusPerNode[node] -= cores; } } else numPools = 0; return pools; }