void* mmap_1g(void* addr /* = nullptr */, int node /* = -1 */) { #ifdef __linux__ if (s_num1GPages >= kMaxNum1GPages) return nullptr; if (get_huge1g_info(node).free_hugepages <= 0) return nullptr; if (node >= 0 && !numa_node_allowed(node)) return nullptr; #ifdef HAVE_NUMA bitmask* memMask = nullptr; bitmask* interleaveMask = nullptr; if (node >= 0 && numa_num_nodes > 1) { memMask = numa_get_membind(); interleaveMask = numa_get_interleave_mask(); bitmask* mask = numa_allocate_nodemask(); numa_bitmask_setbit(mask, node); numa_set_membind(mask); numa_bitmask_free(mask); } #endif void* ret = mmap_1g_impl(addr); if (ret != nullptr) { s_1GPages[s_num1GPages++] = ret; } #ifdef HAVE_NUMA if (memMask) { assert(interleaveMask); numa_set_membind(memMask); numa_set_interleave_mask(interleaveMask); numa_bitmask_free(memMask); numa_bitmask_free(interleaveMask); } #endif return ret; #else return nullptr; #endif }
void* mmap_2m(void* addr, int prot, int node /* = -1 */, bool map_shared /* = false */, bool map_fixed /* = false */) { #ifdef __linux__ if (get_huge2m_info(node).free_hugepages <= 0) return nullptr; #ifdef HAVE_NUMA bitmask* memMask = nullptr; bitmask* interleaveMask = nullptr; if (node >= 0 && numa_num_nodes > 1) { assert(numa_node_set != 0); if ((numa_node_set & (1u << node)) == 0) { // Numa policy forbids allocation on the node. return nullptr; } memMask = numa_get_membind(); interleaveMask = numa_get_interleave_mask(); bitmask* mask = numa_allocate_nodemask(); numa_bitmask_setbit(mask, node); numa_set_membind(mask); numa_bitmask_free(mask); } #endif void* ret = mmap_2m_impl(addr, prot, map_shared, map_fixed); s_num2MPages += !!ret; #ifdef HAVE_NUMA if (memMask) { numa_set_membind(memMask); numa_set_interleave_mask(interleaveMask); numa_bitmask_free(memMask); numa_bitmask_free(interleaveMask); } #endif return ret; #else // not linux return nullptr; #endif }
int virNumaSetupMemoryPolicy(virDomainNumatuneMemMode mode, virBitmapPtr nodeset) { nodemask_t mask; int node = -1; int ret = -1; int bit = 0; size_t i; int maxnode = 0; if (!nodeset) return 0; if (!virNumaNodesetIsAvailable(nodeset)) return -1; maxnode = numa_max_node(); maxnode = maxnode < NUMA_NUM_NODES ? maxnode : NUMA_NUM_NODES; /* Convert nodemask to NUMA bitmask. */ nodemask_zero(&mask); bit = -1; while ((bit = virBitmapNextSetBit(nodeset, bit)) >= 0) { if (bit > maxnode) { virReportError(VIR_ERR_INTERNAL_ERROR, _("NUMA node %d is out of range"), bit); return -1; } nodemask_set(&mask, bit); } switch (mode) { case VIR_DOMAIN_NUMATUNE_MEM_STRICT: numa_set_bind_policy(1); numa_set_membind(&mask); numa_set_bind_policy(0); break; case VIR_DOMAIN_NUMATUNE_MEM_PREFERRED: { int nnodes = 0; for (i = 0; i < NUMA_NUM_NODES; i++) { if (nodemask_isset(&mask, i)) { node = i; nnodes++; } } if (nnodes != 1) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("NUMA memory tuning in 'preferred' mode " "only supports single node")); goto cleanup; } numa_set_bind_policy(0); numa_set_preferred(node); } break; case VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE: numa_set_interleave_mask(&mask); break; case VIR_DOMAIN_NUMATUNE_MEM_LAST: break; } ret = 0; cleanup: return ret; }
int virNumaSetupMemoryPolicy(virNumaTuneDef numatune, virBitmapPtr nodemask) { nodemask_t mask; int mode = -1; int node = -1; int ret = -1; int i = 0; int maxnode = 0; virBitmapPtr tmp_nodemask = NULL; if (numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_STATIC) { if (!numatune.memory.nodemask) return 0; VIR_DEBUG("Set NUMA memory policy with specified nodeset"); tmp_nodemask = numatune.memory.nodemask; } else if (numatune.memory.placement_mode == VIR_NUMA_TUNE_MEM_PLACEMENT_MODE_AUTO) { VIR_DEBUG("Set NUMA memory policy with advisory nodeset from numad"); tmp_nodemask = nodemask; } else { return 0; } if (numa_available() < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Host kernel is not aware of NUMA.")); return -1; } maxnode = numa_max_node() + 1; /* Convert nodemask to NUMA bitmask. */ nodemask_zero(&mask); i = -1; while ((i = virBitmapNextSetBit(tmp_nodemask, i)) >= 0) { if (i > maxnode || i > NUMA_NUM_NODES) { virReportError(VIR_ERR_INTERNAL_ERROR, _("Nodeset is out of range, host cannot support " "NUMA node bigger than %d"), i); return -1; } nodemask_set(&mask, i); } mode = numatune.memory.mode; if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { numa_set_bind_policy(1); numa_set_membind(&mask); numa_set_bind_policy(0); } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_PREFERRED) { int nnodes = 0; for (i = 0; i < NUMA_NUM_NODES; i++) { if (nodemask_isset(&mask, i)) { node = i; nnodes++; } } if (nnodes != 1) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("NUMA memory tuning in 'preferred' mode " "only supports single node")); goto cleanup; } numa_set_bind_policy(0); numa_set_preferred(node); } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE) { numa_set_interleave_mask(&mask); } else { /* XXX: Shouldn't go here, as we already do checking when * parsing domain XML. */ virReportError(VIR_ERR_XML_ERROR, "%s", _("Invalid mode for memory NUMA tuning.")); goto cleanup; } ret = 0; cleanup: return ret; }
size_t remap_interleaved_2m_pages(void* addr, size_t pages, int prot, bool shared /* = false */) { #ifdef __linux__ assert(reinterpret_cast<uintptr_t>(addr) % size2m == 0); assert(addr != nullptr); if (pages == 0) return 0; #ifdef HAVE_NUMA const int maxNode = numa_max_node(); bitmask* memMask = nullptr; bitmask* interleaveMask = nullptr; bitmask* mask = nullptr; if (maxNode > 0) { memMask = numa_get_membind(); interleaveMask = numa_get_interleave_mask(); mask = numa_allocate_nodemask(); } #else constexpr int maxNode = 0; #endif int node = -1; int failed = 0; // consecutive failure count int mapped_count = 0; do { #ifdef HAVE_NUMA if (maxNode > 0) { if (++node > maxNode) node = 0; if (!numa_node_allowed(node)) { // Numa policy forbids allocation on node if (++failed > maxNode) break; continue; } numa_bitmask_setbit(mask, node); numa_set_membind(mask); numa_bitmask_clearbit(mask, node); } #endif // Fail early if we don't have huge pages reserved. if (get_huge2m_info(node).free_hugepages > 0 && mmap_2m_impl(addr, prot, shared, true /* MAP_FIXED */)) { addr = (char*)addr + size2m; ++mapped_count; failed = 0; continue; } // We failed on node, give up if we have failed on all nodes if (++failed > maxNode) break; } while (mapped_count < pages); #ifdef HAVE_NUMA if (mask) { numa_set_membind(memMask); numa_set_interleave_mask(interleaveMask); numa_bitmask_free(mask); numa_bitmask_free(interleaveMask); numa_bitmask_free(memMask); } #endif return mapped_count; #else // not linux return 0; #endif }
int main(int ac, char **av) { int c, i, nnodes=0; long node=-1; char *end; char shortopts[array_len(opts)*2 + 1]; struct bitmask *mask = NULL; get_short_opts(opts,shortopts); while ((c = getopt_long(ac, av, shortopts, opts, NULL)) != -1) { switch (c) { case 's': /* --show */ show(); exit(0); case 'H': /* --hardware */ nopolicy(); hardware(); exit(0); case 'i': /* --interleave */ checknuma(); mask = numactl_parse_nodestring(optarg); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; setpolicy(MPOL_INTERLEAVE); if (shmfd >= 0) numa_interleave_memory(shmptr, shmlen, mask); else numa_set_interleave_mask(mask); checkerror("setting interleave mask"); break; case 'N': /* --cpunodebind */ case 'c': /* --cpubind */ dontshm("-c/--cpubind/--cpunodebind"); checknuma(); mask = numactl_parse_nodestring(optarg); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; check_cpubind(do_shm); did_cpubind = 1; numa_run_on_node_mask(mask); checkerror("sched_setaffinity"); break; case 'C': /* --physcpubind */ { struct bitmask *cpubuf; dontshm("-C/--physcpubind"); cpubuf = numa_parse_cpustring(optarg); if (!cpubuf) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; check_cpubind(do_shm); did_cpubind = 1; numa_sched_setaffinity(0, cpubuf); checkerror("sched_setaffinity"); free(cpubuf); break; } case 'm': /* --membind */ checknuma(); setpolicy(MPOL_BIND); mask = numactl_parse_nodestring(optarg); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; numa_set_bind_policy(1); if (shmfd >= 0) { numa_tonodemask_memory(shmptr, shmlen, mask); } else { numa_set_membind(mask); } numa_set_bind_policy(0); checkerror("setting membind"); break; case 'p': /* --preferred */ checknuma(); setpolicy(MPOL_PREFERRED); mask = numactl_parse_nodestring(optarg); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } for (i=0; i<mask->size; i++) { if (numa_bitmask_isbitset(mask, i)) { node = i; nnodes++; } } if (nnodes != 1) usage(); numa_bitmask_free(mask); errno = 0; numa_set_bind_policy(0); if (shmfd >= 0) numa_tonode_memory(shmptr, shmlen, node); else numa_set_preferred(node); checkerror("setting preferred node"); break; case 'l': /* --local */ checknuma(); setpolicy(MPOL_DEFAULT); errno = 0; if (shmfd >= 0) numa_setlocal_memory(shmptr, shmlen); else numa_set_localalloc(); checkerror("local allocation"); break; case 'S': /* --shm */ check_cpubind(did_cpubind); nopolicy(); attach_sysvshm(optarg, "--shm"); shmattached = 1; break; case 'f': /* --file */ check_cpubind(did_cpubind); nopolicy(); attach_shared(optarg, "--file"); shmattached = 1; break; case 'L': /* --length */ noshm("--length"); shmlen = memsize(optarg); break; case 'M': /* --shmmode */ noshm("--shmmode"); shmmode = strtoul(optarg, &end, 8); if (end == optarg || *end) usage(); break; case 'd': /* --dump */ if (shmfd < 0) complain( "Cannot do --dump without shared memory.\n"); dump_shm(); do_dump = 1; break; case 'D': /* --dump-nodes */ if (shmfd < 0) complain( "Cannot do --dump-nodes without shared memory.\n"); dump_shm_nodes(); do_dump = 1; break; case 't': /* --strict */ did_strict = 1; numa_set_strict(1); break; case 'I': /* --shmid */ shmid = strtoul(optarg, &end, 0); if (end == optarg || *end) usage(); break; case 'u': /* --huge */ noshm("--huge"); shmflags |= SHM_HUGETLB; break; case 'o': /* --offset */ noshm("--offset"); shmoffset = memsize(optarg); break; case 'T': /* --touch */ needshm("--touch"); check_shmbeyond("--touch"); numa_police_memory(shmptr, shmlen); break; case 'V': /* --verify */ needshm("--verify"); if (set_policy < 0) complain("Need a policy first to verify"); check_shmbeyond("--verify"); numa_police_memory(shmptr, shmlen); if (!mask) complain("Need a mask to verify"); else verify_shm(set_policy, mask); break; default: usage(); } } av += optind; ac -= optind; if (shmfd >= 0) { if (*av) usage(); exit(exitcode); } if (did_strict) fprintf(stderr, "numactl: warning. Strict flag for process ignored.\n"); if (do_dump) usage_msg("cannot do --dump|--dump-shm for process"); if (shmoption) usage_msg("shm related option %s for process", shmoption); if (*av == NULL) usage(); execvp(*av, av); complain("execution of `%s': %s\n", av[0], strerror(errno)); return 0; /* not reached */ }
int INTERNAL qt_affinity_gendists(qthread_shepherd_t *sheps, qthread_shepherd_id_t nshepherds) { /*{{{ */ const size_t num_extant_nodes = numa_max_node() + 1; nodemask_t bmask; qthread_debug(AFFINITY_FUNCTIONS, "sheps(%p), nshepherds(%u), num_extant_nodes:%u\n", sheps, nshepherds, (unsigned)num_extant_nodes); if (numa_available() == -1) { return QTHREAD_THIRD_PARTY_ERROR; } nodemask_zero(&bmask); /* assign nodes */ qthread_debug(AFFINITY_DETAILS, "assign nodes...\n"); for (size_t i = 0; i < nshepherds; ++i) { sheps[i].node = i % num_extant_nodes; qthread_debug(AFFINITY_DETAILS, "set bit %u in bmask\n", i % num_extant_nodes); nodemask_set(&bmask, i % num_extant_nodes); } qthread_debug(AFFINITY_DETAILS, "numa_set_interleave_mask\n"); numa_set_interleave_mask(&bmask); qthread_debug(AFFINITY_DETAILS, "querying distances...\n"); /* truly ancient versions of libnuma (in the changelog, this is * considered "pre-history") do not have numa_distance() */ for (qthread_shepherd_id_t i = 0; i < nshepherds; i++) { qthread_debug(AFFINITY_DETAILS, "i = %u < %u...\n", i, nshepherds); const unsigned int node_i = sheps[i].node; size_t j, k; sheps[i].shep_dists = calloc(nshepherds, sizeof(unsigned int)); sheps[i].sorted_sheplist = calloc(nshepherds - 1, sizeof(qthread_shepherd_id_t)); qthread_debug(AFFINITY_DETAILS, "allocs %p %p\n", sheps[i].shep_dists, sheps[i].sorted_sheplist); assert(sheps[i].shep_dists); assert(sheps[i].sorted_sheplist); for (j = 0; j < nshepherds; j++) { const unsigned int node_j = sheps[j].node; #if QTHREAD_NUMA_DISTANCE_WORKING if ((node_i != QTHREAD_NO_NODE) && (node_j != QTHREAD_NO_NODE) && (node_i != node_j)) { sheps[i].shep_dists[j] = numa_distance(node_i, node_j); } else { #endif /* XXX too arbitrary */ if (i == j) { sheps[i].shep_dists[j] = 0; } else { sheps[i].shep_dists[j] = 20; } #if QTHREAD_NUMA_DISTANCE_WORKING } #endif qthread_debug(AFFINITY_DETAILS, "shep %u to shep %u distance: %u\n", i, j, sheps[i].shep_dists[j]); } k = 0; for (j = 0; j < nshepherds; j++) { if (j != i) { sheps[i].sorted_sheplist[k++] = j; } } if (nshepherds > 1) { sort_sheps(sheps[i].shep_dists, sheps[i].sorted_sheplist, nshepherds); } } return QTHREAD_SUCCESS; } /*}}} */
static int virLXCControllerSetupNUMAPolicy(virLXCControllerPtr ctrl) { nodemask_t mask; int mode = -1; int node = -1; int ret = -1; int i = 0; int maxnode = 0; bool warned = false; if (!ctrl->def->numatune.memory.nodemask) return 0; VIR_DEBUG("Setting NUMA memory policy"); if (numa_available() < 0) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Host kernel is not aware of NUMA.")); return -1; } maxnode = numa_max_node() + 1; /* Convert nodemask to NUMA bitmask. */ nodemask_zero(&mask); i = -1; while ((i = virBitmapNextSetBit(ctrl->def->numatune.memory.nodemask, i)) >= 0) { if (i > NUMA_NUM_NODES) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, _("Host cannot support NUMA node %d"), i); return -1; } if (i > maxnode && !warned) { VIR_WARN("nodeset is out of range, there is only %d NUMA " "nodes on host", maxnode); warned = true; } nodemask_set(&mask, i); } mode = ctrl->def->numatune.memory.mode; if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { numa_set_bind_policy(1); numa_set_membind(&mask); numa_set_bind_policy(0); } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_PREFERRED) { int nnodes = 0; for (i = 0; i < NUMA_NUM_NODES; i++) { if (nodemask_isset(&mask, i)) { node = i; nnodes++; } } if (nnodes != 1) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("NUMA memory tuning in 'preferred' mode " "only supports single node")); goto cleanup; } numa_set_bind_policy(0); numa_set_preferred(node); } else if (mode == VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE) { numa_set_interleave_mask(&mask); } else { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, _("Unable to set NUMA policy %s"), virDomainNumatuneMemModeTypeToString(mode)); goto cleanup; } ret = 0; cleanup: return ret; }