/* * Parse the string @a maskStr containing a hex number (with or without * leading "0x") and set nodemask accordingly. * * If the sting is not a valid hex number, each bit in nodemask becomes set. */ static void parseNUMAmask(struct bitmask *nodemask, char *maskStr, int32_t rank) { char *mask, *curchar, *endptr; size_t len; uint32_t curbit; uint16_t i, j, digit; mask = maskStr; if (strncmp(maskStr, "0x", 2) == 0) { /* skip "0x", treat always as hex */ mask += 2; } mask = ustrdup(mask); /* gets destroyed */ len = strlen(mask); curchar = mask + (len - 1); curbit = 0; for (i = len; i > 0; i--) { digit = strtol(curchar, &endptr, 16); if (*endptr != '\0') { mlog("%s: error parsing memory mask '%s'\n", __func__, maskStr); goto error; } for (j = 0; j < 4; j++) { if (digit & (1 << j)) { if ((long int)(curbit + j) > numa_max_node()) { mlog("%s: invalid memory mask entry '%s' for rank %d\n", __func__, maskStr, rank); fprintf(stderr, "Invalid memory mask entry '%s' for rank" " %d\n", maskStr, rank); goto error; } if (numa_bitmask_isbitset(numa_get_mems_allowed(), curbit + j)) { numa_bitmask_setbit(nodemask, curbit + j); } else { mlog("%s: setting bit %u in memory mask not allowed in" " rank %d\n", __func__, curbit + j, rank); fprintf(stderr, "Not allowed to set bit %u in memory mask" " of rank %d\n", curbit + j, rank); } } } curbit += 4; *curchar = '\0'; curchar--; } ufree(mask); return; error: ufree(mask); numa_bitmask_setall(nodemask); }
int init_virtual_topology(config_t* cfg, cpu_model_t* cpu_model, virtual_topology_t** virtual_topologyp) { char* mc_pci_file; char* str; char* saveptr; char* token = "NULL"; int* physical_node_ids; physical_node_t** physical_nodes; int num_physical_nodes; int n, v, i, j, sibling_idx, node_i_idx; int node_id; physical_node_t* node_i, *node_j, *sibling_node; int ret; int min_distance; int hyperthreading; struct bitmask* mem_nodes; virtual_topology_t* virtual_topology; __cconfig_lookup_string(cfg, "topology.physical_nodes", &str); // parse the physical nodes string physical_node_ids = calloc(numa_num_possible_nodes(), sizeof(*physical_node_ids)); num_physical_nodes = 0; while (token = strtok_r(str, ",", &saveptr)) { physical_node_ids[num_physical_nodes] = atoi(token); str = NULL; if (++num_physical_nodes > numa_num_possible_nodes()) { // we re being asked to run on more nodes than available free(physical_node_ids); ret = E_ERROR; goto done; } } physical_nodes = calloc(num_physical_nodes, sizeof(*physical_nodes)); // select those nodes we can run on (e.g. not constrained by any numactl) mem_nodes = numa_get_mems_allowed(); for (i=0, n=0; i<num_physical_nodes; i++) { node_id = physical_node_ids[i]; if (numa_bitmask_isbitset(mem_nodes, node_id)) { physical_nodes[n] = malloc(sizeof(**physical_nodes)); physical_nodes[n]->node_id = node_id; // TODO: what if we want to avoid using only a single hardware contexts of a hyperthreaded core? physical_nodes[n]->cpu_bitmask = numa_allocate_cpumask(); numa_node_to_cpus(node_id, physical_nodes[n]->cpu_bitmask); __cconfig_lookup_bool(cfg, "topology.hyperthreading", &hyperthreading); if (hyperthreading) { physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask); } else { DBG_LOG(INFO, "Not using hyperthreading.\n"); // disable the upper half of the processors in the bitmask physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask) / 2; int fc = first_cpu(physical_nodes[n]->cpu_bitmask); for (j=fc+system_num_cpus()/2; j<fc+system_num_cpus()/2+physical_nodes[n]->num_cpus; j++) { if (numa_bitmask_isbitset(physical_nodes[n]->cpu_bitmask, j)) { numa_bitmask_clearbit(physical_nodes[n]->cpu_bitmask, j); } } } n++; } } free(physical_node_ids); num_physical_nodes = n; // if pci bus topology of each physical node is not provided then discover it if (__cconfig_lookup_string(cfg, "topology.mc_pci", &mc_pci_file) == CONFIG_FALSE || (__cconfig_lookup_string(cfg, "topology.mc_pci", &mc_pci_file) == CONFIG_TRUE && load_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes) != E_SUCCESS)) { discover_mc_pci_topology(cpu_model, physical_nodes, num_physical_nodes); save_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes); } // form virtual nodes by grouping physical nodes that are close to each other virtual_topology = malloc(sizeof(*virtual_topology)); virtual_topology->num_virtual_nodes = num_physical_nodes / 2 + num_physical_nodes % 2; virtual_topology->virtual_nodes = calloc(virtual_topology->num_virtual_nodes, sizeof(*(virtual_topology->virtual_nodes))); for (i=0, v=0; i<num_physical_nodes; i++) { min_distance = INT_MAX; sibling_node = NULL; sibling_idx = -1; if ((node_i = physical_nodes[i]) == NULL) { continue; } for (j=i+1; j<num_physical_nodes; j++) { if ((node_j = physical_nodes[j]) == NULL) { continue; } if (numa_distance(node_i->node_id,node_j->node_id) < min_distance) { sibling_node = node_j; sibling_idx = j; } } if (sibling_node) { physical_nodes[i] = physical_nodes[sibling_idx] = NULL; virtual_node_t* virtual_node = &virtual_topology->virtual_nodes[v]; virtual_node->dram_node = node_i; virtual_node->nvram_node = sibling_node; virtual_node->node_id = v; virtual_node->cpu_model = cpu_model; DBG_LOG(INFO, "Fusing physical nodes %d %d into virtual node %d\n", node_i->node_id, sibling_node->node_id, virtual_node->node_id); v++; } } // any physical node that is not paired with another physical node is // formed into a virtual node on its own if (2*v < num_physical_nodes) { for (i=0; i<num_physical_nodes; i++) { node_i = physical_nodes[i]; virtual_node_t* virtual_node = &virtual_topology->virtual_nodes[v]; virtual_node->dram_node = virtual_node->nvram_node = node_i; virtual_node->node_id = v; DBG_LOG(WARNING, "Forming physical node %d into virtual node %d without a sibling node.\n", node_i->node_id, virtual_node->node_id); } } *virtual_topologyp = virtual_topology; ret = E_SUCCESS; done: free(physical_nodes); return ret; }
/** * @brief Do memory binding. * * This is handling the binding types map_mem, mask_mem and rank. * The types local (default) and none are handled directly by the deamon. * * When using libnuma with API v1, this is a noop, just giving a warning. * * @param step Step structure * @param task Task structure * * @return No return value. */ void doMemBind(Step_t *step, PStask_t *task) { # ifndef HAVE_NUMA_ALLOCATE_NODEMASK mlog("%s: psslurm does not support memory binding types map_mem, mask_mem" " and rank with libnuma v1\n", __func__); fprintf(stderr, "Memory binding type not supported with used libnuma" " version"); return; # else const char delimiters[] = ","; uint32_t lTID; char *next, *saveptr, *ents, *myent, *endptr; char **entarray; unsigned int numents; uint16_t mynode; struct bitmask *nodemask = NULL; if (!(step->memBindType & MEM_BIND_MAP) && !(step->memBindType & MEM_BIND_MASK) && !(step->memBindType & MEM_BIND_RANK)) { /* things are handled elsewhere */ return; } if (!PSIDnodes_bindMem(PSC_getMyID()) || getenv("__PSI_NO_MEMBIND")) { // info messages already printed in doClamps() return; } if (numa_available()==-1) { fprintf(stderr, "NUMA not available:"); return; } nodemask = numa_allocate_nodemask(); if (!nodemask) { fprintf(stderr, "Allocation of nodemask failed:"); return; } lTID = getLocalRankID(task->rank, step, step->localNodeId); if (step->memBindType & MEM_BIND_RANK) { if (lTID > (unsigned int)numa_max_node()) { mlog("%s: memory binding to ranks not possible for rank %d." " (local rank %d > #numa_nodes %d)\n", __func__, task->rank, lTID, numa_max_node()); fprintf(stderr, "Memory binding to ranks not possible for rank %d," " local rank %u larger than max numa node %d.", task->rank, lTID, numa_max_node()); if (nodemask) numa_free_nodemask(nodemask); return; } if (numa_bitmask_isbitset(numa_get_mems_allowed(), lTID)) { numa_bitmask_setbit(nodemask, lTID); } else { mlog("%s: setting bit %d in memory mask not allowed in rank" " %d\n", __func__, lTID, task->rank); fprintf(stderr, "Not allowed to set bit %u in memory mask" " of rank %d\n", lTID, task->rank); } numa_set_membind(nodemask); if (nodemask) numa_free_nodemask(nodemask); return; } ents = ustrdup(step->memBind); entarray = umalloc(step->tasksToLaunch[step->localNodeId] * sizeof(char*)); numents = 0; myent = NULL; entarray[0] = NULL; next = strtok_r(ents, delimiters, &saveptr); while (next && (numents < step->tasksToLaunch[step->localNodeId])) { entarray[numents++] = next; if (numents == lTID+1) { myent = next; break; } next = strtok_r(NULL, delimiters, &saveptr); } if (!myent && numents) { myent = entarray[lTID % numents]; } if (!myent) { numa_set_membind(numa_all_nodes_ptr); if (step->memBindType & MEM_BIND_MASK) { mlog("%s: invalid mem mask string '%s'\n", __func__, ents); } else if (step->memBindType & MEM_BIND_MAP) { mlog("%s: invalid mem map string '%s'\n", __func__, ents); } goto cleanup; } if (step->memBindType & MEM_BIND_MAP) { if (strncmp(myent, "0x", 2) == 0) { mynode = strtoul (myent+2, &endptr, 16); } else { mynode = strtoul (myent, &endptr, 10); } if (*endptr == '\0' && mynode <= numa_max_node()) { if (numa_bitmask_isbitset(numa_get_mems_allowed(), mynode)) { numa_bitmask_setbit(nodemask, mynode); } else { mlog("%s: setting bit %d in memory mask not allowed in rank" " %d\n", __func__, mynode, task->rank); fprintf(stderr, "Not allowed to set bit %d in memory mask" " of rank %d\n", mynode, task->rank); } } else { mlog("%s: invalid memory map entry '%s' (%d) for rank %d\n", __func__, myent, mynode, task->rank); fprintf(stderr, "Invalid memory map entry '%s' for rank %d\n", myent, task->rank); numa_set_membind(numa_all_nodes_ptr); goto cleanup; } mdbg(PSSLURM_LOG_PART, "%s: (bind_map) node %i local task %i" " memstr '%s'\n", __func__, step->localNodeId, lTID, myent); } else if (step->memBindType & MEM_BIND_MASK) { parseNUMAmask(nodemask, myent, task->rank); } numa_set_membind(nodemask); cleanup: ufree(ents); ufree(entarray); if (nodemask) numa_free_nodemask(nodemask); # endif return; }