/* return the numa node for the given cpuid */ extern uint16_t slurm_get_numa_node(uint16_t cpuid) { uint16_t maxcpus = 0, nnid = 0; int size, retry, max_node; unsigned long *cpu_mask; maxcpus = conf->sockets * conf->cores * conf->threads; if (cpuid >= maxcpus) return 0; if (numa_array) { return numa_array[cpuid]; } /* need to load the numa_array */ max_node = numa_max_node(); /* The required size of the mask buffer for numa_node_to_cpus() * is goofed up. The third argument is supposed to be the size * of the mask, which is an array of unsigned longs. The *unit* * of the third argument is unclear - should it be in bytes or * in unsigned longs??? Since I don't know, I'm using this retry * loop to try and determine an acceptable size. If anyone can * fix this interaction, please do!! */ size = 8; cpu_mask = xmalloc(sizeof(unsigned long) * size); retry = 0; while (retry++ < 8 && numa_node_to_cpus(nnid, cpu_mask, size) < 0) { size *= 2; xrealloc(cpu_mask, sizeof(unsigned long) * size); } if (retry >= 8) { xfree(cpu_mask); error("NUMA problem with numa_node_to_cpus arguments"); return 0; } numa_array = xmalloc(sizeof(uint16_t) * maxcpus); _add_numa_mask_to_array(cpu_mask, size, maxcpus, nnid); while (nnid++ < max_node) { if (numa_node_to_cpus(nnid, cpu_mask, size) < 0) { error("NUMA problem - numa_node_to_cpus 2nd call fail"); xfree(cpu_mask); xfree(numa_array); numa_array = NULL; return 0; } _add_numa_mask_to_array(cpu_mask, size, maxcpus, nnid); } xfree(cpu_mask); return numa_array[cpuid]; }
int main(void) { int i, k, w, ncpus; struct bitmask *cpus; int maxnode = numa_num_configured_nodes()-1; if (numa_available() < 0) { printf("no numa\n"); exit(1); } cpus = numa_allocate_cpumask(); ncpus = cpus->size; for (i = 0; i <= maxnode ; i++) { if (numa_node_to_cpus(i, cpus) < 0) { printf("node %d failed to convert\n",i); } printf("%d: ", i); w = 0; for (k = 0; k < ncpus; k++) if (numa_bitmask_isbitset(cpus, k)) printf(" %s%d", w>0?",":"", k); putchar('\n'); } return 0; }
/*---------------------------------------------------------------------------*/ static int numa_node_to_cpusmask(int node, uint64_t *cpusmask, int *nr) { struct bitmask *mask; uint64_t bmask = 0; int retval = -1; unsigned int i; mask = numa_allocate_cpumask(); retval = numa_node_to_cpus(node, mask); if (retval < 0) goto cleanup; *nr = 0; for (i = 0; i < mask->size && i < 64; i++) { if (numa_bitmask_isbitset(mask, i)) { cpusmask_set_bit(i, &bmask); (*nr)++; } } retval = 0; cleanup: *cpusmask = bmask; numa_free_cpumask(mask); return retval; }
int pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg) { int core; int ret; cpu_set_t mask; CPU_ZERO(&mask); ret = old_pthread_create(thread, attr, start_routine, arg); if(!get_shm()->active) return ret; core = get_next_core(); if(!get_shm()->per_node) { CPU_SET(core, &mask); } else { int i, node = numa_node_of_cpu(core); struct bitmask * bmp = numa_allocate_cpumask(); numa_node_to_cpus(node, bmp); for(i = 0; i < numa_num_configured_cpus(); i++) { if(numa_bitmask_isbitset(bmp, i)) CPU_SET(i, &mask); } numa_free_cpumask(bmp); } old_pthread_setaffinity_np(*thread, sizeof(mask), &mask); VERBOSE("-> Set affinity to %d\n", core); return ret; }
int virNumaGetNodeCPUs(int node, virBitmapPtr *cpus) { unsigned long *mask = NULL; unsigned long *allonesmask = NULL; virBitmapPtr cpumap = NULL; int ncpus = 0; int max_n_cpus = virNumaGetMaxCPUs(); int mask_n_bytes = max_n_cpus / 8; size_t i; int ret = -1; *cpus = NULL; if (VIR_ALLOC_N(mask, mask_n_bytes / sizeof(*mask)) < 0) goto cleanup; if (VIR_ALLOC_N(allonesmask, mask_n_bytes / sizeof(*mask)) < 0) goto cleanup; memset(allonesmask, 0xff, mask_n_bytes); /* The first time this returns -1, ENOENT if node doesn't exist... */ if (numa_node_to_cpus(node, mask, mask_n_bytes) < 0) { VIR_WARN("NUMA topology for cell %d is not available, ignoring", node); ret = -2; goto cleanup; } /* second, third... times it returns an all-1's mask */ if (memcmp(mask, allonesmask, mask_n_bytes) == 0) { VIR_DEBUG("NUMA topology for cell %d is invalid, ignoring", node); ret = -2; goto cleanup; } if (!(cpumap = virBitmapNew(max_n_cpus))) goto cleanup; for (i = 0; i < max_n_cpus; i++) { if (MASK_CPU_ISSET(mask, i)) { ignore_value(virBitmapSetBit(cpumap, i)); ncpus++; } } *cpus = cpumap; cpumap = NULL; ret = ncpus; cleanup: VIR_FREE(mask); VIR_FREE(allonesmask); virBitmapFree(cpumap); return ret; }
JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_nodeToCpus (JNIEnv *env, jobject obj, jint node, jlongArray array) { unsigned long* buf = (unsigned long*) (*env)->GetPrimitiveArrayCritical(env, (jarray) array, 0); int len = (int) (*env)->GetArrayLength(env, array); int ret = numa_node_to_cpus((int) node, buf, len * 8); (*env)->ReleasePrimitiveArrayCritical(env, (jarray) array, buf, 0); if(ret != 0) throwException(env, obj, errno); }
/** * \brief get a array of cores with a ceartain placement */ static coreid_t* placement(uint32_t n, bool do_fill) { coreid_t* result = malloc(sizeof(coreid_t)*n); uint32_t numa_nodes = numa_max_node()+1; uint32_t num_cores = numa_num_configured_cpus(); struct bitmask* nodes[numa_nodes]; for (int i = 0; i < numa_nodes; i++) { nodes[i] = numa_allocate_cpumask(); numa_node_to_cpus(i, nodes[i]); } int num_taken = 0; if (numa_available() == 0) { if (do_fill) { for (int i = 0; i < numa_nodes; i++) { for (int j = 0; j < num_cores; j++) { if (numa_bitmask_isbitset(nodes[i], j)) { result[num_taken] = j; num_taken++; } if (num_taken == n) { return result; } } } } else { uint8_t ith_of_node = 0; // go through numa nodes for (int i = 0; i < numa_nodes; i++) { // go through cores and see if part of numa node for (int j = 0; j < num_cores; j++) { // take the ith core of the node if (numa_bitmask_isbitset(nodes[i], j)){ int index = i+ith_of_node*numa_nodes; if (index < n) { result[i+ith_of_node*numa_nodes] = j; num_taken++; ith_of_node++; } } if (num_taken == n) { return result; } } ith_of_node = 0; } } } else { printf("Libnuma not available \n"); return NULL; } return NULL; }
static void regular_nodes_init(void) { int i, node = 0, nodes_num = numa_num_configured_nodes(); struct bitmask *node_cpus = numa_allocate_cpumask(); regular_nodes_mask = numa_allocate_nodemask(); for (i = 0; i < nodes_num; i++) { numa_node_to_cpus(node, node_cpus); if (numa_bitmask_weight(node_cpus)) numa_bitmask_setbit(regular_nodes_mask, i); } numa_bitmask_free(node_cpus); }
void print_node_cpus(int node) { int i, err; struct bitmask *cpus; cpus = numa_allocate_cpumask(); err = numa_node_to_cpus(node, cpus); if (err >= 0) { for (i = 0; i < cpus->size; i++) if (numa_bitmask_isbitset(cpus, i)) printf(" %d", i); } putchar('\n'); }
///This function tries to fill bandwidth array based on knowledge about known CPU models static int fill_bandwidth_values_heuristically(int* bandwidth, int bandwidth_len) { int ret = MEMKIND_ERROR_UNAVAILABLE; // Default error returned if heuristic aproach fails int i, nodes_num, memory_only_nodes_num = 0; struct bitmask *memory_only_nodes, *node_cpus; if (is_cpu_xeon_phi_x200() == 0) { log_info("Known CPU model detected: Intel(R) Xeon Phi(TM) x200."); nodes_num = numa_num_configured_nodes(); // Check if number of numa-nodes meets expectations for // supported configurations of Intel Xeon Phi x200 if( nodes_num != 2 && nodes_num != 4 && nodes_num!= 8 ) { return ret; } memory_only_nodes = numa_allocate_nodemask(); node_cpus = numa_allocate_cpumask(); for(i=0; i<nodes_num; i++) { numa_node_to_cpus(i, node_cpus); if(numa_bitmask_weight(node_cpus) == 0) { memory_only_nodes_num++; numa_bitmask_setbit(memory_only_nodes, i); } } // Check if number of memory-only nodes is equal number of memory+cpu nodes // If it passes change ret to 0 (success) and fill bw table if ( memory_only_nodes_num == (nodes_num - memory_only_nodes_num) ) { ret = 0; assign_arbitrary_bandwidth_values(bandwidth, bandwidth_len, memory_only_nodes); } numa_bitmask_free(memory_only_nodes); numa_bitmask_free(node_cpus); } return ret; }
static int _get_cpu_masks(int num_numa_nodes, int32_t *numa_array, cpu_set_t **cpuMasks) { struct bitmask **remaining_numa_node_cpus = NULL, *collective; unsigned long **numa_node_cpus = NULL; int i, j, at_least_one_cpu = 0, rc = 0; cpu_set_t *cpusetptr; char *bitmask_str = NULL; if (numa_available()) { CRAY_ERR("Libnuma not available"); return -1; } /* * numa_node_cpus: The CPUs available to the NUMA node. * numa_all_cpus_ptr: all CPUs on which the calling task may execute. * remaining_numa_node_cpus: Bitwise-AND of the above two to get all of * the CPUs that the task can run on in this * NUMA node. * collective: Collects all of the CPUs as a precaution. */ remaining_numa_node_cpus = xmalloc(num_numa_nodes * sizeof(struct bitmask *)); collective = numa_allocate_cpumask(); numa_node_cpus = xmalloc(num_numa_nodes * sizeof(unsigned long*)); for (i = 0; i < num_numa_nodes; i++) { remaining_numa_node_cpus[i] = numa_allocate_cpumask(); numa_node_cpus[i] = xmalloc(sizeof(unsigned long) * NUM_INTS_TO_HOLD_ALL_CPUS); rc = numa_node_to_cpus(numa_array[i], numa_node_cpus[i], NUM_INTS_TO_HOLD_ALL_CPUS); if (rc) { CRAY_ERR("numa_node_to_cpus failed: Return code %d", rc); } for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) { (remaining_numa_node_cpus[i]->maskp[j]) = (numa_node_cpus[i][j]) & (numa_all_cpus_ptr->maskp[j]); collective->maskp[j] |= (remaining_numa_node_cpus[i]->maskp[j]); } } /* * Ensure that we have not masked off all of the CPUs. * If we have, just re-enable them all. Better to clear them all than * none of them. */ for (j = 0; j < collective->size; j++) { if (numa_bitmask_isbitset(collective, j)) { at_least_one_cpu = 1; } } if (!at_least_one_cpu) { for (i = 0; i < num_numa_nodes; i++) { for (j = 0; j < (remaining_numa_node_cpus[i]->size / (sizeof(unsigned long) * 8)); j++) { (remaining_numa_node_cpus[i]->maskp[j]) = (numa_all_cpus_ptr->maskp[j]); } } } if (debug_flags & DEBUG_FLAG_TASK) { bitmask_str = NULL; for (i = 0; i < num_numa_nodes; i++) { for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) { xstrfmtcat(bitmask_str, "%6lx ", numa_node_cpus[i][j]); } } info("%sBitmask: Allowed CPUs for NUMA Node", bitmask_str); xfree(bitmask_str); bitmask_str = NULL; for (i = 0; i < num_numa_nodes; i++) { for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) { xstrfmtcat(bitmask_str, "%6lx ", numa_all_cpus_ptr->maskp[j]); } } info("%sBitmask: Allowed CPUs for cpuset", bitmask_str); xfree(bitmask_str); bitmask_str = NULL; for (i = 0; i < num_numa_nodes; i++) { for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) { xstrfmtcat(bitmask_str, "%6lx ", remaining_numa_node_cpus[i]-> maskp[j]); } } info("%sBitmask: Allowed CPUs between cpuset and NUMA Node", bitmask_str); xfree(bitmask_str); } // Convert bitmasks to cpu_set_t types cpusetptr = xmalloc(num_numa_nodes * sizeof(cpu_set_t)); for (i = 0; i < num_numa_nodes; i++) { CPU_ZERO(&cpusetptr[i]); for (j = 0; j < remaining_numa_node_cpus[i]->size; j++) { if (numa_bitmask_isbitset(remaining_numa_node_cpus[i], j)) { CPU_SET(j, &cpusetptr[i]); } } if (debug_flags & DEBUG_FLAG_TASK) { info("CPU_COUNT() of set: %d", CPU_COUNT(&cpusetptr[i])); } } *cpuMasks = cpusetptr; // Freeing Everything numa_free_cpumask(collective); for (i = 0; i < num_numa_nodes; i++) { xfree(numa_node_cpus[i]); numa_free_cpumask(remaining_numa_node_cpus[i]); } xfree(numa_node_cpus); xfree(numa_node_cpus); xfree(remaining_numa_node_cpus); return 0; }
int nodeCapsInitNUMA(virCapsPtr caps) { int n; unsigned long *mask = NULL; unsigned long *allonesmask = NULL; int *cpus = NULL; int ret = -1; int max_n_cpus = NUMA_MAX_N_CPUS; if (numa_available() < 0) return 0; int mask_n_bytes = max_n_cpus / 8; if (VIR_ALLOC_N(mask, mask_n_bytes / sizeof *mask) < 0) goto cleanup; if (VIR_ALLOC_N(allonesmask, mask_n_bytes / sizeof *mask) < 0) goto cleanup; memset(allonesmask, 0xff, mask_n_bytes); for (n = 0 ; n <= numa_max_node() ; n++) { int i; int ncpus; /* The first time this returns -1, ENOENT if node doesn't exist... */ if (numa_node_to_cpus(n, mask, mask_n_bytes) < 0) { VIR_WARN("NUMA topology for cell %d of %d not available, ignoring", n, numa_max_node()+1); continue; } /* second, third... times it returns an all-1's mask */ if (memcmp(mask, allonesmask, mask_n_bytes) == 0) { VIR_DEBUG("NUMA topology for cell %d of %d is all ones, ignoring", n, numa_max_node()+1); continue; } for (ncpus = 0, i = 0 ; i < max_n_cpus ; i++) if (MASK_CPU_ISSET(mask, i)) ncpus++; if (VIR_ALLOC_N(cpus, ncpus) < 0) goto cleanup; for (ncpus = 0, i = 0 ; i < max_n_cpus ; i++) if (MASK_CPU_ISSET(mask, i)) cpus[ncpus++] = i; if (virCapabilitiesAddHostNUMACell(caps, n, ncpus, cpus) < 0) goto cleanup; VIR_FREE(cpus); } ret = 0; cleanup: VIR_FREE(cpus); VIR_FREE(mask); VIR_FREE(allonesmask); return ret; }
int init_virtual_topology(config_t* cfg, cpu_model_t* cpu_model, virtual_topology_t** virtual_topologyp) { char* mc_pci_file; char* str; char* saveptr; char* token = "NULL"; int* physical_node_ids; physical_node_t** physical_nodes; int num_physical_nodes; int n, v, i, j, sibling_idx, node_i_idx; int node_id; physical_node_t* node_i, *node_j, *sibling_node; int ret; int min_distance; int hyperthreading; struct bitmask* mem_nodes; virtual_topology_t* virtual_topology; __cconfig_lookup_string(cfg, "topology.physical_nodes", &str); // parse the physical nodes string physical_node_ids = calloc(numa_num_possible_nodes(), sizeof(*physical_node_ids)); num_physical_nodes = 0; while (token = strtok_r(str, ",", &saveptr)) { physical_node_ids[num_physical_nodes] = atoi(token); str = NULL; if (++num_physical_nodes > numa_num_possible_nodes()) { // we re being asked to run on more nodes than available free(physical_node_ids); ret = E_ERROR; goto done; } } physical_nodes = calloc(num_physical_nodes, sizeof(*physical_nodes)); // select those nodes we can run on (e.g. not constrained by any numactl) mem_nodes = numa_get_mems_allowed(); for (i=0, n=0; i<num_physical_nodes; i++) { node_id = physical_node_ids[i]; if (numa_bitmask_isbitset(mem_nodes, node_id)) { physical_nodes[n] = malloc(sizeof(**physical_nodes)); physical_nodes[n]->node_id = node_id; // TODO: what if we want to avoid using only a single hardware contexts of a hyperthreaded core? physical_nodes[n]->cpu_bitmask = numa_allocate_cpumask(); numa_node_to_cpus(node_id, physical_nodes[n]->cpu_bitmask); __cconfig_lookup_bool(cfg, "topology.hyperthreading", &hyperthreading); if (hyperthreading) { physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask); } else { DBG_LOG(INFO, "Not using hyperthreading.\n"); // disable the upper half of the processors in the bitmask physical_nodes[n]->num_cpus = num_cpus(physical_nodes[n]->cpu_bitmask) / 2; int fc = first_cpu(physical_nodes[n]->cpu_bitmask); for (j=fc+system_num_cpus()/2; j<fc+system_num_cpus()/2+physical_nodes[n]->num_cpus; j++) { if (numa_bitmask_isbitset(physical_nodes[n]->cpu_bitmask, j)) { numa_bitmask_clearbit(physical_nodes[n]->cpu_bitmask, j); } } } n++; } } free(physical_node_ids); num_physical_nodes = n; // if pci bus topology of each physical node is not provided then discover it if (__cconfig_lookup_string(cfg, "topology.mc_pci", &mc_pci_file) == CONFIG_FALSE || (__cconfig_lookup_string(cfg, "topology.mc_pci", &mc_pci_file) == CONFIG_TRUE && load_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes) != E_SUCCESS)) { discover_mc_pci_topology(cpu_model, physical_nodes, num_physical_nodes); save_mc_pci_topology(mc_pci_file, physical_nodes, num_physical_nodes); } // form virtual nodes by grouping physical nodes that are close to each other virtual_topology = malloc(sizeof(*virtual_topology)); virtual_topology->num_virtual_nodes = num_physical_nodes / 2 + num_physical_nodes % 2; virtual_topology->virtual_nodes = calloc(virtual_topology->num_virtual_nodes, sizeof(*(virtual_topology->virtual_nodes))); for (i=0, v=0; i<num_physical_nodes; i++) { min_distance = INT_MAX; sibling_node = NULL; sibling_idx = -1; if ((node_i = physical_nodes[i]) == NULL) { continue; } for (j=i+1; j<num_physical_nodes; j++) { if ((node_j = physical_nodes[j]) == NULL) { continue; } if (numa_distance(node_i->node_id,node_j->node_id) < min_distance) { sibling_node = node_j; sibling_idx = j; } } if (sibling_node) { physical_nodes[i] = physical_nodes[sibling_idx] = NULL; virtual_node_t* virtual_node = &virtual_topology->virtual_nodes[v]; virtual_node->dram_node = node_i; virtual_node->nvram_node = sibling_node; virtual_node->node_id = v; virtual_node->cpu_model = cpu_model; DBG_LOG(INFO, "Fusing physical nodes %d %d into virtual node %d\n", node_i->node_id, sibling_node->node_id, virtual_node->node_id); v++; } } // any physical node that is not paired with another physical node is // formed into a virtual node on its own if (2*v < num_physical_nodes) { for (i=0; i<num_physical_nodes; i++) { node_i = physical_nodes[i]; virtual_node_t* virtual_node = &virtual_topology->virtual_nodes[v]; virtual_node->dram_node = virtual_node->nvram_node = node_i; virtual_node->node_id = v; DBG_LOG(WARNING, "Forming physical node %d into virtual node %d without a sibling node.\n", node_i->node_id, virtual_node->node_id); } } *virtual_topologyp = virtual_topology; ret = E_SUCCESS; done: free(physical_nodes); return ret; }
char * build_default_affinity_string (int shuffle) { int nr_nodes = numa_num_configured_nodes(); int nr_cores = numa_num_configured_cpus(); char * str; int str_size = 512; int str_written = 0; int i; struct bitmask ** bm = (struct bitmask**) malloc(sizeof(struct bitmask*) * nr_nodes); for (i = 0; i < nr_nodes; i++) { bm[i] = numa_allocate_cpumask(); numa_node_to_cpus(i, bm[i]); } str = (char*) malloc(str_size * sizeof(char)); assert(str); if(!shuffle) { for(i = 0; i < nr_nodes; i++) { int j; for(j = 0; j < nr_cores; j++) { if (numa_bitmask_isbitset(bm[i], j)) { add_core_to_str(&str, &str_size, &str_written, j); } } } } else { int next_node = 0; for(i = 0; i < nr_cores; i++) { int idx = (i / nr_nodes) + 1; int found = 0; int j = 0; do { if (numa_bitmask_isbitset(bm[next_node], j)) { found++; } if(found == idx){ add_core_to_str(&str, &str_size, &str_written, j); break; } j = (j + 1) % nr_cores; } while (found != idx); next_node = (next_node + 1) % nr_nodes; } } if(str_written) { str[str_written - 1] = 0; } return str; }
int main(int argc, const char **argv) { int num_cpus = numa_num_task_cpus(); printf("num cpus: %d\n", num_cpus); printf("numa available: %d\n", numa_available()); numa_set_localalloc(); struct bitmask *bm = numa_bitmask_alloc(num_cpus); for (int i=0; i<=numa_max_node(); ++i) { numa_node_to_cpus(i, bm); printf("numa node %d ", i); print_bitmask(bm); printf(" - %g GiB\n", numa_node_size(i, 0) / (1024.*1024*1024.)); } numa_bitmask_free(bm); puts(""); char *x; const size_t cache_line_size = 64; const size_t array_size = 100*1000*1000; size_t ntrips = 2; #pragma omp parallel { assert(omp_get_num_threads() == num_cpus); int tid = omp_get_thread_num(); pin_to_core(tid); if(tid == 0) x = (char *) numa_alloc_local(array_size); // {{{ single access #pragma omp barrier for (size_t i = 0; i<num_cpus; ++i) { if (tid == i) { double t = measure_access(x, array_size, ntrips); printf("sequential core %d -> core 0 : BW %g MB/s\n", i, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier } // }}} // {{{ everybody contends for one { if (tid == 0) puts(""); #pragma omp barrier double t = measure_access(x, array_size, ntrips); #pragma omp barrier for (size_t i = 0; i<num_cpus; ++i) { if (tid == i) printf("all-contention core %d -> core 0 : BW %g MB/s\n", tid, array_size*ntrips*cache_line_size / t / 1e6); #pragma omp barrier } } // }}} // {{{ zero and someone else contending if (tid == 0) puts(""); #pragma omp barrier for (size_t i = 1; i<num_cpus; ++i) { double t; if (tid == i || tid == 0) t = measure_access(x, array_size, ntrips); #pragma omp barrier if (tid == 0) { printf("two-contention core %d -> core 0 : BW %g MB/s\n", tid, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier if (tid == i) { printf("two-contention core %d -> core 0 : BW %g MB/s\n\n", tid, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier } } numa_free(x, array_size); return 0; }
void myhbwmalloc_init(void) { /* set to NULL before trying to initialize. if we return before * successful creation of the mspace, then it will still be NULL, * and we can use that in subsequent library calls to determine * that the library failed to initialize. */ myhbwmalloc_mspace = NULL; /* verbose printout? */ myhbwmalloc_verbose = 0; { char * env_char = getenv("HBWMALLOC_VERBOSE"); if (env_char != NULL) { myhbwmalloc_verbose = 1; printf("hbwmalloc: HBWMALLOC_VERBOSE set\n"); } } /* fail hard or soft? */ myhbwmalloc_hardfail = 1; { char * env_char = getenv("HBWMALLOC_SOFTFAIL"); if (env_char != NULL) { myhbwmalloc_hardfail = 0; printf("hbwmalloc: HBWMALLOC_SOFTFAIL set\n"); } } /* set the atexit handler that will destroy the mspace and free the numa allocation */ atexit(myhbwmalloc_final); /* detect and configure use of NUMA memory nodes */ { int max_possible_node = numa_max_possible_node(); int num_possible_nodes = numa_num_possible_nodes(); int max_numa_nodes = numa_max_node(); int num_configured_nodes = numa_num_configured_nodes(); int num_configured_cpus = numa_num_configured_cpus(); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_max_possible_node() = %d\n", max_possible_node); printf("hbwmalloc: numa_num_possible_nodes() = %d\n", num_possible_nodes); printf("hbwmalloc: numa_max_node() = %d\n", max_numa_nodes); printf("hbwmalloc: numa_num_configured_nodes() = %d\n", num_configured_nodes); printf("hbwmalloc: numa_num_configured_cpus() = %d\n", num_configured_cpus); } /* FIXME this is a hack. assumes HBW is only numa node 1. */ if (num_configured_nodes <= 2) { myhbwmalloc_numa_node = num_configured_nodes-1; } else { fprintf(stderr,"hbwmalloc: we support only 2 numa nodes, not %d\n", num_configured_nodes); } if (myhbwmalloc_verbose) { for (int i=0; i<num_configured_nodes; i++) { unsigned max_numa_cpus = numa_num_configured_cpus(); struct bitmask * mask = numa_bitmask_alloc( max_numa_cpus ); int rc = numa_node_to_cpus(i, mask); if (rc != 0) { fprintf(stderr, "hbwmalloc: numa_node_to_cpus failed\n"); } else { printf("hbwmalloc: numa node %d cpu mask:", i); for (unsigned j=0; j<max_numa_cpus; j++) { int bit = numa_bitmask_isbitset(mask,j); printf(" %d", bit); } printf("\n"); } numa_bitmask_free(mask); } fflush(stdout); } } #if 0 /* unused */ /* see if the user specifies a slab size */ size_t slab_size_requested = 0; { char * env_char = getenv("HBWMALLOC_BYTES"); if (env_char!=NULL) { long units = 1L; if ( NULL != strstr(env_char,"G") ) units = 1000000000L; else if ( NULL != strstr(env_char,"M") ) units = 1000000L; else if ( NULL != strstr(env_char,"K") ) units = 1000L; else units = 1L; int num_count = strspn(env_char, "0123456789"); memset( &env_char[num_count], ' ', strlen(env_char)-num_count); slab_size_requested = units * atol(env_char); } if (myhbwmalloc_verbose) { printf("hbwmalloc: requested slab_size_requested = %zu\n", slab_size_requested); } } #endif /* see what libnuma says is available */ size_t myhbwmalloc_slab_size; { int node = myhbwmalloc_numa_node; long long freemem; long long maxmem = numa_node_size64(node, &freemem); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_node_size64 says maxmem=%lld freemem=%lld for numa node %d\n", maxmem, freemem, node); } myhbwmalloc_slab_size = freemem; } /* assume threads, disable if MPI knows otherwise, then allow user to override. */ int multithreaded = 1; #ifdef HAVE_MPI int nprocs; { int is_init, is_final; MPI_Initialized(&is_init); MPI_Finalized(&is_final); if (is_init && !is_final) { MPI_Comm_size(MPI_COMM_WORLD, &nprocs); } /* give equal portion to every MPI process */ myhbwmalloc_slab_size /= nprocs; /* if the user initializes MPI with MPI_Init or * MPI_Init_thread(MPI_THREAD_SINGLE), they assert there * are no threads at all, which means we can skip the * malloc mspace lock. * * if the user lies to MPI, they deserve any bad thing * that comes of it. */ int provided; MPI_Query_thread(&provided); if (provided==MPI_THREAD_SINGLE) { multithreaded = 0; } else { multithreaded = 1; } if (myhbwmalloc_verbose) { printf("hbwmalloc: MPI processes = %d (threaded = %d)\n", nprocs, multithreaded); printf("hbwmalloc: myhbwmalloc_slab_size = %d\n", myhbwmalloc_slab_size); } } #endif /* user can assert that hbwmalloc and friends need not be thread-safe */ { char * env_char = getenv("HBWMALLOC_LOCKLESS"); if (env_char != NULL) { multithreaded = 0; if (myhbwmalloc_verbose) { printf("hbwmalloc: user has disabled locking in mspaces by setting HBWMALLOC_LOCKLESS\n"); } } } myhbwmalloc_slab = numa_alloc_onnode( myhbwmalloc_slab_size, myhbwmalloc_numa_node); if (myhbwmalloc_slab==NULL) { fprintf(stderr, "hbwmalloc: numa_alloc_onnode returned NULL for size = %zu\n", myhbwmalloc_slab_size); return; } else { if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_alloc_onnode succeeded for size %zu\n", myhbwmalloc_slab_size); } /* part (less than 128*sizeof(size_t) bytes) of this space is used for bookkeeping, * so the capacity must be at least this large */ if (myhbwmalloc_slab_size < 128*sizeof(size_t)) { fprintf(stderr, "hbwmalloc: not enough space for mspace bookkeeping\n"); return; } /* see above regarding if the user lies to MPI. */ int locked = multithreaded; myhbwmalloc_mspace = create_mspace_with_base( myhbwmalloc_slab, myhbwmalloc_slab_size, locked); if (myhbwmalloc_mspace == NULL) { fprintf(stderr, "hbwmalloc: create_mspace_with_base returned NULL\n"); return; } else if (myhbwmalloc_verbose) { printf("hbwmalloc: create_mspace_with_base succeeded for size %zu\n", myhbwmalloc_slab_size); } } }
static uint32_t* placement(uint32_t n, bool do_fill, bool hyper) { uint32_t* result = (uint32_t*) malloc(sizeof(uint32_t)*n); uint32_t numa_nodes = numa_max_node()+1; uint32_t num_cores = 0; if (hyper) { num_cores = numa_num_configured_cpus()/2; } else { num_cores = numa_num_configured_cpus(); } struct bitmask* nodes[numa_nodes]; for (int i = 0; i < numa_nodes; i++) { nodes[i] = numa_allocate_cpumask(); numa_node_to_cpus(i, nodes[i]); } int num_taken = 0; if (numa_available() == 0) { if (do_fill) { for (int i = 0; i < numa_nodes; i++) { for (int j = 0; j < num_cores; j++) { if (numa_bitmask_isbitset(nodes[i], j)) { result[num_taken] = j; num_taken++; } if (num_taken == n) { return result; } } } } else { int cores_per_node = n/numa_nodes; int rest = n - (cores_per_node*numa_nodes); int taken_per_node = 0; for (int i = 0; i < numa_nodes; i++) { for (int j = 0; j < num_cores; j++) { if (numa_bitmask_isbitset(nodes[i], j)) { if (taken_per_node == cores_per_node) { if (rest > 0) { result[num_taken] = j; num_taken++; rest--; if (num_taken == n) { return result; } } break; } result[num_taken] = j; num_taken++; taken_per_node++; if (num_taken == n) { return result; } } } taken_per_node = 0; } } } else { printf("Libnuma not available \n"); return NULL; } return NULL; }
/** * @brief Returns an array of cores of size req_cores choosen * round-robin from NUMA nodes in batches of req_step. * * @param req_step The step with - how many cores should be picked * from each NUMA node in each iteration. Use a negative value * for a "fill"-strategy, where NUMA nodes are completely filled * before moving on to the next one. */ void placement(size_t req_cores, size_t req_step, coreid_t *cores) { // For convenience, allows to lookup 2*n for n in 0..n/2 if (req_step==0) req_step=1; size_t max_node = numa_max_node(); size_t num_cores = numa_num_configured_cpus(); size_t cores_per_node = num_cores/(max_node+1); printf("req_cores: %zu\n", req_cores); printf("req_step: %zu\n", req_step); printf("cores / NUMA node: %zu\n", cores_per_node); printf("max_node: %zu\n", max_node); size_t num_selected = 0; size_t curr_numa_idx = 0; // How many nodes to choose from each NUMA node size_t choose_per_node[max_node+1]; memset(choose_per_node, 0, sizeof(size_t)*(max_node+1)); // Step 1: // Figure out how many cores to choose from each node while (num_selected<req_cores) { // Determine number of cores of that node // How many cores should be choosen in this step? // At max req_step size_t num_choose = min(min(req_step, req_cores-num_selected), cores_per_node-choose_per_node[curr_numa_idx]); // Increment counter indicating how many to choose from this node choose_per_node[curr_numa_idx] += num_choose; num_selected += num_choose; // Move on to the next NUMA node curr_numa_idx = (curr_numa_idx + 1) % (max_node+1); } // Step 2: // Get the cores from each NUMA node // // hyperthreads? -> should have higher core IDs, and hence picked in // the end. struct bitmask *mask = numa_allocate_cpumask(); size_t idx = 0; for (size_t i=0; i<=max_node; i++) { dbg_printf("node %2zu choosing %2zu\n", i, choose_per_node[i]); // Determine which cores are on node i numa_node_to_cpus(i, mask); size_t choosen = 0; for (coreid_t p=0; p<num_cores && choosen<choose_per_node[i]; p++) { // Is processor p on node i if (numa_bitmask_isbitset(mask, p)) { cores[idx++] = p; choosen++; dbg_printf("Choosing %" PRIuCOREID " on node %zu\n", p, i); } } } assert (idx == req_cores); }