static void memkind_hbw_closest_numanode_init(void) { struct memkind_hbw_closest_numanode_t *g = &memkind_hbw_closest_numanode_g; int *bandwidth = NULL; int num_unique = 0; int high_bandwidth = 0; int node; struct bandwidth_nodes_t *bandwidth_nodes = NULL; char *hbw_nodes_env; struct bitmask *hbw_nodes_bm; g->num_cpu = numa_num_configured_cpus(); g->closest_numanode = (int *)je_malloc(sizeof(int) * g->num_cpu); bandwidth = (int *)je_malloc(sizeof(int) * NUMA_NUM_NODES); if (!(g->closest_numanode && bandwidth)) { g->init_err = MEMKIND_ERROR_MALLOC; } if (!g->init_err) { hbw_nodes_env = getenv("MEMKIND_HBW_NODES"); if (hbw_nodes_env) { hbw_nodes_bm = numa_parse_nodestring(hbw_nodes_env); if (!hbw_nodes_bm) { g->init_err = MEMKIND_ERROR_ENVIRON; } else { for (node = 0; node < NUMA_NUM_NODES; ++node) { if (numa_bitmask_isbitset(hbw_nodes_bm, node)) { bandwidth[node] = 2; } else { bandwidth[node] = 1; } } numa_bitmask_free(hbw_nodes_bm); } } else { g->init_err = parse_node_bandwidth(NUMA_NUM_NODES, bandwidth, MEMKIND_BANDWIDTH_PATH); } } if (!g->init_err) { g->init_err = create_bandwidth_nodes(NUMA_NUM_NODES, bandwidth, &num_unique, &bandwidth_nodes); } if (!g->init_err) { if (num_unique == 1) { g->init_err = MEMKIND_ERROR_UNAVAILABLE; } } if (!g->init_err) { high_bandwidth = bandwidth_nodes[num_unique-1].bandwidth; g->init_err = set_closest_numanode(num_unique, bandwidth_nodes, high_bandwidth, g->num_cpu, g->closest_numanode); } if (bandwidth_nodes) { je_free(bandwidth_nodes); } if (bandwidth) { je_free(bandwidth); } if (g->init_err) { if (g->closest_numanode) { je_free(g->closest_numanode); g->closest_numanode = NULL; } } }
void myhbwmalloc_init(void) { /* set to NULL before trying to initialize. if we return before * successful creation of the mspace, then it will still be NULL, * and we can use that in subsequent library calls to determine * that the library failed to initialize. */ myhbwmalloc_mspace = NULL; /* verbose printout? */ myhbwmalloc_verbose = 0; { char * env_char = getenv("HBWMALLOC_VERBOSE"); if (env_char != NULL) { myhbwmalloc_verbose = 1; printf("hbwmalloc: HBWMALLOC_VERBOSE set\n"); } } /* fail hard or soft? */ myhbwmalloc_hardfail = 1; { char * env_char = getenv("HBWMALLOC_SOFTFAIL"); if (env_char != NULL) { myhbwmalloc_hardfail = 0; printf("hbwmalloc: HBWMALLOC_SOFTFAIL set\n"); } } /* set the atexit handler that will destroy the mspace and free the numa allocation */ atexit(myhbwmalloc_final); /* detect and configure use of NUMA memory nodes */ { int max_possible_node = numa_max_possible_node(); int num_possible_nodes = numa_num_possible_nodes(); int max_numa_nodes = numa_max_node(); int num_configured_nodes = numa_num_configured_nodes(); int num_configured_cpus = numa_num_configured_cpus(); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_max_possible_node() = %d\n", max_possible_node); printf("hbwmalloc: numa_num_possible_nodes() = %d\n", num_possible_nodes); printf("hbwmalloc: numa_max_node() = %d\n", max_numa_nodes); printf("hbwmalloc: numa_num_configured_nodes() = %d\n", num_configured_nodes); printf("hbwmalloc: numa_num_configured_cpus() = %d\n", num_configured_cpus); } /* FIXME this is a hack. assumes HBW is only numa node 1. */ if (num_configured_nodes <= 2) { myhbwmalloc_numa_node = num_configured_nodes-1; } else { fprintf(stderr,"hbwmalloc: we support only 2 numa nodes, not %d\n", num_configured_nodes); } if (myhbwmalloc_verbose) { for (int i=0; i<num_configured_nodes; i++) { unsigned max_numa_cpus = numa_num_configured_cpus(); struct bitmask * mask = numa_bitmask_alloc( max_numa_cpus ); int rc = numa_node_to_cpus(i, mask); if (rc != 0) { fprintf(stderr, "hbwmalloc: numa_node_to_cpus failed\n"); } else { printf("hbwmalloc: numa node %d cpu mask:", i); for (unsigned j=0; j<max_numa_cpus; j++) { int bit = numa_bitmask_isbitset(mask,j); printf(" %d", bit); } printf("\n"); } numa_bitmask_free(mask); } fflush(stdout); } } #if 0 /* unused */ /* see if the user specifies a slab size */ size_t slab_size_requested = 0; { char * env_char = getenv("HBWMALLOC_BYTES"); if (env_char!=NULL) { long units = 1L; if ( NULL != strstr(env_char,"G") ) units = 1000000000L; else if ( NULL != strstr(env_char,"M") ) units = 1000000L; else if ( NULL != strstr(env_char,"K") ) units = 1000L; else units = 1L; int num_count = strspn(env_char, "0123456789"); memset( &env_char[num_count], ' ', strlen(env_char)-num_count); slab_size_requested = units * atol(env_char); } if (myhbwmalloc_verbose) { printf("hbwmalloc: requested slab_size_requested = %zu\n", slab_size_requested); } } #endif /* see what libnuma says is available */ size_t myhbwmalloc_slab_size; { int node = myhbwmalloc_numa_node; long long freemem; long long maxmem = numa_node_size64(node, &freemem); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_node_size64 says maxmem=%lld freemem=%lld for numa node %d\n", maxmem, freemem, node); } myhbwmalloc_slab_size = freemem; } /* assume threads, disable if MPI knows otherwise, then allow user to override. */ int multithreaded = 1; #ifdef HAVE_MPI int nprocs; { int is_init, is_final; MPI_Initialized(&is_init); MPI_Finalized(&is_final); if (is_init && !is_final) { MPI_Comm_size(MPI_COMM_WORLD, &nprocs); } /* give equal portion to every MPI process */ myhbwmalloc_slab_size /= nprocs; /* if the user initializes MPI with MPI_Init or * MPI_Init_thread(MPI_THREAD_SINGLE), they assert there * are no threads at all, which means we can skip the * malloc mspace lock. * * if the user lies to MPI, they deserve any bad thing * that comes of it. */ int provided; MPI_Query_thread(&provided); if (provided==MPI_THREAD_SINGLE) { multithreaded = 0; } else { multithreaded = 1; } if (myhbwmalloc_verbose) { printf("hbwmalloc: MPI processes = %d (threaded = %d)\n", nprocs, multithreaded); printf("hbwmalloc: myhbwmalloc_slab_size = %d\n", myhbwmalloc_slab_size); } } #endif /* user can assert that hbwmalloc and friends need not be thread-safe */ { char * env_char = getenv("HBWMALLOC_LOCKLESS"); if (env_char != NULL) { multithreaded = 0; if (myhbwmalloc_verbose) { printf("hbwmalloc: user has disabled locking in mspaces by setting HBWMALLOC_LOCKLESS\n"); } } } myhbwmalloc_slab = numa_alloc_onnode( myhbwmalloc_slab_size, myhbwmalloc_numa_node); if (myhbwmalloc_slab==NULL) { fprintf(stderr, "hbwmalloc: numa_alloc_onnode returned NULL for size = %zu\n", myhbwmalloc_slab_size); return; } else { if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_alloc_onnode succeeded for size %zu\n", myhbwmalloc_slab_size); } /* part (less than 128*sizeof(size_t) bytes) of this space is used for bookkeeping, * so the capacity must be at least this large */ if (myhbwmalloc_slab_size < 128*sizeof(size_t)) { fprintf(stderr, "hbwmalloc: not enough space for mspace bookkeeping\n"); return; } /* see above regarding if the user lies to MPI. */ int locked = multithreaded; myhbwmalloc_mspace = create_mspace_with_base( myhbwmalloc_slab, myhbwmalloc_slab_size, locked); if (myhbwmalloc_mspace == NULL) { fprintf(stderr, "hbwmalloc: create_mspace_with_base returned NULL\n"); return; } else if (myhbwmalloc_verbose) { printf("hbwmalloc: create_mspace_with_base succeeded for size %zu\n", myhbwmalloc_slab_size); } } }
static uint32_t* placement(uint32_t n, bool do_fill, bool hyper) { uint32_t* result = (uint32_t*) malloc(sizeof(uint32_t)*n); uint32_t numa_nodes = numa_max_node()+1; uint32_t num_cores = 0; if (hyper) { num_cores = numa_num_configured_cpus()/2; } else { num_cores = numa_num_configured_cpus(); } struct bitmask* nodes[numa_nodes]; for (int i = 0; i < numa_nodes; i++) { nodes[i] = numa_allocate_cpumask(); numa_node_to_cpus(i, nodes[i]); } int num_taken = 0; if (numa_available() == 0) { if (do_fill) { for (int i = 0; i < numa_nodes; i++) { for (int j = 0; j < num_cores; j++) { if (numa_bitmask_isbitset(nodes[i], j)) { result[num_taken] = j; num_taken++; } if (num_taken == n) { return result; } } } } else { int cores_per_node = n/numa_nodes; int rest = n - (cores_per_node*numa_nodes); int taken_per_node = 0; for (int i = 0; i < numa_nodes; i++) { for (int j = 0; j < num_cores; j++) { if (numa_bitmask_isbitset(nodes[i], j)) { if (taken_per_node == cores_per_node) { if (rest > 0) { result[num_taken] = j; num_taken++; rest--; if (num_taken == n) { return result; } } break; } result[num_taken] = j; num_taken++; taken_per_node++; if (num_taken == n) { return result; } } } taken_per_node = 0; } } } else { printf("Libnuma not available \n"); return NULL; } return NULL; }
EvenNumaObj() { num_cpus_ = numa_num_configured_cpus(); num_mem_nodes_ = numa_num_configured_nodes(); LOG(INFO) << "num_cpus = " << num_cpus_ << " num_mem_nodes = " << num_mem_nodes_; }
char * build_default_affinity_string (int shuffle) { int nr_nodes = numa_num_configured_nodes(); int nr_cores = numa_num_configured_cpus(); char * str; int str_size = 512; int str_written = 0; int i; struct bitmask ** bm = (struct bitmask**) malloc(sizeof(struct bitmask*) * nr_nodes); for (i = 0; i < nr_nodes; i++) { bm[i] = numa_allocate_cpumask(); numa_node_to_cpus(i, bm[i]); } str = (char*) malloc(str_size * sizeof(char)); assert(str); if(!shuffle) { for(i = 0; i < nr_nodes; i++) { int j; for(j = 0; j < nr_cores; j++) { if (numa_bitmask_isbitset(bm[i], j)) { add_core_to_str(&str, &str_size, &str_written, j); } } } } else { int next_node = 0; for(i = 0; i < nr_cores; i++) { int idx = (i / nr_nodes) + 1; int found = 0; int j = 0; do { if (numa_bitmask_isbitset(bm[next_node], j)) { found++; } if(found == idx){ add_core_to_str(&str, &str_size, &str_written, j); break; } j = (j + 1) % nr_cores; } while (found != idx); next_node = (next_node + 1) % nr_nodes; } } if(str_written) { str[str_written - 1] = 0; } return str; }
static int memkind_store(void *memptr, void **mmapptr, struct memkind **kind, size_t *req_size, size_t *size, int mode) { static int table_len = 0; static int is_init = 0; static memkind_table_node_t *table = NULL; static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER; int err = 0; int hash, i; memkind_list_node_t *storeptr, *lastptr; if (!is_init && *mmapptr == NULL) { return -1; } if (!is_init) { pthread_mutex_lock(&init_mutex); if (!is_init) { table_len = numa_num_configured_cpus(); table = jemk_malloc(sizeof(memkind_table_node_t) * table_len); if (table == NULL) { err = MEMKIND_ERROR_MALLOC; } else { for (i = 0; i < table_len; ++i) { pthread_mutex_init(&(table[i].mutex), NULL); table[i].list = NULL; } is_init = 1; } } pthread_mutex_unlock(&init_mutex); } if (is_init) { hash = ptr_hash(memptr, table_len); pthread_mutex_lock(&(table[hash].mutex)); if (mode == GBTLB_STORE_REMOVE || mode == GBTLB_STORE_QUERY) { /* memkind_store() call is a query GBTLB_STORE_REMOVE -> Query if found remove and return the address and size; GBTLB_STORE_QUERTY -> Query if found and return; */ storeptr = table[hash].list; lastptr = NULL; while (storeptr && storeptr->ptr != memptr) { lastptr = storeptr; storeptr = storeptr->next; } if (storeptr == NULL) { err = MEMKIND_ERROR_RUNTIME; } if (!err) { *mmapptr = storeptr->mmapptr; *size = storeptr->size; *req_size = storeptr->requested_size; *kind = storeptr->kind; } if (!err && mode == GBTLB_STORE_REMOVE) { if (lastptr) { lastptr->next = storeptr->next; } else { table[hash].list = storeptr->next; } jemk_free(storeptr); } } else { /* memkind_store() call is a store */ storeptr = table[hash].list; table[hash].list = (memkind_list_node_t*)jemk_malloc(sizeof(memkind_list_node_t)); table[hash].list->ptr = memptr; table[hash].list->mmapptr = *mmapptr; table[hash].list->size = *size; table[hash].list->requested_size = *req_size; table[hash].list->kind = *kind; table[hash].list->next = storeptr; } pthread_mutex_unlock(&(table[hash].mutex)); } else { err = MEMKIND_ERROR_MALLOC; } return err; }
/** * @brief Returns an array of cores of size req_cores choosen * round-robin from NUMA nodes in batches of req_step. * * @param req_step The step with - how many cores should be picked * from each NUMA node in each iteration. Use a negative value * for a "fill"-strategy, where NUMA nodes are completely filled * before moving on to the next one. */ void placement(size_t req_cores, size_t req_step, coreid_t *cores) { // For convenience, allows to lookup 2*n for n in 0..n/2 if (req_step==0) req_step=1; size_t max_node = numa_max_node(); size_t num_cores = numa_num_configured_cpus(); size_t cores_per_node = num_cores/(max_node+1); printf("req_cores: %zu\n", req_cores); printf("req_step: %zu\n", req_step); printf("cores / NUMA node: %zu\n", cores_per_node); printf("max_node: %zu\n", max_node); size_t num_selected = 0; size_t curr_numa_idx = 0; // How many nodes to choose from each NUMA node size_t choose_per_node[max_node+1]; memset(choose_per_node, 0, sizeof(size_t)*(max_node+1)); // Step 1: // Figure out how many cores to choose from each node while (num_selected<req_cores) { // Determine number of cores of that node // How many cores should be choosen in this step? // At max req_step size_t num_choose = min(min(req_step, req_cores-num_selected), cores_per_node-choose_per_node[curr_numa_idx]); // Increment counter indicating how many to choose from this node choose_per_node[curr_numa_idx] += num_choose; num_selected += num_choose; // Move on to the next NUMA node curr_numa_idx = (curr_numa_idx + 1) % (max_node+1); } // Step 2: // Get the cores from each NUMA node // // hyperthreads? -> should have higher core IDs, and hence picked in // the end. struct bitmask *mask = numa_allocate_cpumask(); size_t idx = 0; for (size_t i=0; i<=max_node; i++) { dbg_printf("node %2zu choosing %2zu\n", i, choose_per_node[i]); // Determine which cores are on node i numa_node_to_cpus(i, mask); size_t choosen = 0; for (coreid_t p=0; p<num_cores && choosen<choose_per_node[i]; p++) { // Is processor p on node i if (numa_bitmask_isbitset(mask, p)) { cores[idx++] = p; choosen++; dbg_printf("Choosing %" PRIuCOREID " on node %zu\n", p, i); } } } assert (idx == req_cores); }