TEST(StorageManagerTest, DifferentNUMANodeBlobTestWithEviction) { EvictionPolicy *eviction_policy = LRUKEvictionPolicyFactory::ConstructLRUKEvictionPolicy( 2, std::chrono::seconds(100)); EvictionPolicy::Status status; static constexpr std::size_t kNumSlots = 10; const block_id_domain block_domain = 1000; // Set the max_memory_usage to 4 GB. const size_t max_memory_usage = 2000; std::unique_ptr<StorageManager> storage_manager; storage_manager.reset( new StorageManager("temp_storage", block_domain, max_memory_usage, eviction_policy)); const std::size_t num_numa_nodes = numa_num_configured_nodes(); block_id blob_id; MutableBlobReference blob_obj; char* blob_memory; BlobReference new_blob_obj; const char* new_blob_memory; std::size_t new_numa_node = 0; for (std::size_t numa_node = 0; numa_node < num_numa_nodes; ++numa_node) { blob_id = storage_manager->createBlob(kNumSlots, numa_node); blob_obj = storage_manager->getBlobMutable(blob_id, numa_node); blob_memory = static_cast<char*>(blob_obj->getMemoryMutable()); // Write some contents into the memory. for (std::size_t i = 0; i < kNumSlots * kSlotSizeBytes; ++i) { blob_memory[i] = static_cast<char>(i); } // Dereference the blob. blob_obj.release(); // Choose a blob for eviction. status = storage_manager->eviction_policy_->chooseBlockToEvict(&blob_id); ASSERT_EQ(EvictionPolicy::Status::kOk, status); // Save the blob to disk. storage_manager->saveBlockOrBlob(blob_id, true); // Evict the blob from the buffer pool. storage_manager->evictBlockOrBlob(blob_id); // Inform the eviction policy that this blob has been evicted. storage_manager->eviction_policy_->blockEvicted(blob_id); new_numa_node = (numa_node + 1) % num_numa_nodes; new_blob_obj = storage_manager->getBlob(blob_id, new_numa_node); new_blob_memory = static_cast<const char*>(new_blob_obj->getMemory()); // Read the contents of the blob by giving a different NUMA node hint and // verify if we still read the same blob that we actually wrote to. for (std::size_t i = 0; i < kNumSlots * kSlotSizeBytes; ++i) { EXPECT_EQ(static_cast<char>(i), new_blob_memory[i]); } } }
/** * @brief construct global 'cohort' lock. * * This lock performs handovers in three levels: First within * the same NUMA node, then within the same ArgoDSM node, and * finally over ArgoDSM nodes. */ cohort_lock() : has_global_lock(false), numanodes(1), // sane default numahandover(0), nodelockowner(NO_OWNER), tas_flag(argo::conew_<bool>(false)), global_lock(new argo::globallock::global_tas_lock(tas_flag)), node_lock(new argo::locallock::ticket_lock()) { int num_cpus = sysconf(_SC_NPROCESSORS_CONF); // sane default numa_mapping.resize(num_cpus, 0); #ifdef ARGO_USE_LIBNUMA /* use libnuma only if it is actually available */ if(numa_available() != -1) { numanodes = numa_num_configured_nodes(); /* Initialize the NUMA map */ for (int i = 0; i < num_cpus; ++i) { numa_mapping[i] = numa_node_of_cpu(i); } } #endif /* initialize hierarchy components */ handovers = new int[numanodes](); local_lock = new argo::locallock::mcs_lock[numanodes]; }
int main(void) { int i, k, w, ncpus; struct bitmask *cpus; int maxnode = numa_num_configured_nodes()-1; if (numa_available() < 0) { printf("no numa\n"); exit(1); } cpus = numa_allocate_cpumask(); ncpus = cpus->size; for (i = 0; i <= maxnode ; i++) { if (numa_node_to_cpus(i, cpus) < 0) { printf("node %d failed to convert\n",i); } printf("%d: ", i); w = 0; for (k = 0; k < ncpus; k++) if (numa_bitmask_isbitset(cpus, k)) printf(" %s%d", w>0?",":"", k); putchar('\n'); } return 0; }
/** * \brief sets the memory allocation mask. * * \param nodemask bitmap representing the nodes * * The task will only allocate memory from the nodes set in nodemask. * * an empty mask or not allowed nodes in the mask will result in an error */ errval_t numa_set_membind(struct bitmap *nodemask) { assert(numa_alloc_bind_mask); assert(numa_alloc_interleave_mask); if (!nodemask) { return NUMA_ERR_BITMAP_PARSE; } if (bitmap_get_nbits(nodemask) < NUMA_MAX_NUMNODES) { NUMA_WARNING("supplied interleave mask (%p) has to less bits!", nodemask); return NUMA_ERR_BITMAP_RANGE; } /* copy new membind mask and clear out invalid bits */ bitmap_copy(numa_alloc_bind_mask, nodemask); bitmap_clear_range(numa_alloc_bind_mask, numa_num_configured_nodes(), bitmap_get_nbits(numa_alloc_bind_mask)); if (bitmap_get_weight(numa_alloc_bind_mask) == 0) { /* cannot bind to no node, restore with all nodes pointer*/ bitmap_copy(numa_alloc_bind_mask, numa_all_nodes_ptr); return NUMA_ERR_NUMA_MEMBIND; } /* disable interleaving mode */ bitmap_clear_all(numa_alloc_interleave_mask); return SYS_ERR_OK; }
/** * @brief Add a mapping between a block and the NUMA node on which the block * is placed. * * @param block The block_id of the block for which the NUMA node mapping is * added. * @param numa_node The numa node id on which the block is placed. **/ void addBlockToNUMANodeMap(const block_id block, const int numa_node) { // Make sure that the block doesn't have a mapping already. // A block will be mapped to only one NUMA node. // A NUMA node will be associated with a block only once. DCHECK(block_to_numa_node_map_.find(block) == block_to_numa_node_map_.end()); DCHECK_GT(numa_num_configured_nodes(), numa_node) << "NUMA node above the valid value."; block_to_numa_node_map_[block] = numa_node; }
/** * @brief Constructor. * * @param num_partitions The number of partitions of the Catalog Relation. * This would be the same as the number of partitions * in the Partition Scheme of the relation. **/ explicit NUMAPlacementScheme(const std::size_t num_partitions) : num_numa_nodes_(numa_num_configured_nodes()), num_partitions_(num_partitions) { // Assign each partition to exactly one NUMA node. // Partitions are assigned in a round robin way to NUMA nodes. for (std::size_t part_id = 0; part_id < num_partitions_; ++part_id) { partition_to_numa_node_map_[part_id] = part_id % num_numa_nodes_; } }
void check_all_numa_nodes(int policy, void *ptr, size_t size) { if (policy != MPOL_INTERLEAVE && policy != MPOL_DEFAULT) return; unique_bitmask_ptr expected_bitmask = make_nodemask_ptr(); for(int i=0; i < numa_num_configured_nodes(); i++) { numa_bitmask_setbit(expected_bitmask.get(), i); } check_numa_nodes(expected_bitmask, policy, ptr, size); }
static void regular_nodes_init(void) { int i, node = 0, nodes_num = numa_num_configured_nodes(); struct bitmask *node_cpus = numa_allocate_cpumask(); regular_nodes_mask = numa_allocate_nodemask(); for (i = 0; i < nodes_num; i++) { numa_node_to_cpus(node, node_cpus); if (numa_bitmask_weight(node_cpus)) numa_bitmask_setbit(regular_nodes_mask, i); } numa_bitmask_free(node_cpus); }
numa_init(void) { int max,i; if (sizes_set) return; set_sizes(); /* numa_all_nodes should represent existing nodes on this system */ max = numa_num_configured_nodes(); for (i = 0; i < max; i++) nodemask_set_compat((nodemask_t *)&numa_all_nodes, i); memset(&numa_no_nodes, 0, sizeof(numa_no_nodes)); }
TEST(StorageManagerTest, DifferentNUMANodeBlobTest) { std::unique_ptr<StorageManager> storage_manager; static constexpr std::size_t kNumSlots = 10; storage_manager.reset(new StorageManager("temp_storage")); const std::size_t num_numa_nodes = numa_num_configured_nodes(); block_id blob_id; MutableBlobReference blob_obj; char* blob_memory; BlobReference new_blob_obj; const char* new_blob_memory; std::size_t new_numa_node = 0; for (std::size_t numa_node = 0; numa_node < num_numa_nodes; ++numa_node) { blob_id = storage_manager->createBlob(kNumSlots, numa_node); blob_obj = storage_manager->getBlobMutable(blob_id, numa_node); blob_memory = static_cast<char*>(blob_obj->getMemoryMutable()); // Write some contents into the memory. for (std::size_t i = 0; i < kNumSlots * kSlotSizeBytes; ++i) { blob_memory[i] = static_cast<char>(i); } // Dereference the blob. blob_obj.release(); new_numa_node = (numa_node + 1) % num_numa_nodes; new_blob_obj = storage_manager->getBlob(blob_id, new_numa_node); new_blob_memory = static_cast<const char*>(new_blob_obj->getMemory()); // Read the contents of the blob by giving a different NUMA node hint and // verify if we still read the same blob that we actually wrote to. for (std::size_t i = 0; i < kNumSlots * kSlotSizeBytes; ++i) { EXPECT_EQ(static_cast<char>(i), new_blob_memory[i]); } } }
///This function tries to fill bandwidth array based on knowledge about known CPU models static int fill_bandwidth_values_heuristically(int* bandwidth, int bandwidth_len) { int ret = MEMKIND_ERROR_UNAVAILABLE; // Default error returned if heuristic aproach fails int i, nodes_num, memory_only_nodes_num = 0; struct bitmask *memory_only_nodes, *node_cpus; if (is_cpu_xeon_phi_x200() == 0) { log_info("Known CPU model detected: Intel(R) Xeon Phi(TM) x200."); nodes_num = numa_num_configured_nodes(); // Check if number of numa-nodes meets expectations for // supported configurations of Intel Xeon Phi x200 if( nodes_num != 2 && nodes_num != 4 && nodes_num!= 8 ) { return ret; } memory_only_nodes = numa_allocate_nodemask(); node_cpus = numa_allocate_cpumask(); for(i=0; i<nodes_num; i++) { numa_node_to_cpus(i, node_cpus); if(numa_bitmask_weight(node_cpus) == 0) { memory_only_nodes_num++; numa_bitmask_setbit(memory_only_nodes, i); } } // Check if number of memory-only nodes is equal number of memory+cpu nodes // If it passes change ret to 0 (success) and fill bw table if ( memory_only_nodes_num == (nodes_num - memory_only_nodes_num) ) { ret = 0; assign_arbitrary_bandwidth_values(bandwidth, bandwidth_len, memory_only_nodes); } numa_bitmask_free(memory_only_nodes); numa_bitmask_free(node_cpus); } return ret; }
static void assign_arbitrary_bandwidth_values(int* bandwidth, int bandwidth_len, struct bitmask* hbw_nodes) { int i, nodes_num = numa_num_configured_nodes(); // Assigning arbitrary bandwidth values for nodes: // 2 - high BW node (if bit set in hbw_nodes nodemask), // 1 - low BW node, // 0 - node not present for (i=0; i<NUMA_NUM_NODES; i++) { if (i >= nodes_num) { bandwidth[i] = 0; } else if (numa_bitmask_isbitset(hbw_nodes, i)) { bandwidth[i] = 2; } else { bandwidth[i] = 1; } } }
/** * \brief sets the memory interleave mask for the current task to nodemask * * \param nodemask bitmask representing the nodes * * All new memory allocations are page interleaved over all nodes in the interleave * mask. Interleaving can be turned off again by passing an empty mask. * * This bitmask is considered to be a hint. Fallback to other nodes may be possible */ void numa_set_interleave_mask(struct bitmap *nodemask) { assert(numa_alloc_interleave_mask); if (!nodemask) { bitmap_clear_all(numa_alloc_interleave_mask); return; } if (bitmap_get_nbits(nodemask) < NUMA_MAX_NUMNODES) { NUMA_WARNING("supplied interleave mask (%p) has to less bits!", nodemask); return; } bitmap_copy(numa_alloc_interleave_mask, nodemask); /* clear out the invalid nodes */ bitmap_clear_range(numa_alloc_interleave_mask, numa_num_configured_nodes(), bitmap_get_nbits(numa_alloc_interleave_mask)); /* clear the bind mask as we are using interleaving mode now */ bitmap_clear_all(numa_alloc_bind_mask); }
TEST(StorageManagerTest, NUMAAwareBlobTest) { std::unique_ptr<StorageManager> storage_manager; static constexpr std::size_t kNumSlots = 10; storage_manager.reset(new StorageManager("temp_storage")); const std::size_t num_numa_nodes = numa_num_configured_nodes(); block_id blob_id; MutableBlobReference blob_obj; char* blob_memory; BlobReference new_blob_obj; const char* new_blob_memory; for (std::size_t numa_node = 0; numa_node < num_numa_nodes; ++numa_node) { blob_id = storage_manager->createBlob(kNumSlots, numa_node); blob_obj = storage_manager->getBlobMutable(blob_id, numa_node); blob_memory = static_cast<char*>(blob_obj->getMemoryMutable()); // Write some contents into the memory. for (std::size_t i = 0; i < kNumSlots * kSlotSizeBytes; ++i) { blob_memory[i] = static_cast<char>(i); } // Dereference the blob. blob_obj.release(); new_blob_obj = storage_manager->getBlob(blob_id, numa_node); new_blob_memory = static_cast<const char*>(new_blob_obj->getMemory()); // Read the contents of the blob on the same NUMA node on which the blob was // created and verify if they match with what we wrote into the blob. for (std::size_t i = 0; i < kNumSlots * kSlotSizeBytes; ++i) { EXPECT_EQ(static_cast<char>(i), new_blob_memory[i]); } } }
EvenNumaObj() { num_cpus_ = numa_num_configured_cpus(); num_mem_nodes_ = numa_num_configured_nodes(); LOG(INFO) << "num_cpus = " << num_cpus_ << " num_mem_nodes = " << num_mem_nodes_; }
void myhbwmalloc_init(void) { /* set to NULL before trying to initialize. if we return before * successful creation of the mspace, then it will still be NULL, * and we can use that in subsequent library calls to determine * that the library failed to initialize. */ myhbwmalloc_mspace = NULL; /* verbose printout? */ myhbwmalloc_verbose = 0; { char * env_char = getenv("HBWMALLOC_VERBOSE"); if (env_char != NULL) { myhbwmalloc_verbose = 1; printf("hbwmalloc: HBWMALLOC_VERBOSE set\n"); } } /* fail hard or soft? */ myhbwmalloc_hardfail = 1; { char * env_char = getenv("HBWMALLOC_SOFTFAIL"); if (env_char != NULL) { myhbwmalloc_hardfail = 0; printf("hbwmalloc: HBWMALLOC_SOFTFAIL set\n"); } } /* set the atexit handler that will destroy the mspace and free the numa allocation */ atexit(myhbwmalloc_final); /* detect and configure use of NUMA memory nodes */ { int max_possible_node = numa_max_possible_node(); int num_possible_nodes = numa_num_possible_nodes(); int max_numa_nodes = numa_max_node(); int num_configured_nodes = numa_num_configured_nodes(); int num_configured_cpus = numa_num_configured_cpus(); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_max_possible_node() = %d\n", max_possible_node); printf("hbwmalloc: numa_num_possible_nodes() = %d\n", num_possible_nodes); printf("hbwmalloc: numa_max_node() = %d\n", max_numa_nodes); printf("hbwmalloc: numa_num_configured_nodes() = %d\n", num_configured_nodes); printf("hbwmalloc: numa_num_configured_cpus() = %d\n", num_configured_cpus); } /* FIXME this is a hack. assumes HBW is only numa node 1. */ if (num_configured_nodes <= 2) { myhbwmalloc_numa_node = num_configured_nodes-1; } else { fprintf(stderr,"hbwmalloc: we support only 2 numa nodes, not %d\n", num_configured_nodes); } if (myhbwmalloc_verbose) { for (int i=0; i<num_configured_nodes; i++) { unsigned max_numa_cpus = numa_num_configured_cpus(); struct bitmask * mask = numa_bitmask_alloc( max_numa_cpus ); int rc = numa_node_to_cpus(i, mask); if (rc != 0) { fprintf(stderr, "hbwmalloc: numa_node_to_cpus failed\n"); } else { printf("hbwmalloc: numa node %d cpu mask:", i); for (unsigned j=0; j<max_numa_cpus; j++) { int bit = numa_bitmask_isbitset(mask,j); printf(" %d", bit); } printf("\n"); } numa_bitmask_free(mask); } fflush(stdout); } } #if 0 /* unused */ /* see if the user specifies a slab size */ size_t slab_size_requested = 0; { char * env_char = getenv("HBWMALLOC_BYTES"); if (env_char!=NULL) { long units = 1L; if ( NULL != strstr(env_char,"G") ) units = 1000000000L; else if ( NULL != strstr(env_char,"M") ) units = 1000000L; else if ( NULL != strstr(env_char,"K") ) units = 1000L; else units = 1L; int num_count = strspn(env_char, "0123456789"); memset( &env_char[num_count], ' ', strlen(env_char)-num_count); slab_size_requested = units * atol(env_char); } if (myhbwmalloc_verbose) { printf("hbwmalloc: requested slab_size_requested = %zu\n", slab_size_requested); } } #endif /* see what libnuma says is available */ size_t myhbwmalloc_slab_size; { int node = myhbwmalloc_numa_node; long long freemem; long long maxmem = numa_node_size64(node, &freemem); if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_node_size64 says maxmem=%lld freemem=%lld for numa node %d\n", maxmem, freemem, node); } myhbwmalloc_slab_size = freemem; } /* assume threads, disable if MPI knows otherwise, then allow user to override. */ int multithreaded = 1; #ifdef HAVE_MPI int nprocs; { int is_init, is_final; MPI_Initialized(&is_init); MPI_Finalized(&is_final); if (is_init && !is_final) { MPI_Comm_size(MPI_COMM_WORLD, &nprocs); } /* give equal portion to every MPI process */ myhbwmalloc_slab_size /= nprocs; /* if the user initializes MPI with MPI_Init or * MPI_Init_thread(MPI_THREAD_SINGLE), they assert there * are no threads at all, which means we can skip the * malloc mspace lock. * * if the user lies to MPI, they deserve any bad thing * that comes of it. */ int provided; MPI_Query_thread(&provided); if (provided==MPI_THREAD_SINGLE) { multithreaded = 0; } else { multithreaded = 1; } if (myhbwmalloc_verbose) { printf("hbwmalloc: MPI processes = %d (threaded = %d)\n", nprocs, multithreaded); printf("hbwmalloc: myhbwmalloc_slab_size = %d\n", myhbwmalloc_slab_size); } } #endif /* user can assert that hbwmalloc and friends need not be thread-safe */ { char * env_char = getenv("HBWMALLOC_LOCKLESS"); if (env_char != NULL) { multithreaded = 0; if (myhbwmalloc_verbose) { printf("hbwmalloc: user has disabled locking in mspaces by setting HBWMALLOC_LOCKLESS\n"); } } } myhbwmalloc_slab = numa_alloc_onnode( myhbwmalloc_slab_size, myhbwmalloc_numa_node); if (myhbwmalloc_slab==NULL) { fprintf(stderr, "hbwmalloc: numa_alloc_onnode returned NULL for size = %zu\n", myhbwmalloc_slab_size); return; } else { if (myhbwmalloc_verbose) { printf("hbwmalloc: numa_alloc_onnode succeeded for size %zu\n", myhbwmalloc_slab_size); } /* part (less than 128*sizeof(size_t) bytes) of this space is used for bookkeeping, * so the capacity must be at least this large */ if (myhbwmalloc_slab_size < 128*sizeof(size_t)) { fprintf(stderr, "hbwmalloc: not enough space for mspace bookkeeping\n"); return; } /* see above regarding if the user lies to MPI. */ int locked = multithreaded; myhbwmalloc_mspace = create_mspace_with_base( myhbwmalloc_slab, myhbwmalloc_slab_size, locked); if (myhbwmalloc_mspace == NULL) { fprintf(stderr, "hbwmalloc: create_mspace_with_base returned NULL\n"); return; } else if (myhbwmalloc_verbose) { printf("hbwmalloc: create_mspace_with_base succeeded for size %zu\n", myhbwmalloc_slab_size); } } }
void force_move_pages(const void* data_, const size_t n, const size_t selem, const enum numa_distrib_type distrib, const size_t distrib_parameter) { const char* data = (const char*)data_; const size_t elem_per_page = ASSUMED_PAGE_SIZE/selem; const size_t np = n / elem_per_page; int status[np]; int nodes[np]; const char* pages[np]; size_t i; long res; #ifndef __MIC__ const int nmn = numa_num_configured_nodes(); // fprintf(stderr, "%s:%d elem_per_page = %zd, nmn = %d ; np = %zd\n", __PRETTY_FUNCTION__, __LINE__, elem_per_page, nmn, np); for (i = 0 ; i < np ; i++) { pages[i] = data + i * ASSUMED_PAGE_SIZE; switch (distrib) { case HYDRO_NUMA_NONE: nodes[i] = -1; break; case HYDRO_NUMA_INTERLEAVED: nodes[i] = i % nmn; break; case HYDRO_NUMA_ONE_BLOCK_PER_NODE: { const size_t ppernode = np / nmn; size_t nnode = i / ppernode; if (nnode > (nmn-1)) nnode = nmn - 1; nodes[i] = nnode; } break; case HYDRO_NUMA_SIZED_BLOCK_RR: { const size_t numb = i / (distrib_parameter/elem_per_page); size_t nnode = numb % nmn; nodes[i] = nnode; } break; } } if (HYDRO_NUMA_NONE != distrib) { res = move_pages(0, np, (void**)pages, nodes, status, MPOL_MF_MOVE); } else { res = move_pages(0, np, (void**)pages, NULL , status, MPOL_MF_MOVE); } if (res != 0) { fprintf(stderr, "%s:%d: move_pages -> errno = %d\n", __PRETTY_FUNCTION__, __LINE__, errno); } else { int last_node = status[0]; const char* last; const char* cur = data; // fprintf(stderr, "%s:%d: move_pages for %p of %zd elements (%zd bytes)\n", __PRETTY_FUNCTION__, __LINE__, data, n, n * selem); // fprintf(stderr, "\t%d: %p ... ", last_node, cur ); last = cur; for (i = 1 ; i < np ; i++) { if (status[i] != last_node) { cur += ASSUMED_PAGE_SIZE; // fprintf(stderr, "%p (%llu)\n", cur, (unsigned long long)cur - (unsigned long long)last); last_node = status[i]; // fprintf(stderr, "\t%d: %p ... ", last_node, cur); last = cur; } else { cur += ASSUMED_PAGE_SIZE; } } // fprintf(stderr, "%p (%llu)\n", cur, (unsigned long long)cur - (unsigned long long)last); } #endif }
char * build_default_affinity_string (int shuffle) { int nr_nodes = numa_num_configured_nodes(); int nr_cores = numa_num_configured_cpus(); char * str; int str_size = 512; int str_written = 0; int i; struct bitmask ** bm = (struct bitmask**) malloc(sizeof(struct bitmask*) * nr_nodes); for (i = 0; i < nr_nodes; i++) { bm[i] = numa_allocate_cpumask(); numa_node_to_cpus(i, bm[i]); } str = (char*) malloc(str_size * sizeof(char)); assert(str); if(!shuffle) { for(i = 0; i < nr_nodes; i++) { int j; for(j = 0; j < nr_cores; j++) { if (numa_bitmask_isbitset(bm[i], j)) { add_core_to_str(&str, &str_size, &str_written, j); } } } } else { int next_node = 0; for(i = 0; i < nr_cores; i++) { int idx = (i / nr_nodes) + 1; int found = 0; int j = 0; do { if (numa_bitmask_isbitset(bm[next_node], j)) { found++; } if(found == idx){ add_core_to_str(&str, &str_size, &str_written, j); break; } j = (j + 1) % nr_cores; } while (found != idx); next_node = (next_node + 1) % nr_nodes; } } if(str_written) { str[str_written - 1] = 0; } return str; }
/** * \brief allocates size bytes of memory page interleaved the nodes specified in * the nodemask. * * \param size size of the memory region in bytes * \param nodemask subset of nodes to consider for allocation * \param pagesize preferred page size to be used * * \returns pointer to the mapped memory region * * should only be used for large areas consisting of multiple pages. * The memory must be freed with numa_free(). On errors NULL is returned. */ void *numa_alloc_interleaved_subset(size_t size, size_t pagesize, struct bitmap *nodemask) { errval_t err; /* clear out invalid bits */ bitmap_clear_range(nodemask, numa_num_configured_nodes(), bitmap_get_nbits(nodemask)); /* get the number of nodes */ nodeid_t nodes = bitmap_get_weight(nodemask); if (nodes == 0) { return NULL; } NUMA_DEBUG_ALLOC("allocating interleaved using %" PRIuNODEID " nodes\n", nodes); assert(nodes <= numa_num_configured_nodes()); vregion_flags_t flags; validate_page_size(&pagesize, &flags); size_t stride = pagesize; size_t node_size = size / nodes; node_size = (node_size + pagesize - 1) & ~(pagesize - 1); /* update total size as this may change due to rounding of node sizes*/ size = nodes * node_size; /* * XXX: we may want to keep track of numa alloced frames */ struct memobj_numa *memobj = calloc(1, sizeof(struct memobj_numa)); err = memobj_create_numa(memobj, size, 0, numa_num_configured_nodes(), stride); if (err_is_fail(err)) { return NULL; } bitmap_bit_t node = bitmap_get_first(nodemask); nodeid_t node_idx=0; while(node != BITMAP_BIT_NONE) { struct capref frame; err = numa_frame_alloc_on_node(&frame, node_size, (nodeid_t)node, NULL); if (err_is_fail(err)) { DEBUG_ERR(err, "numa_frame_alloc_on_node"); goto out_err; } memobj->m.f.fill(&memobj->m, node_idx, frame, 0); ++node_idx; node = bitmap_get_next(nodemask, node); } struct vregion *vreg = calloc(1, sizeof(struct vregion)); if (vreg == NULL) { goto out_err; } err = vregion_map_aligned(vreg, get_current_vspace(), &memobj->m, 0, size, flags, pagesize); if (err_is_fail(err)) { DEBUG_ERR(err, "vregion_map_aligned"); goto out_err; } err = memobj->m.f.pagefault(&memobj->m, vreg, 0, 0); if (err_is_fail(err)) { vregion_destroy(vreg); free(vreg); DEBUG_ERR(err, "memobj.m.f.pagefault"); goto out_err; } // XXX - Is this right? return (void *)(uintptr_t)vregion_get_base_addr(vreg); out_err: for (int i = 0; i < node_idx; ++i) { struct capref frame; memobj->m.f.unfill(&memobj->m, node_idx, &frame, NULL); cap_delete(frame); } return NULL; }
void* StorageManager::allocateSlots(const std::size_t num_slots, const int numa_node) { #if defined(QUICKSTEP_HAVE_MMAP_LINUX_HUGETLB) static constexpr int kLargePageMmapFlags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; #elif defined(QUICKSTEP_HAVE_MMAP_BSD_SUPERPAGE) static constexpr int kLargePageMmapFlags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER; #endif makeRoomForBlockOrBlob(num_slots); void *slots = nullptr; #if defined(QUICKSTEP_HAVE_MMAP_LINUX_HUGETLB) || defined(QUICKSTEP_HAVE_MMAP_BSD_SUPERPAGE) slots = mmap(nullptr, num_slots * kSlotSizeBytes, PROT_READ | PROT_WRITE, kLargePageMmapFlags, -1, 0); // Fallback to regular mmap() if large page allocation failed. Even on // systems with large page support, large page allocation may fail if the // user running the executable is not a member of hugetlb_shm_group on Linux, // or if all the reserved hugepages are already in use. if (slots == MAP_FAILED) { slots = mmap(nullptr, num_slots * kSlotSizeBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); } if (slots == MAP_FAILED) { slots = nullptr; } #elif defined(QUICKSTEP_HAVE_MMAP_PLAIN) slots = mmap(nullptr, num_slots * kSlotSizeBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (slots == MAP_FAILED) { slots = nullptr; } #else slots = malloc_with_alignment(num_slots * kSlotSizeBytes, kCacheLineBytes); if (slots != nullptr) { memset(slots, 0x0, num_slots * kSlotSizeBytes); } #endif if (slots == nullptr) { throw OutOfMemory(); } #if defined(QUICKSTEP_HAVE_LIBNUMA) if (numa_node != -1) { DEBUG_ASSERT(numa_node < numa_num_configured_nodes()); struct bitmask *numa_node_bitmask = numa_allocate_nodemask(); // numa_node can be 0 through n-1, where n is the num of NUMA nodes. numa_bitmask_setbit(numa_node_bitmask, numa_node); long mbind_status = mbind(slots, // NOLINT(runtime/int) num_slots * kSlotSizeBytes, MPOL_PREFERRED, numa_node_bitmask->maskp, numa_node_bitmask->size, 0); numa_free_nodemask(numa_node_bitmask); if (mbind_status == -1) { LOG(WARNING) << "mbind() failed with errno " << errno << " (" << std::strerror(errno) << ")"; } } #endif // QUICKSTEP_HAVE_LIBNUMA total_memory_usage_ += num_slots; return slots; }
void p_setup() { #if SWEET_THREADING || SWEET_REXI_THREAD_PARALLEL_SUM if (omp_in_parallel()) { std::cerr << "ERROR: NUMAMemManager may not be initialized within parallel region!" << std::endl; std::cerr << " Call NUMAMemManager::setup() at program start" << std::endl; exit(1); } #endif if (setup_done) return; const char* env_verbosity = getenv("NUMA_BLOCK_ALLOC_VERBOSITY"); if (env_verbosity == nullptr) verbosity = 0; else verbosity = atoi(env_verbosity); #if NUMA_BLOCK_ALLOCATOR_TYPE == 0 if (verbosity > 0) std::cout << "NUMA block alloc: Using default system's allocator" << std::endl; num_alloc_domains = 1; getThreadLocalDomainIdRef() = 0; #elif NUMA_BLOCK_ALLOCATOR_TYPE == 1 if (verbosity > 0) std::cout << "NUMA block alloc: Using NUMA node granularity" << std::endl; /* * NUMA granularity */ num_alloc_domains = numa_num_configured_nodes(); if (verbosity > 0) std::cout << "num_alloc_domains: " << num_alloc_domains << std::endl; // set NUMA id in case that master thread has a different id than the first thread int cpuid = sched_getcpu(); getThreadLocalDomainIdRef() = numa_node_of_cpu(cpuid); #if SWEET_THREADING || SWEET_REXI_THREAD_PARALLEL_SUM #pragma omp parallel { int cpuid = sched_getcpu(); getThreadLocalDomainIdRef() = numa_node_of_cpu(cpuid); } #else getThreadLocalDomainIdRef() = 0; #endif #elif NUMA_BLOCK_ALLOCATOR_TYPE == 2 if (verbosity > 0) std::cout << "NUMA block alloc: Using allocator based on thread granularity" << std::endl; /* * Thread granularity, use this also per default */ #if SWEET_THREADING || SWEET_REXI_THREAD_PARALLEL_SUM num_alloc_domains = omp_get_max_threads(); #else num_alloc_domains = 1; #endif if (verbosity > 0) std::cout << "num_alloc_domains: " << num_alloc_domains << std::endl; // set NUMA id in case that master thread has a different id than the first thread #if SWEET_THREADING || SWEET_REXI_THREAD_PARALLEL_SUM getThreadLocalDomainIdRef() = omp_get_thread_num(); #pragma omp parallel getThreadLocalDomainIdRef() = omp_get_thread_num(); #else getThreadLocalDomainIdRef() = 0; #endif #elif NUMA_BLOCK_ALLOCATOR_TYPE == 3 if (verbosity > 0) std::cout << "NUMA block alloc: Using non-numa single memory block chain" << std::endl; num_alloc_domains = 1; getThreadLocalDomainIdRef() = 0; #else # error "Invalid NUMA_BLOCK_ALLOCATOR_TYPE" #endif #if SWEET_THREADING || SWEET_REXI_THREAD_PARALLEL_SUM if (verbosity > 0) { #pragma omp parallel { #pragma omp critical { std::cout << " thread id " << omp_get_thread_num() << " is assigned to memory allocator domain " << getThreadLocalDomainIdRef() << std::endl; } } } #endif domain_block_groups.resize(num_alloc_domains); #if 0 // TODO: care about first-touch policy for (auto& n : domain_block_groups) { std::size_t S = num_alloc_domains*10; // preallocate S different size of blocks which should be sufficient n.block_groups.reserve(S); } #endif setup_done = true; }