mask_type decode_mapping(hwloc_topology const& t, mapping_type& m, std::vector<mask_type>& affinities, std::size_t thread_index, error_code& ec) { std::size_t size = affinities.size(); mask_type mask; switch (m[0].type_) { case spec_type::socket: // requested top level is a socket mask = decode_mapping_socket(t, m, size, thread_index, ec); break; case spec_type::numanode: // requested top level is a NUMA node mask = decode_mapping_numanode(t, m, size, thread_index, ec); break; case spec_type::unknown: // no top level is requested mask = decode_mapping0_unknown(t, m, size, t.get_machine_affinity_mask(), 0, thread_index, ec); break; default: HPX_THROWS_IF(ec, bad_parameter, "decode_mapping", boost::str(boost::format("unexpected specification type at " "index zero: %x (%s)") % static_cast<unsigned>(m[0].type_) % spec_type::type_name(m[0].type_))); return mask_type(); } return mask; }
mask_type decode_mapping_core(hwloc_topology const& t, mapping_type& m, std::size_t size, mask_type mask, std::size_t core_base_index, std::size_t thread_index, error_code& ec) { bounds_type b = extract_bounds(m[1], size, ec); if (ec) return mask_type(); // We have to account for the thread index at this level if there are // no specifications related to processing units. std::size_t index = std::size_t(-1); if (m[2].type_ == spec_type::unknown && b.size() > 1) index = thread_index; mask_type core_mask = mask_type(); resize(core_mask, size); std::size_t core_index = 0; for (bounds_type::const_iterator it = b.begin(); it != b.end(); ++it, ++core_index) { if (index == std::size_t(-1) || core_index == index) { core_mask |= t.init_core_affinity_mask_from_core( std::size_t(*it+core_base_index), mask_type()); } } core_base_index += std::size_t(*b.begin()); if (thread_index != std::size_t(-1) && b.size() > 1) core_base_index += thread_index; std::size_t base_index = 0; for (std::size_t i = 0; i != core_base_index; ++i) base_index += t.get_number_of_core_pus(i); return decode_mapping1_unknown(t, m, size, mask & core_mask, base_index, thread_index, ec); }
mask_type decode_mapping_pu(hwloc_topology const& t, mapping_type& m, std::size_t size, mask_type mask, std::size_t pu_base_index, std::size_t thread_index, error_code& ec) { bounds_type b = extract_bounds(m[2], size, ec); if (ec) return mask_type(); std::size_t index = std::size_t(-1); if (b.size() > 1) index = thread_index; mask_type pu_mask = mask_type(); resize(pu_mask, size); std::size_t pu_index = 0; for (bounds_type::const_iterator it = b.begin(); it != b.end(); ++it, ++pu_index) { if (index == std::size_t(-1) || pu_index == index) pu_mask |= t.init_thread_affinity_mask(std::size_t(*it+pu_base_index)); } return mask & pu_mask; }
mask_type decode_mapping_numanode(hwloc_topology const& t, mapping_type& m, std::size_t size, std::size_t thread_index, error_code& ec) { bounds_type b = extract_bounds(m[0], size, ec); if (ec) return mask_type(); std::size_t index = std::size_t(-1); if (m[1].type_ == spec_type::unknown && m[2].type_ == spec_type::unknown && b.size() > 1) { index = thread_index; } mask_type mask = mask_type(); resize(mask, size); std::size_t node_index = 0; for (bounds_type::const_iterator it = b.begin(); it != b.end(); ++it, ++node_index) { if (index == std::size_t(-1) || node_index == index) mask |= t.init_numa_node_affinity_mask_from_numa_node(std::size_t(*it)); } std::size_t node_base_index = std::size_t(*b.begin()); if (thread_index != std::size_t(-1) && b.size() > 1) node_base_index += thread_index; std::size_t base_index = 0; for (std::size_t i = 0; i != node_base_index; ++i) base_index += t.get_number_of_numa_node_cores(i); return decode_mapping0_unknown(t, m, size, mask, base_index, thread_index, ec); }
namespace hpx { namespace threads { mask_type noop_topology::empty_mask = mask_type(noop_topology::hardware_concurrency()); }}
void on_start_thread(std::size_t num_thread) { if (nullptr == queues_[num_thread]) { queues_[num_thread] = new thread_queue_type(max_queue_thread_count_); if (num_thread < high_priority_queues_.size()) { high_priority_queues_[num_thread] = new thread_queue_type(max_queue_thread_count_); } } // forward this call to all queues etc. if (num_thread < high_priority_queues_.size()) high_priority_queues_[num_thread]->on_start_thread(num_thread); if (num_thread == queues_.size()-1) low_priority_queue_.on_start_thread(num_thread); queues_[num_thread]->on_start_thread(num_thread); std::size_t num_threads = queues_.size(); // get numa domain masks of all queues... std::vector<mask_type> numa_masks(num_threads); std::vector<mask_type> core_masks(num_threads); for (std::size_t i = 0; i != num_threads; ++i) { std::size_t num_pu = get_pu_num(i); numa_masks[i] = topology_.get_numa_node_affinity_mask(num_pu, numa_sensitive_ != 0); core_masks[i] = topology_.get_core_affinity_mask(num_pu, numa_sensitive_ != 0); } // iterate over the number of threads again to determine where to // steal from std::ptrdiff_t radius = static_cast<std::ptrdiff_t>((num_threads / 2.0) + 0.5); victim_threads_[num_thread].reserve(num_threads); std::size_t num_pu = get_pu_num(num_thread); mask_cref_type pu_mask = topology_.get_thread_affinity_mask(num_pu, numa_sensitive_ != 0); mask_cref_type numa_mask = numa_masks[num_thread]; mask_cref_type core_mask = core_masks[num_thread]; // we allow the thread on the boundary of the NUMA domain to steal mask_type first_mask = mask_type(); resize(first_mask, mask_size(pu_mask)); std::size_t first = find_first(numa_mask); if (first != std::size_t(-1)) set(first_mask, first); else first_mask = pu_mask; auto iterate = [&](hpx::util::function_nonser<bool(std::size_t)> f) { // check our neighbors in a radial fashion (left and right // alternating, increasing distance each iteration) int i = 1; for (/**/; i < radius; ++i) { std::ptrdiff_t left = (static_cast<std::ptrdiff_t>(num_thread) - i) % static_cast<std::ptrdiff_t>(num_threads); if (left < 0) left = num_threads + left; if (f(std::size_t(left))) { victim_threads_[num_thread].push_back( static_cast<std::size_t>(left)); } std::size_t right = (num_thread + i) % num_threads; if (f(right)) { victim_threads_[num_thread].push_back(right); } } if ((num_threads % 2) == 0) { std::size_t right = (num_thread + i) % num_threads; if (f(right)) { victim_threads_[num_thread].push_back(right); } } }; // check for threads which share the same core... iterate( [&](std::size_t other_num_thread) { return any(core_mask & core_masks[other_num_thread]); } ); // check for threads which share the same numa domain... iterate( [&](std::size_t other_num_thread) { return !any(core_mask & core_masks[other_num_thread]) && any(numa_mask & numa_masks[other_num_thread]); } ); // check for the rest and if we are numa aware if (numa_sensitive_ != 2 && any(first_mask & pu_mask)) { iterate( [&](std::size_t other_num_thread) { return !any(numa_mask & numa_masks[other_num_thread]); } ); } }