/* * Where is this process currently bound? (layout string) */ static int get_layout_current_binding(char str[OMPI_AFFINITY_STRING_MAX]) { int ret; hwloc_obj_t root; hwloc_cpuset_t boundset, rootset; bool bound = false; /* get our root object */ root = hwloc_get_root_obj(opal_hwloc_topology); rootset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root); /* get our bindings */ boundset = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, boundset, HWLOC_CPUBIND_PROCESS) < 0) { /* we are NOT bound if get_cpubind fails, nor can we be bound - the environment does not support it */ bound = false; } else { /* we are bound if the two cpusets are not equal, or if there is only ONE PU available to us */ if (0 != hwloc_bitmap_compare(boundset, rootset) || opal_hwloc_base_single_cpu(rootset) || opal_hwloc_base_single_cpu(boundset)) { bound = true; } } /* If we are not bound, indicate that */ if (!bound) { strncat(str, not_bound_str, OMPI_AFFINITY_STRING_MAX - 1); ret = OMPI_SUCCESS; } /* If we are bound, print it out */ else { ret = opal_hwloc_base_cset2mapstr(str, OMPI_AFFINITY_STRING_MAX, opal_hwloc_topology, boundset); if (OPAL_ERR_NOT_BOUND == ret) { strncpy(str, not_bound_str, OMPI_AFFINITY_STRING_MAX - 1); ret = OMPI_SUCCESS; } } hwloc_bitmap_free(boundset); return ret; }
static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int32_t my_smp_rank, int n) { size_t length, length_payload; sm_fifo_t *my_fifos; int my_mem_node, num_mem_nodes, i, rc; mca_mpool_base_resources_t *res = NULL; mca_btl_sm_component_t* m = &mca_btl_sm_component; /* Assume we don't have hwloc support and fill in dummy info */ mca_btl_sm_component.mem_node = my_mem_node = 0; mca_btl_sm_component.num_mem_nodes = num_mem_nodes = 1; #if OPAL_HAVE_HWLOC /* If we have hwloc support, then get accurate information */ if (NULL != opal_hwloc_topology) { i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); /* If we find >0 NUMA nodes, then investigate further */ if (i > 0) { int numa=0, w; unsigned n_bound=0; hwloc_cpuset_t avail; hwloc_obj_t obj; /* JMS This tells me how many numa nodes are *available*, but it's not how many are being used *by this job*. Note that this is the value we've previously used (from the previous carto-based implementation), but it really should be improved to be how many NUMA nodes are being used *in this job*. */ mca_btl_sm_component.num_mem_nodes = num_mem_nodes = i; /* if we are not bound, then there is nothing further to do */ if (NULL != ompi_process_info.cpuset) { /* count the number of NUMA nodes to which we are bound */ for (w=0; w < i; w++) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0, w, OPAL_HWLOC_AVAILABLE))) { continue; } /* get that NUMA node's available cpus */ avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); /* see if we intersect */ if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) { n_bound++; numa = w; } } /* if we are located on more than one NUMA, or we didn't find * a NUMA we are on, then not much we can do */ if (1 == n_bound) { mca_btl_sm_component.mem_node = my_mem_node = numa; } else { mca_btl_sm_component.mem_node = my_mem_node = -1; } } } } #endif if (NULL == (res = calloc(1, sizeof(*res)))) { return OMPI_ERR_OUT_OF_RESOURCE; } /* lookup shared memory pool */ mca_btl_sm_component.sm_mpools = (mca_mpool_base_module_t **)calloc(num_mem_nodes, sizeof(mca_mpool_base_module_t *)); /* Disable memory binding, because each MPI process will claim pages in the * mpool for their local NUMA node */ res->mem_node = -1; if (OMPI_SUCCESS != (rc = setup_mpool_base_resources(m, res))) { free(res); return rc; } /* now that res is fully populated, create the thing */ mca_btl_sm_component.sm_mpools[0] = mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name, sm_btl, res); /* Sanity check to ensure that we found it */ if (NULL == mca_btl_sm_component.sm_mpools[0]) { free(res); return OMPI_ERR_OUT_OF_RESOURCE; } mca_btl_sm_component.sm_mpool = mca_btl_sm_component.sm_mpools[0]; mca_btl_sm_component.sm_mpool_base = mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]); /* create a list of peers */ mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**) calloc(n, sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == mca_btl_sm_component.sm_peers) { free(res); return OMPI_ERR_OUT_OF_RESOURCE; } /* remember that node rank zero is already attached */ if (0 != my_smp_rank) { if (OMPI_SUCCESS != (rc = sm_segment_attach(m))) { free(res); return rc; } } /* it is now safe to free the mpool resources */ free(res); /* check to make sure number of local procs is within the * specified limits */ if(mca_btl_sm_component.sm_max_procs > 0 && mca_btl_sm_component.num_smp_procs + n > mca_btl_sm_component.sm_max_procs) { return OMPI_ERROR; } mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.sm_seg->module_data_addr; mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n); mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n); /* set the base of the shared memory segment */ mca_btl_sm_component.shm_bases[mca_btl_sm_component.my_smp_rank] = (char*)mca_btl_sm_component.sm_mpool_base; mca_btl_sm_component.shm_mem_nodes[mca_btl_sm_component.my_smp_rank] = (uint16_t)my_mem_node; /* initialize the array of fifo's "owned" by this process */ if(NULL == (my_fifos = (sm_fifo_t*)mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t)))) return OMPI_ERR_OUT_OF_RESOURCE; mca_btl_sm_component.shm_fifo[mca_btl_sm_component.my_smp_rank] = my_fifos; /* cache the pointer to the 2d fifo array. These addresses * are valid in the current process space */ mca_btl_sm_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n); if(NULL == mca_btl_sm_component.fifo) return OMPI_ERR_OUT_OF_RESOURCE; mca_btl_sm_component.fifo[mca_btl_sm_component.my_smp_rank] = my_fifos; mca_btl_sm_component.mem_nodes = (uint16_t *) malloc(sizeof(uint16_t) * n); if(NULL == mca_btl_sm_component.mem_nodes) return OMPI_ERR_OUT_OF_RESOURCE; /* initialize fragment descriptor free lists */ /* allocation will be for the fragment descriptor and payload buffer */ length = sizeof(mca_btl_sm_frag1_t); length_payload = sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.eager_limit; i = ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_eager, length, opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag1_t), length_payload, opal_cache_line_size, mca_btl_sm_component.sm_free_list_num, mca_btl_sm_component.sm_free_list_max, mca_btl_sm_component.sm_free_list_inc, mca_btl_sm_component.sm_mpool); if ( OMPI_SUCCESS != i ) return i; length = sizeof(mca_btl_sm_frag2_t); length_payload = sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.max_frag_size; i = ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_max, length, opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag2_t), length_payload, opal_cache_line_size, mca_btl_sm_component.sm_free_list_num, mca_btl_sm_component.sm_free_list_max, mca_btl_sm_component.sm_free_list_inc, mca_btl_sm_component.sm_mpool); if ( OMPI_SUCCESS != i ) return i; i = ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_user, sizeof(mca_btl_sm_user_t), opal_cache_line_size, OBJ_CLASS(mca_btl_sm_user_t), sizeof(mca_btl_sm_hdr_t), opal_cache_line_size, mca_btl_sm_component.sm_free_list_num, mca_btl_sm_component.sm_free_list_max, mca_btl_sm_component.sm_free_list_inc, mca_btl_sm_component.sm_mpool); if ( OMPI_SUCCESS != i ) return i; mca_btl_sm_component.num_outstanding_frags = 0; mca_btl_sm_component.num_pending_sends = 0; i = opal_free_list_init(&mca_btl_sm_component.pending_send_fl, sizeof(btl_sm_pending_send_item_t), OBJ_CLASS(opal_free_list_item_t), 16, -1, 32); if ( OMPI_SUCCESS != i ) return i; /* set flag indicating btl has been inited */ sm_btl->btl_inited = true; return OMPI_SUCCESS; }
static int bind_in_place(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node downwards * until we find an unused object of type target - and then bind * the process to that target */ int i, j; orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; hwloc_cpuset_t cpus; unsigned int idx, ncpus; struct hwloc_topology_support *support; opal_hwloc_obj_data_t *data; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind in place for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(map->binding) || !OPAL_BINDING_POLICY_IS_SET(map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* ignore procs that have already been bound - should * never happen, but safer */ if (NULL != proc->cpu_bitmap) { continue; } /* get the index of this location */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, proc->locale, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } /* track the number bound */ data = (opal_hwloc_obj_data_t*)proc->locale->userdata; data->num_bound++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "BINDING PROC %s TO %s NUMBER %u", ORTE_NAME_PRINT(&proc->name), hwloc_obj_type_string(proc->locale->type), idx); /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, proc->locale))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed, * and it wasn't a default binding policy (i.e., the user requested it) */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) && OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } /* bind the proc here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus); /* record the location */ proc->bind_location = proc->locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), proc->cpu_bitmap, hwloc_obj_type_string(proc->locale->type), idx, node->name); } } return ORTE_SUCCESS; }
static int bind_downwards(orte_job_t *jdata, orte_node_t *node, hwloc_obj_type_t target, unsigned cache_level) { int j; orte_job_map_t *map; orte_proc_t *proc; hwloc_obj_t trg_obj, nxt_obj; hwloc_cpuset_t cpus; unsigned int ncpus; opal_hwloc_obj_data_t *data; int total_cpus; hwloc_cpuset_t totalcpuset; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind downward for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; totalcpuset = hwloc_bitmap_alloc(); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* ignore procs that have already been bound - should * never happen, but safer */ if (NULL != proc->cpu_bitmap) { continue; } /* we don't know if the target is a direct child of this locale, * or if it is some depth below it, so we have to conduct a bit * of a search. Let hwloc find the min usage one for us. */ trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology, proc->locale, target, cache_level); if (NULL == trg_obj) { /* there aren't any such targets under this object */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } /* record the location */ proc->bind_location = trg_obj; /* start with a clean slate */ hwloc_bitmap_zero(totalcpuset); total_cpus = 0; nxt_obj = trg_obj; do { if (NULL == nxt_obj) { /* could not find enough cpus to meet request */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } trg_obj = nxt_obj; /* get the number of cpus under this location */ ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s GOT %d CPUS", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus); /* track the number bound */ if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) { data = OBJ_NEW(opal_hwloc_obj_data_t); trg_obj->userdata = data; } data->num_bound++; /* error out if adding a proc would cause overload and that wasn't allowed, * and it wasn't a default binding policy (i.e., the user requested it) */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } else { /* if this is the default binding policy, then just don't * bind this proc */ data->num_bound--; // maintain count /* show the proc as not bound */ proc->bind_location = NULL; hwloc_bitmap_zero(totalcpuset); break; } } /* bind the proc here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj); hwloc_bitmap_or(totalcpuset, totalcpuset, cpus); /* track total #cpus */ total_cpus += ncpus; /* move to the next location, in case we need it */ nxt_obj = trg_obj->next_cousin; } while (total_cpus < orte_rmaps_base.cpus_per_rank); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, totalcpuset); if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) { char tmp1[1024], tmp2[1024]; if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), node->topology, totalcpuset)) { opal_output(orte_rmaps_base_framework.framework_output, "%s PROC %s ON %s IS NOT BOUND", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name); } else { opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology, totalcpuset); opal_output(orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s[%s] TO %s: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name, tmp1, tmp2); } } } hwloc_bitmap_free(totalcpuset); return ORTE_SUCCESS; }
static int bind_upwards(orte_job_t *jdata, orte_node_t *node, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node upwards * until we find an object of type target - and then bind * the process to that target */ int j; orte_job_map_t *map; orte_proc_t *proc; hwloc_obj_t obj; hwloc_cpuset_t cpus; unsigned int idx, ncpus; opal_hwloc_obj_data_t *data; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind upwards for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* ignore procs that have already been bound - should * never happen, but safer */ if (NULL != proc->cpu_bitmap) { continue; } /* bozo check */ if (NULL == proc->locale) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "BIND UPWARDS: LOCALE FOR PROC %s IS NULL", ORTE_NAME_PRINT(&proc->name)); return ORTE_ERR_SILENT; } /* starting at the locale, move up thru the parents * to find the target object type */ for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind:upward target %s type %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hwloc_obj_type_string(target), hwloc_obj_type_string(obj->type)); if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* get its index */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } /* track the number bound */ data = (opal_hwloc_obj_data_t*)obj->userdata; data->num_bound++; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed, * and it wasn't a default binding policy (i.e., the user requested it) */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) && OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } /* bind it here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, obj); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus); /* record the location */ proc->bind_location = obj; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), proc->cpu_bitmap, hwloc_obj_type_string(target), idx, node->name); break; } } if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) { /* didn't find anyone to bind to - this is an error * unless the user specified if-supported */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true, opal_hwloc_base_print_binding(map->binding), node->name); return ORTE_ERR_SILENT; } } return ORTE_SUCCESS; }
/* recursively climb the topology, pruning procs beyond that allowed * by the given ppr */ static void prune(orte_jobid_t jobid, orte_app_idx_t app_idx, orte_node_t *node, opal_hwloc_level_t *level, orte_vpid_t *nmapped) { hwloc_obj_t obj, top; unsigned int i, nobjs; hwloc_obj_type_t lvl; unsigned cache_level = 0, k; int nprocs; hwloc_cpuset_t avail, cpus, childcpus; int n, limit, nmax, nunder, idx, idxmax = 0; orte_proc_t *proc, *pptr, *procmax; opal_hwloc_level_t ll; char dang[64]; hwloc_obj_t locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: pruning level %d", *level); /* convenience */ ll = *level; /* convenience */ lvl = opal_hwloc_levels[ll]; limit = ppr[ll]; if (0 == limit) { /* no limit at this level, so move up if necessary */ if (0 == ll) { /* done */ return; } --(*level); prune(jobid, app_idx, node, level, nmapped); return; } /* handle the darn cache thing again */ if (OPAL_HWLOC_L3CACHE_LEVEL == ll) { cache_level = 3; } else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) { cache_level = 2; } else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) { cache_level = 1; } /* get the number of resources at this level on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, lvl, cache_level, OPAL_HWLOC_AVAILABLE); /* for each resource, compute the number of procs sitting * underneath it and check against the limit */ for (i=0; i < nobjs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology, lvl, cache_level, i, OPAL_HWLOC_AVAILABLE); /* get the available cpuset */ avail = opal_hwloc_base_get_available_cpus(node->topology, obj); /* look at the intersection of this object's cpuset and that * of each proc in the job/app - if they intersect, then count this proc * against the limit */ nprocs = 0; for (n=0; n < node->procs->size; n++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { continue; } if (proc->name.jobid != jobid || proc->app_idx != app_idx) { continue; } locale = NULL; if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return; } cpus = opal_hwloc_base_get_available_cpus(node->topology, locale); if (hwloc_bitmap_intersects(avail, cpus)) { nprocs++; } } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: found %d procs limit %d", nprocs, limit); /* check against the limit */ while (limit < nprocs) { /* need to remove procs - do this in a semi-intelligent * manner to provide a little load balancing by cycling * across the objects beneath this one, removing procs * in a round-robin fashion until the limit is satisfied * * NOTE: I'm sure someone more knowledgeable with hwloc * will come up with a more efficient way to do this, so * consider this is a starting point */ /* find the first level that has more than * one child beneath it - if all levels * have only one child, then return this * object */ top = find_split(node->topology, obj); hwloc_obj_type_snprintf(dang, 64, top, 1); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: SPLIT AT LEVEL %s", dang); /* cycle across the children of this object */ nmax = 0; procmax = NULL; idx = 0; /* find the child with the most procs underneath it */ for (k=0; k < top->arity && limit < nprocs; k++) { /* get this object's available cpuset */ childcpus = opal_hwloc_base_get_available_cpus(node->topology, top->children[k]); nunder = 0; pptr = NULL; for (n=0; n < node->procs->size; n++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { continue; } if (proc->name.jobid != jobid || proc->app_idx != app_idx) { continue; } locale = NULL; if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return; } cpus = opal_hwloc_base_get_available_cpus(node->topology, locale); if (hwloc_bitmap_intersects(childcpus, cpus)) { nunder++; if (NULL == pptr) { /* save the location of the first proc under this object */ pptr = proc; idx = n; } } } if (nmax < nunder) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d", k, nunder, nmax); nmax = nunder; procmax = pptr; idxmax = idx; } } if (NULL == procmax) { /* can't find anything to remove - error out */ goto error; } /* remove it */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: removing proc at posn %d", idxmax); opal_pointer_array_set_item(node->procs, idxmax, NULL); node->num_procs--; node->slots_inuse--; if (node->slots_inuse < 0) { node->slots_inuse = 0; } nprocs--; *nmapped -= 1; OBJ_RELEASE(procmax); } } /* finished with this level - move up if necessary */ if (0 == ll) { return; } --(*level); prune(jobid, app_idx, node, level, nmapped); return; error: opal_output(0, "INFINITE LOOP"); }
static int bind_in_place(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node downwards * until we find an unused object of type target - and then bind * the process to that target */ int i, j; orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; hwloc_cpuset_t cpus; unsigned int idx, ncpus; struct hwloc_topology_support *support; opal_hwloc_obj_data_t *data; hwloc_obj_t locale, sib; char *cpu_bitmap; bool found; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind in place for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(map->binding) || !OPAL_BINDING_POLICY_IS_SET(map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* bozo check */ if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name)); return ORTE_ERR_SILENT; } /* get the index of this location */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, locale, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } data = (opal_hwloc_obj_data_t*)locale->userdata; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, locale))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* if we don't have enough cpus to support this additional proc, try * shifting the location to a cousin that can support it - the important * thing is that we maintain the same level in the topology */ if (ncpus < (data->num_bound+1)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_in_place: searching right", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); sib = locale; found = false; while (NULL != (sib = sib->next_cousin)) { data = (opal_hwloc_obj_data_t*)sib->userdata; ncpus = opal_hwloc_base_get_npus(node->topology, sib); if (data->num_bound < ncpus) { found = true; locale = sib; break; } } if (!found) { /* try the other direction */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_in_place: searching left", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); sib = locale; while (NULL != (sib = sib->prev_cousin)) { data = (opal_hwloc_obj_data_t*)sib->userdata; ncpus = opal_hwloc_base_get_npus(node->topology, sib); if (data->num_bound < ncpus) { found = true; locale = sib; break; } } } if (!found) { /* no place to put this - see if overload is allowed */ if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* if the user specified a binding policy, then we cannot meet * it since overload isn't allowed, so error out - have the * message indicate that setting overload allowed will remove * this restriction */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } else { /* if we have the default binding policy, then just don't bind */ OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE); unbind_procs(jdata); return ORTE_SUCCESS; } } } } /* track the number bound */ data = (opal_hwloc_obj_data_t*)locale->userdata; // just in case it changed data->num_bound++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "BINDING PROC %s TO %s NUMBER %u", ORTE_NAME_PRINT(&proc->name), hwloc_obj_type_string(locale->type), idx); /* bind the proc here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, locale); hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus); orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); /* update the location, in case it changed */ orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, locale, OPAL_PTR); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), cpu_bitmap, hwloc_obj_type_string(locale->type), idx, node->name); if (NULL != cpu_bitmap) { free(cpu_bitmap); } } } return ORTE_SUCCESS; }
static int bind_upwards(orte_job_t *jdata, orte_node_t *node, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node upwards * until we find an object of type target - and then bind * the process to that target */ int j; orte_job_map_t *map; orte_proc_t *proc; hwloc_obj_t obj; hwloc_cpuset_t cpus; unsigned int idx, ncpus; opal_hwloc_obj_data_t *data; hwloc_obj_t locale; char *cpu_bitmap; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind upwards for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* bozo check */ if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name)); return ORTE_ERR_SILENT; } /* starting at the locale, move up thru the parents * to find the target object type */ cpu_bitmap = NULL; for (obj = locale->parent; NULL != obj; obj = obj->parent) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind:upward target %s type %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hwloc_obj_type_string(target), hwloc_obj_type_string(obj->type)); if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* get its index */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } /* track the number bound */ data = (opal_hwloc_obj_data_t*)obj->userdata; data->num_bound++; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed, * and it wasn't a default binding policy (i.e., the user requested it) */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* if the user specified a binding policy, then we cannot meet * it since overload isn't allowed, so error out - have the * message indicate that setting overload allowed will remove * this restriction */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } else { /* if we have the default binding policy, then just don't bind */ OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE); unbind_procs(jdata); return ORTE_SUCCESS; } } /* bind it here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, obj); hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus); orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); /* record the location */ orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, obj, OPAL_PTR); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), cpu_bitmap, hwloc_obj_type_string(target), idx, node->name); break; } } if (NULL == cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) { /* didn't find anyone to bind to - this is an error * unless the user specified if-supported */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true, opal_hwloc_base_print_binding(map->binding), node->name); return ORTE_ERR_SILENT; } if (NULL != cpu_bitmap) { free(cpu_bitmap); } } return ORTE_SUCCESS; }
static int bind_upwards(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node upwards * until we find an object of type target - and then bind * the process to that target */ int i, j; orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; hwloc_obj_t obj; hwloc_cpuset_t cpus; unsigned int idx, ncpus; struct hwloc_topology_support *support; opal_hwloc_obj_data_t *data; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind upwards for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); return ORTE_ERR_SILENT; } } } /* clear the topology of any prior usage numbers */ opal_hwloc_base_clear_usage(node->topology); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* ignore procs that have already been bound - should * never happen, but safer */ if (NULL != proc->cpu_bitmap) { continue; } /* bozo check */ if (NULL == proc->locale) { opal_output(0, "BIND UPWARDS: LOCALE FOR PROC %s IS NULL", ORTE_NAME_PRINT(&proc->name)); return ORTE_ERR_SILENT; } /* starting at the locale, move up thru the parents * to find the target object type */ for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) { opal_output(0, "%s bind:upward target %s type %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hwloc_obj_type_string(target), hwloc_obj_type_string(obj->type)); if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* get its index */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) { return ORTE_ERR_SILENT; } /* track the number bound */ data = (opal_hwloc_obj_data_t*)obj->userdata; data->num_bound++; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } /* bind it here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, obj); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), proc->cpu_bitmap, hwloc_obj_type_string(target), idx, node->name); break; } } if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) { /* didn't find anyone to bind to - this is an error * unless the user specified if-supported */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true, opal_hwloc_base_print_binding(map->binding), node->name); return ORTE_ERR_SILENT; } } } return ORTE_SUCCESS; }
static int bind_downwards(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { int i, j; orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; hwloc_obj_t trg_obj, nxt_obj; hwloc_cpuset_t cpus; unsigned int ncpus; struct hwloc_topology_support *support; opal_hwloc_obj_data_t *data; int total_cpus; hwloc_cpuset_t totalcpuset; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind downward for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; totalcpuset = hwloc_bitmap_alloc(); for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } } } /* clear the topology of any prior usage numbers */ opal_hwloc_base_clear_usage(node->topology); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* ignore procs that have already been bound - should * never happen, but safer */ if (NULL != proc->cpu_bitmap) { continue; } /* we don't know if the target is a direct child of this locale, * or if it is some depth below it, so we have to conduct a bit * of a search. Let hwloc find the min usage one for us. */ trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology, proc->locale, target, cache_level); if (NULL == trg_obj) { /* there aren't any such targets under this object */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } /* start with a clean slate */ hwloc_bitmap_zero(totalcpuset); total_cpus = 0; nxt_obj = trg_obj; do { if (NULL == nxt_obj) { /* could not find enough cpus to meet request */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } trg_obj = nxt_obj; /* get the number of cpus under this location */ ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s GOT %d CPUS", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus); /* track the number bound */ if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) { data = OBJ_NEW(opal_hwloc_obj_data_t); trg_obj->userdata = data; } data->num_bound++; /* error out if adding a proc would cause overload and that wasn't allowed */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } /* bind the proc here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj); hwloc_bitmap_or(totalcpuset, totalcpuset, cpus); total_cpus += ncpus; /* move to the next location, in case we need it */ nxt_obj = trg_obj->next_cousin; } while (total_cpus < orte_rmaps_base.cpus_per_rank); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, totalcpuset); if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) { char tmp1[1024], tmp2[1024]; opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), totalcpuset); opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), totalcpuset); opal_output(orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s[%s] TO %s: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name, tmp1, tmp2); } } } hwloc_bitmap_free(totalcpuset); return ORTE_SUCCESS; }
int orte_ess_base_proc_binding(void) { #if OPAL_HAVE_HWLOC hwloc_obj_t node, obj; hwloc_cpuset_t cpus, nodeset; hwloc_obj_type_t target; unsigned int cache_level = 0; struct hwloc_topology_support *support; char *map; int ret; char *error; /* Determine if we were pre-bound or not */ if (NULL != getenv("OMPI_MCA_orte_bound_at_launch")) { orte_proc_is_bound = true; if (NULL != (map = getenv("OMPI_MCA_orte_base_applied_binding"))) { orte_proc_applied_binding = hwloc_bitmap_alloc(); if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) { error = "applied_binding parse"; goto error; } } } /* see if we were bound when launched */ if (!orte_proc_is_bound) { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Not bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* we were not bound at launch */ if (NULL != opal_hwloc_topology) { support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology); /* get our node object */ node = hwloc_get_root_obj(opal_hwloc_topology); nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node); /* get our bindings */ cpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) { /* we are NOT bound if get_cpubind fails, nor can we be bound - the * environment does not support it */ hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Binding not supported", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* we are bound if the two cpusets are not equal, * or if there is only ONE cpu available to us */ if (0 != hwloc_bitmap_compare(cpus, nodeset) || opal_hwloc_base_single_cpu(nodeset) || opal_hwloc_base_single_cpu(cpus)) { /* someone external set it - indicate it is set * so that we know */ orte_proc_is_bound = true; hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process was externally bound", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (support->cpubind->set_thisproc_cpubind && OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* the system is capable of doing processor affinity, but it * has not yet been set - see if a slot_list was given */ hwloc_bitmap_zero(cpus); if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list, opal_hwloc_topology, cpus))) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } /* try to find a level and index for this location */ opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx); /* cleanup */ hwloc_bitmap_free(cpus); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound according to slot_list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* cleanup */ hwloc_bitmap_free(cpus); /* get the node rank */ if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) { /* this is not an error - could be due to being * direct launched - so just ignore and leave * us unbound */ OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process not bound - no node rank available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* if the binding policy is hwthread, then we bind to the nrank-th * hwthread on this node */ if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting hwthread object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL; orte_process_info.bind_idx = orte_process_info.my_node_rank; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to hwthread", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* if the binding policy is core, then we bind to the nrank-th * core on this node */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; ret = ORTE_ERROR; goto error; } orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL; orte_process_info.bind_idx = orte_process_info.my_node_rank; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to core", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* for all higher binding policies, we bind to the specified * object that the nrank-th core belongs to */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 1; orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL; } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 2; orte_process_info.bind_level = OPAL_HWLOC_L2CACHE_LEVEL; } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 3; orte_process_info.bind_level = OPAL_HWLOC_L3CACHE_LEVEL; } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_SOCKET; orte_process_info.bind_level = OPAL_HWLOC_SOCKET_LEVEL; } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_NODE; orte_process_info.bind_level = OPAL_HWLOC_NUMA_LEVEL; } else { ret = ORTE_ERR_NOT_FOUND; error = "Binding policy not known"; goto error; } for (obj = obj->parent; NULL != obj; obj = obj->parent) { if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* this is the place! */ cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology, obj, OPAL_HWLOC_LOGICAL); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), opal_hwloc_base_print_level(orte_process_info.bind_level))); break; } } if (!orte_proc_is_bound) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } } } } } } else { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } MOVEON: /* get or update our local cpuset - it will get used multiple * times, so it's more efficient to keep a global copy */ opal_hwloc_base_get_local_cpuset(); /* report bindings, if requested */ if (opal_hwloc_report_bindings) { char bindings[64]; hwloc_obj_t root; hwloc_cpuset_t cpus; /* get the root object for this node */ root = hwloc_get_root_obj(opal_hwloc_topology); cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root); /* we are not bound if this equals our cpuset */ if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) { opal_output(0, "%s is not bound", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } else { hwloc_bitmap_list_snprintf(bindings, 64, opal_hwloc_my_cpuset); opal_output(0, "%s is bound to cpus %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), bindings); } } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ORTE_ERR_SILENT; #else return ORTE_SUCCESS; #endif }
int orte_ess_base_proc_binding(void) { hwloc_obj_t node, obj; hwloc_cpuset_t cpus, nodeset; hwloc_obj_type_t target; unsigned int cache_level = 0; struct hwloc_topology_support *support; char *map; int ret; char *error=NULL; hwloc_cpuset_t mycpus; /* Determine if we were pre-bound or not */ if (NULL != getenv(OPAL_MCA_PREFIX"orte_bound_at_launch")) { orte_proc_is_bound = true; if (NULL != (map = getenv(OPAL_MCA_PREFIX"orte_base_applied_binding"))) { orte_proc_applied_binding = hwloc_bitmap_alloc(); if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) { error = "applied_binding parse"; goto error; } } } /* see if we were bound when launched */ if (!orte_proc_is_bound) { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Not bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* we were not bound at launch */ if (NULL == opal_hwloc_topology) { /* there is nothing we can do, so just return */ return ORTE_SUCCESS; } support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology); /* get our node object */ node = hwloc_get_root_obj(opal_hwloc_topology); nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node); /* get our bindings */ cpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) { /* we are NOT bound if get_cpubind fails, nor can we be bound - the * environment does not support it */ hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Binding not supported", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* we are bound if the two cpusets are not equal, * or if there is only ONE cpu available to us */ if (0 != hwloc_bitmap_compare(cpus, nodeset) || opal_hwloc_base_single_cpu(nodeset) || opal_hwloc_base_single_cpu(cpus)) { /* someone external set it - indicate it is set * so that we know */ orte_proc_is_bound = true; hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process was externally bound", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (support->cpubind->set_thisproc_cpubind && OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* the system is capable of doing processor affinity, but it * has not yet been set - see if a slot_list was given */ hwloc_bitmap_zero(cpus); if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list, opal_hwloc_topology, OPAL_HWLOC_LOGICAL, cpus))) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); hwloc_bitmap_free(cpus); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound according to slot_list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* cleanup */ hwloc_bitmap_free(cpus); /* get the node rank */ if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) { /* this is not an error - could be due to being * direct launched - so just ignore and leave * us unbound */ OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process not bound - no node rank available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* if the binding policy is hwthread, then we bind to the nrank-th * hwthread on this node */ if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting hwthread object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound to hwthread", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* if the binding policy is core, then we bind to the nrank-th * core on this node */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; ret = ORTE_ERROR; goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound to core", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* for all higher binding policies, we bind to the specified * object that the nrank-th core belongs to */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 1; } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 2; } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 3; } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_SOCKET; } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_NODE; } else { ret = ORTE_ERR_NOT_FOUND; error = "Binding policy not known"; goto error; } for (obj = obj->parent; NULL != obj; obj = obj->parent) { if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* this is the place! */ cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hwloc_obj_type_string(target))); break; } } if (!orte_proc_is_bound) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } } } } } else { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } MOVEON: /* get or update our local cpuset - it will get used multiple * times, so it's more efficient to keep a global copy */ opal_hwloc_base_get_local_cpuset(); /* get the cpus we are bound to */ mycpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, mycpus, HWLOC_CPUBIND_PROCESS) < 0) { if (NULL != orte_process_info.cpuset) { free(orte_process_info.cpuset); orte_process_info.cpuset = NULL; } if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "MCW rank %d is not bound", ORTE_PROC_MY_NAME->vpid); } } else { /* store/update the string representation of our local binding */ if (NULL != orte_process_info.cpuset) { free(orte_process_info.cpuset); orte_process_info.cpuset = NULL; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, mycpus); /* report the binding, if requested */ if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { char tmp1[1024], tmp2[1024]; if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), opal_hwloc_topology, mycpus)) { opal_output(0, "MCW rank %d is not bound (or bound to all available processors)", ORTE_PROC_MY_NAME->vpid); } else { opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), opal_hwloc_topology, mycpus); opal_output(0, "MCW rank %d bound to %s: %s", ORTE_PROC_MY_NAME->vpid, tmp1, tmp2); } } } hwloc_bitmap_free(mycpus); /* push our cpuset so others can calculate our locality */ if (NULL != orte_process_info.cpuset) { OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_CPUSET, orte_process_info.cpuset, OPAL_STRING); } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ORTE_ERR_SILENT; }