int orte_rmaps_base_compute_bindings(orte_job_t *jdata) { hwloc_obj_type_t hwb, hwm; unsigned clvl=0, clvm=0; opal_binding_policy_t bind; orte_mapping_policy_t map; orte_node_t *node; int i, rc; struct hwloc_topology_support *support; bool force_down = false; hwloc_cpuset_t totalcpuset; int bind_depth, map_depth; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: compute bindings for job %s with policy %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping); bind = OPAL_GET_BINDING_POLICY(jdata->map->binding); if (ORTE_MAPPING_BYUSER == map) { /* user specified binding by rankfile - nothing for us to do */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_CPUSET == bind) { int rc; /* cpuset was given - setup the bindings */ if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) { ORTE_ERROR_LOG(rc); } return rc; } if (OPAL_BIND_TO_NONE == bind) { /* no binding requested */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_BOARD == bind) { /* doesn't do anything at this time */ return ORTE_SUCCESS; } /* binding requested - convert the binding level to the hwloc obj type */ switch (bind) { case OPAL_BIND_TO_NUMA: hwb = HWLOC_OBJ_NODE; break; case OPAL_BIND_TO_SOCKET: hwb = HWLOC_OBJ_SOCKET; break; case OPAL_BIND_TO_L3CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 3; break; case OPAL_BIND_TO_L2CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 2; break; case OPAL_BIND_TO_L1CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 1; break; case OPAL_BIND_TO_CORE: hwb = HWLOC_OBJ_CORE; break; case OPAL_BIND_TO_HWTHREAD: hwb = HWLOC_OBJ_PU; break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* do the same for the mapping policy */ switch (map) { case ORTE_MAPPING_BYNODE: case ORTE_MAPPING_BYSLOT: case ORTE_MAPPING_SEQ: hwm = HWLOC_OBJ_MACHINE; break; case ORTE_MAPPING_BYDIST: case ORTE_MAPPING_BYNUMA: hwm = HWLOC_OBJ_NODE; break; case ORTE_MAPPING_BYSOCKET: hwm = HWLOC_OBJ_SOCKET; break; case ORTE_MAPPING_BYL3CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 3; break; case ORTE_MAPPING_BYL2CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 2; break; case ORTE_MAPPING_BYL1CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 1; break; case ORTE_MAPPING_BYCORE: hwm = HWLOC_OBJ_CORE; break; case ORTE_MAPPING_BYHWTHREAD: hwm = HWLOC_OBJ_PU; break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* if the job was mapped by the corresponding target, then * we bind in place * * otherwise, we have to bind either up or down the hwloc * tree. If we are binding upwards (e.g., mapped to hwthread * but binding to core), then we just climb the tree to find * the first matching object. * * if we are binding downwards (e.g., mapped to node and bind * to core), then we have to do a round-robin assigment of * procs to the resources below. */ if (ORTE_MAPPING_BYDIST == map) { int rc = ORTE_SUCCESS; if (OPAL_BIND_TO_NUMA == bind) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - dist to numa", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_NUMA < bind) { /* bind every proc downwards */ force_down = true; goto execute; } /* if the binding policy is less than numa, then we are unbound - so * just ignore this and return (should have been caught in prior * tests anyway as only options meeting that criteria are "none" * and "board") */ return rc; } /* now deal with the remaining binding policies based on hardware */ if (bind == map) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - bind in place", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) { ORTE_ERROR_LOG(rc); } return rc; } /* we need to handle the remaining binding options on a per-node * basis because different nodes could potentially have different * topologies, with different relative depths for the two levels */ execute: /* initialize */ totalcpuset = hwloc_bitmap_alloc(); for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(jdata->map->binding) || !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); if (force_down) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } else { /* determine the relative depth on this node */ if (HWLOC_OBJ_CACHE == hwb) { /* must use a unique function because blasted hwloc * just doesn't deal with caches very well...sigh */ bind_depth = hwloc_get_cache_type_depth(node->topology, clvl, -1); } else { bind_depth = hwloc_get_type_depth(node->topology, hwb); } if (0 > bind_depth) { /* didn't find such an object */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(hwb), node->name); return ORTE_ERR_SILENT; } if (HWLOC_OBJ_CACHE == hwm) { /* must use a unique function because blasted hwloc * just doesn't deal with caches very well...sigh */ map_depth = hwloc_get_cache_type_depth(node->topology, clvm, -1); } else { map_depth = hwloc_get_type_depth(node->topology, hwm); } if (0 > map_depth) { /* didn't find such an object */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(hwm), node->name); return ORTE_ERR_SILENT; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_depth: %d map_depth %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), bind_depth, map_depth); if (bind_depth > map_depth) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } } } return ORTE_SUCCESS; }
int orte_rmaps_base_compute_bindings(orte_job_t *jdata) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: compute bindings for job %s with policy %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); if (ORTE_MAPPING_BYUSER == ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping)) { /* user specified binding by rankfile - nothing for us to do */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; /* cpuset was given - setup the bindings */ if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) { ORTE_ERROR_LOG(rc); } return rc; } if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) || OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { /* no binding requested */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_BOARD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { /* doesn't do anything at this time */ return ORTE_SUCCESS; } /* binding requested */ /* if the job was mapped by the corresponding target, then * we bind in place * * otherwise, we have to bind either up or down the hwloc * tree. If we are binding upwards (e.g., mapped to hwthread * but binding to core), then we just climb the tree to find * the first matching object. * * if we are binding downwards (e.g., mapped to node and bind * to core), then we have to do a round-robin assigment of * procs to the resources below. */ if (ORTE_MAPPING_BYDIST == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { int rc = ORTE_SUCCESS; if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - dist to numa", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_NUMA < OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) { ORTE_ERROR_LOG(rc); } } } /* if the binding policy is less than numa, then we are unbound - so * just ignore this and return (should have been caught in prior * tests anyway as only options meeting that criteria are "none" * and "board") */ return rc; } else if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - hwthread to hwthread", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_PU, 0))) { ORTE_ERROR_LOG(rc); } return rc; } /* HW threads are at the bottom, so we must have mapped somewhere * above this level - and we need to bind downwards */ if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) { ORTE_ERROR_LOG(rc); } return rc; } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - core to core", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CORE, 0))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy used is less than bycore, then it is a * downward binding - i.e., the locale is above the binding location. * for example, if we map-to-socket and bind-to-core, then we compare * the mapping value of ORTE_MAPPING_BYCORE to ORTE_MAPPING_BYSOCKET. * In this case, BYCORE > BYSOCKET, so we know that the locale is * above the desired binding level (sockets are at a higher level than * the desired core binding level), and we will have to bind downwards */ if (ORTE_MAPPING_BYCORE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CORE, 0))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - L1cache to L1cache", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 1))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than l1cache, then it is a * downward binding */ if (ORTE_MAPPING_BYL1CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 1))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - L2cache to L2cache", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 2))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than l2cache, then it is a * downward binding */ if (ORTE_MAPPING_BYL2CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 2))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - L3cache to L3cache", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 3))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than l3cache, then it is a * downward binding */ if (ORTE_MAPPING_BYL3CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 3))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - socket to socket", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_SOCKET, 0))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than bysocket, then it is a * downward binding */ if (ORTE_MAPPING_BYSOCKET > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_SOCKET, 0))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - numa to numa", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than numa, then it is a * downward binding */ if (ORTE_MAPPING_BYNUMA > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } return rc; } else { ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); return ORTE_ERR_NOT_SUPPORTED; } return ORTE_SUCCESS; }