/* * Sequentially map the ranks according to the placement in the * specified hostfile */ static int orte_rmaps_seq_map(orte_job_t *jdata) { orte_job_map_t *map; orte_app_context_t *app; int i, n; orte_std_cntr_t j; opal_list_item_t *item; orte_node_t *node, *nd; seq_node_t *sq, *save=NULL, *seq;; orte_vpid_t vpid; orte_std_cntr_t num_nodes; int rc; opal_list_t default_seq_list; opal_list_t node_list, *seq_list, sq_list; orte_proc_t *proc; mca_base_component_t *c = &mca_rmaps_seq_component.base_version; char *hosts = NULL, *sep, *eptr; FILE *fp; opal_hwloc_resource_type_t rtype; OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base_framework.framework_output, "%s rmaps:seq called on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* this mapper can only handle initial launch * when seq mapping is desired - allow * restarting of failed apps */ if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s is being restarted - seq cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper) { if (0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s not using sequential mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } /* we need to process it */ goto process; } if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I don't know how to do these - defer */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s not using seq mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } process: opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* convenience def */ map = jdata->map; /* if there is a default hostfile, go and get its ordered list of nodes */ OBJ_CONSTRUCT(&default_seq_list, opal_list_t); if (NULL != orte_default_hostfile) { char *hstname = NULL; /* open the file */ fp = fopen(orte_default_hostfile, "r"); if (NULL == fp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto error; } while (NULL != (hstname = orte_getline(fp))) { if (0 == strlen(hstname)) { free(hstname); /* blank line - ignore */ continue; } if( '#' == hstname[0] ) { free(hstname); /* Comment line - ignore */ continue; } sq = OBJ_NEW(seq_node_t); if (NULL != (sep = strchr(hstname, ' '))) { *sep = '\0'; sep++; /* remove any trailing space */ eptr = sep + strlen(sep) - 1; while (eptr > sep && isspace(*eptr)) { eptr--; } *(eptr+1) = 0; sq->cpuset = strdup(sep); } // Strip off the FQDN if present, ignore IP addresses if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) { char *ptr; if (NULL != (ptr = strchr(hstname, '.'))) { *ptr = '\0'; } } sq->hostname = hstname; opal_list_append(&default_seq_list, &sq->super); } fclose(fp); } /* start at the beginning... */ vpid = 0; jdata->num_procs = 0; if (0 < opal_list_get_size(&default_seq_list)) { save = (seq_node_t*)opal_list_get_first(&default_seq_list); } /* default to LOGICAL processors */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, NULL, OPAL_BOOL)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using PHYSICAL processors"); rtype = OPAL_HWLOC_PHYSICAL; } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using LOGICAL processors"); rtype = OPAL_HWLOC_LOGICAL; } /* initialize all the nodes as not included in this job map */ for (j=0; j < orte_node_pool->size; j++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } } /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* dash-host trumps hostfile */ if (orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using dash-host nodes on app %s", app->app); OBJ_CONSTRUCT(&node_list, opal_list_t); /* dash host entries cannot specify cpusets, so used the std function to retrieve the list */ if (ORTE_SUCCESS != (rc = orte_util_get_ordered_dash_host_list(&node_list, hosts))) { ORTE_ERROR_LOG(rc); free(hosts); goto error; } free(hosts); /* transfer the list to a seq_node_t list */ OBJ_CONSTRUCT(&sq_list, opal_list_t); while (NULL != (nd = (orte_node_t*)opal_list_remove_first(&node_list))) { sq = OBJ_NEW(seq_node_t); sq->hostname = strdup(nd->name); opal_list_append(&sq_list, &sq->super); OBJ_RELEASE(nd); } OBJ_DESTRUCT(&node_list); seq_list = &sq_list; } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) { char *hstname; if (NULL == hosts) { rc = ORTE_ERR_NOT_FOUND; goto error; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using hostfile %s nodes on app %s", hosts, app->app); OBJ_CONSTRUCT(&sq_list, opal_list_t); /* open the file */ fp = fopen(hosts, "r"); if (NULL == fp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; OBJ_DESTRUCT(&sq_list); goto error; } while (NULL != (hstname = orte_getline(fp))) { if (0 == strlen(hstname)) { free(hstname); /* blank line - ignore */ continue; } if( '#' == hstname[0] ) { free(hstname); /* Comment line - ignore */ continue; } sq = OBJ_NEW(seq_node_t); if (NULL != (sep = strchr(hstname, ' '))) { *sep = '\0'; sep++; /* remove any trailing space */ eptr = sep + strlen(sep) - 1; while (eptr > sep && isspace(*eptr)) { eptr--; } *(eptr+1) = 0; sq->cpuset = strdup(sep); } // Strip off the FQDN if present, ignore IP addresses if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) { char *ptr; if (NULL != (ptr = strchr(hstname, '.'))) { (*ptr) = '\0'; } } sq->hostname = hstname; opal_list_append(&sq_list, &sq->super); } fclose(fp); free(hosts); seq_list = &sq_list; } else if (0 < opal_list_get_size(&default_seq_list)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using default hostfile nodes on app %s", app->app); seq_list = &default_seq_list; } else { /* can't do anything - no nodes available! */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* check for nolocal and remove the head node, if required */ if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) { for (item = opal_list_get_first(seq_list); item != opal_list_get_end(seq_list); item = opal_list_get_next(item) ) { seq = (seq_node_t*)item; /* need to check ifislocal because the name in the * hostfile may not have been FQDN, while name returned * by gethostname may have been (or vice versa) */ if (orte_ifislocal(seq->hostname)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: removing head node %s", seq->hostname); opal_list_remove_item(seq_list, item); OBJ_RELEASE(item); /* "un-retain" it */ } } } if (NULL == seq_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(seq_list))) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* if num_procs wasn't specified, set it now */ if (0 == app->num_procs) { app->num_procs = num_nodes; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: setting num procs to %s for app %s", ORTE_VPID_PRINT(app->num_procs), app->app); } else if (num_nodes < app->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "seq:not-enough-resources", true, app->num_procs, num_nodes); return ORTE_ERR_SILENT; } if (seq_list == &default_seq_list) { sq = save; } else { sq = (seq_node_t*)opal_list_get_first(seq_list); } for (n=0; n < app->num_procs; n++) { /* find this node on the global array - this is necessary so * that our mapping gets saved on that array as the objects * returned by the hostfile function are -not- on the array */ node = NULL; for (j=0; j < orte_node_pool->size; j++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { continue; } if (0 == strcmp(sq->hostname, node->name)) { break; } } if (NULL == node) { /* wasn't found - that is an error */ orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:resource-not-found", true, sq->hostname); rc = ORTE_ERR_SILENT; goto error; } /* ensure the node is in the map */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { OBJ_RETAIN(node); opal_pointer_array_add(map->nodes, node); jdata->map->num_nodes++; ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); } proc = orte_rmaps_base_setup_proc(jdata, node, i); if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED); /* check for permission */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) { /* if we weren't given a directive either way, then we will error out * as the #slots were specifically given, either by the host RM or * via hostfile/dash-host */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return ORTE_ERR_SILENT; } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { /* if we were explicitly told not to oversubscribe, then don't */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return ORTE_ERR_SILENT; } } } /* assign the vpid */ proc->name.vpid = vpid++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: assign proc %s to node %s for app %s", ORTE_VPID_PRINT(proc->name.vpid), sq->hostname, app->app); /* record the cpuset, if given */ if (NULL != sq->cpuset) { hwloc_cpuset_t bitmap; char *cpu_bitmap; if (NULL == node->topology) { /* not allowed - for sequential cpusets, we must have * the topology info */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name); rc = ORTE_ERR_SILENT; goto error; } /* if we are using hwthreads as cpus and binding to hwthreads, then * we can just copy the cpuset across as it already specifies things * at that level */ if (opal_hwloc_use_hwthreads_as_cpus && OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { cpu_bitmap = strdup(sq->cpuset); } else { /* setup the bitmap */ bitmap = hwloc_bitmap_alloc(); /* parse the slot_list to find the socket and core */ if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(sq->cpuset, node->topology, rtype, bitmap))) { ORTE_ERROR_LOG(rc); hwloc_bitmap_free(bitmap); goto error; } /* note that we cannot set the proc locale to any specific object * as the slot list may have assigned it to more than one - so * leave that field NULL */ /* set the proc to the specified map */ hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap); hwloc_bitmap_free(bitmap); } orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: binding proc %s to cpuset %s bitmap %s", ORTE_VPID_PRINT(proc->name.vpid), sq->cpuset, cpu_bitmap); /* we are going to bind to cpuset since the user is specifying the cpus */ OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CPUSET); /* note that the user specified the mapping */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYUSER); ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_GIVEN); /* cleanup */ free(cpu_bitmap); } else { hwloc_obj_t locale; /* assign the locale - okay for the topo to be null as * it just means it wasn't returned */ if (NULL != node->topology) { locale = hwloc_get_root_obj(node->topology); orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, locale, OPAL_PTR); } } /* add to the jdata proc array */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); goto error; } /* move to next node */ sq = (seq_node_t*)opal_list_get_next(&sq->super); } /** track the total number of processes we mapped */ jdata->num_procs += app->num_procs; /* cleanup the node list if it came from this app_context */ if (seq_list != &default_seq_list) { OPAL_LIST_DESTRUCT(seq_list); } else { save = sq; } } return ORTE_SUCCESS; error: OPAL_LIST_DESTRUCT(&default_seq_list); return rc; }
int orte_rmaps_base_compute_bindings(orte_job_t *jdata) { hwloc_obj_type_t hwb, hwm; unsigned clvl=0, clvm=0; opal_binding_policy_t bind; orte_mapping_policy_t map; orte_node_t *node; int i, rc; struct hwloc_topology_support *support; bool force_down = false; hwloc_cpuset_t totalcpuset; int bind_depth, map_depth; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: compute bindings for job %s with policy %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping); bind = OPAL_GET_BINDING_POLICY(jdata->map->binding); if (ORTE_MAPPING_BYUSER == map) { /* user specified binding by rankfile - nothing for us to do */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_CPUSET == bind) { int rc; /* cpuset was given - setup the bindings */ if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) { ORTE_ERROR_LOG(rc); } return rc; } if (OPAL_BIND_TO_NONE == bind) { /* no binding requested */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_BOARD == bind) { /* doesn't do anything at this time */ return ORTE_SUCCESS; } /* binding requested - convert the binding level to the hwloc obj type */ switch (bind) { case OPAL_BIND_TO_NUMA: hwb = HWLOC_OBJ_NODE; break; case OPAL_BIND_TO_SOCKET: hwb = HWLOC_OBJ_SOCKET; break; case OPAL_BIND_TO_L3CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 3; break; case OPAL_BIND_TO_L2CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 2; break; case OPAL_BIND_TO_L1CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 1; break; case OPAL_BIND_TO_CORE: hwb = HWLOC_OBJ_CORE; break; case OPAL_BIND_TO_HWTHREAD: hwb = HWLOC_OBJ_PU; break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* do the same for the mapping policy */ switch (map) { case ORTE_MAPPING_BYNODE: case ORTE_MAPPING_BYSLOT: case ORTE_MAPPING_SEQ: hwm = HWLOC_OBJ_MACHINE; break; case ORTE_MAPPING_BYDIST: case ORTE_MAPPING_BYNUMA: hwm = HWLOC_OBJ_NODE; break; case ORTE_MAPPING_BYSOCKET: hwm = HWLOC_OBJ_SOCKET; break; case ORTE_MAPPING_BYL3CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 3; break; case ORTE_MAPPING_BYL2CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 2; break; case ORTE_MAPPING_BYL1CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 1; break; case ORTE_MAPPING_BYCORE: hwm = HWLOC_OBJ_CORE; break; case ORTE_MAPPING_BYHWTHREAD: hwm = HWLOC_OBJ_PU; break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* if the job was mapped by the corresponding target, then * we bind in place * * otherwise, we have to bind either up or down the hwloc * tree. If we are binding upwards (e.g., mapped to hwthread * but binding to core), then we just climb the tree to find * the first matching object. * * if we are binding downwards (e.g., mapped to node and bind * to core), then we have to do a round-robin assigment of * procs to the resources below. */ if (ORTE_MAPPING_BYDIST == map) { int rc = ORTE_SUCCESS; if (OPAL_BIND_TO_NUMA == bind) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - dist to numa", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_NUMA < bind) { /* bind every proc downwards */ force_down = true; goto execute; } /* if the binding policy is less than numa, then we are unbound - so * just ignore this and return (should have been caught in prior * tests anyway as only options meeting that criteria are "none" * and "board") */ return rc; } /* now deal with the remaining binding policies based on hardware */ if (bind == map) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - bind in place", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) { ORTE_ERROR_LOG(rc); } return rc; } /* we need to handle the remaining binding options on a per-node * basis because different nodes could potentially have different * topologies, with different relative depths for the two levels */ execute: /* initialize */ totalcpuset = hwloc_bitmap_alloc(); for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(jdata->map->binding) || !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); if (force_down) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } else { /* determine the relative depth on this node */ if (HWLOC_OBJ_CACHE == hwb) { /* must use a unique function because blasted hwloc * just doesn't deal with caches very well...sigh */ bind_depth = hwloc_get_cache_type_depth(node->topology, clvl, -1); } else { bind_depth = hwloc_get_type_depth(node->topology, hwb); } if (0 > bind_depth) { /* didn't find such an object */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(hwb), node->name); return ORTE_ERR_SILENT; } if (HWLOC_OBJ_CACHE == hwm) { /* must use a unique function because blasted hwloc * just doesn't deal with caches very well...sigh */ map_depth = hwloc_get_cache_type_depth(node->topology, clvm, -1); } else { map_depth = hwloc_get_type_depth(node->topology, hwm); } if (0 > map_depth) { /* didn't find such an object */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(hwm), node->name); return ORTE_ERR_SILENT; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_depth: %d map_depth %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), bind_depth, map_depth); if (bind_depth > map_depth) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } } } return ORTE_SUCCESS; }
int orte_rmaps_base_set_ranking_policy(orte_ranking_policy_t *policy, orte_mapping_policy_t mapping, char *spec) { orte_mapping_policy_t map; orte_ranking_policy_t tmp; char **ck; size_t len; /* set default */ tmp = 0; if (NULL == spec) { /* check for map-by object directives - we set the * ranking to match if one was given */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { map = ORTE_GET_MAPPING_POLICY(mapping); switch (map) { case ORTE_MAPPING_BYSLOT: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT); break; case ORTE_MAPPING_BYNODE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NODE); break; case ORTE_MAPPING_BYCORE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_CORE); break; case ORTE_MAPPING_BYL1CACHE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L1CACHE); break; case ORTE_MAPPING_BYL2CACHE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L2CACHE); break; case ORTE_MAPPING_BYL3CACHE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L3CACHE); break; case ORTE_MAPPING_BYSOCKET: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SOCKET); break; case ORTE_MAPPING_BYNUMA: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NUMA); break; case ORTE_MAPPING_BYBOARD: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_BOARD); break; case ORTE_MAPPING_BYHWTHREAD: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_HWTHREAD); break; default: /* anything not tied to a specific hw obj can rank by slot */ ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT); break; } } else { /* if no map-by was given, default to by-slot */ ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT); } } else { ck = opal_argv_split(spec, ':'); if (2 < opal_argv_count(ck)) { /* incorrect format */ orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", policy); opal_argv_free(ck); return ORTE_ERR_SILENT; } if (2 == opal_argv_count(ck)) { if (0 == strncasecmp(ck[1], "span", strlen(ck[1]))) { ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_SPAN); } else if (0 == strncasecmp(ck[1], "fill", strlen(ck[1]))) { ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_FILL); } else { /* unrecognized modifier */ orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, ck[1]); opal_argv_free(ck); return ORTE_ERR_SILENT; } } len = strlen(ck[0]); if (0 == strncasecmp(ck[0], "slot", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT); } else if (0 == strncasecmp(ck[0], "node", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NODE); } else if (0 == strncasecmp(ck[0], "hwthread", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_HWTHREAD); } else if (0 == strncasecmp(ck[0], "core", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_CORE); } else if (0 == strncasecmp(ck[0], "l1cache", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L1CACHE); } else if (0 == strncasecmp(ck[0], "l2cache", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L2CACHE); } else if (0 == strncasecmp(ck[0], "l3cache", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L3CACHE); } else if (0 == strncasecmp(ck[0], "socket", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SOCKET); } else if (0 == strncasecmp(ck[0], "numa", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NUMA); } else if (0 == strncasecmp(ck[0], "board", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_BOARD); } else { orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", rmaps_base_ranking_policy); opal_argv_free(ck); return ORTE_ERR_SILENT; } opal_argv_free(ck); ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_GIVEN); } *policy = tmp; return ORTE_SUCCESS; }
static int ppr_mapper(orte_job_t *jdata) { int rc = ORTE_SUCCESS, j, n; mca_base_component_t *c=&mca_rmaps_ppr_component.base_version; orte_node_t *node; orte_proc_t *proc; orte_app_context_t *app; orte_vpid_t total_procs, nprocs_mapped; opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL; #if OPAL_HAVE_HWLOC hwloc_obj_t obj; hwloc_obj_type_t lowest; unsigned cache_level=0; unsigned int nobjs, i; bool pruning_reqd = false; opal_hwloc_level_t level; #endif opal_list_t node_list; opal_list_item_t *item; orte_std_cntr_t num_slots; orte_app_idx_t idx; char **ppr_req, **ck; size_t len; bool initial_map=true; /* only handle initial launch of loadbalanced * or NPERxxx jobs - allow restarting of failed apps */ if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s being restarted - ppr cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s not using ppr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL == jdata->map->ppr || ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* not for us */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s not using ppr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: mapping job %s with ppr %s", ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* initialize */ memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t)); /* parse option */ n=0; ppr_req = opal_argv_split(jdata->map->ppr, ','); for (j=0; NULL != ppr_req[j]; j++) { /* split on the colon */ ck = opal_argv_split(ppr_req[j], ':'); if (2 != opal_argv_count(ck)) { /* must provide a specification */ orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr); opal_argv_free(ppr_req); opal_argv_free(ck); return ORTE_ERR_SILENT; } len = strlen(ck[1]); if (0 == strncasecmp(ck[1], "node", len)) { ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10); ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE); start = OPAL_HWLOC_NODE_LEVEL; n++; #if OPAL_HAVE_HWLOC } else if (0 == strncasecmp(ck[1], "hwthread", len) || 0 == strncasecmp(ck[1], "thread", len)) { ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10); start = OPAL_HWLOC_HWTHREAD_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD); n++; } else if (0 == strncasecmp(ck[1], "core", len)) { ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_CORE_LEVEL) { start = OPAL_HWLOC_CORE_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE); } n++; } else if (0 == strncasecmp(ck[1], "socket", len) || 0 == strncasecmp(ck[1], "skt", len)) { ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_SOCKET_LEVEL) { start = OPAL_HWLOC_SOCKET_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET); } n++; } else if (0 == strncasecmp(ck[1], "l1cache", len)) { ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L1CACHE_LEVEL) { start = OPAL_HWLOC_L1CACHE_LEVEL; cache_level = 1; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE); } n++; } else if (0 == strncasecmp(ck[1], "l2cache", len)) { ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L2CACHE_LEVEL) { start = OPAL_HWLOC_L2CACHE_LEVEL; cache_level = 2; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE); } n++; } else if (0 == strncasecmp(ck[1], "l3cache", len)) { ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L3CACHE_LEVEL) { start = OPAL_HWLOC_L3CACHE_LEVEL; cache_level = 3; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE); } n++; } else if (0 == strncasecmp(ck[1], "numa", len)) { ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_NUMA_LEVEL) { start = OPAL_HWLOC_NUMA_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA); } n++; #endif } else { /* unknown spec */ orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr); opal_argv_free(ppr_req); opal_argv_free(ck); return ORTE_ERR_SILENT; } opal_argv_free(ck); } opal_argv_free(ppr_req); /* if nothing was given, that's an error */ if (0 == n) { opal_output(0, "NOTHING GIVEN"); return ORTE_ERR_SILENT; } #if OPAL_HAVE_HWLOC /* if more than one level was specified, then pruning will be reqd */ if (1 < n) { pruning_reqd = true; } #endif opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s assigned policy %s", ORTE_JOBID_PRINT(jdata->jobid), orte_rmaps_base_print_mapping(jdata->map->mapping)); #if OPAL_HAVE_HWLOC /* convenience */ level = start; lowest = opal_hwloc_levels[start]; #endif for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { continue; } /* if the number of total procs was given, set that * limit - otherwise, set to max so we simply fill * all the nodes with the pattern */ if (0 < app->num_procs) { total_procs = app->num_procs; } else { total_procs = ORTE_VPID_MAX; } /* get the available nodes */ OBJ_CONSTRUCT(&node_list, opal_list_t); if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map, false))) { ORTE_ERROR_LOG(rc); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; /* if a bookmark exists from some prior mapping, set us to start there */ jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata); /* cycle across the nodes */ nprocs_mapped = 0; for (item = opal_list_get_first(&node_list); item != opal_list_get_end(&node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; #if OPAL_HAVE_HWLOC /* bozo check */ if (NULL == node->topology) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); rc = ORTE_ERR_SILENT; goto error; } #endif /* add the node to the map, if needed */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); goto error; } ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); OBJ_RETAIN(node); /* maintain accounting on object */ jdata->map->num_nodes++; } /* if we are mapping solely at the node level, just put * that many procs on this node */ if (OPAL_HWLOC_NODE_LEVEL == start) { #if OPAL_HAVE_HWLOC obj = hwloc_get_root_obj(node->topology); #endif for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; #if OPAL_HAVE_HWLOC orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); #endif } #if OPAL_HAVE_HWLOC } else { /* get the number of lowest resources on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, lowest, cache_level, OPAL_HWLOC_AVAILABLE); /* map the specified number of procs to each such resource on this node, * recording the locale of each proc so we know its cpuset */ for (i=0; i < nobjs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology, lowest, cache_level, i, OPAL_HWLOC_AVAILABLE); for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); } } if (pruning_reqd) { /* go up the ladder and prune the procs according to * the specification, adjusting the count of procs on the * node as we go */ level--; prune(jdata->jobid, idx, node, &level, &nprocs_mapped); } #endif } /* set the total slots used */ if ((int)node->num_procs <= node->slots) { node->slots_inuse = (int)node->num_procs; } else { node->slots_inuse = node->slots; } /* if no-oversubscribe was specified, check to see if * we have violated the total slot specification - regardless, * if slots_max was given, we are not allowed to violate it! */ if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED); } /* if we haven't mapped all the procs, continue on to the * next node */ if (total_procs == nprocs_mapped) { break; } } if (0 == app->num_procs) { app->num_procs = nprocs_mapped; } if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) { /* couldn't map them all */ orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs", true, app->app, app->num_procs, jdata->map->ppr); rc = ORTE_ERR_SILENT; goto error; } /* compute vpids and add proc objects to the job */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); goto error; } /* track the total number of processes we mapped - must update * this AFTER we compute vpids so that computation is done * correctly */ jdata->num_procs += app->num_procs; while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); } return ORTE_SUCCESS; error: while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); return rc; }
/* * Sequentially map the ranks according to the placement in the * specified hostfile */ static int orte_rmaps_seq_map(orte_job_t *jdata) { orte_job_map_t *map; orte_app_context_t *app; int i, n; orte_std_cntr_t j; opal_list_item_t *item; orte_node_t *node, *nd, *save=NULL; orte_vpid_t vpid; orte_std_cntr_t num_nodes; int rc; opal_list_t *default_node_list=NULL; opal_list_t *node_list=NULL; orte_proc_t *proc; mca_base_component_t *c = &mca_rmaps_seq_component.base_version; OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, "%s rmaps:seq mapping job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* this mapper can only handle initial launch * when seq mapping is desired - allow * restarting of failed apps */ if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:seq: job %s is being restarted - seq cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:seq: job %s not using sequential mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I don't know how to do these - defer */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:seq: job %s not using seq mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:seq: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* conveniece def */ map = jdata->map; /* if there is a default hostfile, go and get its ordered list of nodes */ if (NULL != orte_default_hostfile) { default_node_list = OBJ_NEW(opal_list_t); if (ORTE_SUCCESS != (rc = orte_util_get_ordered_host_list(default_node_list, orte_default_hostfile))) { ORTE_ERROR_LOG(rc); goto error; } } /* start at the beginning... */ vpid = 0; jdata->num_procs = 0; if (NULL != default_node_list) { save = (orte_node_t*)opal_list_get_first(default_node_list); } /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* dash-host trumps hostfile */ if (NULL != app->dash_host) { node_list = OBJ_NEW(opal_list_t); if (ORTE_SUCCESS != (rc = orte_util_get_ordered_dash_host_list(node_list, app->dash_host))) { ORTE_ERROR_LOG(rc); goto error; } nd = (orte_node_t*)opal_list_get_first(node_list); } else if (NULL != app->hostfile) { node_list = OBJ_NEW(opal_list_t); if (ORTE_SUCCESS != (rc = orte_util_get_ordered_host_list(node_list, app->hostfile))) { ORTE_ERROR_LOG(rc); goto error; } nd = (orte_node_t*)opal_list_get_first(node_list); } else if (NULL != default_node_list) { node_list = default_node_list; nd = save; } else { /* can't do anything - no nodes available! */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* check for nolocal and remove the head node, if required */ if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) { for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item) ) { node = (orte_node_t*)item; /* need to check ifislocal because the name in the * hostfile may not have been FQDN, while name returned * by gethostname may have been (or vice versa) */ if (opal_ifislocal(node->name)) { opal_list_remove_item(node_list, item); OBJ_RELEASE(item); /* "un-retain" it */ } } } if (NULL == node_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(node_list))) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* if num_procs wasn't specified, set it now */ if (0 == app->num_procs) { app->num_procs = num_nodes; } for (n=0; n < app->num_procs; n++) { /* find this node on the global array - this is necessary so * that our mapping gets saved on that array as the objects * returned by the hostfile function are -not- on the array */ node = NULL; for (j=0; j < orte_node_pool->size; j++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { continue; } if (0 == strcmp(nd->name, node->name)) { break; } } if (NULL == node) { /* wasn't found - that is an error */ orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:resource-not-found", true, nd->name); rc = ORTE_ERR_SILENT; goto error; } /* ensure the node is in the map */ if (!node->mapped) { OBJ_RETAIN(node); opal_pointer_array_add(map->nodes, node); node->mapped = true; } proc = orte_rmaps_base_setup_proc(jdata, node, i); if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } /* assign the vpid */ proc->name.vpid = vpid++; #if OPAL_HAVE_HWLOC /* assign the locale - okay for the topo to be null as * it just means it wasn't returned */ if (NULL != node->topology) { proc->locale = hwloc_get_root_obj(node->topology); } #endif /* add to the jdata proc array */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); goto error; } /* move to next node */ nd = (orte_node_t*)opal_list_get_next((opal_list_item_t*)nd); } /** track the total number of processes we mapped */ jdata->num_procs += app->num_procs; /* cleanup the node list if it came from this app_context */ if (node_list != default_node_list) { while (NULL != (item = opal_list_remove_first(node_list))) { OBJ_RELEASE(item); } OBJ_RELEASE(node_list); } else { save = nd; } } return ORTE_SUCCESS; error: if (NULL != default_node_list) { while (NULL != (item = opal_list_remove_first(default_node_list))) { OBJ_RELEASE(item); } OBJ_RELEASE(default_node_list); } if (NULL != node_list) { while (NULL != (item = opal_list_remove_first(node_list))) { OBJ_RELEASE(item); } OBJ_RELEASE(node_list); } return rc; }
/** * Function for finding and opening either all MCA components, or the one * that was specifically requested via a MCA parameter. */ static int orte_rmaps_base_open(mca_base_open_flag_t flags) { int rc; /* init the globals */ OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t); orte_rmaps_base.slot_list = NULL; orte_rmaps_base.mapping = 0; orte_rmaps_base.ranking = 0; orte_rmaps_base.device = NULL; /* if a topology file was given, then set our topology * from it. Even though our actual topology may differ, * mpirun only needs to see the compute node topology * for mapping purposes */ if (NULL != rmaps_base_topo_file) { if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(rmaps_base_topo_file))) { orte_show_help("help-orte-rmaps-base.txt", "topo-file", true, rmaps_base_topo_file); return ORTE_ERR_SILENT; } } /* check for violations that has to be detected before we parse the mapping option */ if (NULL != orte_rmaps_base.ppr) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--ppr, -ppr", "--map-by ppr:<pattern>", "rmaps_base_pattern, rmaps_ppr_pattern", "rmaps_base_mapping_policy=ppr:<pattern>"); /* if the mapping policy is NULL, then we can proceed */ if (NULL == rmaps_base_mapping_policy) { asprintf(&rmaps_base_mapping_policy, "ppr:%s", orte_rmaps_base.ppr); } else { return ORTE_ERR_SILENT; } } if (1 < orte_rmaps_base.cpus_per_rank) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--cpus-per-proc, -cpus-per-proc, --cpus-per-rank, -cpus-per-rank", "--map-by <obj>:PE=N, default <obj>=NUMA", "rmaps_base_cpus_per_proc", "rmaps_base_mapping_policy=<obj>:PE=N, default <obj>=NUMA"); } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping, &orte_rmaps_base.device, rmaps_base_mapping_policy))) { return rc; } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&orte_rmaps_base.ranking, orte_rmaps_base.mapping, rmaps_base_ranking_policy))) { return rc; } if (rmaps_base_bycore) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--bycore, -bycore", "--map-by core", "rmaps_base_bycore", "rmaps_base_mapping_policy=core"); /* set mapping policy to bycore - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYCORE) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bycore", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYCORE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bycore - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_CORE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bycore", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_CORE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_byslot) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--byslot, -byslot", "--map-by slot", "rmaps_base_byslot", "rmaps_base_mapping_policy=slot"); /* set mapping policy to byslot - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to byslot - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_bynode) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--bynode, -bynode", "--map-by node", "rmaps_base_bynode", "rmaps_base_mapping_policy=node"); /* set mapping policy to bynode - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bynode - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (1 < orte_rmaps_base.cpus_per_rank) { /* if we were asked for multiple cpus/proc, then we have to * bind to those cpus - any other binding policy is an * error */ if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) { if (opal_hwloc_use_hwthreads_as_cpus) { if (OPAL_BIND_TO_HWTHREAD != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true, orte_rmaps_base.cpus_per_rank, "use-hwthreads-as-cpus", opal_hwloc_base_print_binding(opal_hwloc_binding_policy), "bind-to hwthread"); return ORTE_ERR_SILENT; } } else if (OPAL_BIND_TO_CORE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true, orte_rmaps_base.cpus_per_rank, "cores as cpus", opal_hwloc_base_print_binding(opal_hwloc_binding_policy), "bind-to core"); return ORTE_ERR_SILENT; } } else { if (opal_hwloc_use_hwthreads_as_cpus) { OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD); } else { OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE); } } /* we also need to ensure we are mapping to a high-enough level to have * multiple cpus beneath it - by default, we'll go to the NUMA level */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { if (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYHWTHREAD || (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYCORE && !opal_hwloc_use_hwthreads_as_cpus)) { orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low-init", true); return ORTE_ERR_SILENT; } } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base pe/rank set - setting mapping to BYNUMA", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNUMA); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); } } if (orte_rmaps_base_pernode) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ orte_rmaps_base.ppr = strdup("1:node"); } if (0 < orte_rmaps_base_n_pernode) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ asprintf(&orte_rmaps_base.ppr, "%d:node", orte_rmaps_base_n_pernode); } if (0 < orte_rmaps_base_n_persocket) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ asprintf(&orte_rmaps_base.ppr, "%d:socket", orte_rmaps_base_n_persocket); } /* Should we schedule on the local node or not? */ if (rmaps_base_no_schedule_local) { orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL; } /* Should we oversubscribe or not? */ if (rmaps_base_no_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); } /** force oversubscription permission */ if (rmaps_base_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); /* also set the overload allowed flag */ opal_hwloc_binding_policy |= OPAL_BIND_ALLOW_OVERLOAD; } /* should we display a detailed (developer-quality) version of the map after determining it? */ if (rmaps_base_display_devel_map) { orte_rmaps_base.display_map = true; orte_devel_level_output = true; } /* should we display a diffable report of proc locations after determining it? */ if (rmaps_base_display_diffable_map) { orte_rmaps_base.display_map = true; orte_display_diffable_output = true; } /* Open up all available components */ rc = mca_base_framework_components_open(&orte_rmaps_base_framework, flags); /* check to see if any component indicated a problem */ if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { /* the component would have already reported the error, so * tell the rest of the chain to shut up */ return ORTE_ERR_SILENT; } /* All done */ return rc; }
char* orte_rmaps_base_print_mapping(orte_mapping_policy_t mapping) { char *ret, *map, *mymap, *tmp; orte_rmaps_print_buffers_t *ptr; if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { return "CONFLICTED"; } ptr = get_print_buffer(); if (NULL == ptr) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return orte_rmaps_print_null; } /* cycle around the ring */ if (ORTE_RMAPS_PRINT_NUM_BUFS == ptr->cntr) { ptr->cntr = 0; } switch(ORTE_GET_MAPPING_POLICY(mapping)) { case ORTE_MAPPING_BYNODE: map = "BYNODE"; break; case ORTE_MAPPING_BYBOARD: map = "BYBOARD"; break; case ORTE_MAPPING_BYNUMA: map = "BYNUMA"; break; case ORTE_MAPPING_BYSOCKET: map = "BYSOCKET"; break; case ORTE_MAPPING_BYL3CACHE: map = "BYL3CACHE"; break; case ORTE_MAPPING_BYL2CACHE: map = "BYL2CACHE"; break; case ORTE_MAPPING_BYL1CACHE: map = "BYL1CACHE"; break; case ORTE_MAPPING_BYCORE: map = "BYCORE"; break; case ORTE_MAPPING_BYHWTHREAD: map = "BYHWTHREAD"; break; case ORTE_MAPPING_BYSLOT: map = "BYSLOT"; break; case ORTE_MAPPING_SEQ: map = "SEQUENTIAL"; break; case ORTE_MAPPING_BYUSER: map = "BYUSER"; break; case ORTE_MAPPING_BYDIST: map = "MINDIST"; break; default: if (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { map = "PPR"; } else { map = "UNKNOWN"; } } if (0 != strcmp(map, "PPR") && (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping))) { asprintf(&mymap, "%s[PPR]:", map); } else { asprintf(&mymap, "%s:", map); } if (ORTE_MAPPING_NO_USE_LOCAL & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { asprintf(&tmp, "%sNO_USE_LOCAL,", mymap); free(mymap); mymap = tmp; } if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { asprintf(&tmp, "%sNOOVERSUBSCRIBE,", mymap); free(mymap); mymap = tmp; } else if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { asprintf(&tmp, "%sOVERSUBSCRIBE,", mymap); free(mymap); mymap = tmp; } if (ORTE_MAPPING_SPAN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { asprintf(&tmp, "%sSPAN,", mymap); free(mymap); mymap = tmp; } /* remove the trailing mark */ mymap[strlen(mymap)-1] = '\0'; snprintf(ptr->buffers[ptr->cntr], ORTE_RMAPS_PRINT_MAX_SIZE, "%s", mymap); free(mymap); ret = ptr->buffers[ptr->cntr]; ptr->cntr++; return ret; }
/* * Create a round-robin mapping for the job. */ static int orte_rmaps_rr_map(orte_job_t *jdata) { orte_app_context_t *app; int i; opal_list_t node_list; opal_list_item_t *item; orte_std_cntr_t num_slots; int rc; mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version; bool initial_map=true; /* this mapper can only handle initial launch * when rr mapping is desired - allow * restarting of failed apps */ if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: job %s is being restarted - rr cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: job %s not using rr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (ORTE_MAPPING_RR < ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I don't know how to do these - defer */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: job %s not using rr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* start at the beginning... */ jdata->num_procs = 0; /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* setup the nodelist here in case we jump to error */ OBJ_CONSTRUCT(&node_list, opal_list_t); /* if the number of processes wasn't specified, then we know there can be only * one app_context allowed in the launch, and that we are to launch it across * all available slots. We'll double-check the single app_context rule first */ if (0 == app->num_procs && 1 < jdata->num_apps) { orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np", true, jdata->num_apps, NULL); rc = ORTE_ERR_SILENT; goto error; } /* for each app_context, we have to get the list of nodes that it can * use since that can now be modified with a hostfile and/or -host * option */ if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map, false))) { ORTE_ERROR_LOG(rc); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; /* if a bookmark exists from some prior mapping, set us to start there */ jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata); if (0 == app->num_procs) { /* set the num_procs to equal the number of slots on these * mapped nodes, taking into account the number of cpus/rank */ app->num_procs = num_slots / orte_rmaps_base.cpus_per_rank; /* sometimes, we have only one "slot" assigned, but may * want more than one cpu/rank - so ensure we always wind * up with at least one proc */ if (0 == app->num_procs) { app->num_procs = 1; } } /* Make assignments */ if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_bynode(jdata, app, &node_list, num_slots, app->num_procs); } else if (ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); #if OPAL_HAVE_HWLOC } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_PU, 0); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_CORE, 0); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_CACHE, 1); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_CACHE, 2); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_CACHE, 3); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_SOCKET, 0); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_NODE, 0); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } #endif } else { /* unrecognized mapping directive */ orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", orte_rmaps_base_print_mapping(jdata->map->mapping)); rc = ORTE_ERR_SILENT; goto error; } if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto error; } /* compute vpids and add proc objects to the job - do this after * each app_context so that the ranks within each context are * contiguous */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); return rc; } /* track the total number of processes we mapped - must update * this value AFTER we compute vpids so that computation * is done correctly */ jdata->num_procs += app->num_procs; /* cleanup the node list - it can differ from one app_context * to another, so we have to get it every time */ while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); } return ORTE_SUCCESS; error: while(NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); return rc; }
int orte_rmaps_base_compute_bindings(orte_job_t *jdata) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: compute bindings for job %s with policy %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); if (ORTE_MAPPING_BYUSER == ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping)) { /* user specified binding by rankfile - nothing for us to do */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; /* cpuset was given - setup the bindings */ if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) { ORTE_ERROR_LOG(rc); } return rc; } if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) || OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { /* no binding requested */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_BOARD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { /* doesn't do anything at this time */ return ORTE_SUCCESS; } /* binding requested */ /* if the job was mapped by the corresponding target, then * we bind in place * * otherwise, we have to bind either up or down the hwloc * tree. If we are binding upwards (e.g., mapped to hwthread * but binding to core), then we just climb the tree to find * the first matching object. * * if we are binding downwards (e.g., mapped to node and bind * to core), then we have to do a round-robin assigment of * procs to the resources below. */ if (ORTE_MAPPING_BYDIST == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { int rc = ORTE_SUCCESS; if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - dist to numa", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_NUMA < OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) { ORTE_ERROR_LOG(rc); } } } /* if the binding policy is less than numa, then we are unbound - so * just ignore this and return (should have been caught in prior * tests anyway as only options meeting that criteria are "none" * and "board") */ return rc; } else if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - hwthread to hwthread", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_PU, 0))) { ORTE_ERROR_LOG(rc); } return rc; } /* HW threads are at the bottom, so we must have mapped somewhere * above this level - and we need to bind downwards */ if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) { ORTE_ERROR_LOG(rc); } return rc; } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - core to core", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CORE, 0))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy used is less than bycore, then it is a * downward binding - i.e., the locale is above the binding location. * for example, if we map-to-socket and bind-to-core, then we compare * the mapping value of ORTE_MAPPING_BYCORE to ORTE_MAPPING_BYSOCKET. * In this case, BYCORE > BYSOCKET, so we know that the locale is * above the desired binding level (sockets are at a higher level than * the desired core binding level), and we will have to bind downwards */ if (ORTE_MAPPING_BYCORE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CORE, 0))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - L1cache to L1cache", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 1))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than l1cache, then it is a * downward binding */ if (ORTE_MAPPING_BYL1CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 1))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - L2cache to L2cache", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 2))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than l2cache, then it is a * downward binding */ if (ORTE_MAPPING_BYL2CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 2))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - L3cache to L3cache", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 3))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than l3cache, then it is a * downward binding */ if (ORTE_MAPPING_BYL3CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 3))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - socket to socket", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_SOCKET, 0))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than bysocket, then it is a * downward binding */ if (ORTE_MAPPING_BYSOCKET > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_SOCKET, 0))) { ORTE_ERROR_LOG(rc); } } return rc; } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) { int rc; if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - numa to numa", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } return rc; } /* if the mapping policy is less than numa, then it is a * downward binding */ if (ORTE_MAPPING_BYNUMA > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } return rc; } else { ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); return ORTE_ERR_NOT_SUPPORTED; } return ORTE_SUCCESS; }
/** * Function for finding and opening either all MCA components, or the one * that was specifically requested via a MCA parameter. */ static int orte_rmaps_base_open(mca_base_open_flag_t flags) { int rc; /* init the globals */ OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t); orte_rmaps_base.ppr = NULL; orte_rmaps_base.slot_list = NULL; orte_rmaps_base.mapping = 0; orte_rmaps_base.ranking = 0; #if OPAL_HAVE_HWLOC /* if a topology file was given, then set our topology * from it. Even though our actual topology may differ, * mpirun only needs to see the compute node topology * for mapping purposes */ if (NULL != rmaps_base_topo_file) { if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(rmaps_base_topo_file))) { orte_show_help("help-orte-rmaps-base.txt", "topo-file", true, rmaps_base_topo_file); return ORTE_ERR_SILENT; } } #endif if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping, &orte_rmaps_base.device, rmaps_base_mapping_policy))) { return rc; } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&orte_rmaps_base.ranking, orte_rmaps_base.mapping, rmaps_base_ranking_policy))) { return rc; } if (rmaps_base_byslot) { /* set mapping policy to byslot - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to byslot - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_bynode) { /* set mapping policy to bynode - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bynode - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } /* Should we schedule on the local node or not? */ if (rmaps_base_no_schedule_local) { orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL; } /* Should we oversubscribe or not? */ if (rmaps_base_no_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); } /** force oversubscription permission */ if (rmaps_base_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); } /* should we display a detailed (developer-quality) version of the map after determining it? */ if (rmaps_base_display_devel_map) { orte_rmaps_base.display_map = true; orte_devel_level_output = true; } /* should we display a diffable report of proc locations after determining it? */ if (rmaps_base_display_diffable_map) { orte_rmaps_base.display_map = true; orte_display_diffable_output = true; } /* Open up all available components */ rc = mca_base_framework_components_open(&orte_rmaps_base_framework, flags); /* check to see if any component indicated a problem */ if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { /* the component would have already reported the error, so * tell the rest of the chain to shut up */ return ORTE_ERR_SILENT; } /* All done */ return rc; }
static int assign_locations(orte_job_t *jdata) { int i, j, m, n; mca_base_component_t *c=&mca_rmaps_ppr_component.base_version; orte_node_t *node; orte_proc_t *proc; orte_app_context_t *app; hwloc_obj_type_t level; hwloc_obj_t obj; unsigned int cache_level=0; int ppr, cnt, nobjs, nprocs_mapped; char **ppr_req, **ck; if (NULL == jdata->map->last_mapper || 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s not using ppr assign: %s", ORTE_JOBID_PRINT(jdata->jobid), (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: assigning locations for job %s with ppr %s policy %s", ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr, orte_rmaps_base_print_mapping(jdata->map->mapping)); /* pickup the object level */ if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { level = HWLOC_OBJ_MACHINE; } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { level = HWLOC_OBJ_PU; } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { level = HWLOC_OBJ_CORE; } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { level = HWLOC_OBJ_SOCKET; } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { level = HWLOC_OBJ_L1CACHE; cache_level = 1; } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { level = HWLOC_OBJ_L2CACHE; cache_level = 2; } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { level = HWLOC_OBJ_L3CACHE; cache_level = 3; } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { level = HWLOC_OBJ_NUMANODE; } else { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_TAKE_NEXT_OPTION; } /* get the ppr value */ ppr_req = opal_argv_split(jdata->map->ppr, ','); ck = opal_argv_split(ppr_req[0], ':'); ppr = strtol(ck[0], NULL, 10); opal_argv_free(ck); opal_argv_free(ppr_req); /* start assigning procs to objects, filling each object as we go until * all procs are assigned. */ for (n=0; n < jdata->apps->size; n++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { continue; } nprocs_mapped = 0; for (m=0; m < jdata->map->nodes->size; m++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { continue; } if (NULL == node->topology || NULL == node->topology->topo) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); return ORTE_ERR_SILENT; } if (HWLOC_OBJ_MACHINE == level) { obj = hwloc_get_root_obj(node->topology->topo); for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } if (proc->name.jobid != jdata->jobid) { continue; } orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); } } else { /* get the number of resources on this node at this level */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, level, cache_level, OPAL_HWLOC_AVAILABLE); /* map the specified number of procs to each such resource on this node, * recording the locale of each proc so we know its cpuset */ for (i=0; i < nobjs; i++) { cnt = 0; obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, level, cache_level, i, OPAL_HWLOC_AVAILABLE); for (j=0; j < node->procs->size && cnt < ppr && nprocs_mapped < app->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } if (proc->name.jobid != jdata->jobid) { continue; } /* if we already assigned it, then skip */ if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, NULL, OPAL_PTR)) { continue; } nprocs_mapped++; cnt++; orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); } } } } } return ORTE_SUCCESS; }
/********************************* * Parsing Functions *********************************/ int rmaps_lama_process_alias_params(orte_job_t *jdata) { int exit_status = ORTE_SUCCESS; /* * Mapping options * Note: L1, L2, L3 are not exposed in orterun to the user, so * there is no need to specify them here. */ if( NULL == rmaps_lama_cmd_map ) { /* orte_rmaps_base.mapping */ switch( ORTE_GET_MAPPING_POLICY(jdata->map->mapping) ) { case ORTE_MAPPING_BYNODE: /* rmaps_lama_cmd_map = strdup("nbNsL3L2L1ch"); */ rmaps_lama_cmd_map = strdup("nbsch"); break; case ORTE_MAPPING_BYBOARD: /* rmaps_lama_cmd_map = strdup("bnNsL3L2L1ch"); */ orte_show_help("help-orte-rmaps-lama.txt", "invalid mapping option", true, "by board", "mapping by board not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; break; case ORTE_MAPPING_BYNUMA: /* rmaps_lama_cmd_map = strdup("NbnsL3L2L1ch"); */ rmaps_lama_cmd_map = strdup("Nbnsch"); break; case ORTE_MAPPING_BYSOCKET: /* rmaps_lama_cmd_map = strdup("sNbnL3L2L1ch"); */ rmaps_lama_cmd_map = strdup("sbnch"); break; case ORTE_MAPPING_BYL3CACHE: rmaps_lama_cmd_map = strdup("L3sNbnL2L1ch"); break; case ORTE_MAPPING_BYL2CACHE: rmaps_lama_cmd_map = strdup("L2sNbnL1ch"); break; case ORTE_MAPPING_BYL1CACHE: rmaps_lama_cmd_map = strdup("L1sNbnch"); break; case ORTE_MAPPING_BYCORE: case ORTE_MAPPING_BYSLOT: /* rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); */ rmaps_lama_cmd_map = strdup("csbnh"); break; case ORTE_MAPPING_BYHWTHREAD: /* rmaps_lama_cmd_map = strdup("hcL1L2L3sNbn"); */ rmaps_lama_cmd_map = strdup("hcsbn"); break; case ORTE_MAPPING_RR: orte_show_help("help-orte-rmaps-lama.txt", "invalid mapping option", true, "round robin", "mapping by round robin not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; case ORTE_MAPPING_SEQ: orte_show_help("help-orte-rmaps-lama.txt", "invalid mapping option", true, "sequential", "mapping by sequential not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; case ORTE_MAPPING_BYUSER: orte_show_help("help-orte-rmaps-lama.txt", "invalid mapping option", true, "by user", "mapping by user not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; default: /* * Default is map-by core */ rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); break; } } /* * Binding Options */ if( NULL == rmaps_lama_cmd_bind ) { /* * No binding specified, use default */ if( !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) || !OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) || OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { rmaps_lama_cmd_bind = NULL; } switch( OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { case OPAL_BIND_TO_BOARD: /* rmaps_lama_cmd_bind = strdup("1b"); */ orte_show_help("help-orte-rmaps-lama.txt", "invalid binding option", true, "by board", "binding to board not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; break; case OPAL_BIND_TO_NUMA: rmaps_lama_cmd_bind = strdup("1N"); break; case OPAL_BIND_TO_SOCKET: rmaps_lama_cmd_bind = strdup("1s"); break; case OPAL_BIND_TO_L3CACHE: rmaps_lama_cmd_bind = strdup("1L3"); break; case OPAL_BIND_TO_L2CACHE: rmaps_lama_cmd_bind = strdup("1L2"); break; case OPAL_BIND_TO_L1CACHE: rmaps_lama_cmd_bind = strdup("1L1"); break; case OPAL_BIND_TO_CORE: rmaps_lama_cmd_bind = strdup("1c"); break; case OPAL_BIND_TO_HWTHREAD: rmaps_lama_cmd_bind = strdup("1h"); break; case OPAL_BIND_TO_CPUSET: orte_show_help("help-orte-rmaps-lama.txt", "invalid binding option", true, "by CPU set", "binding to CPU set not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; break; default: rmaps_lama_cmd_bind = NULL; break; } } /* * Ordering (a.k.a. Ranking) Options */ if( NULL == rmaps_lama_cmd_ordering ) { /* orte_rmaps_base.ranking */ switch( ORTE_GET_RANKING_POLICY(jdata->map->ranking) ) { case ORTE_RANK_BY_SLOT: rmaps_lama_cmd_ordering = strdup("s"); break; case ORTE_RANK_BY_NODE: case ORTE_RANK_BY_NUMA: case ORTE_RANK_BY_SOCKET: case ORTE_RANK_BY_L3CACHE: case ORTE_RANK_BY_L2CACHE: case ORTE_RANK_BY_L1CACHE: case ORTE_RANK_BY_CORE: case ORTE_RANK_BY_HWTHREAD: rmaps_lama_cmd_ordering = strdup("n"); break; case ORTE_RANK_BY_BOARD: /* rmaps_lama_cmd_ordering = strdup("n"); */ orte_show_help("help-orte-rmaps-lama.txt", "invalid ordering option", true, "by board", "ordering by board not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; break; default: rmaps_lama_cmd_ordering = strdup("n"); break; } } /* * MPPR */ if( NULL == rmaps_lama_cmd_mppr ) { /* * The ppr is given in the map */ if( NULL != jdata->map->ppr) { rmaps_lama_cmd_mppr = rmaps_lama_covert_ppr(jdata->map->ppr); } } /* * Oversubscription */ if( ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping) ) { rmaps_lama_can_oversubscribe = false; } else { rmaps_lama_can_oversubscribe = true; } /* * Display revised values */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: Revised Parameters -----"); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: Map : %s", rmaps_lama_cmd_map); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: Bind : %s", rmaps_lama_cmd_bind); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: MPPR : %s", rmaps_lama_cmd_mppr); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: Order : %s", rmaps_lama_cmd_ordering); cleanup: return exit_status; }
static int staged_mapper(orte_job_t *jdata) { mca_base_component_t *c=&mca_rmaps_staged_component.base_version; int i, j, k, rc; orte_app_context_t *app; opal_list_t node_list, desired; orte_std_cntr_t num_slots; orte_proc_t *proc; orte_node_t *node, *next; bool work_to_do = false, first_pass = false; opal_list_item_t *item, *it2; char *cptr, **minimap; orte_vpid_t load; /* only use this mapper if it was specified */ if (NULL == jdata->map->req_mapper || 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name) || ORTE_MAPPING_STAGED != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I wasn't specified */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:staged: job %s not using staged mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: mapping job %s with %d procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), (int)jdata->num_procs); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* if there are no nodes in the map, then this is our first * pass thru this job */ if (0 == jdata->map->num_nodes) { first_pass = true; } /* we assume that the app_contexts are in priority order, * with the highest priority being the first entry in the * job's app_context array. Loop across the app_contexts * in order, looking for apps that have not been * fully mapped */ for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* has it been fully mapped? */ if (ORTE_APP_STATE_ALL_MAPPED <= app->state) { continue; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: working app %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app); /* find nodes that meet any constraints provided in the form of * -hostfile or -host directives */ OBJ_CONSTRUCT(&node_list, opal_list_t); /* get nodes based on a strict interpretation of the location hints */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, false, true))) { /* we were unable to get any nodes that match those * specified in the app */ if (ORTE_ERR_RESOURCE_BUSY == rc) { /* if the return is "busy", then at least one of the * specified resources must exist, but no slots are * currently available. This means there is at least * a hope of eventually being able to map this app * within its specified constraints, so continue working */ if (orte_soft_locations) { /* if soft locations were given, then we know that * none of the nodes in this allocation are available, * so there is no point in continuing to check the * remaining apps */ while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); goto complete; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: all nodes for this app are currently busy", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); OBJ_DESTRUCT(&node_list); continue; } else { /* this indicates that there are no nodes that match * the specified constraints, so there is no hope of * ever being able to execute this app. This is an * unrecoverable error - note that a return of * "silent" means that the function already printed * an error message, so the error_log will print nothing */ ORTE_ERROR_LOG(rc); return rc; } } /* if a max number of procs/node was given for this * app, remove all nodes from the list that exceed * that limit */ if (0 < app->max_procs_per_node) { item = opal_list_get_first(&node_list); while (item != opal_list_get_end(&node_list)) { it2 = opal_list_get_next(item); node = (orte_node_t*)item; if (app->max_procs_per_node <= node->num_procs) { opal_list_remove_item(&node_list, item); OBJ_RELEASE(item); } item = it2; } } /* if we have no available nodes, then move on to next app */ if (0 == opal_list_get_size(&node_list)) { OBJ_DESTRUCT(&node_list); continue; } /* if the app specified locations, soft or not, search the list of nodes * for those that match the requested locations and move those * to the desired list so we use them first */ if (NULL != app->dash_host) { OBJ_CONSTRUCT(&desired, opal_list_t); /* no particular order is required */ for (j=0; j < opal_argv_count(app->dash_host); j++) { minimap = opal_argv_split(app->dash_host[j], ','); for (k=0; k < opal_argv_count(minimap); k++) { cptr = minimap[k]; for (item = opal_list_get_first(&node_list); item != opal_list_get_end(&node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; if (0 == strcmp(node->name, cptr) || (0 == strcmp("localhost", cptr) && 0 == strcmp(node->name, orte_process_info.nodename))) { opal_list_remove_item(&node_list, item); opal_list_append(&desired, item); opal_output_verbose(10, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: placing node %s on desired list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name); break; } } } opal_argv_free(minimap); } /* if no nodes made the transition and the app specified soft * locations, then we can skip to look at the non-desired list */ if (0 == opal_list_get_size(&desired)) { OBJ_DESTRUCT(&desired); if (orte_soft_locations) { goto process; } else { /* move on to next app */ continue; } } /* cycle thru the procs for this app and attempt to map them * to the desired nodes using a load-balancing algo */ for (j=0; j < app->procs.size; j++) { if (NULL == (proc = opal_pointer_array_get_item(&app->procs, j))) { continue; } if (ORTE_PROC_STATE_UNDEF != proc->state) { /* this proc has already been mapped or executed */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: proc %s has already been mapped", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name)); continue; } /* flag that there is at least one proc still to * be executed */ work_to_do = true; /* track number mapped */ jdata->num_mapped++; /* find the lightest-loaded node on the desired list */ node = NULL; load = ORTE_VPID_MAX; for (item = opal_list_get_first(&desired); item != opal_list_get_end(&desired); item = opal_list_get_next(item)) { next = (orte_node_t*)item; if (next->num_procs < load) { node = next; load = next->num_procs; } } /* put the proc there */ proc->node = node; proc->nodename = node->name; /* the local rank is the number of procs * on this node from this job - we don't * directly track this number, so it must * be found by looping across the node->procs * array and counting it each time. For now, * since we don't use this value in this mode * of operation, just set it to something arbitrary */ proc->local_rank = node->num_procs; /* the node rank is simply the number of procs * on the node at this time */ proc->node_rank = node->num_procs; /* track number of procs on node and number of slots used */ node->num_procs++; node->slots_inuse++; opal_output_verbose(10, orte_rmaps_base_framework.framework_output, "%s Proc %s on node %s: slots %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name, (int)node->slots, (int)node->slots_inuse); if (node->slots_inuse == node->slots) { opal_list_remove_item(&desired, &node->super); OBJ_RELEASE(node); } if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(proc); return rc; } /* retain the proc struct so that we correctly track its release */ OBJ_RETAIN(proc); proc->state = ORTE_PROC_STATE_INIT; /* flag the proc as updated so it will be included * in the next pidmap message */ proc->updated =true; /* add the node to the map, if needed */ if (!node->mapped) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); return rc; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ jdata->map->num_nodes++; } if (0 == opal_list_get_size(&desired)) { /* nothing more we can do */ break; } } /* clear the list */ while (NULL != (item = opal_list_remove_first(&desired))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&desired); } process: for (j=0; j < app->procs.size; j++) { if (NULL == (proc = opal_pointer_array_get_item(&app->procs, j))) { continue; } if (ORTE_PROC_STATE_UNDEF != proc->state) { /* this proc has already been mapped or executed */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: proc %s has already been mapped", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name)); continue; } /* find the lightest-loaded node on the node list */ node = NULL; load = ORTE_VPID_MAX; for (item = opal_list_get_first(&node_list); item != opal_list_get_end(&node_list); item = opal_list_get_next(item)) { next = (orte_node_t*)item; if (next->num_procs < load) { node = next; load = next->num_procs; } } /* flag that there is at least one proc still to * be executed */ work_to_do = true; /* track number mapped */ jdata->num_mapped++; /* map this proc to the first available slot */ OBJ_RETAIN(node); /* maintain accounting on object */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: assigning proc %s to node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name); proc->node = node; proc->nodename = node->name; /* the local rank is the number of procs * on this node from this job - we don't * directly track this number, so it must * be found by looping across the node->procs * array and counting it each time. For now, * since we don't use this value in this mode * of operation, just set it to something arbitrary */ proc->local_rank = node->num_procs; /* the node rank is simply the number of procs * on the node at this time */ proc->node_rank = node->num_procs; /* track number of procs on node and number of slots used */ node->num_procs++; node->slots_inuse++; opal_output_verbose(10, orte_rmaps_base_framework.framework_output, "%s Proc %s on node %s: slots %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name, (int)node->slots, (int)node->slots_inuse); if (node->slots_inuse == node->slots) { opal_list_remove_item(&node_list, &node->super); OBJ_RELEASE(node); } if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(proc); return rc; } /* retain the proc struct so that we correctly track its release */ OBJ_RETAIN(proc); proc->state = ORTE_PROC_STATE_INIT; /* flag the proc as updated so it will be included * in the next pidmap message */ proc->updated =true; /* add the node to the map, if needed */ if (!node->mapped) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); return rc; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ jdata->map->num_nodes++; } if (0 == opal_list_get_size(&node_list)) { /* nothing more we can do */ break; } } /* clear the list */ while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); } complete: /* if there isn't at least one proc that can be launched, * then indicate that we don't need to proceed with the * launch sequence */ if (!work_to_do) { return ORTE_ERR_RESOURCE_BUSY; } /* flag that the job was updated so it will be * included in the pidmap message */ jdata->updated = true; /* if we successfully mapped ALL procs in the first pass, * then this job is capable of supporting MPI procs */ if (first_pass && jdata->num_mapped == jdata->num_procs) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: job %s is MPI-capable", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid)); jdata->gang_launched = true; } return ORTE_SUCCESS; }