static int orte_rmaps_rank_file_open(void) { /* ensure we flag mapping by user */ #if OPAL_HAVE_HWLOC if (NULL != opal_hwloc_base_slot_list || NULL != orte_rankfile) { #else if (NULL != orte_rankfile) { #endif if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { /* if a non-default mapping is already specified, then we * have an error */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "RANK_FILE", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_CONFLICTED); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYUSER); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* we are going to bind to cpuset since the user is specifying the cpus */ OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET); /* make us first */ my_priority = 10000; } return ORTE_SUCCESS; } static int orte_rmaps_rank_file_query(mca_base_module_t **module, int *priority) { *priority = my_priority; *module = (mca_base_module_t *)&orte_rmaps_rank_file_module; return ORTE_SUCCESS; }
/* * Sequentially map the ranks according to the placement in the * specified hostfile */ static int orte_rmaps_seq_map(orte_job_t *jdata) { orte_job_map_t *map; orte_app_context_t *app; int i, n; orte_std_cntr_t j; opal_list_item_t *item; orte_node_t *node, *nd, *save=NULL; orte_vpid_t vpid; orte_std_cntr_t num_nodes; int rc; opal_list_t *default_node_list=NULL; opal_list_t *node_list=NULL; orte_proc_t *proc; mca_base_component_t *c = &mca_rmaps_seq_component.base_version; OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, "%s rmaps:seq mapping job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* this mapper can only handle initial launch * when seq mapping is desired - allow * restarting of failed apps */ if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:seq: job %s is being restarted - seq cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:seq: job %s not using sequential mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I don't know how to do these - defer */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:seq: job %s not using seq mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:seq: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* conveniece def */ map = jdata->map; /* if there is a default hostfile, go and get its ordered list of nodes */ if (NULL != orte_default_hostfile) { default_node_list = OBJ_NEW(opal_list_t); if (ORTE_SUCCESS != (rc = orte_util_get_ordered_host_list(default_node_list, orte_default_hostfile))) { ORTE_ERROR_LOG(rc); goto error; } } /* start at the beginning... */ vpid = 0; jdata->num_procs = 0; if (NULL != default_node_list) { save = (orte_node_t*)opal_list_get_first(default_node_list); } /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* dash-host trumps hostfile */ if (NULL != app->dash_host) { node_list = OBJ_NEW(opal_list_t); if (ORTE_SUCCESS != (rc = orte_util_get_ordered_dash_host_list(node_list, app->dash_host))) { ORTE_ERROR_LOG(rc); goto error; } nd = (orte_node_t*)opal_list_get_first(node_list); } else if (NULL != app->hostfile) { node_list = OBJ_NEW(opal_list_t); if (ORTE_SUCCESS != (rc = orte_util_get_ordered_host_list(node_list, app->hostfile))) { ORTE_ERROR_LOG(rc); goto error; } nd = (orte_node_t*)opal_list_get_first(node_list); } else if (NULL != default_node_list) { node_list = default_node_list; nd = save; } else { /* can't do anything - no nodes available! */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* check for nolocal and remove the head node, if required */ if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) { for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item) ) { node = (orte_node_t*)item; /* need to check ifislocal because the name in the * hostfile may not have been FQDN, while name returned * by gethostname may have been (or vice versa) */ if (opal_ifislocal(node->name)) { opal_list_remove_item(node_list, item); OBJ_RELEASE(item); /* "un-retain" it */ } } } if (NULL == node_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(node_list))) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* if num_procs wasn't specified, set it now */ if (0 == app->num_procs) { app->num_procs = num_nodes; } for (n=0; n < app->num_procs; n++) { /* find this node on the global array - this is necessary so * that our mapping gets saved on that array as the objects * returned by the hostfile function are -not- on the array */ node = NULL; for (j=0; j < orte_node_pool->size; j++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { continue; } if (0 == strcmp(nd->name, node->name)) { break; } } if (NULL == node) { /* wasn't found - that is an error */ orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:resource-not-found", true, nd->name); rc = ORTE_ERR_SILENT; goto error; } /* ensure the node is in the map */ if (!node->mapped) { OBJ_RETAIN(node); opal_pointer_array_add(map->nodes, node); node->mapped = true; } proc = orte_rmaps_base_setup_proc(jdata, node, i); if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } /* assign the vpid */ proc->name.vpid = vpid++; #if OPAL_HAVE_HWLOC /* assign the locale - okay for the topo to be null as * it just means it wasn't returned */ if (NULL != node->topology) { proc->locale = hwloc_get_root_obj(node->topology); } #endif /* add to the jdata proc array */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); goto error; } /* move to next node */ nd = (orte_node_t*)opal_list_get_next((opal_list_item_t*)nd); } /** track the total number of processes we mapped */ jdata->num_procs += app->num_procs; /* cleanup the node list if it came from this app_context */ if (node_list != default_node_list) { while (NULL != (item = opal_list_remove_first(node_list))) { OBJ_RELEASE(item); } OBJ_RELEASE(node_list); } else { save = nd; } } return ORTE_SUCCESS; error: if (NULL != default_node_list) { while (NULL != (item = opal_list_remove_first(default_node_list))) { OBJ_RELEASE(item); } OBJ_RELEASE(default_node_list); } if (NULL != node_list) { while (NULL != (item = opal_list_remove_first(node_list))) { OBJ_RELEASE(item); } OBJ_RELEASE(node_list); } return rc; }
/* * Function for selecting one component from all those that are * available. */ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) { orte_job_t *jdata; orte_job_map_t *map; int rc; bool did_map; opal_list_item_t *item; orte_rmaps_base_selected_module_t *mod; orte_job_t *parent; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; /* convenience */ jdata = caddy->jdata; jdata->state = ORTE_JOB_STATE_MAP; /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A * NULL MAP FIELD * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY * DIDN'T SET IT */ if (NULL == jdata->map) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: creating new map for job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* create a map object where we will store the results */ map = OBJ_NEW(orte_job_map_t); if (NULL == map) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* load it with the system defaults */ map->mapping = orte_rmaps_base.mapping; map->ranking = orte_rmaps_base.ranking; #if OPAL_HAVE_HWLOC map->binding = opal_hwloc_binding_policy; #endif if (NULL != orte_rmaps_base.ppr) { map->ppr = strdup(orte_rmaps_base.ppr); } map->cpus_per_rank = orte_rmaps_base.cpus_per_rank; map->display_map = orte_rmaps_base.display_map; /* assign the map object to this job */ jdata->map = map; } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: setting mapping policies for job %s", ORTE_JOBID_PRINT(jdata->jobid)); if (!jdata->map->display_map) { jdata->map->display_map = orte_rmaps_base.display_map; } /* set the default mapping policy IFF it wasn't provided */ if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping); } if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)); } /* ditto for rank and bind policies */ if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking); } #if OPAL_HAVE_HWLOC if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { jdata->map->binding = opal_hwloc_binding_policy; } #endif } #if OPAL_HAVE_HWLOC /* if we are not going to launch, then we need to set any * undefined topologies to match our own so the mapper * can operate */ if (orte_do_not_launch) { orte_node_t *node; hwloc_topology_t t0; int i; node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); t0 = node->topology; for (i=1; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (NULL == node->topology) { node->topology = t0; } } } #endif /* cycle thru the available mappers until one agrees to map * the job */ did_map = false; if (1 == opal_list_get_size(&orte_rmaps_base.selected_modules)) { /* forced selection */ mod = (orte_rmaps_base_selected_module_t*)opal_list_get_first(&orte_rmaps_base.selected_modules); jdata->map->req_mapper = strdup(mod->component->mca_component_name); } for (item = opal_list_get_first(&orte_rmaps_base.selected_modules); item != opal_list_get_end(&orte_rmaps_base.selected_modules); item = opal_list_get_next(item)) { mod = (orte_rmaps_base_selected_module_t*)item; if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata)) || ORTE_ERR_RESOURCE_BUSY == rc) { did_map = true; break; } /* mappers return "next option" if they didn't attempt to * map the job. anything else is a true error. */ if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } if (did_map && ORTE_ERR_RESOURCE_BUSY == rc) { /* the map was done but nothing could be mapped * for launch as all the resources were busy */ OBJ_RELEASE(caddy); return; } /* if we get here without doing the map, or with zero procs in * the map, then that's an error */ if (!did_map || 0 == jdata->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "failed-map", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* compute and save local ranks */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #if OPAL_HAVE_HWLOC /* compute and save bindings */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #endif /* set the offset so shared memory components can potentially * connect to any spawned jobs */ jdata->offset = orte_total_procs; /* track the total number of procs launched by us */ orte_total_procs += jdata->num_procs; /* if it is a dynamic spawn, save the bookmark on the parent's job too */ if (ORTE_JOBID_INVALID != jdata->originator.jobid) { if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) { parent->bookmark = jdata->bookmark; } } /* if we wanted to display the map, now is the time to do it - ignore * daemon job */ if (jdata->map->display_map) { char *output=NULL; int i, j; orte_node_t *node; orte_proc_t *proc; if (orte_display_diffable_output) { /* intended solely to test mapping methods, this output * can become quite long when testing at scale. Rather * than enduring all the malloc/free's required to * create an arbitrary-length string, custom-generate * the output a line at a time here */ /* display just the procs in a diffable format */ opal_output(orte_clean_output, "<map>\n\t<jobid=%s>\n\t<offset=%s>", ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset)); fflush(stderr); /* loop through nodes */ for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name); fflush(stderr); for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } #if OPAL_HAVE_HWLOC { char locale[64]; if (NULL != proc->locale) { hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset); } opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank, locale, (NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap); } #else opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank); #endif fflush(stderr); } opal_output(orte_clean_output, "\t</host>"); fflush(stderr); } #if OPAL_HAVE_HWLOC { opal_hwloc_locality_t locality; orte_proc_t *p0; /* test locality - for the first node, print the locality of each proc relative to the first one */ node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0); p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0); opal_output(orte_clean_output, "\t<locality>"); for (j=1; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } locality = opal_hwloc_base_get_relative_locality(node->topology, p0->cpu_bitmap, proc->cpu_bitmap); opal_output(orte_clean_output, "\t\t<rank=%s rank=%s locality=%s>", ORTE_VPID_PRINT(p0->name.vpid), ORTE_VPID_PRINT(proc->name.vpid), opal_hwloc_base_print_locality(locality)); } opal_output(orte_clean_output, "\t</locality>\n</map>"); fflush(stderr); } #else opal_output(orte_clean_output, "\n</map>"); fflush(stderr); #endif } else { opal_output(orte_clean_output, " Data for JOB %s offset %s", ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset)); opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP); if (orte_xml_output) { fprintf(orte_xml_fp, "%s\n", output); fflush(orte_xml_fp); } else { opal_output(orte_clean_output, "%s", output); } free(output); } } /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE); /* cleanup */ OBJ_RELEASE(caddy); }
int orte_rmaps_rr_byslot(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_std_cntr_t num_slots, orte_vpid_t num_procs) { int rc, i, nprocs_mapped; orte_node_t *node; orte_proc_t *proc; int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0; #if OPAL_HAVE_HWLOC hwloc_obj_t obj=NULL; #endif float balance; bool add_one=false; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu", ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs); /* check to see if we can map all the procs */ if (num_slots < ((int)app->num_procs * orte_rmaps_base.cpus_per_rank)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); return ORTE_ERR_SILENT; } #if OPAL_HAVE_HWLOC /* if we will and are allowed to oversubscribe, and binding was given, then * we really should warn the user that we cannot bind */ if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { if ((OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding) || OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { /* RHC: don't emit this warning at this time while we try to * determine the best path forward. See * https://svn.open-mpi.org/trac/ompi/ticket/4345 * for an explanation orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed", true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank); OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); */ } } else { /* don't default to bound */ OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); } #endif } /* first pass: map the number of procs to each node until we * map all specified procs or use all allocated slots */ nprocs_mapped = 0; OPAL_LIST_FOREACH(node, node_list, orte_node_t) { opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot working node %s", node->name); #if OPAL_HAVE_HWLOC /* get the root object as we are not assigning * locale here except at the node level */ if (NULL != node->topology) { obj = hwloc_get_root_obj(node->topology); } #endif if (node->slots <= node->slots_inuse) { opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot node %s is full - skipping", node->name); continue; } /* assign a number of procs equal to the number of available * slots divided by the number of cpus/rank the user * requested */ num_procs_to_assign = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot assigning %d procs to node %s", (int)num_procs_to_assign, node->name); for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) { /* add this node to the map - do it only once */ if (!node->mapped) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); return rc; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; #if OPAL_HAVE_HWLOC proc->locale = obj; #endif } }
/* * Query the registry for all nodes allocated to a specified app_context */ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots, orte_app_context_t *app, orte_mapping_policy_t policy, bool initial_map, bool silent) { opal_list_item_t *item, *next; orte_node_t *node, *nd, *nptr; orte_std_cntr_t num_slots; orte_std_cntr_t i; int rc; orte_job_t *daemons; bool novm; opal_list_t nodes; char *hosts; /** set default answer */ *total_num_slots = 0; /* get the daemon job object */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); /* see if we have a vm or not */ novm = orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL); /* if this is NOT a managed allocation, then we use the nodes * that were specified for this app - there is no need to collect * all available nodes and "filter" them */ if (!orte_managed_allocation) { OBJ_CONSTRUCT(&nodes, opal_list_t); /* if the app provided a dash-host, and we are not treating * them as requested or "soft" locations, then use those nodes */ hosts = NULL; if (!orte_soft_locations && orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using dash_host %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts)); if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts, false))) { ORTE_ERROR_LOG(rc); free(hosts); return rc; } free(hosts); } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) { /* otherwise, if the app provided a hostfile, then use that */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts)); if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, hosts))) { free(hosts); ORTE_ERROR_LOG(rc); return rc; } free(hosts); } else if (NULL != orte_rankfile) { /* use the rankfile, if provided */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using rankfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_rankfile)); if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_rankfile))) { ORTE_ERROR_LOG(rc); return rc; } if (0 == opal_list_get_size(&nodes)) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s nothing found in given rankfile", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_DESTRUCT(&nodes); return ORTE_ERR_BAD_PARAM; } } else if (NULL != orte_default_hostfile) { /* fall back to the default hostfile, if provided */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using default hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_default_hostfile)); if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_default_hostfile))) { ORTE_ERROR_LOG(rc); return rc; } /* this is a special case - we always install a default * hostfile, but it is empty. If the user didn't remove it * or put something into it, then we will have pursued that * option and found nothing. This isn't an error, we just need * to add all the known nodes */ if (0 == opal_list_get_size(&nodes)) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s nothing in default hostfile - using known nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto addknown; } } else { /* if nothing else was available, then use all known nodes, which * will include ourselves */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using known nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto addknown; } /** if we still don't have anything */ if (0 == opal_list_get_size(&nodes)) { if (!silent) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); } OBJ_DESTRUCT(&nodes); return ORTE_ERR_SILENT; } /* find the nodes in our node array and assemble them * in daemon order if the vm was launched */ while (NULL != (item = opal_list_remove_first(&nodes))) { nptr = (orte_node_t*)item; nd = NULL; for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (0 != strcmp(node->name, nptr->name)) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s DOESNT MATCH NODE %s", node->name, nptr->name)); continue; } /* ignore nodes that are marked as do-not-use for this mapping */ if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED NO_USE", node->name)); /* reset the state so it can be used another time */ node->state = ORTE_NODE_STATE_UP; continue; } if (ORTE_NODE_STATE_DOWN == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS DOWN", node->name)); continue; } if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED NO_INCLUDE", node->name)); /* not to be used */ continue; } /* if this node wasn't included in the vm (e.g., by -host), ignore it, * unless we are mapping prior to launching the vm */ if (NULL == node->daemon && !novm) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s HAS NO DAEMON", node->name)); continue; } /* retain a copy for our use in case the item gets * destructed along the way */ OBJ_RETAIN(node); if (initial_map) { /* if this is the first app_context we * are getting for an initial map of a job, * then mark all nodes as unmapped */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } if (NULL == nd || NULL == nd->daemon || NULL == node->daemon || nd->daemon->name.vpid < node->daemon->name.vpid) { /* just append to end */ opal_list_append(allocated_nodes, &node->super); nd = node; } else { /* starting from end, put this node in daemon-vpid order */ while (node->daemon->name.vpid < nd->daemon->name.vpid) { if (opal_list_get_begin(allocated_nodes) == opal_list_get_prev(&nd->super)) { /* insert at beginning */ opal_list_prepend(allocated_nodes, &node->super); goto moveon1; } nd = (orte_node_t*)opal_list_get_prev(&nd->super); } item = opal_list_get_next(&nd->super); if (item == opal_list_get_end(allocated_nodes)) { /* we are at the end - just append */ opal_list_append(allocated_nodes, &node->super); } else { nd = (orte_node_t*)item; opal_list_insert_pos(allocated_nodes, item, &node->super); } moveon1: /* reset us back to the end for the next node */ nd = (orte_node_t*)opal_list_get_last(allocated_nodes); } } OBJ_RELEASE(nptr); } OBJ_DESTRUCT(&nodes); /* now prune for usage and compute total slots */ goto complete; } addknown: /* if the hnp was allocated, include it unless flagged not to */ if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "HNP IS MARKED NO_USE")); /* clear this for future use, but don't include it */ node->state = ORTE_NODE_STATE_UP; } else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) { OBJ_RETAIN(node); if (initial_map) { /* if this is the first app_context we * are getting for an initial map of a job, * then mark all nodes as unmapped */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } opal_list_append(allocated_nodes, &node->super); } } } /* add everything in the node pool that can be used - add them * in daemon order, which may be different than the order in the * node pool. Since an empty list is passed into us, the list at * this point either has the HNP node or nothing, and the HNP * node obviously has a daemon on it (us!) */ if (0 == opal_list_get_size(allocated_nodes)) { /* the list is empty */ nd = NULL; } else { nd = (orte_node_t*)opal_list_get_last(allocated_nodes); } for (i=1; i < orte_node_pool->size; i++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { /* ignore nodes that are marked as do-not-use for this mapping */ if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED NO_USE", node->name)); /* reset the state so it can be used another time */ node->state = ORTE_NODE_STATE_UP; continue; } if (ORTE_NODE_STATE_DOWN == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED DOWN", node->name)); continue; } if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED NO_INCLUDE", node->name)); /* not to be used */ continue; } /* if this node wasn't included in the vm (e.g., by -host), ignore it, * unless we are mapping prior to launching the vm */ if (NULL == node->daemon && !novm) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s HAS NO DAEMON", node->name)); continue; } /* retain a copy for our use in case the item gets * destructed along the way */ OBJ_RETAIN(node); if (initial_map) { /* if this is the first app_context we * are getting for an initial map of a job, * then mark all nodes as unmapped */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } if (NULL == nd || NULL == nd->daemon || NULL == node->daemon || nd->daemon->name.vpid < node->daemon->name.vpid) { /* just append to end */ opal_list_append(allocated_nodes, &node->super); nd = node; } else { /* starting from end, put this node in daemon-vpid order */ while (node->daemon->name.vpid < nd->daemon->name.vpid) { if (opal_list_get_begin(allocated_nodes) == opal_list_get_prev(&nd->super)) { /* insert at beginning */ opal_list_prepend(allocated_nodes, &node->super); goto moveon; } nd = (orte_node_t*)opal_list_get_prev(&nd->super); } item = opal_list_get_next(&nd->super); if (item == opal_list_get_end(allocated_nodes)) { /* we are at the end - just append */ opal_list_append(allocated_nodes, &node->super); } else { nd = (orte_node_t*)item; opal_list_insert_pos(allocated_nodes, item, &node->super); } moveon: /* reset us back to the end for the next node */ nd = (orte_node_t*)opal_list_get_last(allocated_nodes); } } } OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Starting with %d nodes in list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)opal_list_get_size(allocated_nodes))); /** check that anything is here */ if (0 == opal_list_get_size(allocated_nodes)) { if (!silent) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); } return ORTE_ERR_SILENT; } /* filter the nodes thru any hostfile and dash-host options */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Filtering thru apps", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_rmaps_base_filter_nodes(app, allocated_nodes, true)) && ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); return rc; } OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Retained %d nodes in list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)opal_list_get_size(allocated_nodes))); complete: /* remove all nodes that are already at max usage, and * compute the total number of allocated slots while * we do so */ num_slots = 0; item = opal_list_get_first(allocated_nodes); while (item != opal_list_get_end(allocated_nodes)) { /** save the next pointer in case we remove this node */ next = opal_list_get_next(item); /** check to see if this node is fully used - remove if so */ node = (orte_node_t*)item; if (0 != node->slots_max && node->slots_inuse > node->slots_max) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Removing node %s: max %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots_max, node->slots_inuse)); opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ } else if (node->slots <= node->slots_inuse && (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { /* remove the node as fully used */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Removing node %s slots %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots, node->slots_inuse)); opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ } else if (node->slots > node->slots_inuse) { /* add the available slots */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s node %s has %d slots available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots - node->slots_inuse)); num_slots += node->slots - node->slots_inuse; } else if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { /* nothing needed to do here - we don't add slots to the * count as we don't have any available. Just let the mapper * do what it needs to do to meet the request */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s node %s is fully used, but available for oversubscrition", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); } else { /* if we cannot use it, remove it from list */ opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ } /** go on to next item */ item = next; } /* Sanity check to make sure we have resources available */ if (0 == opal_list_get_size(allocated_nodes)) { if (silent) { /* let the caller know that the resources exist, * but are currently busy */ return ORTE_ERR_RESOURCE_BUSY; } else { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:all-available-resources-used", true); return ORTE_ERR_SILENT; } } /* pass back the total number of available slots */ *total_num_slots = num_slots; if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) { opal_output(0, "AVAILABLE NODES FOR MAPPING:"); for (item = opal_list_get_first(allocated_nodes); item != opal_list_get_end(allocated_nodes); item = opal_list_get_next(item)) { node = (orte_node_t*)item; opal_output(0, " node: %s daemon: %s", node->name, (NULL == node->daemon) ? "NULL" : ORTE_VPID_PRINT(node->daemon->name.vpid)); } } return ORTE_SUCCESS; }
int orte_rmaps_rr_bynode(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_std_cntr_t num_slots, orte_vpid_t num_procs) { int j, nprocs_mapped, lag, delta; orte_node_t *node; orte_proc_t *proc; opal_list_item_t *item; int num_procs_to_assign, navg, idx; int extra_procs_to_assign=0, nxtra_nodes=0; #if OPAL_HAVE_HWLOC hwloc_obj_t obj=NULL; #endif float balance; bool add_one=false; bool oversubscribed=false; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping by node for job %s app %d slots %d num_procs %lu", ORTE_JOBID_PRINT(jdata->jobid), (int)app->idx, (int)num_slots, (unsigned long)num_procs); /* quick check to see if we can map all the procs */ if (num_slots < ((int)app->num_procs * orte_rmaps_base.cpus_per_rank)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); return ORTE_ERR_SILENT; } oversubscribed = true; } /* divide the procs evenly across all nodes - this is the * average we have to maintain as we go, but we adjust * the number on each node to reflect its available slots. * Obviously, if all nodes have the same number of slots, * then the avg is what we get on each node - this is * the most common situation. */ navg = app->num_procs / opal_list_get_size(node_list); if (0 == navg) { /* if there are less procs than nodes, we have to * place at least one/node */ navg = 1; } /* compute how many extra procs to put on each node */ balance = (float)(app->num_procs - (navg * (float)opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list); extra_procs_to_assign = (int)balance; if (0 < (balance - (float)extra_procs_to_assign)) { /* compute how many nodes need an extra proc */ nxtra_nodes = app->num_procs - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list)); /* add one so that we add an extra proc to the first nodes * until all procs are mapped */ extra_procs_to_assign++; /* flag that we added one */ add_one = true; } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping by node navg %d extra_procs %d extra_nodes %d", navg, extra_procs_to_assign, nxtra_nodes); nprocs_mapped = 0; lag = 0; for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; #if OPAL_HAVE_HWLOC /* get the root object as we are not assigning * locale except at the node level */ if (NULL != node->topology) { obj = hwloc_get_root_obj(node->topology); } #endif /* add this node to the map, but only do so once */ if (!node->mapped) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(idx); return idx; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } /* compute the number of procs to go on this node */ if (add_one) { if (0 == nxtra_nodes) { --extra_procs_to_assign; add_one = false; } else { --nxtra_nodes; } } if (oversubscribed) { /* everybody just takes their share */ num_procs_to_assign = navg + extra_procs_to_assign; } else { /* if we are not oversubscribed, then there are enough * slots to handle all the procs. However, not every * node will have the same number of slots, so we * have to track how many procs to "shift" elsewhere * to make up the difference */ if (node->slots <= node->slots_inuse) { /* if there are no extras to take, then we can * ignore this node */ if (0 == extra_procs_to_assign) { /* update how many we are lagging behind */ lag += navg; continue; } /* everybody has to take at least the extras */ num_procs_to_assign = extra_procs_to_assign; /* update how many we are lagging behind */ lag += navg; } else { /* if slots < avg (adjusted for cpus/proc), then take all */ if ((node->slots - node->slots_inuse) < (navg * orte_rmaps_base.cpus_per_rank)) { num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank; /* update how many we are lagging behind */ lag += navg - num_procs_to_assign; } else { /* take the avg plus as much of the "lag" as we can */ delta = 0; if (0 < lag) { delta = ((node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank) - navg; if (lag < delta) { delta = lag; } lag -= delta; } num_procs_to_assign = navg; } /* add in the extras */ num_procs_to_assign += extra_procs_to_assign; } } for (j=0; j < num_procs_to_assign && nprocs_mapped < app->num_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; #if OPAL_HAVE_HWLOC proc->locale = obj; #endif } /* not all nodes are equal, so only set oversubscribed for * this node if it is in that state */ if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) { /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } if (nprocs_mapped == app->num_procs) { /* we are done */ break; } } /* if we have some remaining lag, then put one/node until * all are assigned */ for (item = opal_list_get_first(node_list); 0 < lag && item != opal_list_get_end(node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; lag--; #if OPAL_HAVE_HWLOC proc->locale = obj; #endif /* not all nodes are equal, so only set oversubscribed for * this node if it is in that state */ if (node->slots < ((int)node->num_procs * orte_rmaps_base.cpus_per_rank)) { /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } if (nprocs_mapped == app->num_procs) { /* we are done */ break; } } return ORTE_SUCCESS; }
int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy, char **device, char *spec) { char **ck, **ck2; orte_mapping_policy_t tmp; int i; size_t len; /* set defaults */ tmp = 0; *device = NULL; if (NULL == spec) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET); ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SPAN); ORTE_UNSET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN); } else { ck = opal_argv_split(spec, ':'); if (2 < opal_argv_count(ck)) { /* incorrect format */ orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", rmaps_base_mapping_policy); opal_argv_free(ck); return ORTE_ERR_SILENT; } if (2 == opal_argv_count(ck)) { /* if the policy is "dist", then we set the policy to that value * and save the second argument as the device */ #if OPAL_HAVE_HWLOC if (0 == strncasecmp(ck[0], "dist", strlen(ck[0]))) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYDIST); ck2 = opal_argv_split(ck[1], ','); if (ck2[0] != NULL) { *device = strdup(ck2[0]); for (i=1; NULL != ck2[i]; i++) { if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) { ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SPAN); } } } opal_argv_free(ck2); goto setpolicy; } #endif ck2 = opal_argv_split(ck[1], ','); for (i=0; NULL != ck2[i]; i++) { if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) { ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SPAN); } else if (0 == strncasecmp(ck2[i], "oversubscribe", strlen(ck2[i]))) { if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(tmp)) { ORTE_UNSET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SUBSCRIBE_GIVEN); } else if (0 == strncasecmp(ck2[i], "nooversubscribe", strlen(ck2[i]))) { ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SUBSCRIBE_GIVEN); } else { /* unrecognized modifier */ orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, "mapping", ck2[i]); opal_argv_free(ck); opal_argv_free(ck2); return ORTE_ERR_SILENT; } } opal_argv_free(ck2); } } len = strlen(ck[0]); if (0 == strncasecmp(ck[0], "slot", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSLOT); } else if (0 == strncasecmp(ck[0], "node", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYNODE); #if OPAL_HAVE_HWLOC } else if (0 == strncasecmp(ck[0], "core", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYCORE); } else if (0 == strncasecmp(ck[0], "l1cache", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL1CACHE); } else if (0 == strncasecmp(ck[0], "l2cache", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL2CACHE); } else if (0 == strncasecmp(ck[0], "l3cache", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL3CACHE); } else if (0 == strncasecmp(ck[0], "socket", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET); } else if (0 == strncasecmp(ck[0], "numa", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYNUMA); } else if (0 == strncasecmp(ck[0], "board", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYBOARD); } else if (0 == strncasecmp(ck[0], "hwthread", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYHWTHREAD); /* if we are mapping processes to individual hwthreads, then * we need to treat those hwthreads as separate cpus */ opal_hwloc_use_hwthreads_as_cpus = true; #endif } else { orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", rmaps_base_mapping_policy); opal_argv_free(ck); return ORTE_ERR_SILENT; } opal_argv_free(ck); ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN); } #if OPAL_HAVE_HWLOC setpolicy: #endif *policy = tmp; return ORTE_SUCCESS; }
static int ppr_mapper(orte_job_t *jdata) { int rc = ORTE_SUCCESS, j, n; mca_base_component_t *c=&mca_rmaps_ppr_component.base_version; orte_node_t *node; orte_proc_t *proc; orte_app_context_t *app; orte_vpid_t total_procs, nprocs_mapped; opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL; #if OPAL_HAVE_HWLOC hwloc_obj_t obj; hwloc_obj_type_t lowest; unsigned cache_level=0; unsigned int nobjs, i; bool pruning_reqd = false; opal_hwloc_level_t level; #endif opal_list_t node_list; opal_list_item_t *item; orte_std_cntr_t num_slots; orte_app_idx_t idx; char **ppr_req, **ck; size_t len; bool initial_map=true; /* only handle initial launch of loadbalanced * or NPERxxx jobs - allow restarting of failed apps */ if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s being restarted - ppr cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s not using ppr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL == jdata->map->ppr || ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* not for us */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s not using ppr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: mapping job %s with ppr %s", ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* initialize */ memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t)); /* parse option */ n=0; ppr_req = opal_argv_split(jdata->map->ppr, ','); for (j=0; NULL != ppr_req[j]; j++) { /* split on the colon */ ck = opal_argv_split(ppr_req[j], ':'); if (2 != opal_argv_count(ck)) { /* must provide a specification */ orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr); opal_argv_free(ppr_req); opal_argv_free(ck); return ORTE_ERR_SILENT; } len = strlen(ck[1]); if (0 == strncasecmp(ck[1], "node", len)) { ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10); ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE); start = OPAL_HWLOC_NODE_LEVEL; n++; #if OPAL_HAVE_HWLOC } else if (0 == strncasecmp(ck[1], "hwthread", len) || 0 == strncasecmp(ck[1], "thread", len)) { ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10); start = OPAL_HWLOC_HWTHREAD_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD); n++; } else if (0 == strncasecmp(ck[1], "core", len)) { ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_CORE_LEVEL) { start = OPAL_HWLOC_CORE_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE); } n++; } else if (0 == strncasecmp(ck[1], "socket", len) || 0 == strncasecmp(ck[1], "skt", len)) { ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_SOCKET_LEVEL) { start = OPAL_HWLOC_SOCKET_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET); } n++; } else if (0 == strncasecmp(ck[1], "l1cache", len)) { ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L1CACHE_LEVEL) { start = OPAL_HWLOC_L1CACHE_LEVEL; cache_level = 1; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE); } n++; } else if (0 == strncasecmp(ck[1], "l2cache", len)) { ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L2CACHE_LEVEL) { start = OPAL_HWLOC_L2CACHE_LEVEL; cache_level = 2; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE); } n++; } else if (0 == strncasecmp(ck[1], "l3cache", len)) { ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L3CACHE_LEVEL) { start = OPAL_HWLOC_L3CACHE_LEVEL; cache_level = 3; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE); } n++; } else if (0 == strncasecmp(ck[1], "numa", len)) { ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_NUMA_LEVEL) { start = OPAL_HWLOC_NUMA_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA); } n++; #endif } else { /* unknown spec */ orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr); opal_argv_free(ppr_req); opal_argv_free(ck); return ORTE_ERR_SILENT; } opal_argv_free(ck); } opal_argv_free(ppr_req); /* if nothing was given, that's an error */ if (0 == n) { opal_output(0, "NOTHING GIVEN"); return ORTE_ERR_SILENT; } #if OPAL_HAVE_HWLOC /* if more than one level was specified, then pruning will be reqd */ if (1 < n) { pruning_reqd = true; } #endif opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s assigned policy %s", ORTE_JOBID_PRINT(jdata->jobid), orte_rmaps_base_print_mapping(jdata->map->mapping)); #if OPAL_HAVE_HWLOC /* convenience */ level = start; lowest = opal_hwloc_levels[start]; #endif for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { continue; } /* if the number of total procs was given, set that * limit - otherwise, set to max so we simply fill * all the nodes with the pattern */ if (0 < app->num_procs) { total_procs = app->num_procs; } else { total_procs = ORTE_VPID_MAX; } /* get the available nodes */ OBJ_CONSTRUCT(&node_list, opal_list_t); if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map, false))) { ORTE_ERROR_LOG(rc); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; /* if a bookmark exists from some prior mapping, set us to start there */ jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata); /* cycle across the nodes */ nprocs_mapped = 0; for (item = opal_list_get_first(&node_list); item != opal_list_get_end(&node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; #if OPAL_HAVE_HWLOC /* bozo check */ if (NULL == node->topology) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); rc = ORTE_ERR_SILENT; goto error; } #endif /* add the node to the map, if needed */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); goto error; } ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); OBJ_RETAIN(node); /* maintain accounting on object */ jdata->map->num_nodes++; } /* if we are mapping solely at the node level, just put * that many procs on this node */ if (OPAL_HWLOC_NODE_LEVEL == start) { #if OPAL_HAVE_HWLOC obj = hwloc_get_root_obj(node->topology); #endif for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; #if OPAL_HAVE_HWLOC orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); #endif } #if OPAL_HAVE_HWLOC } else { /* get the number of lowest resources on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, lowest, cache_level, OPAL_HWLOC_AVAILABLE); /* map the specified number of procs to each such resource on this node, * recording the locale of each proc so we know its cpuset */ for (i=0; i < nobjs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology, lowest, cache_level, i, OPAL_HWLOC_AVAILABLE); for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); } } if (pruning_reqd) { /* go up the ladder and prune the procs according to * the specification, adjusting the count of procs on the * node as we go */ level--; prune(jdata->jobid, idx, node, &level, &nprocs_mapped); } #endif } /* set the total slots used */ if ((int)node->num_procs <= node->slots) { node->slots_inuse = (int)node->num_procs; } else { node->slots_inuse = node->slots; } /* if no-oversubscribe was specified, check to see if * we have violated the total slot specification - regardless, * if slots_max was given, we are not allowed to violate it! */ if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED); } /* if we haven't mapped all the procs, continue on to the * next node */ if (total_procs == nprocs_mapped) { break; } } if (0 == app->num_procs) { app->num_procs = nprocs_mapped; } if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) { /* couldn't map them all */ orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs", true, app->app, app->num_procs, jdata->map->ppr); rc = ORTE_ERR_SILENT; goto error; } /* compute vpids and add proc objects to the job */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); goto error; } /* track the total number of processes we mapped - must update * this AFTER we compute vpids so that computation is done * correctly */ jdata->num_procs += app->num_procs; while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); } return ORTE_SUCCESS; error: while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); return rc; }
int orte_rmaps_rr_byslot(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_std_cntr_t num_slots, orte_vpid_t num_procs) { int i, nprocs_mapped; orte_node_t *node; orte_proc_t *proc; int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0; hwloc_obj_t obj=NULL; float balance; bool add_one=false; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu", ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs); /* check to see if we can map all the procs */ if (num_slots < (int)app->num_procs) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app, orte_process_info.nodename); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return ORTE_ERR_SILENT; } } /* first pass: map the number of procs to each node until we * map all specified procs or use all allocated slots */ nprocs_mapped = 0; OPAL_LIST_FOREACH(node, node_list, orte_node_t) { opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot working node %s", node->name); /* get the root object as we are not assigning * locale here except at the node level */ if (NULL != node->topology && NULL != node->topology->topo) { obj = hwloc_get_root_obj(node->topology->topo); } if (node->slots <= node->slots_inuse) { opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot node %s is full - skipping", node->name); continue; } /* assign a number of procs equal to the number of available slots */ num_procs_to_assign = node->slots - node->slots_inuse; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot assigning %d procs to node %s", (int)num_procs_to_assign, node->name); for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) { /* add this node to the map - do it only once */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); ++(jdata->map->num_nodes); } if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); } }
/** * Function for finding and opening either all MCA components, or the one * that was specifically requested via a MCA parameter. */ static int orte_rmaps_base_open(mca_base_open_flag_t flags) { int rc; /* init the globals */ OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t); orte_rmaps_base.ppr = NULL; orte_rmaps_base.slot_list = NULL; orte_rmaps_base.mapping = 0; orte_rmaps_base.ranking = 0; #if OPAL_HAVE_HWLOC /* if a topology file was given, then set our topology * from it. Even though our actual topology may differ, * mpirun only needs to see the compute node topology * for mapping purposes */ if (NULL != rmaps_base_topo_file) { if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(rmaps_base_topo_file))) { orte_show_help("help-orte-rmaps-base.txt", "topo-file", true, rmaps_base_topo_file); return ORTE_ERR_SILENT; } } #endif if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping, &orte_rmaps_base.device, rmaps_base_mapping_policy))) { return rc; } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&orte_rmaps_base.ranking, orte_rmaps_base.mapping, rmaps_base_ranking_policy))) { return rc; } if (rmaps_base_byslot) { /* set mapping policy to byslot - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to byslot - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_bynode) { /* set mapping policy to bynode - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bynode - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } /* Should we schedule on the local node or not? */ if (rmaps_base_no_schedule_local) { orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL; } /* Should we oversubscribe or not? */ if (rmaps_base_no_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); } /** force oversubscription permission */ if (rmaps_base_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); } /* should we display a detailed (developer-quality) version of the map after determining it? */ if (rmaps_base_display_devel_map) { orte_rmaps_base.display_map = true; orte_devel_level_output = true; } /* should we display a diffable report of proc locations after determining it? */ if (rmaps_base_display_diffable_map) { orte_rmaps_base.display_map = true; orte_display_diffable_output = true; } /* Open up all available components */ rc = mca_base_framework_components_open(&orte_rmaps_base_framework, flags); /* check to see if any component indicated a problem */ if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { /* the component would have already reported the error, so * tell the rest of the chain to shut up */ return ORTE_ERR_SILENT; } /* All done */ return rc; }
static int byobj_span(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_std_cntr_t num_slots, orte_vpid_t num_procs, hwloc_obj_type_t target, unsigned cache_level) { int i, j, nprocs_mapped, lag, delta, navg; orte_node_t *node; orte_proc_t *proc; opal_list_item_t *item; int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0; int extra_procs_to_assign=0, nxtra_nodes=0, idx; hwloc_obj_t obj=NULL; unsigned int nobjs; float balance; bool add_one=false; bool oversubscribed=false; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping span by %s for job %s slots %d num_procs %lu", hwloc_obj_type_string(target), ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs); /* quick check to see if we can map all the procs - can't * do more because we don't know how many total objects exist * across all the nodes */ if (num_slots < (int)app->num_procs) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); return ORTE_ERR_SILENT; } oversubscribed = true; } /* divide the procs evenly across all nodes - this is the * average we have to maintain as we go, but we adjust * the number on each node to reflect its available slots. * Obviously, if all nodes have the same number of slots, * then the avg is what we get on each node - this is * the most common situation. */ navg = app->num_procs / opal_list_get_size(node_list); if (0 == navg) { /* if there are less procs than nodes, we have to * place at least one/node */ navg = 1; } /* compute how many extra procs to put on each node */ balance = (float)((jdata->num_procs + app->num_procs) - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list); extra_procs_to_assign = (int)balance; if (0 < (balance - (float)extra_procs_to_assign)) { /* compute how many nodes need an extra proc */ nxtra_nodes = (jdata->num_procs + app->num_procs) - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list)); /* add one so that we add an extra proc to the first nodes * until all procs are mapped */ extra_procs_to_assign++; /* flag that we added one */ add_one = true; } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping by %s navg %d extra_procs %d extra_nodes %d", hwloc_obj_type_string(target), navg, extra_procs_to_assign, nxtra_nodes); nprocs_mapped = 0; lag = 0; for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; /* bozo check */ if (NULL == node->topology) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); return ORTE_ERR_SILENT; } /* add this node to the map, if reqd */ if (!node->mapped) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(idx); return idx; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } /* compute the number of procs to go on this node */ if (add_one) { if (0 == nxtra_nodes) { --extra_procs_to_assign; add_one = false; } else { --nxtra_nodes; } } if (oversubscribed) { /* everybody just takes their share */ num_procs_to_assign = navg + extra_procs_to_assign; } else { /* if we are not oversubscribed, then there are enough * slots to handle all the procs. However, not every * node will have the same number of slots, so we * have to track how many procs to "shift" elsewhere * to make up the difference */ if (node->slots <= node->slots_inuse) { /* if there are no extras to take, then we can * safely remove this node as we don't need it */ if (0 == extra_procs_to_assign) { opal_pointer_array_set_item(jdata->map->nodes, node->index, NULL); OBJ_RELEASE(node); --(jdata->map->num_nodes); /* update how many we are lagging behind */ lag += navg; continue; } /* everybody has to take at least the extras */ num_procs_to_assign = extra_procs_to_assign; /* update how many we are lagging behind */ lag += navg; } else { /* if slots < avg, then take all */ if ((node->slots - node->slots_inuse) < navg) { num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign; /* update how many we are lagging behind */ lag += navg - (node->slots - node->slots_inuse); } else { /* take the avg plus as much of the "lag" as we can */ delta = 0; if (0 < lag) { delta = (node->slots - node->slots_inuse) - navg; if (lag < delta) { delta = lag; } lag -= delta; } num_procs_to_assign = navg + delta + extra_procs_to_assign; } } } /* get the number of objects of this type on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name); /* compute the number of procs to go on each object */ nperobj = num_procs_to_assign / nobjs; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: placing %d procs on each object", nperobj); if ((int)(nperobj * nobjs) < num_procs_to_assign) { /* compute how many objs need an extra proc */ nxtra_objs = num_procs_to_assign - nperobj * nobjs; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs); } /* loop through the number of objects */ for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) { /* get the hwloc object */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } /* map the reqd number of procs */ if (0 < nxtra_objs) { nprocs = nperobj + 1; --nxtra_objs; } else { nprocs = nperobj; } for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; proc->locale = obj; } /* keep track of the node we last used */ jdata->bookmark = node; } /* not all nodes are equal, so only set oversubscribed for * this node if it is in that state */ if (node->slots < (int)node->num_procs) { /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } if (nprocs_mapped == app->num_procs) { /* we are done */ break; } } return ORTE_SUCCESS; }
/* mapping by hwloc object looks a lot like mapping by node, * but has the added complication of possibly having different * numbers of objects on each node */ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_std_cntr_t num_slots, orte_vpid_t num_procs, hwloc_obj_type_t target, unsigned cache_level) { int i, j, nprocs_mapped; orte_node_t *node; orte_proc_t *proc; opal_list_item_t *item; int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0; int extra_procs_to_assign=0, nxtra_nodes=0, idx; hwloc_obj_t obj=NULL; unsigned int nobjs; float balance; bool add_one=false; /* there are two modes for mapping by object: span and not-span. The * span mode essentially operates as if there was just a single * "super-node" in the system - i.e., it balances the load across * all objects of the indicated type regardless of their location. * In essence, it acts as if we placed one proc on each object, cycling * across all objects on all nodes, and then wrapped around to place * another proc on each object, doing so until all procs were placed. * * In contrast, the non-span mode operates similar to byslot mapping. * All slots on each node are filled, assigning each proc to an object * on that node in a balanced fashion, and then the mapper moves on * to the next node. Thus, procs tend to be "front loaded" onto the * list of nodes, as opposed to being "load balanced" in the span mode */ if (ORTE_MAPPING_SPAN & jdata->map->mapping) { return byobj_span(jdata, app, node_list, num_slots, num_procs, target, cache_level); } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping no-span by %s for job %s slots %d num_procs %lu", hwloc_obj_type_string(target), ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs); /* quick check to see if we can map all the procs - can't * do more because we don't know how many total objects exist * across all the nodes */ if (num_slots < (app->num_procs * orte_rmaps_base.cpus_per_rank)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); return ORTE_ERR_SILENT; } /* compute how many extra procs to put on each node */ if (1 == opal_list_get_size(node_list)) { /* if there is only one node, then they all have to go on it */ extra_procs_to_assign = app->num_procs; } else { balance = (float)(((jdata->num_procs + app->num_procs)*orte_rmaps_base.cpus_per_rank) - num_slots) / (float)opal_list_get_size(node_list); extra_procs_to_assign = (int)balance; if (0 < (balance - (float)extra_procs_to_assign)) { /* compute how many nodes need an extra proc */ nxtra_nodes = ((jdata->num_procs + app->num_procs)*orte_rmaps_base.cpus_per_rank) - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list)); /* add one so that we add an extra proc to the first nodes * until all procs are mapped */ extra_procs_to_assign++; /* flag that we added one */ add_one = true; } } } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping no-span by %s extra_procs %d extra_nodes %d", hwloc_obj_type_string(target), extra_procs_to_assign, nxtra_nodes); nprocs_mapped = 0; for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; /* bozo check */ if (NULL == node->topology) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); return ORTE_ERR_SILENT; } /* add this node to the map, if reqd */ if (!node->mapped) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(idx); return idx; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } /* compute the number of procs to go on this node */ if (add_one) { if (0 == nxtra_nodes) { --extra_procs_to_assign; add_one = false; } else { --nxtra_nodes; } } if (node->slots <= node->slots_inuse) { /* everybody takes at least the extras */ num_procs_to_assign = extra_procs_to_assign; } else { num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank + extra_procs_to_assign; if (app->num_procs < num_procs_to_assign) { /* might have more slots than procs */ num_procs_to_assign = app->num_procs; } } /* get the number of objects of this type on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: nprocs-to-assign %d for %d objs on node %s", num_procs_to_assign, nobjs, node->name); /* if there are no objects of this type, then report the error * and abort - this can happen, for example, on systems that * don't report "sockets" as an independent object. However, IF * this object is the default one - i.e., not specified by the * user - then we can fall back to mapping by slot */ if (0 == nobjs) { if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(target), node->name); return ORTE_ERR_SILENT; } else { /* this was the default mapping policy, so clear the map * of any prior work and indicate that map-by slot is reqd */ for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } for (idx=0; idx < node->procs->size; idx++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, idx))) { continue; } if (proc->name.jobid != jdata->jobid) { continue; } --node->num_procs; OBJ_RELEASE(proc); opal_pointer_array_set_item(node->procs, idx, NULL); } if (0 == node->num_procs) { node->mapped = false; OBJ_RELEASE(node); opal_pointer_array_set_item(jdata->map->nodes, i, NULL); } } return ORTE_ERR_NOT_SUPPORTED; } } /* compute the number of procs to go on each object */ nperobj = num_procs_to_assign / nobjs; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: placing %d procs on each object", nperobj); if ((int)(nperobj * nobjs) < num_procs_to_assign) { /* compute how many objs need an extra proc */ nxtra_objs = num_procs_to_assign - nperobj * nobjs; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs); } /* loop through the number of objects */ for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) { /* get the hwloc object */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } /* map the reqd number of procs */ if (0 < nxtra_objs) { nprocs = nperobj + 1; --nxtra_objs; } else { nprocs = nperobj; } for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; proc->locale = obj; } } /* not all nodes are equal, so only set oversubscribed for * this node if it is in that state */ if (node->slots < (int)node->num_procs) { /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } if (nprocs_mapped == app->num_procs) { /* we are done */ break; } } return ORTE_SUCCESS; }
int orte_rmaps_rr_byslot(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_std_cntr_t num_slots, orte_vpid_t num_procs) { int rc, i, nprocs_mapped; orte_node_t *node; orte_proc_t *proc; opal_list_item_t *item; int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0; #if OPAL_HAVE_HWLOC hwloc_obj_t obj=NULL; #endif float balance; bool add_one=false; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu", ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs); /* check to see if we can map all the procs */ if (num_slots < ((int)app->num_procs * orte_rmaps_base.cpus_per_rank)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); return ORTE_ERR_SILENT; } } /* first pass: map the number of procs to each node until we * map all specified procs or use all allocated slots */ nprocs_mapped = 0; for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot working node %s", node->name); #if OPAL_HAVE_HWLOC /* get the root object as we are not assigning * locale here except at the node level */ if (NULL != node->topology) { obj = hwloc_get_root_obj(node->topology); } #endif if (node->slots <= node->slots_inuse) { opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot node %s is full - skipping", node->name); continue; } /* assign a number of procs equal to the number of available * slots divided by the number of cpus/rank the user * requested */ num_procs_to_assign = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot assigning %d procs to node %s", (int)num_procs_to_assign, node->name); for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) { /* add this node to the map - do it only once */ if (!node->mapped) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); return rc; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; #if OPAL_HAVE_HWLOC proc->locale = obj; #endif } } if (nprocs_mapped == app->num_procs) { /* we are done */ return ORTE_SUCCESS; } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot job %s is oversubscribed - performing second pass", ORTE_JOBID_PRINT(jdata->jobid)); /* second pass: if we haven't mapped everyone yet, it is * because we are oversubscribed. Figure out how many procs * to add */ balance = (float)(app->num_procs - nprocs_mapped) / (float)opal_list_get_size(node_list); extra_procs_to_assign = (int)balance; if (0 < (balance - (float)extra_procs_to_assign)) { /* compute how many nodes need an extra proc */ nxtra_nodes = app->num_procs - nprocs_mapped - (extra_procs_to_assign * opal_list_get_size(node_list)); /* add one so that we add an extra proc to the first nodes * until all procs are mapped */ extra_procs_to_assign++; /* flag that we added one */ add_one = true; } for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot working node %s", node->name); #if OPAL_HAVE_HWLOC /* get the root object as we are not assigning * locale except at the node level */ if (NULL != node->topology) { obj = hwloc_get_root_obj(node->topology); } #endif /* add this node to the map - do it only once */ if (!node->mapped) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); return rc; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } if (add_one) { if (0 == nxtra_nodes) { --extra_procs_to_assign; add_one = false; } else { --nxtra_nodes; } } num_procs_to_assign = ((node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank) + extra_procs_to_assign; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot adding up to %d procs to node %s", num_procs_to_assign, node->name); for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; #if OPAL_HAVE_HWLOC proc->locale = obj; #endif } /* not all nodes are equal, so only set oversubscribed for * this node if it is in that state */ if (node->slots < (int)node->num_procs) { /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } } return ORTE_SUCCESS; }
/** * Function for finding and opening either all MCA components, or the one * that was specifically requested via a MCA parameter. */ static int orte_rmaps_base_open(mca_base_open_flag_t flags) { int rc; /* init the globals */ OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t); orte_rmaps_base.slot_list = NULL; orte_rmaps_base.mapping = 0; orte_rmaps_base.ranking = 0; orte_rmaps_base.device = NULL; /* if a topology file was given, then set our topology * from it. Even though our actual topology may differ, * mpirun only needs to see the compute node topology * for mapping purposes */ if (NULL != rmaps_base_topo_file) { if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(rmaps_base_topo_file))) { orte_show_help("help-orte-rmaps-base.txt", "topo-file", true, rmaps_base_topo_file); return ORTE_ERR_SILENT; } } /* check for violations that has to be detected before we parse the mapping option */ if (NULL != orte_rmaps_base.ppr) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--ppr, -ppr", "--map-by ppr:<pattern>", "rmaps_base_pattern, rmaps_ppr_pattern", "rmaps_base_mapping_policy=ppr:<pattern>"); /* if the mapping policy is NULL, then we can proceed */ if (NULL == rmaps_base_mapping_policy) { asprintf(&rmaps_base_mapping_policy, "ppr:%s", orte_rmaps_base.ppr); } else { return ORTE_ERR_SILENT; } } if (1 < orte_rmaps_base.cpus_per_rank) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--cpus-per-proc, -cpus-per-proc, --cpus-per-rank, -cpus-per-rank", "--map-by <obj>:PE=N, default <obj>=NUMA", "rmaps_base_cpus_per_proc", "rmaps_base_mapping_policy=<obj>:PE=N, default <obj>=NUMA"); } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping, &orte_rmaps_base.device, rmaps_base_mapping_policy))) { return rc; } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&orte_rmaps_base.ranking, orte_rmaps_base.mapping, rmaps_base_ranking_policy))) { return rc; } if (rmaps_base_bycore) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--bycore, -bycore", "--map-by core", "rmaps_base_bycore", "rmaps_base_mapping_policy=core"); /* set mapping policy to bycore - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYCORE) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bycore", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYCORE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bycore - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_CORE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bycore", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_CORE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_byslot) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--byslot, -byslot", "--map-by slot", "rmaps_base_byslot", "rmaps_base_mapping_policy=slot"); /* set mapping policy to byslot - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to byslot - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_bynode) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--bynode, -bynode", "--map-by node", "rmaps_base_bynode", "rmaps_base_mapping_policy=node"); /* set mapping policy to bynode - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bynode - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (1 < orte_rmaps_base.cpus_per_rank) { /* if we were asked for multiple cpus/proc, then we have to * bind to those cpus - any other binding policy is an * error */ if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) { if (opal_hwloc_use_hwthreads_as_cpus) { if (OPAL_BIND_TO_HWTHREAD != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true, orte_rmaps_base.cpus_per_rank, "use-hwthreads-as-cpus", opal_hwloc_base_print_binding(opal_hwloc_binding_policy), "bind-to hwthread"); return ORTE_ERR_SILENT; } } else if (OPAL_BIND_TO_CORE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true, orte_rmaps_base.cpus_per_rank, "cores as cpus", opal_hwloc_base_print_binding(opal_hwloc_binding_policy), "bind-to core"); return ORTE_ERR_SILENT; } } else { if (opal_hwloc_use_hwthreads_as_cpus) { OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD); } else { OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE); } } /* we also need to ensure we are mapping to a high-enough level to have * multiple cpus beneath it - by default, we'll go to the NUMA level */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { if (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYHWTHREAD || (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYCORE && !opal_hwloc_use_hwthreads_as_cpus)) { orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low-init", true); return ORTE_ERR_SILENT; } } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base pe/rank set - setting mapping to BYNUMA", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNUMA); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); } } if (orte_rmaps_base_pernode) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ orte_rmaps_base.ppr = strdup("1:node"); } if (0 < orte_rmaps_base_n_pernode) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ asprintf(&orte_rmaps_base.ppr, "%d:node", orte_rmaps_base_n_pernode); } if (0 < orte_rmaps_base_n_persocket) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ asprintf(&orte_rmaps_base.ppr, "%d:socket", orte_rmaps_base_n_persocket); } /* Should we schedule on the local node or not? */ if (rmaps_base_no_schedule_local) { orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL; } /* Should we oversubscribe or not? */ if (rmaps_base_no_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); } /** force oversubscription permission */ if (rmaps_base_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); /* also set the overload allowed flag */ opal_hwloc_binding_policy |= OPAL_BIND_ALLOW_OVERLOAD; } /* should we display a detailed (developer-quality) version of the map after determining it? */ if (rmaps_base_display_devel_map) { orte_rmaps_base.display_map = true; orte_devel_level_output = true; } /* should we display a diffable report of proc locations after determining it? */ if (rmaps_base_display_diffable_map) { orte_rmaps_base.display_map = true; orte_display_diffable_output = true; } /* Open up all available components */ rc = mca_base_framework_components_open(&orte_rmaps_base_framework, flags); /* check to see if any component indicated a problem */ if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { /* the component would have already reported the error, so * tell the rest of the chain to shut up */ return ORTE_ERR_SILENT; } /* All done */ return rc; }
int orte_rmaps_rr_byslot(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_std_cntr_t num_slots, orte_vpid_t num_procs) { int rc, i, nprocs_mapped; orte_node_t *node; orte_proc_t *proc; int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0; #if OPAL_HAVE_HWLOC hwloc_obj_t obj=NULL; #endif float balance; bool add_one=false; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu", ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs); /* check to see if we can map all the procs */ if (num_slots < ((int)app->num_procs * orte_rmaps_base.cpus_per_rank)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); return ORTE_ERR_SILENT; } } /* first pass: map the number of procs to each node until we * map all specified procs or use all allocated slots */ nprocs_mapped = 0; OPAL_LIST_FOREACH(node, node_list, orte_node_t) { opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot working node %s", node->name); #if OPAL_HAVE_HWLOC /* get the root object as we are not assigning * locale here except at the node level */ if (NULL != node->topology) { obj = hwloc_get_root_obj(node->topology); } #endif if (node->slots <= node->slots_inuse) { opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot node %s is full - skipping", node->name); continue; } /* assign a number of procs equal to the number of available * slots divided by the number of cpus/rank the user * requested */ num_procs_to_assign = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot assigning %d procs to node %s", (int)num_procs_to_assign, node->name); for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) { /* add this node to the map - do it only once */ if (!node->mapped) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); return rc; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; #if OPAL_HAVE_HWLOC proc->locale = obj; #endif } }
int orte_rmaps_base_set_ranking_policy(orte_ranking_policy_t *policy, orte_mapping_policy_t mapping, char *spec) { orte_mapping_policy_t map; orte_ranking_policy_t tmp; char **ck; size_t len; /* set default */ tmp = 0; if (NULL == spec) { /* check for map-by object directives - we set the * ranking to match if one was given */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { map = ORTE_GET_MAPPING_POLICY(mapping); switch (map) { case ORTE_MAPPING_BYSLOT: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT); break; case ORTE_MAPPING_BYNODE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NODE); break; case ORTE_MAPPING_BYCORE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_CORE); break; case ORTE_MAPPING_BYL1CACHE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L1CACHE); break; case ORTE_MAPPING_BYL2CACHE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L2CACHE); break; case ORTE_MAPPING_BYL3CACHE: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L3CACHE); break; case ORTE_MAPPING_BYSOCKET: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SOCKET); break; case ORTE_MAPPING_BYNUMA: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NUMA); break; case ORTE_MAPPING_BYBOARD: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_BOARD); break; case ORTE_MAPPING_BYHWTHREAD: ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_HWTHREAD); break; default: /* anything not tied to a specific hw obj can rank by slot */ ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT); break; } } else { /* if no map-by was given, default to by-slot */ ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT); } } else { ck = opal_argv_split(spec, ':'); if (2 < opal_argv_count(ck)) { /* incorrect format */ orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", policy); opal_argv_free(ck); return ORTE_ERR_SILENT; } if (2 == opal_argv_count(ck)) { if (0 == strncasecmp(ck[1], "span", strlen(ck[1]))) { ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_SPAN); } else if (0 == strncasecmp(ck[1], "fill", strlen(ck[1]))) { ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_FILL); } else { /* unrecognized modifier */ orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, ck[1]); opal_argv_free(ck); return ORTE_ERR_SILENT; } } len = strlen(ck[0]); if (0 == strncasecmp(ck[0], "slot", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT); } else if (0 == strncasecmp(ck[0], "node", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NODE); } else if (0 == strncasecmp(ck[0], "hwthread", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_HWTHREAD); } else if (0 == strncasecmp(ck[0], "core", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_CORE); } else if (0 == strncasecmp(ck[0], "l1cache", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L1CACHE); } else if (0 == strncasecmp(ck[0], "l2cache", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L2CACHE); } else if (0 == strncasecmp(ck[0], "l3cache", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L3CACHE); } else if (0 == strncasecmp(ck[0], "socket", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SOCKET); } else if (0 == strncasecmp(ck[0], "numa", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NUMA); } else if (0 == strncasecmp(ck[0], "board", len)) { ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_BOARD); } else { orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", rmaps_base_ranking_policy); opal_argv_free(ck); return ORTE_ERR_SILENT; } opal_argv_free(ck); ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_GIVEN); } *policy = tmp; return ORTE_SUCCESS; }
static void recv_data(int fd, short args, void *cbdata) { bool found; int i, rc; orte_node_t *nd, *nd2; opal_list_t nds, ndtmp; opal_list_item_t *item, *itm; char recv_msg[8192]; int nbytes, idx, sjob; char **alloc, *nodelist, *tpn; local_jobtracker_t *ptr, *jtrk; local_apptracker_t *aptrk; orte_app_context_t *app; orte_jobid_t jobid; orte_job_t *jdata; char **dash_host = NULL; opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation - data recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* read the data from the socket and put it in the * nodes field of op */ memset(recv_msg, 0, sizeof(recv_msg)); nbytes = read(fd, recv_msg, sizeof(recv_msg) - 1); opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation msg: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg); /* check if we got something */ if (0 == nbytes || 0 == strlen(recv_msg) || strstr(recv_msg, "failure") != NULL) { /* show an error here - basically, a "nothing was available" * message */ orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, (0 == strlen(recv_msg)) ? "NO MSG" : recv_msg); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); return; } /* break the message into its component parts, separated by colons */ alloc = opal_argv_split(recv_msg, ':'); /* the first section contains the ORTE jobid for this allocation */ tpn = strchr(alloc[0], '='); orte_util_convert_string_to_jobid(&jobid, tpn+1); /* get the corresponding job object */ jdata = orte_get_job_data_object(jobid); jtrk = NULL; /* find the associated tracking object */ for (item = opal_list_get_first(&jobs); item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { ptr = (local_jobtracker_t*)item; if (ptr->jobid == jobid) { jtrk = ptr; break; } } if (NULL == jtrk) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, "NO JOB TRACKER"); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); return; } /* stop the timeout event */ opal_event_del(&jtrk->timeout_ev); /* cycle across all the remaining parts - each is the allocation for * an app in this job */ OBJ_CONSTRUCT(&nds, opal_list_t); OBJ_CONSTRUCT(&ndtmp, opal_list_t); idx = -1; sjob = -1; nodelist = NULL; tpn = NULL; for (i=1; NULL != alloc[i]; i++) { if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); if (NULL != nodelist) { free(nodelist); } if (NULL != tpn) { free(tpn); } return; } if (idx < 0) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* release the current dash_host as that contained the *desired* allocation */ orte_remove_attribute(&app->attributes, ORTE_APP_DASH_HOST); /* track the Slurm jobid */ if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) { aptrk = OBJ_NEW(local_apptracker_t); opal_pointer_array_set_item(&jtrk->apps, idx, aptrk); } aptrk->sjob = sjob; /* since the nodelist/tpn may contain regular expressions, parse them */ if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) { ORTE_ERROR_LOG(rc); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* transfer the discovered nodes to our node list, and construct * the new dash_host entry to match what was allocated */ while (NULL != (item = opal_list_remove_first(&ndtmp))) { nd = (orte_node_t*)item; opal_argv_append_nosize(&dash_host, nd->name); /* check for duplicates */ found = false; for (itm = opal_list_get_first(&nds); itm != opal_list_get_end(&nds); itm = opal_list_get_next(itm)) { nd2 = (orte_node_t*)itm; if (0 == strcmp(nd->name, nd2->name)) { found = true; nd2->slots += nd->slots; OBJ_RELEASE(item); break; } } if (!found) { /* append the new node to our list */ opal_list_append(&nds, item); } } /* cleanup */ free(nodelist); free(tpn); } /* cleanup */ opal_argv_free(alloc); OBJ_DESTRUCT(&ndtmp); if (NULL != dash_host) { tpn = opal_argv_join(dash_host, ','); orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, (void*)tpn, OPAL_STRING); opal_argv_free(dash_host); free(tpn); } if (opal_list_is_empty(&nds)) { /* if we get here, then we were able to contact slurm, * which means we are in an actively managed cluster. * However, slurm indicated that nothing is currently * available that meets our requirements. This is a fatal * situation - we do NOT have the option of running on * user-specified hosts as the cluster is managed. */ OBJ_DESTRUCT(&nds); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* store the found nodes */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nds, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nds); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } OBJ_DESTRUCT(&nds); /* default to no-oversubscribe-allowed for managed systems */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* flag that the allocation is managed */ orte_managed_allocation = true; /* move the job along */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE); /* all done */ return; }
/* * Sequentially map the ranks according to the placement in the * specified hostfile */ static int orte_rmaps_seq_map(orte_job_t *jdata) { orte_job_map_t *map; orte_app_context_t *app; int i, n; orte_std_cntr_t j; opal_list_item_t *item; orte_node_t *node, *nd; seq_node_t *sq, *save=NULL, *seq;; orte_vpid_t vpid; orte_std_cntr_t num_nodes; int rc; opal_list_t default_seq_list; opal_list_t node_list, *seq_list, sq_list; orte_proc_t *proc; mca_base_component_t *c = &mca_rmaps_seq_component.base_version; char *hosts = NULL, *sep, *eptr; FILE *fp; opal_hwloc_resource_type_t rtype; OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base_framework.framework_output, "%s rmaps:seq called on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* this mapper can only handle initial launch * when seq mapping is desired - allow * restarting of failed apps */ if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s is being restarted - seq cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper) { if (0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s not using sequential mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } /* we need to process it */ goto process; } if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I don't know how to do these - defer */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s not using seq mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } process: opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* convenience def */ map = jdata->map; /* if there is a default hostfile, go and get its ordered list of nodes */ OBJ_CONSTRUCT(&default_seq_list, opal_list_t); if (NULL != orte_default_hostfile) { char *hstname = NULL; /* open the file */ fp = fopen(orte_default_hostfile, "r"); if (NULL == fp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto error; } while (NULL != (hstname = orte_getline(fp))) { if (0 == strlen(hstname)) { free(hstname); /* blank line - ignore */ continue; } if( '#' == hstname[0] ) { free(hstname); /* Comment line - ignore */ continue; } sq = OBJ_NEW(seq_node_t); if (NULL != (sep = strchr(hstname, ' '))) { *sep = '\0'; sep++; /* remove any trailing space */ eptr = sep + strlen(sep) - 1; while (eptr > sep && isspace(*eptr)) { eptr--; } *(eptr+1) = 0; sq->cpuset = strdup(sep); } // Strip off the FQDN if present, ignore IP addresses if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) { char *ptr; if (NULL != (ptr = strchr(hstname, '.'))) { *ptr = '\0'; } } sq->hostname = hstname; opal_list_append(&default_seq_list, &sq->super); } fclose(fp); } /* start at the beginning... */ vpid = 0; jdata->num_procs = 0; if (0 < opal_list_get_size(&default_seq_list)) { save = (seq_node_t*)opal_list_get_first(&default_seq_list); } /* default to LOGICAL processors */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, NULL, OPAL_BOOL)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using PHYSICAL processors"); rtype = OPAL_HWLOC_PHYSICAL; } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using LOGICAL processors"); rtype = OPAL_HWLOC_LOGICAL; } /* initialize all the nodes as not included in this job map */ for (j=0; j < orte_node_pool->size; j++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } } /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* dash-host trumps hostfile */ if (orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using dash-host nodes on app %s", app->app); OBJ_CONSTRUCT(&node_list, opal_list_t); /* dash host entries cannot specify cpusets, so used the std function to retrieve the list */ if (ORTE_SUCCESS != (rc = orte_util_get_ordered_dash_host_list(&node_list, hosts))) { ORTE_ERROR_LOG(rc); free(hosts); goto error; } free(hosts); /* transfer the list to a seq_node_t list */ OBJ_CONSTRUCT(&sq_list, opal_list_t); while (NULL != (nd = (orte_node_t*)opal_list_remove_first(&node_list))) { sq = OBJ_NEW(seq_node_t); sq->hostname = strdup(nd->name); opal_list_append(&sq_list, &sq->super); OBJ_RELEASE(nd); } OBJ_DESTRUCT(&node_list); seq_list = &sq_list; } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) { char *hstname; if (NULL == hosts) { rc = ORTE_ERR_NOT_FOUND; goto error; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using hostfile %s nodes on app %s", hosts, app->app); OBJ_CONSTRUCT(&sq_list, opal_list_t); /* open the file */ fp = fopen(hosts, "r"); if (NULL == fp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; OBJ_DESTRUCT(&sq_list); goto error; } while (NULL != (hstname = orte_getline(fp))) { if (0 == strlen(hstname)) { free(hstname); /* blank line - ignore */ continue; } if( '#' == hstname[0] ) { free(hstname); /* Comment line - ignore */ continue; } sq = OBJ_NEW(seq_node_t); if (NULL != (sep = strchr(hstname, ' '))) { *sep = '\0'; sep++; /* remove any trailing space */ eptr = sep + strlen(sep) - 1; while (eptr > sep && isspace(*eptr)) { eptr--; } *(eptr+1) = 0; sq->cpuset = strdup(sep); } // Strip off the FQDN if present, ignore IP addresses if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) { char *ptr; if (NULL != (ptr = strchr(hstname, '.'))) { (*ptr) = '\0'; } } sq->hostname = hstname; opal_list_append(&sq_list, &sq->super); } fclose(fp); free(hosts); seq_list = &sq_list; } else if (0 < opal_list_get_size(&default_seq_list)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using default hostfile nodes on app %s", app->app); seq_list = &default_seq_list; } else { /* can't do anything - no nodes available! */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* check for nolocal and remove the head node, if required */ if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) { for (item = opal_list_get_first(seq_list); item != opal_list_get_end(seq_list); item = opal_list_get_next(item) ) { seq = (seq_node_t*)item; /* need to check ifislocal because the name in the * hostfile may not have been FQDN, while name returned * by gethostname may have been (or vice versa) */ if (orte_ifislocal(seq->hostname)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: removing head node %s", seq->hostname); opal_list_remove_item(seq_list, item); OBJ_RELEASE(item); /* "un-retain" it */ } } } if (NULL == seq_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(seq_list))) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* if num_procs wasn't specified, set it now */ if (0 == app->num_procs) { app->num_procs = num_nodes; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: setting num procs to %s for app %s", ORTE_VPID_PRINT(app->num_procs), app->app); } else if (num_nodes < app->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "seq:not-enough-resources", true, app->num_procs, num_nodes); return ORTE_ERR_SILENT; } if (seq_list == &default_seq_list) { sq = save; } else { sq = (seq_node_t*)opal_list_get_first(seq_list); } for (n=0; n < app->num_procs; n++) { /* find this node on the global array - this is necessary so * that our mapping gets saved on that array as the objects * returned by the hostfile function are -not- on the array */ node = NULL; for (j=0; j < orte_node_pool->size; j++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { continue; } if (0 == strcmp(sq->hostname, node->name)) { break; } } if (NULL == node) { /* wasn't found - that is an error */ orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:resource-not-found", true, sq->hostname); rc = ORTE_ERR_SILENT; goto error; } /* ensure the node is in the map */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { OBJ_RETAIN(node); opal_pointer_array_add(map->nodes, node); jdata->map->num_nodes++; ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); } proc = orte_rmaps_base_setup_proc(jdata, node, i); if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED); /* check for permission */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) { /* if we weren't given a directive either way, then we will error out * as the #slots were specifically given, either by the host RM or * via hostfile/dash-host */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return ORTE_ERR_SILENT; } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { /* if we were explicitly told not to oversubscribe, then don't */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return ORTE_ERR_SILENT; } } } /* assign the vpid */ proc->name.vpid = vpid++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: assign proc %s to node %s for app %s", ORTE_VPID_PRINT(proc->name.vpid), sq->hostname, app->app); /* record the cpuset, if given */ if (NULL != sq->cpuset) { hwloc_cpuset_t bitmap; char *cpu_bitmap; if (NULL == node->topology) { /* not allowed - for sequential cpusets, we must have * the topology info */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name); rc = ORTE_ERR_SILENT; goto error; } /* if we are using hwthreads as cpus and binding to hwthreads, then * we can just copy the cpuset across as it already specifies things * at that level */ if (opal_hwloc_use_hwthreads_as_cpus && OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { cpu_bitmap = strdup(sq->cpuset); } else { /* setup the bitmap */ bitmap = hwloc_bitmap_alloc(); /* parse the slot_list to find the socket and core */ if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(sq->cpuset, node->topology, rtype, bitmap))) { ORTE_ERROR_LOG(rc); hwloc_bitmap_free(bitmap); goto error; } /* note that we cannot set the proc locale to any specific object * as the slot list may have assigned it to more than one - so * leave that field NULL */ /* set the proc to the specified map */ hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap); hwloc_bitmap_free(bitmap); } orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: binding proc %s to cpuset %s bitmap %s", ORTE_VPID_PRINT(proc->name.vpid), sq->cpuset, cpu_bitmap); /* we are going to bind to cpuset since the user is specifying the cpus */ OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CPUSET); /* note that the user specified the mapping */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYUSER); ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_GIVEN); /* cleanup */ free(cpu_bitmap); } else { hwloc_obj_t locale; /* assign the locale - okay for the topo to be null as * it just means it wasn't returned */ if (NULL != node->topology) { locale = hwloc_get_root_obj(node->topology); orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, locale, OPAL_PTR); } } /* add to the jdata proc array */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); goto error; } /* move to next node */ sq = (seq_node_t*)opal_list_get_next(&sq->super); } /** track the total number of processes we mapped */ jdata->num_procs += app->num_procs; /* cleanup the node list if it came from this app_context */ if (seq_list != &default_seq_list) { OPAL_LIST_DESTRUCT(seq_list); } else { save = sq; } } return ORTE_SUCCESS; error: OPAL_LIST_DESTRUCT(&default_seq_list); return rc; }
char* orte_rmaps_base_print_mapping(orte_mapping_policy_t mapping) { char *ret, *map, *mymap, *tmp; orte_rmaps_print_buffers_t *ptr; if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { return "CONFLICTED"; } ptr = get_print_buffer(); if (NULL == ptr) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return orte_rmaps_print_null; } /* cycle around the ring */ if (ORTE_RMAPS_PRINT_NUM_BUFS == ptr->cntr) { ptr->cntr = 0; } switch(ORTE_GET_MAPPING_POLICY(mapping)) { case ORTE_MAPPING_BYNODE: map = "BYNODE"; break; case ORTE_MAPPING_BYBOARD: map = "BYBOARD"; break; case ORTE_MAPPING_BYNUMA: map = "BYNUMA"; break; case ORTE_MAPPING_BYSOCKET: map = "BYSOCKET"; break; case ORTE_MAPPING_BYL3CACHE: map = "BYL3CACHE"; break; case ORTE_MAPPING_BYL2CACHE: map = "BYL2CACHE"; break; case ORTE_MAPPING_BYL1CACHE: map = "BYL1CACHE"; break; case ORTE_MAPPING_BYCORE: map = "BYCORE"; break; case ORTE_MAPPING_BYHWTHREAD: map = "BYHWTHREAD"; break; case ORTE_MAPPING_BYSLOT: map = "BYSLOT"; break; case ORTE_MAPPING_SEQ: map = "SEQUENTIAL"; break; case ORTE_MAPPING_BYUSER: map = "BYUSER"; break; case ORTE_MAPPING_BYDIST: map = "MINDIST"; break; default: if (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { map = "PPR"; } else { map = "UNKNOWN"; } } if (0 != strcmp(map, "PPR") && (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping))) { asprintf(&mymap, "%s[PPR]:", map); } else { asprintf(&mymap, "%s:", map); } if (ORTE_MAPPING_NO_USE_LOCAL & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { asprintf(&tmp, "%sNO_USE_LOCAL,", mymap); free(mymap); mymap = tmp; } if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { asprintf(&tmp, "%sNOOVERSUBSCRIBE,", mymap); free(mymap); mymap = tmp; } else if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { asprintf(&tmp, "%sOVERSUBSCRIBE,", mymap); free(mymap); mymap = tmp; } if (ORTE_MAPPING_SPAN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) { asprintf(&tmp, "%sSPAN,", mymap); free(mymap); mymap = tmp; } /* remove the trailing mark */ mymap[strlen(mymap)-1] = '\0'; snprintf(ptr->buffers[ptr->cntr], ORTE_RMAPS_PRINT_MAX_SIZE, "%s", mymap); free(mymap); ret = ptr->buffers[ptr->cntr]; ptr->cntr++; return ret; }
/* * Function for selecting one component from all those that are * available. */ void orte_ras_base_allocate(int fd, short args, void *cbdata) { int rc; orte_job_t *jdata; opal_list_t nodes; orte_node_t *node; orte_std_cntr_t i; orte_app_context_t *app; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* convenience */ jdata = caddy->jdata; /* if we already did this, don't do it again - the pool of * global resources is set. */ if (orte_ras_base.allocation_read) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate allocation already read", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto next_state; } orte_ras_base.allocation_read = true; /* Otherwise, we have to create * the initial set of resources that will delineate all * further operations serviced by this HNP. This list will * contain ALL nodes that can be used by any subsequent job. * * In other words, if a node isn't found in this step, then * no job launched by this HNP will be able to utilize it. */ /* construct a list to hold the results */ OBJ_CONSTRUCT(&nodes, opal_list_t); /* if a component was selected, then we know we are in a managed * environment. - the active module will return a list of what it found */ if (NULL != orte_ras_base.active_module) { /* read the allocation */ if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(jdata, &nodes))) { if (ORTE_ERR_ALLOCATION_PENDING == rc) { /* an allocation request is underway, so just do nothing */ OBJ_DESTRUCT(&nodes); OBJ_RELEASE(caddy); return; } if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == rc) { /* this module indicates that nodes will be discovered * on a bootstrap basis, so all we do here is add our * own node to the list */ goto addlocal; } if (ORTE_ERR_TAKE_NEXT_OPTION == rc) { /* we have an active module, but it is unable to * allocate anything for this job - this indicates * that it isn't a fatal error, but could be if * an allocation is required */ if (orte_allocation_required) { /* an allocation is required, so this is fatal */ OBJ_DESTRUCT(&nodes); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } else { /* an allocation is not required, so we can just * run on the local node - go add it */ goto addlocal; } } ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } /* If something came back, save it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } OBJ_DESTRUCT(&nodes); /* default to no-oversubscribe-allowed for managed systems */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* flag that the allocation is managed */ orte_managed_allocation = true; goto DISPLAY; } else if (orte_allocation_required) { /* if nothing was found, and an allocation is * required, then error out */ OBJ_DESTRUCT(&nodes); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate nothing found in module - proceeding to hostfile", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* nothing was found, or no active module was alive. Our next * option is to look for a hostfile and assign our global * pool from there. * * Individual hostfile names, if given, are included * in the app_contexts for this job. We therefore need to * retrieve the app_contexts for the job, and then cycle * through them to see if anything is there. The parser will * add the nodes found in each hostfile to our list - i.e., * the resulting list contains the UNION of all nodes specified * in hostfiles from across all app_contexts * * We then continue to add any hosts provided by dash-host and * the default hostfile, if we have it. We will then filter out * all the non-desired hosts (i.e., those not specified by * -host and/or -hostfile) when we start the mapping process * * Note that any relative node syntax found in the hostfiles will * generate an error in this scenario, so only non-relative syntax * can be present */ if (NULL != orte_default_hostfile) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate parsing default hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_default_hostfile)); /* a default hostfile was provided - parse it */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_default_hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } if (NULL != app->hostfile) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate adding hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->hostfile)); /* hostfile was specified - parse it and add it to the list */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, app->hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); /* set an error event */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } else if (!orte_soft_locations && NULL != app->dash_host) { /* if we are using soft locations, then any dash-host would * just include desired nodes and not required. We don't want * to pick them up here as this would mean the request was * always satisfied - instead, we want to allow the request * to fail later on and use whatever nodes are actually * available */ OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate adding dash_hosts", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, app->dash_host))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } } /* if something was found in the hostfile(s), we use that as our global * pool - set it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* cleanup */ OBJ_DESTRUCT(&nodes); goto DISPLAY; } OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate nothing found in hostfiles - checking for rankfile", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Our next option is to look for a rankfile - if one was provided, we * will use its nodes to create a default allocation pool */ if (NULL != orte_rankfile) { /* check the rankfile for node information */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_rankfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return ; } } /* if something was found in rankfile, we use that as our global * pool - set it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* rankfile is considered equivalent to an RM allocation */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* cleanup */ OBJ_DESTRUCT(&nodes); goto DISPLAY; } OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate nothing found in rankfile - inserting current node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); addlocal: /* if nothing was found by any of the above methods, then we have no * earthly idea what to do - so just add the local host */ node = OBJ_NEW(orte_node_t); if (NULL == node) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* use the same name we got in orte_process_info so we avoid confusion in * the session directories */ node->name = strdup(orte_process_info.nodename); node->state = ORTE_NODE_STATE_UP; node->slots_inuse = 0; node->slots_max = 0; node->slots = 1; opal_list_append(&nodes, &node->super); /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } OBJ_DESTRUCT(&nodes); DISPLAY: /* shall we display the results? */ if (4 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) { orte_ras_base_display_alloc(); } next_state: /* are we to report this event? */ if (orte_report_events) { if (ORTE_SUCCESS != (rc = orte_util_comm_report_event(ORTE_COMM_EVENT_ALLOCATE))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); } } /* set total slots alloc */ jdata->total_slots_alloc = orte_ras_base.total_slots_alloc; /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE); /* cleanup */ OBJ_RELEASE(caddy); }
/********************************* * Parsing Functions *********************************/ int rmaps_lama_process_alias_params(orte_job_t *jdata) { int exit_status = ORTE_SUCCESS; /* * Mapping options * Note: L1, L2, L3 are not exposed in orterun to the user, so * there is no need to specify them here. */ if( NULL == rmaps_lama_cmd_map ) { /* orte_rmaps_base.mapping */ switch( ORTE_GET_MAPPING_POLICY(jdata->map->mapping) ) { case ORTE_MAPPING_BYNODE: /* rmaps_lama_cmd_map = strdup("nbNsL3L2L1ch"); */ rmaps_lama_cmd_map = strdup("nbsch"); break; case ORTE_MAPPING_BYBOARD: /* rmaps_lama_cmd_map = strdup("bnNsL3L2L1ch"); */ orte_show_help("help-orte-rmaps-lama.txt", "invalid mapping option", true, "by board", "mapping by board not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; break; case ORTE_MAPPING_BYNUMA: /* rmaps_lama_cmd_map = strdup("NbnsL3L2L1ch"); */ rmaps_lama_cmd_map = strdup("Nbnsch"); break; case ORTE_MAPPING_BYSOCKET: /* rmaps_lama_cmd_map = strdup("sNbnL3L2L1ch"); */ rmaps_lama_cmd_map = strdup("sbnch"); break; case ORTE_MAPPING_BYL3CACHE: rmaps_lama_cmd_map = strdup("L3sNbnL2L1ch"); break; case ORTE_MAPPING_BYL2CACHE: rmaps_lama_cmd_map = strdup("L2sNbnL1ch"); break; case ORTE_MAPPING_BYL1CACHE: rmaps_lama_cmd_map = strdup("L1sNbnch"); break; case ORTE_MAPPING_BYCORE: case ORTE_MAPPING_BYSLOT: /* rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); */ rmaps_lama_cmd_map = strdup("csbnh"); break; case ORTE_MAPPING_BYHWTHREAD: /* rmaps_lama_cmd_map = strdup("hcL1L2L3sNbn"); */ rmaps_lama_cmd_map = strdup("hcsbn"); break; case ORTE_MAPPING_RR: orte_show_help("help-orte-rmaps-lama.txt", "invalid mapping option", true, "round robin", "mapping by round robin not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; case ORTE_MAPPING_SEQ: orte_show_help("help-orte-rmaps-lama.txt", "invalid mapping option", true, "sequential", "mapping by sequential not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; case ORTE_MAPPING_BYUSER: orte_show_help("help-orte-rmaps-lama.txt", "invalid mapping option", true, "by user", "mapping by user not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; default: /* * Default is map-by core */ rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); break; } } /* * Binding Options */ if( NULL == rmaps_lama_cmd_bind ) { /* * No binding specified, use default */ if( !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) || !OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) || OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { rmaps_lama_cmd_bind = NULL; } switch( OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { case OPAL_BIND_TO_BOARD: /* rmaps_lama_cmd_bind = strdup("1b"); */ orte_show_help("help-orte-rmaps-lama.txt", "invalid binding option", true, "by board", "binding to board not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; break; case OPAL_BIND_TO_NUMA: rmaps_lama_cmd_bind = strdup("1N"); break; case OPAL_BIND_TO_SOCKET: rmaps_lama_cmd_bind = strdup("1s"); break; case OPAL_BIND_TO_L3CACHE: rmaps_lama_cmd_bind = strdup("1L3"); break; case OPAL_BIND_TO_L2CACHE: rmaps_lama_cmd_bind = strdup("1L2"); break; case OPAL_BIND_TO_L1CACHE: rmaps_lama_cmd_bind = strdup("1L1"); break; case OPAL_BIND_TO_CORE: rmaps_lama_cmd_bind = strdup("1c"); break; case OPAL_BIND_TO_HWTHREAD: rmaps_lama_cmd_bind = strdup("1h"); break; case OPAL_BIND_TO_CPUSET: orte_show_help("help-orte-rmaps-lama.txt", "invalid binding option", true, "by CPU set", "binding to CPU set not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; break; default: rmaps_lama_cmd_bind = NULL; break; } } /* * Ordering (a.k.a. Ranking) Options */ if( NULL == rmaps_lama_cmd_ordering ) { /* orte_rmaps_base.ranking */ switch( ORTE_GET_RANKING_POLICY(jdata->map->ranking) ) { case ORTE_RANK_BY_SLOT: rmaps_lama_cmd_ordering = strdup("s"); break; case ORTE_RANK_BY_NODE: case ORTE_RANK_BY_NUMA: case ORTE_RANK_BY_SOCKET: case ORTE_RANK_BY_L3CACHE: case ORTE_RANK_BY_L2CACHE: case ORTE_RANK_BY_L1CACHE: case ORTE_RANK_BY_CORE: case ORTE_RANK_BY_HWTHREAD: rmaps_lama_cmd_ordering = strdup("n"); break; case ORTE_RANK_BY_BOARD: /* rmaps_lama_cmd_ordering = strdup("n"); */ orte_show_help("help-orte-rmaps-lama.txt", "invalid ordering option", true, "by board", "ordering by board not supported by LAMA"); exit_status = ORTE_ERR_NOT_SUPPORTED; goto cleanup; break; default: rmaps_lama_cmd_ordering = strdup("n"); break; } } /* * MPPR */ if( NULL == rmaps_lama_cmd_mppr ) { /* * The ppr is given in the map */ if( NULL != jdata->map->ppr) { rmaps_lama_cmd_mppr = rmaps_lama_covert_ppr(jdata->map->ppr); } } /* * Oversubscription */ if( ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping) ) { rmaps_lama_can_oversubscribe = false; } else { rmaps_lama_can_oversubscribe = true; } /* * Display revised values */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: Revised Parameters -----"); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: Map : %s", rmaps_lama_cmd_map); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: Bind : %s", rmaps_lama_cmd_bind); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: MPPR : %s", rmaps_lama_cmd_mppr); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:lama: Order : %s", rmaps_lama_cmd_ordering); cleanup: return exit_status; }