int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata) { orte_job_t *jdat; orte_node_t *node; orte_proc_t *proc; orte_job_map_t *map; opal_list_t node_list; opal_list_item_t *item; orte_app_context_t *app; orte_std_cntr_t num_slots; int rc, i, n; bool ignored; /* get the daemon app if provided - may include -host or hostfile * info about available nodes */ app = (orte_app_context_t *) opal_pointer_array_get_item(jdata->apps, 0); map = jdata->map; /* get the list of all available nodes that do not already * have a daemon on them */ OBJ_CONSTRUCT(&node_list, opal_list_t); if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, map->policy))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&node_list); return rc; } /* check all other known jobs to see if they have something to * add to the allocation - we won't have seen these and the * daemon job won't have any in its app */ for (i=0; i < orte_job_data->size; i++) { if (NULL == (jdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { continue; } for (n=0; n < jdat->apps->size; n++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdat->apps, n))) { continue; } if (NULL != app->hostfile) { /* hostfile was specified - parse it and add it to the list. The * function automatically ignores duplicates */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&node_list, &ignored, app->hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&node_list); return rc; } } if (NULL != app->dash_host) { /* parse and add to list, ignoring duplicates */ if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&node_list, &ignored, app->dash_host))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&node_list); return rc; } } } } /* add all these nodes to the map */ while (NULL != (item = opal_list_remove_first(&node_list))) { node = (orte_node_t*)item; /* if this is my node, ignore it - we are already here */ if (0 == strcmp(node->name, orte_process_info.nodename)) { continue; } opal_pointer_array_add(map->nodes, (void*)node); ++(map->num_nodes); /* if this node already has a daemon, release that object * to maintain bookkeeping */ if (NULL != node->daemon) { OBJ_RELEASE(node->daemon); } /* create a new daemon object for this node */ proc = OBJ_NEW(orte_proc_t); if (NULL == proc) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } proc->name.jobid = ORTE_PROC_MY_HNP->jobid; if (ORTE_VPID_MAX-1 <= jdata->num_procs) { /* no more daemons available */ orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true); OBJ_RELEASE(proc); return ORTE_ERR_OUT_OF_RESOURCE; } proc->name.vpid = jdata->num_procs; /* take the next available vpid */ ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); proc->node = node; proc->nodename = node->name; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:setup_vm add new daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); /* add the daemon to the daemon job object */ if (0 > (rc = opal_pointer_array_add(jdata->procs, (void*)proc))) { ORTE_ERROR_LOG(rc); return rc; } ++jdata->num_procs; /* point the node to the daemon */ node->daemon = proc; OBJ_RETAIN(proc); /* maintain accounting */ /* track number of daemons to be launched */ ++map->num_new_daemons; /* and their starting vpid */ if (ORTE_VPID_INVALID == map->daemon_vpid_start) { map->daemon_vpid_start = proc->name.vpid; } } OBJ_DESTRUCT(&node_list); return ORTE_SUCCESS; }
static int ppr_mapper(orte_job_t *jdata) { int rc = ORTE_SUCCESS, j, n; mca_base_component_t *c=&mca_rmaps_ppr_component.base_version; orte_node_t *node; orte_proc_t *proc; orte_app_context_t *app; orte_vpid_t total_procs, nprocs_mapped; opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL; #if OPAL_HAVE_HWLOC hwloc_obj_t obj; hwloc_obj_type_t lowest; unsigned cache_level=0; unsigned int nobjs, i; bool pruning_reqd = false; opal_hwloc_level_t level; #endif opal_list_t node_list; opal_list_item_t *item; orte_std_cntr_t num_slots; orte_app_idx_t idx; char **ppr_req, **ck; size_t len; bool initial_map=true; /* only handle initial launch of loadbalanced * or NPERxxx jobs - allow restarting of failed apps */ if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s being restarted - ppr cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s not using ppr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL == jdata->map->ppr || ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* not for us */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s not using ppr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: mapping job %s with ppr %s", ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* initialize */ memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t)); /* parse option */ n=0; ppr_req = opal_argv_split(jdata->map->ppr, ','); for (j=0; NULL != ppr_req[j]; j++) { /* split on the colon */ ck = opal_argv_split(ppr_req[j], ':'); if (2 != opal_argv_count(ck)) { /* must provide a specification */ orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr); opal_argv_free(ppr_req); opal_argv_free(ck); return ORTE_ERR_SILENT; } len = strlen(ck[1]); if (0 == strncasecmp(ck[1], "node", len)) { ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10); ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE); start = OPAL_HWLOC_NODE_LEVEL; n++; #if OPAL_HAVE_HWLOC } else if (0 == strncasecmp(ck[1], "hwthread", len) || 0 == strncasecmp(ck[1], "thread", len)) { ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10); start = OPAL_HWLOC_HWTHREAD_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD); n++; } else if (0 == strncasecmp(ck[1], "core", len)) { ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_CORE_LEVEL) { start = OPAL_HWLOC_CORE_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE); } n++; } else if (0 == strncasecmp(ck[1], "socket", len) || 0 == strncasecmp(ck[1], "skt", len)) { ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_SOCKET_LEVEL) { start = OPAL_HWLOC_SOCKET_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET); } n++; } else if (0 == strncasecmp(ck[1], "l1cache", len)) { ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L1CACHE_LEVEL) { start = OPAL_HWLOC_L1CACHE_LEVEL; cache_level = 1; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE); } n++; } else if (0 == strncasecmp(ck[1], "l2cache", len)) { ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L2CACHE_LEVEL) { start = OPAL_HWLOC_L2CACHE_LEVEL; cache_level = 2; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE); } n++; } else if (0 == strncasecmp(ck[1], "l3cache", len)) { ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_L3CACHE_LEVEL) { start = OPAL_HWLOC_L3CACHE_LEVEL; cache_level = 3; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE); } n++; } else if (0 == strncasecmp(ck[1], "numa", len)) { ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10); if (start < OPAL_HWLOC_NUMA_LEVEL) { start = OPAL_HWLOC_NUMA_LEVEL; ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA); } n++; #endif } else { /* unknown spec */ orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr); opal_argv_free(ppr_req); opal_argv_free(ck); return ORTE_ERR_SILENT; } opal_argv_free(ck); } opal_argv_free(ppr_req); /* if nothing was given, that's an error */ if (0 == n) { opal_output(0, "NOTHING GIVEN"); return ORTE_ERR_SILENT; } #if OPAL_HAVE_HWLOC /* if more than one level was specified, then pruning will be reqd */ if (1 < n) { pruning_reqd = true; } #endif opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s assigned policy %s", ORTE_JOBID_PRINT(jdata->jobid), orte_rmaps_base_print_mapping(jdata->map->mapping)); #if OPAL_HAVE_HWLOC /* convenience */ level = start; lowest = opal_hwloc_levels[start]; #endif for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { continue; } /* if the number of total procs was given, set that * limit - otherwise, set to max so we simply fill * all the nodes with the pattern */ if (0 < app->num_procs) { total_procs = app->num_procs; } else { total_procs = ORTE_VPID_MAX; } /* get the available nodes */ OBJ_CONSTRUCT(&node_list, opal_list_t); if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map, false))) { ORTE_ERROR_LOG(rc); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; /* if a bookmark exists from some prior mapping, set us to start there */ jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata); /* cycle across the nodes */ nprocs_mapped = 0; for (item = opal_list_get_first(&node_list); item != opal_list_get_end(&node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; #if OPAL_HAVE_HWLOC /* bozo check */ if (NULL == node->topology) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); rc = ORTE_ERR_SILENT; goto error; } #endif /* add the node to the map, if needed */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); goto error; } ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); OBJ_RETAIN(node); /* maintain accounting on object */ jdata->map->num_nodes++; } /* if we are mapping solely at the node level, just put * that many procs on this node */ if (OPAL_HWLOC_NODE_LEVEL == start) { #if OPAL_HAVE_HWLOC obj = hwloc_get_root_obj(node->topology); #endif for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; #if OPAL_HAVE_HWLOC orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); #endif } #if OPAL_HAVE_HWLOC } else { /* get the number of lowest resources on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, lowest, cache_level, OPAL_HWLOC_AVAILABLE); /* map the specified number of procs to each such resource on this node, * recording the locale of each proc so we know its cpuset */ for (i=0; i < nobjs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology, lowest, cache_level, i, OPAL_HWLOC_AVAILABLE); for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); } } if (pruning_reqd) { /* go up the ladder and prune the procs according to * the specification, adjusting the count of procs on the * node as we go */ level--; prune(jdata->jobid, idx, node, &level, &nprocs_mapped); } #endif } /* set the total slots used */ if ((int)node->num_procs <= node->slots) { node->slots_inuse = (int)node->num_procs; } else { node->slots_inuse = node->slots; } /* if no-oversubscribe was specified, check to see if * we have violated the total slot specification - regardless, * if slots_max was given, we are not allowed to violate it! */ if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED); } /* if we haven't mapped all the procs, continue on to the * next node */ if (total_procs == nprocs_mapped) { break; } } if (0 == app->num_procs) { app->num_procs = nprocs_mapped; } if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) { /* couldn't map them all */ orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs", true, app->app, app->num_procs, jdata->map->ppr); rc = ORTE_ERR_SILENT; goto error; } /* compute vpids and add proc objects to the job */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); goto error; } /* track the total number of processes we mapped - must update * this AFTER we compute vpids so that computation is done * correctly */ jdata->num_procs += app->num_procs; while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); } return ORTE_SUCCESS; error: while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); return rc; }
/* * Create a round-robin mapping for the job. */ static int orte_rmaps_rr_map(orte_job_t *jdata) { orte_app_context_t *app; int i; opal_list_t node_list; opal_list_item_t *item; orte_std_cntr_t num_slots; int rc; mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version; bool initial_map=true; /* this mapper can only handle initial launch * when rr mapping is desired - allow * restarting of failed apps */ if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: job %s is being restarted - rr cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: job %s not using rr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (ORTE_MAPPING_RR < ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I don't know how to do these - defer */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: job %s not using rr mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* start at the beginning... */ jdata->num_procs = 0; /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* setup the nodelist here in case we jump to error */ OBJ_CONSTRUCT(&node_list, opal_list_t); /* if the number of processes wasn't specified, then we know there can be only * one app_context allowed in the launch, and that we are to launch it across * all available slots. We'll double-check the single app_context rule first */ if (0 == app->num_procs && 1 < jdata->num_apps) { orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np", true, jdata->num_apps, NULL); rc = ORTE_ERR_SILENT; goto error; } /* for each app_context, we have to get the list of nodes that it can * use since that can now be modified with a hostfile and/or -host * option */ if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, initial_map, false))) { ORTE_ERROR_LOG(rc); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; /* if a bookmark exists from some prior mapping, set us to start there */ jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata); if (0 == app->num_procs) { /* set the num_procs to equal the number of slots on these * mapped nodes, taking into account the number of cpus/rank */ app->num_procs = num_slots / orte_rmaps_base.cpus_per_rank; /* sometimes, we have only one "slot" assigned, but may * want more than one cpu/rank - so ensure we always wind * up with at least one proc */ if (0 == app->num_procs) { app->num_procs = 1; } } /* Make assignments */ if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_bynode(jdata, app, &node_list, num_slots, app->num_procs); } else if (ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); #if OPAL_HAVE_HWLOC } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_PU, 0); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_CORE, 0); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_CACHE, 1); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_CACHE, 2); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_CACHE, 3); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_SOCKET, 0); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, HWLOC_OBJ_NODE, 0); if (ORTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); } #endif } else { /* unrecognized mapping directive */ orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", orte_rmaps_base_print_mapping(jdata->map->mapping)); rc = ORTE_ERR_SILENT; goto error; } if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto error; } /* compute vpids and add proc objects to the job - do this after * each app_context so that the ranks within each context are * contiguous */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { ORTE_ERROR_LOG(rc); return rc; } /* track the total number of processes we mapped - must update * this value AFTER we compute vpids so that computation * is done correctly */ jdata->num_procs += app->num_procs; /* cleanup the node list - it can differ from one app_context * to another, so we have to get it every time */ while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); } return ORTE_SUCCESS; error: while(NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); return rc; }
/* place specified #procs on each node, up to the specified total * number of procs (if one was given). */ static int npernode(orte_job_t *jdata) { orte_app_context_t *app; int i, j, rc=ORTE_SUCCESS; opal_list_t node_list; opal_list_item_t *item; orte_std_cntr_t num_slots; orte_node_t *node; int np, nprocs; int num_nodes; /* setup the node list */ OBJ_CONSTRUCT(&node_list, opal_list_t); /* loop through the app_contexts */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* use the number of procs if one was given */ if (0 < app->num_procs) { np = app->num_procs; } else { np = INT_MAX; } /* for each app_context, we have to get the list of nodes that it can * use since that can now be modified with a hostfile and/or -host * option */ if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->policy))) { ORTE_ERROR_LOG(rc); goto error; } /* loop through the list of nodes */ num_nodes = opal_list_get_size(&node_list); nprocs = 0; while (NULL != (item = opal_list_remove_first(&node_list))) { node = (orte_node_t*)item; /* put the specified number of procs on each node */ for (j=0; j < jdata->map->npernode && nprocs < np; j++) { if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx, &node_list, jdata->map->oversubscribe, false, NULL))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have * more procs to place, then that is an error */ if (ORTE_ERR_NODE_FULLY_USED != rc || j < jdata->map->npernode-1) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(node); goto error; } } nprocs++; } OBJ_RELEASE(node); } /* update the number of procs in the job */ jdata->num_procs += nprocs; /* if the user requested a specific number of procs and * the total number of procs we were able to assign * doesn't equal the number requested, then we have a * problem */ if (0 < app->num_procs && nprocs < app->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true, app->app, app->num_procs, "number of nodes", num_nodes, "npernode", jdata->map->npernode); return ORTE_ERR_SILENT; } /* compute vpids and add proc objects to the job - this has to be * done after each app_context is mapped in order to keep the * vpids contiguous within an app_context */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { ORTE_ERROR_LOG(rc); return rc; } } error: while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); return rc; }
/* * Create a load balanced mapping for the job by assigning a constant #procs/node, with * leftovers being spread one/node starting from the first node. */ static int loadbalance(orte_job_t *jdata) { orte_app_context_t *app; int i, j; opal_list_t node_list; orte_std_cntr_t num_nodes, num_slots; int rc=ORTE_SUCCESS, np, nprocs; int ppn = 0; opal_list_item_t *item, *start; orte_node_t *node; /* setup */ OBJ_CONSTRUCT(&node_list, opal_list_t); /* compute total #procs we are going to add and the total number of nodes available */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* get the nodes and #slots available for this app_context */ if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->policy))) { ORTE_ERROR_LOG(rc); goto error; } if (0 < app->num_procs) { np = app->num_procs; } else { /* set the num_procs to the #slots */ np = num_slots; } num_nodes = opal_list_get_size(&node_list); /* compute the base ppn */ ppn = np / num_nodes; /* if a bookmark exists from some prior mapping, set us to start there */ start = orte_rmaps_base_get_starting_point(&node_list, jdata); /* loop through the list of nodes until we either assign all the procs * or return to the starting point */ item = start; nprocs = 0; do { node = (orte_node_t*)item; /* put the specified number of procs on each node */ for (j=0; j < ppn; j++) { if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx, &node_list, jdata->map->oversubscribe, false, NULL))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have * more procs to place, then that is an error */ if (ORTE_ERR_NODE_FULLY_USED != rc || j < ppn-1) { ORTE_ERROR_LOG(rc); goto error; } } nprocs++; } /* move to next node */ if (opal_list_get_end(&node_list) == opal_list_get_next(item)) { item = opal_list_get_first(&node_list); } else { item = opal_list_get_next(item); } } while (item != start && nprocs < np); /* save the bookmark */ jdata->bookmark = node; /* if we haven't assigned all the procs, then loop through the list * again, assigning 1 per node until all are assigned */ item = start; while (nprocs < np) { node = (orte_node_t*)item; if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx, &node_list, jdata->map->oversubscribe, false, NULL))) { /* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */ if (ORTE_ERR_NODE_FULLY_USED != rc) { ORTE_ERROR_LOG(rc); goto error; } } nprocs++; /* move to next node */ if (opal_list_get_end(&node_list) == opal_list_get_next(item)) { item = opal_list_get_first(&node_list); } else { item = opal_list_get_next(item); } } /* save the bookmark */ jdata->bookmark = node; /* update the number of procs in the job */ jdata->num_procs += nprocs; /* cleanup */ while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } /* if the user requested a specific number of procs and * the total number of procs we were able to assign * doesn't equal the number requested, then we have a * problem */ if (0 < app->num_procs && nprocs < app->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true, app->app, app->num_procs, "number of slots", nprocs, "number of nodes", num_nodes); return ORTE_ERR_SILENT; } /* compute vpids and add proc objects to the job - this has to be * done after each app_context is mapped in order to keep the * vpids contiguous within an app_context */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { ORTE_ERROR_LOG(rc); return rc; } } error: while(NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); return rc; }
static int staged_mapper(orte_job_t *jdata) { mca_base_component_t *c=&mca_rmaps_staged_component.base_version; int i, j, k, rc; orte_app_context_t *app; opal_list_t node_list, desired; orte_std_cntr_t num_slots; orte_proc_t *proc; orte_node_t *node, *next; bool work_to_do = false, first_pass = false; opal_list_item_t *item, *it2; char *cptr, **minimap; orte_vpid_t load; /* only use this mapper if it was specified */ if (NULL == jdata->map->req_mapper || 0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name) || ORTE_MAPPING_STAGED != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I wasn't specified */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:staged: job %s not using staged mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: mapping job %s with %d procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), (int)jdata->num_procs); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* if there are no nodes in the map, then this is our first * pass thru this job */ if (0 == jdata->map->num_nodes) { first_pass = true; } /* we assume that the app_contexts are in priority order, * with the highest priority being the first entry in the * job's app_context array. Loop across the app_contexts * in order, looking for apps that have not been * fully mapped */ for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* has it been fully mapped? */ if (ORTE_APP_STATE_ALL_MAPPED <= app->state) { continue; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: working app %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app); /* find nodes that meet any constraints provided in the form of * -hostfile or -host directives */ OBJ_CONSTRUCT(&node_list, opal_list_t); /* get nodes based on a strict interpretation of the location hints */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, false, true))) { /* we were unable to get any nodes that match those * specified in the app */ if (ORTE_ERR_RESOURCE_BUSY == rc) { /* if the return is "busy", then at least one of the * specified resources must exist, but no slots are * currently available. This means there is at least * a hope of eventually being able to map this app * within its specified constraints, so continue working */ if (orte_soft_locations) { /* if soft locations were given, then we know that * none of the nodes in this allocation are available, * so there is no point in continuing to check the * remaining apps */ while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); goto complete; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: all nodes for this app are currently busy", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); OBJ_DESTRUCT(&node_list); continue; } else { /* this indicates that there are no nodes that match * the specified constraints, so there is no hope of * ever being able to execute this app. This is an * unrecoverable error - note that a return of * "silent" means that the function already printed * an error message, so the error_log will print nothing */ ORTE_ERROR_LOG(rc); return rc; } } /* if a max number of procs/node was given for this * app, remove all nodes from the list that exceed * that limit */ if (0 < app->max_procs_per_node) { item = opal_list_get_first(&node_list); while (item != opal_list_get_end(&node_list)) { it2 = opal_list_get_next(item); node = (orte_node_t*)item; if (app->max_procs_per_node <= node->num_procs) { opal_list_remove_item(&node_list, item); OBJ_RELEASE(item); } item = it2; } } /* if we have no available nodes, then move on to next app */ if (0 == opal_list_get_size(&node_list)) { OBJ_DESTRUCT(&node_list); continue; } /* if the app specified locations, soft or not, search the list of nodes * for those that match the requested locations and move those * to the desired list so we use them first */ if (NULL != app->dash_host) { OBJ_CONSTRUCT(&desired, opal_list_t); /* no particular order is required */ for (j=0; j < opal_argv_count(app->dash_host); j++) { minimap = opal_argv_split(app->dash_host[j], ','); for (k=0; k < opal_argv_count(minimap); k++) { cptr = minimap[k]; for (item = opal_list_get_first(&node_list); item != opal_list_get_end(&node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; if (0 == strcmp(node->name, cptr) || (0 == strcmp("localhost", cptr) && 0 == strcmp(node->name, orte_process_info.nodename))) { opal_list_remove_item(&node_list, item); opal_list_append(&desired, item); opal_output_verbose(10, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: placing node %s on desired list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name); break; } } } opal_argv_free(minimap); } /* if no nodes made the transition and the app specified soft * locations, then we can skip to look at the non-desired list */ if (0 == opal_list_get_size(&desired)) { OBJ_DESTRUCT(&desired); if (orte_soft_locations) { goto process; } else { /* move on to next app */ continue; } } /* cycle thru the procs for this app and attempt to map them * to the desired nodes using a load-balancing algo */ for (j=0; j < app->procs.size; j++) { if (NULL == (proc = opal_pointer_array_get_item(&app->procs, j))) { continue; } if (ORTE_PROC_STATE_UNDEF != proc->state) { /* this proc has already been mapped or executed */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: proc %s has already been mapped", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name)); continue; } /* flag that there is at least one proc still to * be executed */ work_to_do = true; /* track number mapped */ jdata->num_mapped++; /* find the lightest-loaded node on the desired list */ node = NULL; load = ORTE_VPID_MAX; for (item = opal_list_get_first(&desired); item != opal_list_get_end(&desired); item = opal_list_get_next(item)) { next = (orte_node_t*)item; if (next->num_procs < load) { node = next; load = next->num_procs; } } /* put the proc there */ proc->node = node; proc->nodename = node->name; /* the local rank is the number of procs * on this node from this job - we don't * directly track this number, so it must * be found by looping across the node->procs * array and counting it each time. For now, * since we don't use this value in this mode * of operation, just set it to something arbitrary */ proc->local_rank = node->num_procs; /* the node rank is simply the number of procs * on the node at this time */ proc->node_rank = node->num_procs; /* track number of procs on node and number of slots used */ node->num_procs++; node->slots_inuse++; opal_output_verbose(10, orte_rmaps_base_framework.framework_output, "%s Proc %s on node %s: slots %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name, (int)node->slots, (int)node->slots_inuse); if (node->slots_inuse == node->slots) { opal_list_remove_item(&desired, &node->super); OBJ_RELEASE(node); } if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(proc); return rc; } /* retain the proc struct so that we correctly track its release */ OBJ_RETAIN(proc); proc->state = ORTE_PROC_STATE_INIT; /* flag the proc as updated so it will be included * in the next pidmap message */ proc->updated =true; /* add the node to the map, if needed */ if (!node->mapped) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); return rc; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ jdata->map->num_nodes++; } if (0 == opal_list_get_size(&desired)) { /* nothing more we can do */ break; } } /* clear the list */ while (NULL != (item = opal_list_remove_first(&desired))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&desired); } process: for (j=0; j < app->procs.size; j++) { if (NULL == (proc = opal_pointer_array_get_item(&app->procs, j))) { continue; } if (ORTE_PROC_STATE_UNDEF != proc->state) { /* this proc has already been mapped or executed */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: proc %s has already been mapped", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name)); continue; } /* find the lightest-loaded node on the node list */ node = NULL; load = ORTE_VPID_MAX; for (item = opal_list_get_first(&node_list); item != opal_list_get_end(&node_list); item = opal_list_get_next(item)) { next = (orte_node_t*)item; if (next->num_procs < load) { node = next; load = next->num_procs; } } /* flag that there is at least one proc still to * be executed */ work_to_do = true; /* track number mapped */ jdata->num_mapped++; /* map this proc to the first available slot */ OBJ_RETAIN(node); /* maintain accounting on object */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: assigning proc %s to node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name); proc->node = node; proc->nodename = node->name; /* the local rank is the number of procs * on this node from this job - we don't * directly track this number, so it must * be found by looping across the node->procs * array and counting it each time. For now, * since we don't use this value in this mode * of operation, just set it to something arbitrary */ proc->local_rank = node->num_procs; /* the node rank is simply the number of procs * on the node at this time */ proc->node_rank = node->num_procs; /* track number of procs on node and number of slots used */ node->num_procs++; node->slots_inuse++; opal_output_verbose(10, orte_rmaps_base_framework.framework_output, "%s Proc %s on node %s: slots %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name, (int)node->slots, (int)node->slots_inuse); if (node->slots_inuse == node->slots) { opal_list_remove_item(&node_list, &node->super); OBJ_RELEASE(node); } if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(proc); return rc; } /* retain the proc struct so that we correctly track its release */ OBJ_RETAIN(proc); proc->state = ORTE_PROC_STATE_INIT; /* flag the proc as updated so it will be included * in the next pidmap message */ proc->updated =true; /* add the node to the map, if needed */ if (!node->mapped) { if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(rc); return rc; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ jdata->map->num_nodes++; } if (0 == opal_list_get_size(&node_list)) { /* nothing more we can do */ break; } } /* clear the list */ while (NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&node_list); } complete: /* if there isn't at least one proc that can be launched, * then indicate that we don't need to proceed with the * launch sequence */ if (!work_to_do) { return ORTE_ERR_RESOURCE_BUSY; } /* flag that the job was updated so it will be * included in the pidmap message */ jdata->updated = true; /* if we successfully mapped ALL procs in the first pass, * then this job is capable of supporting MPI procs */ if (first_pass && jdata->num_mapped == jdata->num_procs) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s mca:rmaps:staged: job %s is MPI-capable", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid)); jdata->gang_launched = true; } return ORTE_SUCCESS; }