int orte_ess_base_proc_binding(void) { #if OPAL_HAVE_HWLOC hwloc_obj_t node, obj; hwloc_cpuset_t cpus, nodeset; hwloc_obj_type_t target; unsigned int cache_level = 0; struct hwloc_topology_support *support; char *map; int ret; char *error; /* Determine if we were pre-bound or not */ if (NULL != getenv("OMPI_MCA_orte_bound_at_launch")) { orte_proc_is_bound = true; if (NULL != (map = getenv("OMPI_MCA_orte_base_applied_binding"))) { orte_proc_applied_binding = hwloc_bitmap_alloc(); if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) { error = "applied_binding parse"; goto error; } } } /* see if we were bound when launched */ if (!orte_proc_is_bound) { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Not bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* we were not bound at launch */ if (NULL != opal_hwloc_topology) { support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology); /* get our node object */ node = hwloc_get_root_obj(opal_hwloc_topology); nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node); /* get our bindings */ cpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) { /* we are NOT bound if get_cpubind fails, nor can we be bound - the * environment does not support it */ hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Binding not supported", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* we are bound if the two cpusets are not equal, * or if there is only ONE cpu available to us */ if (0 != hwloc_bitmap_compare(cpus, nodeset) || opal_hwloc_base_single_cpu(nodeset) || opal_hwloc_base_single_cpu(cpus)) { /* someone external set it - indicate it is set * so that we know */ orte_proc_is_bound = true; hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process was externally bound", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (support->cpubind->set_thisproc_cpubind && OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* the system is capable of doing processor affinity, but it * has not yet been set - see if a slot_list was given */ hwloc_bitmap_zero(cpus); if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list, opal_hwloc_topology, cpus))) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } /* try to find a level and index for this location */ opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx); /* cleanup */ hwloc_bitmap_free(cpus); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound according to slot_list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* cleanup */ hwloc_bitmap_free(cpus); /* get the node rank */ if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) { /* this is not an error - could be due to being * direct launched - so just ignore and leave * us unbound */ OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process not bound - no node rank available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* if the binding policy is hwthread, then we bind to the nrank-th * hwthread on this node */ if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting hwthread object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL; orte_process_info.bind_idx = orte_process_info.my_node_rank; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to hwthread", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* if the binding policy is core, then we bind to the nrank-th * core on this node */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; ret = ORTE_ERROR; goto error; } orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL; orte_process_info.bind_idx = orte_process_info.my_node_rank; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to core", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* for all higher binding policies, we bind to the specified * object that the nrank-th core belongs to */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 1; orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL; } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 2; orte_process_info.bind_level = OPAL_HWLOC_L2CACHE_LEVEL; } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 3; orte_process_info.bind_level = OPAL_HWLOC_L3CACHE_LEVEL; } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_SOCKET; orte_process_info.bind_level = OPAL_HWLOC_SOCKET_LEVEL; } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_NODE; orte_process_info.bind_level = OPAL_HWLOC_NUMA_LEVEL; } else { ret = ORTE_ERR_NOT_FOUND; error = "Binding policy not known"; goto error; } for (obj = obj->parent; NULL != obj; obj = obj->parent) { if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* this is the place! */ cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology, obj, OPAL_HWLOC_LOGICAL); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), opal_hwloc_base_print_level(orte_process_info.bind_level))); break; } } if (!orte_proc_is_bound) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } } } } } } else { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } MOVEON: /* get or update our local cpuset - it will get used multiple * times, so it's more efficient to keep a global copy */ opal_hwloc_base_get_local_cpuset(); /* report bindings, if requested */ if (opal_hwloc_report_bindings) { char bindings[64]; hwloc_obj_t root; hwloc_cpuset_t cpus; /* get the root object for this node */ root = hwloc_get_root_obj(opal_hwloc_topology); cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root); /* we are not bound if this equals our cpuset */ if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) { opal_output(0, "%s is not bound", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } else { hwloc_bitmap_list_snprintf(bindings, 64, opal_hwloc_my_cpuset); opal_output(0, "%s is bound to cpus %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), bindings); } } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ORTE_ERR_SILENT; #else return ORTE_SUCCESS; #endif }
/* only APPS call this function - daemons have their own */ int orte_util_decode_pidmap(opal_byte_object_t *bo) { orte_vpid_t i, num_procs, *vptr, daemon; orte_vpid_t *daemons=NULL; orte_local_rank_t *local_rank=NULL; orte_node_rank_t *node_rank=NULL; #if OPAL_HAVE_HWLOC opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr; unsigned int *bind_idx=NULL, pbidx, *uiptr; #endif opal_hwloc_locality_t locality; orte_std_cntr_t n; opal_buffer_t buf; int rc; orte_proc_state_t *states = NULL; orte_app_idx_t *app_idx = NULL; int32_t *restarts = NULL; orte_process_name_t proc, dmn; orte_namelist_t *nm; opal_list_t jobs; char *hostname; /* xfer the byte object to a buffer for unpacking */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.load(&buf, bo->bytes, bo->size))) { ORTE_ERROR_LOG(rc); goto cleanup; } n = 1; /* cycle through the buffer */ OBJ_CONSTRUCT(&jobs, opal_list_t); while (ORTE_SUCCESS == (rc = opal_dss.unpack(&buf, &proc.jobid, &n, ORTE_JOBID))) { OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap working job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(proc.jobid))); /* record the jobid */ nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = proc.jobid; opal_list_append(&jobs, &nm->super); /* unpack and store the number of procs */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_procs, &n, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } proc.vpid = ORTE_VPID_INVALID; if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NPROCS, &num_procs, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } #if OPAL_HAVE_HWLOC /* unpack and store the binding level */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &bind_level, &n, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* store it */ proc.vpid = ORTE_VPID_INVALID; if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_LEVEL, &bind_level, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* set mine */ if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { orte_process_info.bind_level = bind_level; } OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap nprocs %s bind level %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(num_procs), opal_hwloc_base_print_level(bind_level))); #endif /* allocate memory for the daemon info */ daemons = (orte_vpid_t*)malloc(num_procs * sizeof(orte_vpid_t)); /* unpack it in one shot */ n=num_procs; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, daemons, &n, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* allocate memory for local ranks */ local_rank = (orte_local_rank_t*)malloc(num_procs*sizeof(orte_local_rank_t)); /* unpack them in one shot */ n=num_procs; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, local_rank, &n, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { /* set mine */ orte_process_info.my_local_rank = local_rank[ORTE_PROC_MY_NAME->vpid]; if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_LOCALRANK, &orte_process_info.my_local_rank, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } } /* allocate memory for node ranks */ node_rank = (orte_node_rank_t*)malloc(num_procs*sizeof(orte_node_rank_t)); /* unpack node ranks in one shot */ n=num_procs; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, node_rank, &n, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { /* set mine */ orte_process_info.my_node_rank = node_rank[ORTE_PROC_MY_NAME->vpid]; if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_NODERANK, &orte_process_info.my_node_rank, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } } #if OPAL_HAVE_HWLOC /* allocate memory for bind_idx */ bind_idx = (unsigned int*)malloc(num_procs*sizeof(unsigned int)); /* unpack bind_idx in one shot */ n=num_procs; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, bind_idx, &n, OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { /* set mine */ orte_process_info.bind_idx = bind_idx[ORTE_PROC_MY_NAME->vpid]; if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_BIND_INDEX, &orte_process_info.bind_idx, OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } } #endif /* allocate memory for states */ states = (orte_proc_state_t*)malloc(num_procs*sizeof(orte_proc_state_t)); /* unpack states in one shot */ n=num_procs; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, states, &n, ORTE_PROC_STATE))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* dump this info - apps don't need it */ free(states); states = NULL; /* allocate memory for app_idx's */ app_idx = (orte_app_idx_t*)malloc(num_procs*sizeof(orte_app_idx_t)); /* unpack app_idx's in one shot */ n=num_procs; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, app_idx, &n, ORTE_APP_IDX))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* dump this info - apps don't need it */ free(app_idx); app_idx = NULL; /* allocate memory for restarts */ restarts = (int32_t*)malloc(num_procs*sizeof(int32_t)); /* unpack restarts in one shot */ n=num_procs; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, restarts, &n, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* dump this info - apps don't need it */ free(restarts); restarts = NULL; /* set the daemon jobid */ dmn.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid); /* xfer the data */ for (i=0; i < num_procs; i++) { if (proc.jobid == ORTE_PROC_MY_NAME->jobid && i == ORTE_PROC_MY_NAME->vpid) { continue; } proc.vpid = i; if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &daemons[i], ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* lookup and store the hostname for this proc */ dmn.vpid = daemons[i]; if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank[i], ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank[i], ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } #if OPAL_HAVE_HWLOC if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_INDEX, &bind_idx[i], OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((10, orte_nidmap_output, "%s orte:util:decode:pidmap proc %s host %s lrank %d nrank %d bindidx %u", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc), hostname, (int)local_rank[i], (int)node_rank[i], bind_idx[i])); #endif } /* release data */ free(daemons); daemons = NULL; free(local_rank); local_rank = NULL; free(node_rank); node_rank = NULL; #if OPAL_HAVE_HWLOC free(bind_idx); bind_idx = NULL; #endif /* setup for next cycle */ n = 1; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } rc = ORTE_SUCCESS; /* now that we have all the data, we are guaranteed * to know our own node, so go back and record the * locality of each proc relative to me */ while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&jobs))) { proc.jobid = nm->name.jobid; /* recover the number of procs in this job */ vptr = &num_procs; proc.vpid = ORTE_VPID_INVALID; if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_NPROCS, (void**)&vptr, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } for (i=0; i < num_procs; i++) { if (ORTE_PROC_MY_NAME->vpid == i && ORTE_PROC_MY_NAME->jobid == proc.jobid) { /* this is me */ continue; } proc.vpid = i; /* recover the daemon for this proc */ vptr = &daemon; if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_DAEMON_VPID, (void**)&vptr, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (daemon == ORTE_PROC_MY_DAEMON->vpid) { OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap proc %s shares node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc))); /* we share a node, so add them to the count of peers * sharing the node with me */ orte_process_info.num_local_peers++; #if OPAL_HAVE_HWLOC /* retrieve the bind level for the other proc's job */ lvptr = &pbind; proc.vpid = ORTE_VPID_INVALID; if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_LEVEL, (void**)&lvptr, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* retrieve the other's proc's bind idx */ uiptr = &pbidx; proc.vpid = i; if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_INDEX, (void**)&uiptr, OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* we share a node - see what else we share */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, orte_process_info.bind_level, orte_process_info.bind_idx, pbind, pbidx); #else locality = OPAL_PROC_ON_NODE; #endif } else { /* we don't share a node */ OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap proc %s does NOT node [my daemon %s, their daemon %s]", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc), ORTE_VPID_PRINT(ORTE_PROC_MY_DAEMON->vpid), ORTE_VPID_PRINT(daemon))); locality = OPAL_PROC_NON_LOCAL; } /* store the locality */ OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap set proc %s locality to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc), opal_hwloc_base_print_locality(locality))); if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } cleanup: if (NULL != daemons) { free(daemons); } if (NULL != local_rank) { free(local_rank); } if (NULL != node_rank) { free(node_rank); } #if OPAL_HAVE_HWLOC if (NULL != bind_idx) { free(bind_idx); } #endif if (NULL != states) { free(states); } if (NULL != app_idx) { free(app_idx); } if (NULL != restarts) { free(restarts); } OBJ_DESTRUCT(&buf); return rc; }
/* * Function for selecting one component from all those that are * available. */ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) { orte_job_t *jdata; orte_job_map_t *map; int rc; bool did_map; opal_list_item_t *item; orte_rmaps_base_selected_module_t *mod; orte_job_t *parent; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; /* convenience */ jdata = caddy->jdata; /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A * NULL MAP FIELD * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY * DIDN'T SET IT */ if (NULL == jdata->map) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps: creating new map for job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* create a map object where we will store the results */ map = OBJ_NEW(orte_job_map_t); if (NULL == map) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* load it with the system defaults */ map->mapping = orte_rmaps_base.mapping; map->ranking = orte_rmaps_base.ranking; #if OPAL_HAVE_HWLOC map->binding = opal_hwloc_binding_policy; #endif if (NULL != orte_rmaps_base.ppr) { map->ppr = strdup(orte_rmaps_base.ppr); } map->cpus_per_rank = orte_rmaps_base.cpus_per_rank; map->display_map = orte_rmaps_base.display_map; /* assign the map object to this job */ jdata->map = map; } else { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps: setting mapping policies for job %s", ORTE_JOBID_PRINT(jdata->jobid)); if (!jdata->map->display_map) { jdata->map->display_map = orte_rmaps_base.display_map; } /* set the default mapping policy IFF it wasn't provided */ if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping); } if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)); } /* ditto for rank and bind policies */ if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking); } #if OPAL_HAVE_HWLOC if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { jdata->map->binding = opal_hwloc_binding_policy; } #endif } #if OPAL_HAVE_HWLOC /* if we are not going to launch, then we need to set any * undefined topologies to match our own so the mapper * can operate */ if (orte_do_not_launch) { orte_node_t *node; hwloc_topology_t t0; int i; node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); t0 = node->topology; for (i=1; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (NULL == node->topology) { node->topology = t0; } } } #endif /* cycle thru the available mappers until one agrees to map * the job */ did_map = false; for (item = opal_list_get_first(&orte_rmaps_base.selected_modules); item != opal_list_get_end(&orte_rmaps_base.selected_modules); item = opal_list_get_next(item)) { mod = (orte_rmaps_base_selected_module_t*)item; if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) { did_map = true; break; } /* mappers return "next option" if they didn't attempt to * map the job. anything else is a true error. */ if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } /* if we get here without doing the map, or with zero procs in * the map, then that's an error */ if (!did_map || 0 == jdata->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "failed-map", true); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* compute and save local ranks */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #if OPAL_HAVE_HWLOC /* compute and save bindings */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #endif /* if it is a dynamic spawn, save the bookmark on the parent's job too */ if (ORTE_JOBID_INVALID != jdata->originator.jobid) { if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) { parent->bookmark = jdata->bookmark; } } /* if we wanted to display the map, now is the time to do it - ignore * daemon job */ if (jdata->map->display_map) { char *output; int i, j; orte_node_t *node; orte_proc_t *proc; if (orte_display_diffable_output) { /* intended solely to test mapping methods, this output * can become quite long when testing at scale. Rather * than enduring all the malloc/free's required to * create an arbitrary-length string, custom-generate * the output a line at a time here */ /* display just the procs in a diffable format */ opal_output(orte_clean_output, "<map>"); fflush(stderr); /* loop through nodes */ for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name); fflush(stderr); for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } #if OPAL_HAVE_HWLOC { char locale[64]; if (NULL != proc->locale) { hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset); } opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s[%s:%u]>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank, locale, (NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap, opal_hwloc_base_print_level(jdata->map->bind_level), proc->bind_idx); } #else opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank); #endif fflush(stderr); } opal_output(orte_clean_output, "\t</host>"); fflush(stderr); } #if OPAL_HAVE_HWLOC { opal_hwloc_locality_t locality; orte_proc_t *p0; /* test locality - for the first node, print the locality of each proc relative to the first one */ node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0); p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0); opal_output(orte_clean_output, "\t<locality>"); for (j=1; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } locality = opal_hwloc_base_get_relative_locality(node->topology, jdata->map->bind_level, p0->bind_idx, jdata->map->bind_level, proc->bind_idx); opal_output(orte_clean_output, "\t\t<bind_level=%s rank=%s bind_idx=%u rank=%s bind_idx=%u locality=%s>", opal_hwloc_base_print_level(jdata->map->bind_level), ORTE_VPID_PRINT(p0->name.vpid), p0->bind_idx, ORTE_VPID_PRINT(proc->name.vpid), proc->bind_idx, opal_hwloc_base_print_locality(locality)); } opal_output(orte_clean_output, "\t</locality>\n</map>"); fflush(stderr); } #else opal_output(orte_clean_output, "\n</map>"); fflush(stderr); #endif } else { opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP); if (orte_xml_output) { fprintf(orte_xml_fp, "%s\n", output); fflush(orte_xml_fp); } else { opal_output(orte_clean_output, "%s", output); } free(output); } } /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE); /* cleanup */ OBJ_RELEASE(caddy); }