Exemplo n.º 1
0
int orte_ess_base_proc_binding(void)
{
#if OPAL_HAVE_HWLOC
    hwloc_obj_t node, obj;
    hwloc_cpuset_t cpus, nodeset;
    hwloc_obj_type_t target;
    unsigned int cache_level = 0;
    struct hwloc_topology_support *support;
    char *map;
    int ret;
    char *error;

    /* Determine if we were pre-bound or not */
    if (NULL != getenv("OMPI_MCA_orte_bound_at_launch")) {
        orte_proc_is_bound = true;
        if (NULL != (map = getenv("OMPI_MCA_orte_base_applied_binding"))) {
            orte_proc_applied_binding = hwloc_bitmap_alloc();
            if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) {
                error = "applied_binding parse";
                goto error;
            }
        }
    }

    /* see if we were bound when launched */
    if (!orte_proc_is_bound) {
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                             "%s Not bound at launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* we were not bound at launch */
        if (NULL != opal_hwloc_topology) {
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
            /* get our node object */
            node = hwloc_get_root_obj(opal_hwloc_topology);
            nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node);
            /* get our bindings */
            cpus = hwloc_bitmap_alloc();
            if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) {
                /* we are NOT bound if get_cpubind fails, nor can we be bound - the
                 * environment does not support it
                 */
                hwloc_bitmap_free(cpus);
                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                     "%s Binding not supported",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                goto MOVEON;
            }
            /* we are bound if the two cpusets are not equal,
             * or if there is only ONE cpu available to us
             */
            if (0 != hwloc_bitmap_compare(cpus, nodeset) ||
                opal_hwloc_base_single_cpu(nodeset) ||
                opal_hwloc_base_single_cpu(cpus)) {
                /* someone external set it - indicate it is set
                 * so that we know
                 */
                orte_proc_is_bound = true;
                hwloc_bitmap_free(cpus);
                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                     "%s Process was externally bound",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            } else if (support->cpubind->set_thisproc_cpubind &&
                       OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
                       OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                /* the system is capable of doing processor affinity, but it
                 * has not yet been set - see if a slot_list was given
                 */
                hwloc_bitmap_zero(cpus);
                if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                    if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
                                                                               opal_hwloc_topology, cpus))) {
                        error = "Setting processor affinity failed";
                        hwloc_bitmap_free(cpus);
                        goto error;
                    }
                    if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                        error = "Setting processor affinity failed";
                        hwloc_bitmap_free(cpus);
                        goto error;
                    }
                    /* try to find a level and index for this location */
                    opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx);
                    /* cleanup */
                    hwloc_bitmap_free(cpus);
                    orte_proc_is_bound = true;
                    OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                         "%s Process bound according to slot_list",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                } else {
                    /* cleanup */
                    hwloc_bitmap_free(cpus);
                    /* get the node rank */
                    if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) {
                        /* this is not an error - could be due to being
                         * direct launched - so just ignore and leave
                         * us unbound
                         */
                        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                             "%s Process not bound - no node rank available",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                        goto MOVEON;
                    }
                    /* if the binding policy is hwthread, then we bind to the nrank-th
                     * hwthread on this node
                     */
                    if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU,
                                                                           0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Getting hwthread object";
                            goto error;
                        }
                        cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                        if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                            ret = ORTE_ERROR;
                            error = "Setting processor affinity failed";
                            goto error;
                        }
                        orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
                        orte_process_info.bind_idx = orte_process_info.my_node_rank;
                        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                             "%s Process bound to hwthread",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                    } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        /* if the binding policy is core, then we bind to the nrank-th
                         * core on this node
                         */
                        if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
                                                                           0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Getting core object";
                            goto error;
                        }
                        cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                        if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                            error = "Setting processor affinity failed";
                            ret = ORTE_ERROR;
                            goto error;
                        }
                        orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
                        orte_process_info.bind_idx = orte_process_info.my_node_rank;
                        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                             "%s Process bound to core",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                    } else {
                        /* for all higher binding policies, we bind to the specified
                         * object that the nrank-th core belongs to
                         */
                        if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
                                                                           0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Getting core object";
                            goto error;
                        }
                        if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_CACHE;
                            cache_level = 1;
                            orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
                        } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_CACHE;
                            cache_level = 2;
                            orte_process_info.bind_level = OPAL_HWLOC_L2CACHE_LEVEL;
                        } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_CACHE;
                            cache_level = 3;
                            orte_process_info.bind_level = OPAL_HWLOC_L3CACHE_LEVEL;
                        } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_SOCKET;
                            orte_process_info.bind_level = OPAL_HWLOC_SOCKET_LEVEL;
                        } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_NODE;
                            orte_process_info.bind_level = OPAL_HWLOC_NUMA_LEVEL;
                        } else {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Binding policy not known";
                            goto error;
                        }
                        for (obj = obj->parent; NULL != obj; obj = obj->parent) {
                            if (target == obj->type) {
                                if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
                                    continue;
                                }
                                /* this is the place! */
                                cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                                if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                                    ret = ORTE_ERROR;
                                    error = "Setting processor affinity failed";
                                    goto error;
                                }
                                orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
                                                                                         obj, OPAL_HWLOC_LOGICAL);
                                orte_proc_is_bound = true;
                                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                                     "%s Process bound to %s",
                                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                     opal_hwloc_base_print_level(orte_process_info.bind_level)));
                                break;
                            }
                        }
                        if (!orte_proc_is_bound) {
                            ret = ORTE_ERROR;
                            error = "Setting processor affinity failed";
                            goto error;
                        }
                    }
                }
            }
        }
    } else {
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                             "%s Process bound at launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    }

 MOVEON:
    /* get or update our local cpuset - it will get used multiple
     * times, so it's more efficient to keep a global copy
     */
    opal_hwloc_base_get_local_cpuset();
    /* report bindings, if requested */
    if (opal_hwloc_report_bindings) {
        char bindings[64];
        hwloc_obj_t root;
        hwloc_cpuset_t cpus;
        /* get the root object for this node */
        root = hwloc_get_root_obj(opal_hwloc_topology);
        cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);
        /* we are not bound if this equals our cpuset */
        if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) {
            opal_output(0, "%s is not bound",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        } else {
            hwloc_bitmap_list_snprintf(bindings, 64, opal_hwloc_my_cpuset);
            opal_output(0, "%s is bound to cpus %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        bindings);
        }
    }

    return ORTE_SUCCESS;

 error:
    if (ORTE_ERR_SILENT != ret) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ORTE_ERR_SILENT;

#else
    return ORTE_SUCCESS;
#endif
}
Exemplo n.º 2
0
/* only APPS call this function - daemons have their own */
int orte_util_decode_pidmap(opal_byte_object_t *bo)
{
    orte_vpid_t i, num_procs, *vptr, daemon;
    orte_vpid_t *daemons=NULL;
    orte_local_rank_t *local_rank=NULL;
    orte_node_rank_t *node_rank=NULL;
#if OPAL_HAVE_HWLOC
    opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr;
    unsigned int *bind_idx=NULL, pbidx, *uiptr;
#endif
    opal_hwloc_locality_t locality;
    orte_std_cntr_t n;
    opal_buffer_t buf;
    int rc;
    orte_proc_state_t *states = NULL;
    orte_app_idx_t *app_idx = NULL;
    int32_t *restarts = NULL;
    orte_process_name_t proc, dmn;
    orte_namelist_t *nm;
    opal_list_t jobs;
    char *hostname;

    /* xfer the byte object to a buffer for unpacking */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (ORTE_SUCCESS != (rc = opal_dss.load(&buf, bo->bytes, bo->size))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    n = 1;
    /* cycle through the buffer */
    OBJ_CONSTRUCT(&jobs, opal_list_t);
    while (ORTE_SUCCESS == (rc = opal_dss.unpack(&buf, &proc.jobid, &n, ORTE_JOBID))) {
        OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                             "%s orte:util:decode:pidmap working job %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(proc.jobid)));
        /* record the jobid */
        nm = OBJ_NEW(orte_namelist_t);
        nm->name.jobid = proc.jobid;
        opal_list_append(&jobs, &nm->super);

        /* unpack and store the number of procs */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_procs, &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        proc.vpid = ORTE_VPID_INVALID;
        if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NPROCS, &num_procs, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

#if OPAL_HAVE_HWLOC
        /* unpack and store the binding level */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &bind_level, &n, OPAL_HWLOC_LEVEL_T))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* store it */
        proc.vpid = ORTE_VPID_INVALID;
        if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_LEVEL, &bind_level, OPAL_HWLOC_LEVEL_T))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* set mine */
        if (proc.jobid == ORTE_PROC_MY_NAME->jobid) {
            orte_process_info.bind_level = bind_level;
        }

        OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                             "%s orte:util:decode:pidmap nprocs %s bind level %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_VPID_PRINT(num_procs),
                             opal_hwloc_base_print_level(bind_level)));
#endif

        /* allocate memory for the daemon info */
        daemons = (orte_vpid_t*)malloc(num_procs * sizeof(orte_vpid_t));
        /* unpack it in one shot */
        n=num_procs;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, daemons, &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* allocate memory for local ranks */
        local_rank = (orte_local_rank_t*)malloc(num_procs*sizeof(orte_local_rank_t));
        /* unpack them in one shot */
        n=num_procs;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, local_rank, &n, ORTE_LOCAL_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (proc.jobid == ORTE_PROC_MY_NAME->jobid) {
            /* set mine */
            orte_process_info.my_local_rank = local_rank[ORTE_PROC_MY_NAME->vpid];
            if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_LOCALRANK,
                                                    &orte_process_info.my_local_rank, ORTE_LOCAL_RANK))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
        }
        
        /* allocate memory for node ranks */
        node_rank = (orte_node_rank_t*)malloc(num_procs*sizeof(orte_node_rank_t));
        /* unpack node ranks in one shot */
        n=num_procs;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, node_rank, &n, ORTE_NODE_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (proc.jobid == ORTE_PROC_MY_NAME->jobid) {
            /* set mine */
            orte_process_info.my_node_rank = node_rank[ORTE_PROC_MY_NAME->vpid];
            if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_NODERANK,
                                                    &orte_process_info.my_node_rank, ORTE_NODE_RANK))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
        }
        
#if OPAL_HAVE_HWLOC
        /* allocate memory for bind_idx */
        bind_idx = (unsigned int*)malloc(num_procs*sizeof(unsigned int));
        /* unpack bind_idx in one shot */
        n=num_procs;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, bind_idx, &n, OPAL_UINT))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (proc.jobid == ORTE_PROC_MY_NAME->jobid) {
            /* set mine */
            orte_process_info.bind_idx = bind_idx[ORTE_PROC_MY_NAME->vpid];
            if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_BIND_INDEX,
                                                    &orte_process_info.bind_idx, OPAL_UINT))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
        }
#endif

        /* allocate memory for states */
        states = (orte_proc_state_t*)malloc(num_procs*sizeof(orte_proc_state_t));
        /* unpack states in one shot */
        n=num_procs;
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, states, &n, ORTE_PROC_STATE))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* dump this info - apps don't need it */
        free(states);
        states = NULL;

        /* allocate memory for app_idx's */
        app_idx = (orte_app_idx_t*)malloc(num_procs*sizeof(orte_app_idx_t));
        /* unpack app_idx's in one shot */
        n=num_procs;
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, app_idx, &n, ORTE_APP_IDX))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* dump this info - apps don't need it */
        free(app_idx);
        app_idx = NULL;

        /* allocate memory for restarts */
        restarts = (int32_t*)malloc(num_procs*sizeof(int32_t));
        /* unpack restarts in one shot */
        n=num_procs;
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, restarts, &n, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* dump this info - apps don't need it */
        free(restarts);
        restarts = NULL;

        /* set the daemon jobid */
        dmn.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid);

        /* xfer the data */
        for (i=0; i < num_procs; i++) {
            if (proc.jobid == ORTE_PROC_MY_NAME->jobid &&
                i == ORTE_PROC_MY_NAME->vpid) {
                continue;
            }
            proc.vpid = i;
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &daemons[i], ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* lookup and store the hostname for this proc */
            dmn.vpid = daemons[i];
            if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank[i], ORTE_LOCAL_RANK))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank[i], ORTE_NODE_RANK))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
#if OPAL_HAVE_HWLOC
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_INDEX, &bind_idx[i], OPAL_UINT))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            OPAL_OUTPUT_VERBOSE((10, orte_nidmap_output,
                                 "%s orte:util:decode:pidmap proc %s host %s lrank %d nrank %d bindidx %u",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&proc), hostname,
                                 (int)local_rank[i], (int)node_rank[i], bind_idx[i]));
#endif
        }
        /* release data */
        free(daemons);
        daemons = NULL;
        free(local_rank);
        local_rank = NULL;
        free(node_rank);
        node_rank = NULL;
#if OPAL_HAVE_HWLOC
        free(bind_idx);
        bind_idx = NULL;
#endif
        /* setup for next cycle */
        n = 1;
    }
    if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    rc = ORTE_SUCCESS;

    /* now that we have all the data, we are guaranteed
     * to know our own node, so go back and record the
     * locality of each proc relative to me
     */
    while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&jobs))) {
        proc.jobid = nm->name.jobid;
        /* recover the number of procs in this job */
        vptr = &num_procs;
        proc.vpid = ORTE_VPID_INVALID;
        if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_NPROCS, (void**)&vptr, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        for (i=0; i < num_procs; i++) {
            if (ORTE_PROC_MY_NAME->vpid == i &&
                ORTE_PROC_MY_NAME->jobid == proc.jobid) {
                /* this is me */
                continue;
            }
            proc.vpid = i;
            /* recover the daemon for this proc */
            vptr = &daemon;
            if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_DAEMON_VPID, (void**)&vptr, ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (daemon == ORTE_PROC_MY_DAEMON->vpid) {
                OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                                     "%s orte:util:decode:pidmap proc %s shares node",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&proc)));
                /* we share a node, so add them to the count of peers
                 * sharing the node with me
                 */
                orte_process_info.num_local_peers++;
#if OPAL_HAVE_HWLOC
                /* retrieve the bind level for the other proc's job */
                lvptr = &pbind;
                proc.vpid = ORTE_VPID_INVALID;
                if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_LEVEL, (void**)&lvptr, OPAL_HWLOC_LEVEL_T))) {
                    ORTE_ERROR_LOG(rc);
                    goto cleanup;
                }

                /* retrieve the other's proc's bind idx */
                uiptr = &pbidx;
                proc.vpid = i;
                if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_INDEX, (void**)&uiptr, OPAL_UINT))) {
                    ORTE_ERROR_LOG(rc);
                    goto cleanup;
                }

                /* we share a node - see what else we share */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 orte_process_info.bind_level,
                                                                 orte_process_info.bind_idx,
                                                                 pbind, pbidx);
#else
                locality = OPAL_PROC_ON_NODE;
#endif
            } else {
                /* we don't share a node */
                OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                                     "%s orte:util:decode:pidmap proc %s does NOT node [my daemon %s, their daemon %s]",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&proc),
                                     ORTE_VPID_PRINT(ORTE_PROC_MY_DAEMON->vpid),
                                     ORTE_VPID_PRINT(daemon)));
                locality = OPAL_PROC_NON_LOCAL;
            }
            /* store the locality */
                OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                                     "%s orte:util:decode:pidmap set proc %s locality to %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&proc),
                                     opal_hwloc_base_print_locality(locality)));
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
        }
    }
        
    
 cleanup:
    if (NULL != daemons) {
        free(daemons);
    }
    if (NULL != local_rank) {
        free(local_rank);
    }
    if (NULL != node_rank) {
        free(node_rank);
    }
#if OPAL_HAVE_HWLOC
    if (NULL != bind_idx) {
        free(bind_idx);
    }
#endif
    if (NULL != states) {
        free(states);
    }
    if (NULL != app_idx) {
        free(app_idx);
    }
    if (NULL != restarts) {
        free(restarts);
    }
    OBJ_DESTRUCT(&buf);
    return rc;
}
/*
 * Function for selecting one component from all those that are
 * available.
 */
void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
{
    orte_job_t *jdata;
    orte_job_map_t *map;
    int rc;
    bool did_map;
    opal_list_item_t *item;
    orte_rmaps_base_selected_module_t *mod;
    orte_job_t *parent;
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

    /* convenience */
    jdata = caddy->jdata;

    /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT
     * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN
     * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP
     */
    
    opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                        "mca:rmaps: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
     * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
     * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
     * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A
     * NULL MAP FIELD
     * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY
     * DIDN'T SET IT
     */        
    if (NULL == jdata->map) {
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps: creating new map for job %s",
                            ORTE_JOBID_PRINT(jdata->jobid));
        /* create a map object where we will store the results */
        map = OBJ_NEW(orte_job_map_t);
        if (NULL == map) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
        /* load it with the system defaults */
        map->mapping = orte_rmaps_base.mapping;
        map->ranking = orte_rmaps_base.ranking;
#if OPAL_HAVE_HWLOC
        map->binding = opal_hwloc_binding_policy;
#endif
        if (NULL != orte_rmaps_base.ppr) {
            map->ppr = strdup(orte_rmaps_base.ppr);
        }
        map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
        map->display_map = orte_rmaps_base.display_map;
        /* assign the map object to this job */
        jdata->map = map;
    } else {
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps: setting mapping policies for job %s",
                            ORTE_JOBID_PRINT(jdata->jobid));

        if (!jdata->map->display_map) {
            jdata->map->display_map = orte_rmaps_base.display_map;
        }
        /* set the default mapping policy IFF it wasn't provided */
        if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping);
        }
        if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
            ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping));
        }
        /* ditto for rank and bind policies */
        if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) {
            ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking);
        }
#if OPAL_HAVE_HWLOC
        if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
            jdata->map->binding = opal_hwloc_binding_policy;
        }
#endif
    }

#if OPAL_HAVE_HWLOC
    /* if we are not going to launch, then we need to set any
     * undefined topologies to match our own so the mapper
     * can operate
     */
    if (orte_do_not_launch) {
        orte_node_t *node;
        hwloc_topology_t t0;
        int i;
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        t0 = node->topology;
        for (i=1; i < orte_node_pool->size; i++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
                continue;
            }
            if (NULL == node->topology) {
                node->topology = t0;
            }
        }
    }
#endif

    /* cycle thru the available mappers until one agrees to map
     * the job
     */
    did_map = false;
    for (item = opal_list_get_first(&orte_rmaps_base.selected_modules);
         item != opal_list_get_end(&orte_rmaps_base.selected_modules);
         item = opal_list_get_next(item)) {
        mod = (orte_rmaps_base_selected_module_t*)item;
        if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) {
            did_map = true;
            break;
        }
        /* mappers return "next option" if they didn't attempt to
         * map the job. anything else is a true error.
         */
        if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
            ORTE_ERROR_LOG(rc);
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
    }
    /* if we get here without doing the map, or with zero procs in
     * the map, then that's an error
     */
    if (!did_map || 0 == jdata->num_procs) {
        orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }

    /* compute and save local ranks */
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
    
#if OPAL_HAVE_HWLOC
    /* compute and save bindings */
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
#endif
    
    /* if it is a dynamic spawn, save the bookmark on the parent's job too */
    if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
        if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) {
            parent->bookmark = jdata->bookmark;
        }
    }

    /* if we wanted to display the map, now is the time to do it - ignore
     * daemon job
     */
    if (jdata->map->display_map) {
        char *output;
        int i, j;
        orte_node_t *node;
        orte_proc_t *proc;

        if (orte_display_diffable_output) {
            /* intended solely to test mapping methods, this output
             * can become quite long when testing at scale. Rather
             * than enduring all the malloc/free's required to
             * create an arbitrary-length string, custom-generate
             * the output a line at a time here
             */
            /* display just the procs in a diffable format */
            opal_output(orte_clean_output, "<map>");
            fflush(stderr);
            /* loop through nodes */
            for (i=0; i < jdata->map->nodes->size; i++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
                    continue;
                }
                opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name);
                fflush(stderr);
                for (j=0; j < node->procs->size; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
#if OPAL_HAVE_HWLOC
                    {
                        char locale[64];

                        if (NULL != proc->locale) {
                            hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset);
                        }
                        opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s[%s:%u]>",
                                    ORTE_VPID_PRINT(proc->name.vpid),  (long)proc->app_idx,
                                    (unsigned long)proc->local_rank,
                                    (unsigned long)proc->node_rank, locale,
                                    (NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap,
                                    opal_hwloc_base_print_level(jdata->map->bind_level), proc->bind_idx);
                    }
#else
                    opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>",
                                ORTE_VPID_PRINT(proc->name.vpid),  (long)proc->app_idx,
                                (unsigned long)proc->local_rank,
                                (unsigned long)proc->node_rank);
#endif
                    fflush(stderr);
                }
                opal_output(orte_clean_output, "\t</host>");
                fflush(stderr);
            }
#if OPAL_HAVE_HWLOC
            {
                opal_hwloc_locality_t locality;
                orte_proc_t *p0;

                /* test locality - for the first node, print the locality of each proc relative to the first one */
                node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0);
                p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0);
                opal_output(orte_clean_output, "\t<locality>");
                for (j=1; j < node->procs->size; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    locality = opal_hwloc_base_get_relative_locality(node->topology,
                                                                     jdata->map->bind_level,
                                                                     p0->bind_idx,
                                                                     jdata->map->bind_level,
                                                                     proc->bind_idx);
                    opal_output(orte_clean_output, "\t\t<bind_level=%s rank=%s bind_idx=%u rank=%s bind_idx=%u locality=%s>",
                                opal_hwloc_base_print_level(jdata->map->bind_level),
                                ORTE_VPID_PRINT(p0->name.vpid),
                                p0->bind_idx, ORTE_VPID_PRINT(proc->name.vpid),
                                proc->bind_idx, opal_hwloc_base_print_locality(locality));
                }
                opal_output(orte_clean_output, "\t</locality>\n</map>");
                fflush(stderr);
            }
#else
            opal_output(orte_clean_output, "\n</map>");
            fflush(stderr);
#endif
        } else {
            opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
            if (orte_xml_output) {
                fprintf(orte_xml_fp, "%s\n", output);
                fflush(orte_xml_fp);
            } else {
                opal_output(orte_clean_output, "%s", output);
            }
            free(output);
        }
    }
    /* set the job state to the next position */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE);

    /* cleanup */
    OBJ_RELEASE(caddy);
}