Example #1
0
/*
 * Where is this process currently bound? (layout string)
 */
static int get_layout_current_binding(char str[OMPI_AFFINITY_STRING_MAX])
{
    int ret;
    hwloc_obj_t root;
    hwloc_cpuset_t boundset, rootset;
    bool bound = false;

    /* get our root object */
    root = hwloc_get_root_obj(opal_hwloc_topology);
    rootset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);

    /* get our bindings */
    boundset = hwloc_bitmap_alloc();
    if (hwloc_get_cpubind(opal_hwloc_topology, boundset,
                          HWLOC_CPUBIND_PROCESS) < 0) {
        /* we are NOT bound if get_cpubind fails, nor can we be bound
           - the environment does not support it */
        bound = false;
    } else {
        /* we are bound if the two cpusets are not equal, or if there
           is only ONE PU available to us */
        if (0 != hwloc_bitmap_compare(boundset, rootset) ||
            opal_hwloc_base_single_cpu(rootset) ||
            opal_hwloc_base_single_cpu(boundset)) {
            bound = true;
        }
    }

    /* If we are not bound, indicate that */
    if (!bound) {
        strncat(str, not_bound_str, OMPI_AFFINITY_STRING_MAX - 1);
        ret = OMPI_SUCCESS;
    }

    /* If we are bound, print it out */
    else {
        ret = opal_hwloc_base_cset2mapstr(str, OMPI_AFFINITY_STRING_MAX,
                                          opal_hwloc_topology,
                                          boundset);
        if (OPAL_ERR_NOT_BOUND == ret) {
            strncpy(str, not_bound_str, OMPI_AFFINITY_STRING_MAX - 1);
            ret = OMPI_SUCCESS;
        }
    }
    hwloc_bitmap_free(boundset);

    return ret;
}
Example #2
0
static int
sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
                       int32_t my_smp_rank,
                       int n)
{
    size_t length, length_payload;
    sm_fifo_t *my_fifos;
    int my_mem_node, num_mem_nodes, i, rc;
    mca_mpool_base_resources_t *res = NULL;
    mca_btl_sm_component_t* m = &mca_btl_sm_component;

    /* Assume we don't have hwloc support and fill in dummy info */
    mca_btl_sm_component.mem_node = my_mem_node = 0;
    mca_btl_sm_component.num_mem_nodes = num_mem_nodes = 1;

#if OPAL_HAVE_HWLOC
    /* If we have hwloc support, then get accurate information */
    if (NULL != opal_hwloc_topology) {
        i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
                                               HWLOC_OBJ_NODE, 0,
                                               OPAL_HWLOC_AVAILABLE);

        /* If we find >0 NUMA nodes, then investigate further */
        if (i > 0) {
            int numa=0, w;
            unsigned n_bound=0;
            hwloc_cpuset_t avail;
            hwloc_obj_t obj;

            /* JMS This tells me how many numa nodes are *available*,
               but it's not how many are being used *by this job*.
               Note that this is the value we've previously used (from
               the previous carto-based implementation), but it really
               should be improved to be how many NUMA nodes are being
               used *in this job*. */
            mca_btl_sm_component.num_mem_nodes = num_mem_nodes = i;

            /* if we are not bound, then there is nothing further to do */
            if (NULL != ompi_process_info.cpuset) {
                /* count the number of NUMA nodes to which we are bound */
                for (w=0; w < i; w++) {
                    if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
                                                                       HWLOC_OBJ_NODE, 0, w,
                                                                       OPAL_HWLOC_AVAILABLE))) {
                        continue;
                    }
                    /* get that NUMA node's available cpus */
                    avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                    /* see if we intersect */
                    if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
                        n_bound++;
                        numa = w;
                    }
                }
                /* if we are located on more than one NUMA, or we didn't find
                 * a NUMA we are on, then not much we can do
                 */
                if (1 == n_bound) {
                    mca_btl_sm_component.mem_node = my_mem_node = numa;
                } else {
                    mca_btl_sm_component.mem_node = my_mem_node = -1;
                }
            }
        }
    }
#endif

    if (NULL == (res = calloc(1, sizeof(*res)))) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* lookup shared memory pool */
    mca_btl_sm_component.sm_mpools =
        (mca_mpool_base_module_t **)calloc(num_mem_nodes,
                                           sizeof(mca_mpool_base_module_t *));

    /* Disable memory binding, because each MPI process will claim pages in the
     * mpool for their local NUMA node */
    res->mem_node = -1;

    if (OMPI_SUCCESS != (rc = setup_mpool_base_resources(m, res))) {
        free(res);
        return rc;
    }
    /* now that res is fully populated, create the thing */
    mca_btl_sm_component.sm_mpools[0] =
        mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name,
                                     sm_btl, res);
    /* Sanity check to ensure that we found it */
    if (NULL == mca_btl_sm_component.sm_mpools[0]) {
        free(res);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    mca_btl_sm_component.sm_mpool = mca_btl_sm_component.sm_mpools[0];

    mca_btl_sm_component.sm_mpool_base =
        mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]);

    /* create a list of peers */
    mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**)
        calloc(n, sizeof(struct mca_btl_base_endpoint_t*));
    if (NULL == mca_btl_sm_component.sm_peers) {
        free(res);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* remember that node rank zero is already attached */
    if (0 != my_smp_rank) {
        if (OMPI_SUCCESS != (rc = sm_segment_attach(m))) {
            free(res);
            return rc;
        }
    }

    /* it is now safe to free the mpool resources */
    free(res);

    /* check to make sure number of local procs is within the
     * specified limits */
    if(mca_btl_sm_component.sm_max_procs > 0 &&
       mca_btl_sm_component.num_smp_procs + n >
       mca_btl_sm_component.sm_max_procs) {
        return OMPI_ERROR;
    }

    mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.sm_seg->module_data_addr;
    mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n);
    mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n);

    /* set the base of the shared memory segment */
    mca_btl_sm_component.shm_bases[mca_btl_sm_component.my_smp_rank] =
        (char*)mca_btl_sm_component.sm_mpool_base;
    mca_btl_sm_component.shm_mem_nodes[mca_btl_sm_component.my_smp_rank] =
        (uint16_t)my_mem_node;

    /* initialize the array of fifo's "owned" by this process */
    if(NULL == (my_fifos = (sm_fifo_t*)mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t))))
        return OMPI_ERR_OUT_OF_RESOURCE;

    mca_btl_sm_component.shm_fifo[mca_btl_sm_component.my_smp_rank] = my_fifos;

    /* cache the pointer to the 2d fifo array.  These addresses
     * are valid in the current process space */
    mca_btl_sm_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n);

    if(NULL == mca_btl_sm_component.fifo)
        return OMPI_ERR_OUT_OF_RESOURCE;

    mca_btl_sm_component.fifo[mca_btl_sm_component.my_smp_rank] = my_fifos;

    mca_btl_sm_component.mem_nodes = (uint16_t *) malloc(sizeof(uint16_t) * n);
    if(NULL == mca_btl_sm_component.mem_nodes)
        return OMPI_ERR_OUT_OF_RESOURCE;

    /* initialize fragment descriptor free lists */

    /* allocation will be for the fragment descriptor and payload buffer */
    length = sizeof(mca_btl_sm_frag1_t);
    length_payload =
        sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.eager_limit;
    i = ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_eager, length,
                                opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag1_t),
                                length_payload, opal_cache_line_size,
                                mca_btl_sm_component.sm_free_list_num,
                                mca_btl_sm_component.sm_free_list_max,
                                mca_btl_sm_component.sm_free_list_inc,
                                mca_btl_sm_component.sm_mpool);
    if ( OMPI_SUCCESS != i )
        return i;

    length = sizeof(mca_btl_sm_frag2_t);
    length_payload =
        sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.max_frag_size;
    i = ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_max, length,
                                opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag2_t),
                                length_payload, opal_cache_line_size,
                                mca_btl_sm_component.sm_free_list_num,
                                mca_btl_sm_component.sm_free_list_max,
                                mca_btl_sm_component.sm_free_list_inc,
                                mca_btl_sm_component.sm_mpool);
    if ( OMPI_SUCCESS != i )
        return i;

    i = ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_user, 
		    sizeof(mca_btl_sm_user_t),
		    opal_cache_line_size, OBJ_CLASS(mca_btl_sm_user_t),
		    sizeof(mca_btl_sm_hdr_t), opal_cache_line_size,
		    mca_btl_sm_component.sm_free_list_num,
		    mca_btl_sm_component.sm_free_list_max,
		    mca_btl_sm_component.sm_free_list_inc,
		    mca_btl_sm_component.sm_mpool);
    if ( OMPI_SUCCESS != i )
	    return i;   

    mca_btl_sm_component.num_outstanding_frags = 0;

    mca_btl_sm_component.num_pending_sends = 0;
    i = opal_free_list_init(&mca_btl_sm_component.pending_send_fl,
                            sizeof(btl_sm_pending_send_item_t),
                            OBJ_CLASS(opal_free_list_item_t),
                            16, -1, 32);
    if ( OMPI_SUCCESS != i )
        return i;

    /* set flag indicating btl has been inited */
    sm_btl->btl_inited = true;

    return OMPI_SUCCESS;
}
static int bind_in_place(orte_job_t *jdata,
                         hwloc_obj_type_t target,
                         unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node downwards
     * until we find an unused object of type target - and then bind
     * the process to that target
     */
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    struct hwloc_topology_support *support;
    opal_hwloc_obj_data_t *data;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind in place for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(map->binding) ||
                    !OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability - don't warn if the user didn't
             * specifically request binding
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind &&
                OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* some systems do not report cores, and so we can get a situation where our
         * default binding policy will fail for no necessary reason. So if we are
         * computing a binding due to our default policy, and no cores are found
         * on this node, just silently skip it - we will not bind
         */
        if (!OPAL_BINDING_POLICY_IS_SET(map->binding) &&
            HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "Unable to bind-to core by default on node %s as no cores detected",
                                node->name);
            continue;
        }

        /* we share topologies in order
         * to save space, so we need to reset the usage info to reflect
         * our own current state
         */
        reset_usage(node, jdata->jobid);

        /* cycle thru the procs */
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            /* ignore procs that have already been bound - should
             * never happen, but safer
             */
            if (NULL != proc->cpu_bitmap) {
                continue;
            }
            /* get the index of this location */
            if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, proc->locale, OPAL_HWLOC_AVAILABLE))) {
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                return ORTE_ERR_SILENT;
            }
            /* track the number bound */
            data = (opal_hwloc_obj_data_t*)proc->locale->userdata;
            data->num_bound++;
             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "BINDING PROC %s TO %s NUMBER %u",
                                ORTE_NAME_PRINT(&proc->name),
                                hwloc_obj_type_string(proc->locale->type), idx);
            /* get the number of cpus under this location */
            if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, proc->locale))) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* error out if adding a proc would cause overload and that wasn't allowed,
             * and it wasn't a default binding policy (i.e., the user requested it)
             */
            if (ncpus < data->num_bound &&
                !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) &&
                OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                               opal_hwloc_base_print_binding(map->binding), node->name,
                               data->num_bound, ncpus);
                return ORTE_ERR_SILENT;
            }
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
            hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
            /* record the location */
            proc->bind_location = proc->locale;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&proc->name),
                                proc->cpu_bitmap,
                                hwloc_obj_type_string(proc->locale->type),
                                idx, node->name);
        }
    }

    return ORTE_SUCCESS;
}
static int bind_downwards(orte_job_t *jdata,
                          orte_node_t *node,
                          hwloc_obj_type_t target,
                          unsigned cache_level)
{
    int j;
    orte_job_map_t *map;
    orte_proc_t *proc;
    hwloc_obj_t trg_obj, nxt_obj;
    hwloc_cpuset_t cpus;
    unsigned int ncpus;
    opal_hwloc_obj_data_t *data;
    int total_cpus;
    hwloc_cpuset_t totalcpuset;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind downward for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;
    totalcpuset = hwloc_bitmap_alloc();

    /* cycle thru the procs */
    for (j=0; j < node->procs->size; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
            continue;
        }
        /* ignore procs from other jobs */
        if (proc->name.jobid != jdata->jobid) {
            continue;
        }
        /* ignore procs that have already been bound - should
         * never happen, but safer
         */
        if (NULL != proc->cpu_bitmap) {
            continue;
        }
        /* we don't know if the target is a direct child of this locale,
         * or if it is some depth below it, so we have to conduct a bit
         * of a search. Let hwloc find the min usage one for us.
         */
        trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology,
                                                                  proc->locale,
                                                                  target, cache_level);
        if (NULL == trg_obj) {
            /* there aren't any such targets under this object */
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
            hwloc_bitmap_free(totalcpuset);
            return ORTE_ERR_SILENT;
        }
        /* record the location */
        proc->bind_location = trg_obj;
        /* start with a clean slate */
        hwloc_bitmap_zero(totalcpuset);
        total_cpus = 0;
        nxt_obj = trg_obj;
        do {
            if (NULL == nxt_obj) {
                /* could not find enough cpus to meet request */
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            trg_obj = nxt_obj;
            /* get the number of cpus under this location */
            ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s GOT %d CPUS",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
            /* track the number bound */
            if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
                data = OBJ_NEW(opal_hwloc_obj_data_t);
                trg_obj->userdata = data;
            }
            data->num_bound++;
            /* error out if adding a proc would cause overload and that wasn't allowed,
             * and it wasn't a default binding policy (i.e., the user requested it)
             */
            if (ncpus < data->num_bound &&
                !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                   opal_hwloc_base_print_binding(map->binding), node->name,
                                   data->num_bound, ncpus);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                } else {
                    /* if this is the default binding policy, then just don't
                     * bind this proc
                     */
                    data->num_bound--;  // maintain count
                    /* show the proc as not bound */
                    proc->bind_location = NULL;
                    hwloc_bitmap_zero(totalcpuset);
                    break;
                }
            }
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
            hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
            /* track total #cpus */
            total_cpus += ncpus;
            /* move to the next location, in case we need it */
            nxt_obj = trg_obj->next_cousin;
        } while (total_cpus < orte_rmaps_base.cpus_per_rank);
        hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, totalcpuset);
        if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
            char tmp1[1024], tmp2[1024];
            if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1),
                                                               node->topology, totalcpuset)) {
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s PROC %s ON %s IS NOT BOUND",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name);
            } else {
                opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology, totalcpuset);
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s BOUND PROC %s[%s] TO %s: %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name,
                            tmp1, tmp2);
            }
        }
    }
    hwloc_bitmap_free(totalcpuset);
    
    return ORTE_SUCCESS;
}
static int bind_upwards(orte_job_t *jdata,
                        orte_node_t *node,
                        hwloc_obj_type_t target,
                        unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node upwards
     * until we find an object of type target - and then bind
     * the process to that target
     */
    int j;
    orte_job_map_t *map;
    orte_proc_t *proc;
    hwloc_obj_t obj;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    opal_hwloc_obj_data_t *data;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind upwards for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;


    /* cycle thru the procs */
    for (j=0; j < node->procs->size; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
            continue;
        }
        /* ignore procs from other jobs */
        if (proc->name.jobid != jdata->jobid) {
            continue;
        }
        /* ignore procs that have already been bound - should
         * never happen, but safer
         */
        if (NULL != proc->cpu_bitmap) {
            continue;
        }
        /* bozo check */
        if (NULL == proc->locale) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "BIND UPWARDS: LOCALE FOR PROC %s IS NULL",
                                ORTE_NAME_PRINT(&proc->name));
            return ORTE_ERR_SILENT;
        }
        /* starting at the locale, move up thru the parents
         * to find the target object type
         */
        for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s bind:upward target %s type %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                hwloc_obj_type_string(target),
                                hwloc_obj_type_string(obj->type));
            if (target == obj->type) {
                if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
                    continue;
                }
                /* get its index */
                if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
                    ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                    return ORTE_ERR_SILENT;
                }
                /* track the number bound */
                data = (opal_hwloc_obj_data_t*)obj->userdata;
                data->num_bound++;
                /* get the number of cpus under this location */
                if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                    return ORTE_ERR_SILENT;
                }
                /* error out if adding a proc would cause overload and that wasn't allowed,
                 * and it wasn't a default binding policy (i.e., the user requested it)
                 */
                if (ncpus < data->num_bound &&
                    !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) &&
                    OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                   opal_hwloc_base_print_binding(map->binding), node->name,
                                   data->num_bound, ncpus);
                    return ORTE_ERR_SILENT;
                }
                /* bind it here */
                cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
                hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
                /* record the location */
                proc->bind_location = obj;
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    ORTE_NAME_PRINT(&proc->name),
                                    proc->cpu_bitmap,
                                    hwloc_obj_type_string(target),
                                    idx, node->name);
                break;
            }
        }
        if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
            /* didn't find anyone to bind to - this is an error
             * unless the user specified if-supported
             */
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
                           opal_hwloc_base_print_binding(map->binding), node->name);
            return ORTE_ERR_SILENT;
        }
    }

    return ORTE_SUCCESS;
}
Example #6
0
/* recursively climb the topology, pruning procs beyond that allowed
 * by the given ppr
 */
static void prune(orte_jobid_t jobid,
                  orte_app_idx_t app_idx,
                  orte_node_t *node,
                  opal_hwloc_level_t *level,
                  orte_vpid_t *nmapped)
{
    hwloc_obj_t obj, top;
    unsigned int i, nobjs;
    hwloc_obj_type_t lvl;
    unsigned cache_level = 0, k;
    int nprocs;
    hwloc_cpuset_t avail, cpus, childcpus;
    int n, limit, nmax, nunder, idx, idxmax = 0;
    orte_proc_t *proc, *pptr, *procmax;
    opal_hwloc_level_t ll;
    char dang[64];
    hwloc_obj_t locale;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: pruning level %d",
                        *level);

    /* convenience */
    ll = *level;

    /* convenience */
    lvl = opal_hwloc_levels[ll];
    limit = ppr[ll];

    if (0 == limit) {
        /* no limit at this level, so move up if necessary */
        if (0 == ll) {
            /* done */
            return;
        }
        --(*level);
        prune(jobid, app_idx, node, level, nmapped);
        return;
    }

    /* handle the darn cache thing again */
    if (OPAL_HWLOC_L3CACHE_LEVEL == ll) {
        cache_level = 3;
    } else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) {
        cache_level = 2;
    } else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) {
        cache_level = 1;
    }

    /* get the number of resources at this level on this node */
    nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
                                               lvl, cache_level,
                                               OPAL_HWLOC_AVAILABLE);

    /* for each resource, compute the number of procs sitting
     * underneath it and check against the limit
     */
    for (i=0; i < nobjs; i++) {
        obj = opal_hwloc_base_get_obj_by_type(node->topology,
                                              lvl, cache_level,
                                              i, OPAL_HWLOC_AVAILABLE);
        /* get the available cpuset */
        avail = opal_hwloc_base_get_available_cpus(node->topology, obj);

        /* look at the intersection of this object's cpuset and that
         * of each proc in the job/app - if they intersect, then count this proc
         * against the limit
         */
        nprocs = 0;
        for (n=0; n < node->procs->size; n++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
                continue;
            }
            if (proc->name.jobid != jobid ||
                proc->app_idx != app_idx) {
                continue;
            }
            locale = NULL;
            if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                return;
            }
            cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
            if (hwloc_bitmap_intersects(avail, cpus)) {
                nprocs++;
            }
        }
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: found %d procs limit %d",
                            nprocs, limit);

        /* check against the limit */
        while (limit < nprocs) {
            /* need to remove procs - do this in a semi-intelligent
             * manner to provide a little load balancing by cycling
             * across the objects beneath this one, removing procs
             * in a round-robin fashion until the limit is satisfied
             *
             * NOTE: I'm sure someone more knowledgeable with hwloc
             * will come up with a more efficient way to do this, so
             * consider this is a starting point
             */

            /* find the first level that has more than
             * one child beneath it - if all levels
             * have only one child, then return this
             * object
             */
            top = find_split(node->topology, obj);
            hwloc_obj_type_snprintf(dang, 64, top, 1);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:ppr: SPLIT AT LEVEL %s", dang);

            /* cycle across the children of this object */
            nmax = 0;
            procmax = NULL;
            idx = 0;
            /* find the child with the most procs underneath it */
            for (k=0; k < top->arity && limit < nprocs; k++) {
                /* get this object's available cpuset */
                childcpus = opal_hwloc_base_get_available_cpus(node->topology, top->children[k]);
                nunder = 0;
                pptr = NULL;
                for (n=0; n < node->procs->size; n++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
                        continue;
                    }
                    if (proc->name.jobid != jobid ||
                        proc->app_idx != app_idx) {
                        continue;
                    }
                    locale = NULL;
                    if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                        return;
                    }
                    cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
                    if (hwloc_bitmap_intersects(childcpus, cpus)) {
                        nunder++;
                        if (NULL == pptr) {
                            /* save the location of the first proc under this object */
                            pptr = proc;
                            idx = n;
                        }
                    }
                }
                if (nmax < nunder) {
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d",
                                        k, nunder, nmax);
                    nmax = nunder;
                    procmax = pptr;
                    idxmax = idx;
                }
            }
            if (NULL == procmax) {
                /* can't find anything to remove - error out */
                goto error;
            }
            /* remove it */
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:ppr: removing proc at posn %d",
                                idxmax);
            opal_pointer_array_set_item(node->procs, idxmax, NULL);
            node->num_procs--;
            node->slots_inuse--;
            if (node->slots_inuse < 0) {
                node->slots_inuse = 0;
            }
            nprocs--;
            *nmapped -= 1;
            OBJ_RELEASE(procmax);
        }
    }
    /* finished with this level - move up if necessary */
    if (0 == ll) {
        return;
    }
    --(*level);
    prune(jobid, app_idx, node, level, nmapped);
    return;

 error:
    opal_output(0, "INFINITE LOOP");
}
Example #7
0
static int bind_in_place(orte_job_t *jdata,
                         hwloc_obj_type_t target,
                         unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node downwards
     * until we find an unused object of type target - and then bind
     * the process to that target
     */
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    struct hwloc_topology_support *support;
    opal_hwloc_obj_data_t *data;
    hwloc_obj_t locale, sib;
    char *cpu_bitmap;
    bool found;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind in place for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(map->binding) ||
                    !OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability - don't warn if the user didn't
             * specifically request binding
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind &&
                OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* some systems do not report cores, and so we can get a situation where our
         * default binding policy will fail for no necessary reason. So if we are
         * computing a binding due to our default policy, and no cores are found
         * on this node, just silently skip it - we will not bind
         */
        if (!OPAL_BINDING_POLICY_IS_SET(map->binding) &&
            HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "Unable to bind-to core by default on node %s as no cores detected",
                                node->name);
            continue;
        }

        /* we share topologies in order
         * to save space, so we need to reset the usage info to reflect
         * our own current state
         */
        reset_usage(node, jdata->jobid);

        /* cycle thru the procs */
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            /* bozo check */
            if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
                return ORTE_ERR_SILENT;
            }
            /* get the index of this location */
            if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, locale, OPAL_HWLOC_AVAILABLE))) {
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                return ORTE_ERR_SILENT;
            }
            data = (opal_hwloc_obj_data_t*)locale->userdata;
            /* get the number of cpus under this location */
            if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, locale))) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* if we don't have enough cpus to support this additional proc, try
             * shifting the location to a cousin that can support it - the important
             * thing is that we maintain the same level in the topology */
            if (ncpus < (data->num_bound+1)) {
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s bind_in_place: searching right",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                sib = locale;
                found = false;
                while (NULL != (sib = sib->next_cousin)) {
                    data = (opal_hwloc_obj_data_t*)sib->userdata;
                    ncpus = opal_hwloc_base_get_npus(node->topology, sib);
                    if (data->num_bound < ncpus) {
                        found = true;
                        locale = sib;
                        break;
                    }
                }
                if (!found) {
                    /* try the other direction */
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "%s bind_in_place: searching left",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                    sib = locale;
                    while (NULL != (sib = sib->prev_cousin)) {
                        data = (opal_hwloc_obj_data_t*)sib->userdata;
                        ncpus = opal_hwloc_base_get_npus(node->topology, sib);
                        if (data->num_bound < ncpus) {
                            found = true;
                            locale = sib;
                            break;
                        }
                    }
                }
                if (!found) {
                    /* no place to put this - see if overload is allowed */
                    if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                        if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                            /* if the user specified a binding policy, then we cannot meet
                             * it since overload isn't allowed, so error out - have the
                             * message indicate that setting overload allowed will remove
                             * this restriction */
                            orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                           opal_hwloc_base_print_binding(map->binding), node->name,
                                           data->num_bound, ncpus);
                            return ORTE_ERR_SILENT;
                        } else {
                            /* if we have the default binding policy, then just don't bind */
                            OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
                            unbind_procs(jdata);
                            return ORTE_SUCCESS;
                        }
                    }
                }
            }
            /* track the number bound */
            data = (opal_hwloc_obj_data_t*)locale->userdata;  // just in case it changed
            data->num_bound++;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "BINDING PROC %s TO %s NUMBER %u",
                                ORTE_NAME_PRINT(&proc->name),
                                hwloc_obj_type_string(locale->type), idx);
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
            hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus);
            orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
            /* update the location, in case it changed */
            orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, locale, OPAL_PTR);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&proc->name),
                                cpu_bitmap, hwloc_obj_type_string(locale->type),
                                idx, node->name);
            if (NULL != cpu_bitmap) {
                free(cpu_bitmap);
            }
        }
    }

    return ORTE_SUCCESS;
}
Example #8
0
static int bind_upwards(orte_job_t *jdata,
                        orte_node_t *node,
                        hwloc_obj_type_t target,
                        unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node upwards
     * until we find an object of type target - and then bind
     * the process to that target
     */
    int j;
    orte_job_map_t *map;
    orte_proc_t *proc;
    hwloc_obj_t obj;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    opal_hwloc_obj_data_t *data;
    hwloc_obj_t locale;
    char *cpu_bitmap;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind upwards for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;


    /* cycle thru the procs */
    for (j=0; j < node->procs->size; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
            continue;
        }
        /* ignore procs from other jobs */
        if (proc->name.jobid != jdata->jobid) {
            continue;
        }
        /* bozo check */
        if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
            return ORTE_ERR_SILENT;
        }
        /* starting at the locale, move up thru the parents
         * to find the target object type
         */
        cpu_bitmap = NULL;
        for (obj = locale->parent; NULL != obj; obj = obj->parent) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s bind:upward target %s type %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                hwloc_obj_type_string(target),
                                hwloc_obj_type_string(obj->type));
            if (target == obj->type) {
                if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
                    continue;
                }
                /* get its index */
                if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
                    ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                    return ORTE_ERR_SILENT;
                }
                /* track the number bound */
                data = (opal_hwloc_obj_data_t*)obj->userdata;
                data->num_bound++;
                /* get the number of cpus under this location */
                if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                    return ORTE_ERR_SILENT;
                }
                /* error out if adding a proc would cause overload and that wasn't allowed,
                 * and it wasn't a default binding policy (i.e., the user requested it)
                 */
                if (ncpus < data->num_bound &&
                    !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                    if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                        /* if the user specified a binding policy, then we cannot meet
                         * it since overload isn't allowed, so error out - have the
                         * message indicate that setting overload allowed will remove
                         * this restriction */
                        orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                       opal_hwloc_base_print_binding(map->binding), node->name,
                                       data->num_bound, ncpus);
                        return ORTE_ERR_SILENT;
                    } else {
                        /* if we have the default binding policy, then just don't bind */
                        OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
                        unbind_procs(jdata);
                        return ORTE_SUCCESS;
                    }
                }
                /* bind it here */
                cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
                hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus);
                orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
                /* record the location */
                orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    ORTE_NAME_PRINT(&proc->name),
                                    cpu_bitmap,
                                    hwloc_obj_type_string(target),
                                    idx, node->name);
                break;
            }
        }
        if (NULL == cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
            /* didn't find anyone to bind to - this is an error
             * unless the user specified if-supported
             */
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
                           opal_hwloc_base_print_binding(map->binding), node->name);
            return ORTE_ERR_SILENT;
        }
        if (NULL != cpu_bitmap) {
            free(cpu_bitmap);
        }
    }

    return ORTE_SUCCESS;
}
static int bind_upwards(orte_job_t *jdata,
                        hwloc_obj_type_t target,
                        unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node upwards
     * until we find an object of type target - and then bind
     * the process to that target
     */
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    hwloc_obj_t obj;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    struct hwloc_topology_support *support;
    opal_hwloc_obj_data_t *data;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind upwards for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* clear the topology of any prior usage numbers */
        opal_hwloc_base_clear_usage(node->topology);

        /* cycle thru the procs */
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            /* ignore procs that have already been bound - should
             * never happen, but safer
             */
            if (NULL != proc->cpu_bitmap) {
                continue;
            }
            /* bozo check */
            if (NULL == proc->locale) {
                opal_output(0, "BIND UPWARDS: LOCALE FOR PROC %s IS NULL", ORTE_NAME_PRINT(&proc->name));
                return ORTE_ERR_SILENT;
            }
            /* starting at the locale, move up thru the parents
             * to find the target object type
             */
            for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) {
                opal_output(0, "%s bind:upward target %s type %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            hwloc_obj_type_string(target),
                            hwloc_obj_type_string(obj->type));
                if (target == obj->type) {
                    if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
                        continue;
                    }
                    /* get its index */
                    if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
                        return ORTE_ERR_SILENT;
                    }
                    /* track the number bound */
                    data = (opal_hwloc_obj_data_t*)obj->userdata;
                    data->num_bound++;
                    /* get the number of cpus under this location */
                    if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
                        orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                        return ORTE_ERR_SILENT;
                    }
                    /* error out if adding a proc would cause overload and that wasn't allowed */
                    if (ncpus < data->num_bound &&
                        !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                        orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                       opal_hwloc_base_print_binding(map->binding), node->name,
                                       data->num_bound, ncpus);
                        return ORTE_ERR_SILENT;
                    }
                    /* bind it here */
                    cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
                    hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                        ORTE_NAME_PRINT(&proc->name),
                                        proc->cpu_bitmap,
                                        hwloc_obj_type_string(target),
                                        idx, node->name);
                    break;
                }
            }
            if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
                /* didn't find anyone to bind to - this is an error
                 * unless the user specified if-supported
                 */
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
                               opal_hwloc_base_print_binding(map->binding), node->name);
                return ORTE_ERR_SILENT;
            }
        }
    }

    return ORTE_SUCCESS;
}
static int bind_downwards(orte_job_t *jdata,
                          hwloc_obj_type_t target,
                          unsigned cache_level)
{
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    hwloc_obj_t trg_obj, nxt_obj;
    hwloc_cpuset_t cpus;
    unsigned int ncpus;
    struct hwloc_topology_support *support;
    opal_hwloc_obj_data_t *data;
    int total_cpus;
    hwloc_cpuset_t totalcpuset;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind downward for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;
    totalcpuset = hwloc_bitmap_alloc();

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* clear the topology of any prior usage numbers */
        opal_hwloc_base_clear_usage(node->topology);

        /* cycle thru the procs */
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            /* ignore procs that have already been bound - should
             * never happen, but safer
             */
            if (NULL != proc->cpu_bitmap) {
                continue;
            }
            /* we don't know if the target is a direct child of this locale,
             * or if it is some depth below it, so we have to conduct a bit
             * of a search. Let hwloc find the min usage one for us.
             */
            trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology,
                                                                      proc->locale,
                                                                      target, cache_level);
            if (NULL == trg_obj) {
                /* there aren't any such targets under this object */
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            /* start with a clean slate */
            hwloc_bitmap_zero(totalcpuset);
            total_cpus = 0;
            nxt_obj = trg_obj;
            do {
                if (NULL == nxt_obj) {
                    /* could not find enough cpus to meet request */
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                }
                trg_obj = nxt_obj;
                /* get the number of cpus under this location */
                ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj);
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s GOT %d CPUS",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
                /* track the number bound */
                if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
                    data = OBJ_NEW(opal_hwloc_obj_data_t);
                    trg_obj->userdata = data;
                }
                data->num_bound++;
                /* error out if adding a proc would cause overload and that wasn't allowed */
                if (ncpus < data->num_bound &&
                    !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                   opal_hwloc_base_print_binding(map->binding), node->name,
                                   data->num_bound, ncpus);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                }
                /* bind the proc here */
                cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
                hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
                total_cpus += ncpus;
                /* move to the next location, in case we need it */
                nxt_obj = trg_obj->next_cousin;
            } while (total_cpus < orte_rmaps_base.cpus_per_rank);
            hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, totalcpuset);
            if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
                char tmp1[1024], tmp2[1024];
                opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), totalcpuset);
                opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), totalcpuset);
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s BOUND PROC %s[%s] TO %s: %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name,
                            tmp1, tmp2);
            }
        }
    }
    hwloc_bitmap_free(totalcpuset);

    return ORTE_SUCCESS;
}
Example #11
0
int orte_ess_base_proc_binding(void)
{
#if OPAL_HAVE_HWLOC
    hwloc_obj_t node, obj;
    hwloc_cpuset_t cpus, nodeset;
    hwloc_obj_type_t target;
    unsigned int cache_level = 0;
    struct hwloc_topology_support *support;
    char *map;
    int ret;
    char *error;

    /* Determine if we were pre-bound or not */
    if (NULL != getenv("OMPI_MCA_orte_bound_at_launch")) {
        orte_proc_is_bound = true;
        if (NULL != (map = getenv("OMPI_MCA_orte_base_applied_binding"))) {
            orte_proc_applied_binding = hwloc_bitmap_alloc();
            if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) {
                error = "applied_binding parse";
                goto error;
            }
        }
    }

    /* see if we were bound when launched */
    if (!orte_proc_is_bound) {
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                             "%s Not bound at launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* we were not bound at launch */
        if (NULL != opal_hwloc_topology) {
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
            /* get our node object */
            node = hwloc_get_root_obj(opal_hwloc_topology);
            nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node);
            /* get our bindings */
            cpus = hwloc_bitmap_alloc();
            if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) {
                /* we are NOT bound if get_cpubind fails, nor can we be bound - the
                 * environment does not support it
                 */
                hwloc_bitmap_free(cpus);
                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                     "%s Binding not supported",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                goto MOVEON;
            }
            /* we are bound if the two cpusets are not equal,
             * or if there is only ONE cpu available to us
             */
            if (0 != hwloc_bitmap_compare(cpus, nodeset) ||
                opal_hwloc_base_single_cpu(nodeset) ||
                opal_hwloc_base_single_cpu(cpus)) {
                /* someone external set it - indicate it is set
                 * so that we know
                 */
                orte_proc_is_bound = true;
                hwloc_bitmap_free(cpus);
                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                     "%s Process was externally bound",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            } else if (support->cpubind->set_thisproc_cpubind &&
                       OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
                       OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                /* the system is capable of doing processor affinity, but it
                 * has not yet been set - see if a slot_list was given
                 */
                hwloc_bitmap_zero(cpus);
                if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                    if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
                                                                               opal_hwloc_topology, cpus))) {
                        error = "Setting processor affinity failed";
                        hwloc_bitmap_free(cpus);
                        goto error;
                    }
                    if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                        error = "Setting processor affinity failed";
                        hwloc_bitmap_free(cpus);
                        goto error;
                    }
                    /* try to find a level and index for this location */
                    opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx);
                    /* cleanup */
                    hwloc_bitmap_free(cpus);
                    orte_proc_is_bound = true;
                    OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                         "%s Process bound according to slot_list",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                } else {
                    /* cleanup */
                    hwloc_bitmap_free(cpus);
                    /* get the node rank */
                    if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) {
                        /* this is not an error - could be due to being
                         * direct launched - so just ignore and leave
                         * us unbound
                         */
                        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                             "%s Process not bound - no node rank available",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                        goto MOVEON;
                    }
                    /* if the binding policy is hwthread, then we bind to the nrank-th
                     * hwthread on this node
                     */
                    if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU,
                                                                           0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Getting hwthread object";
                            goto error;
                        }
                        cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                        if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                            ret = ORTE_ERROR;
                            error = "Setting processor affinity failed";
                            goto error;
                        }
                        orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
                        orte_process_info.bind_idx = orte_process_info.my_node_rank;
                        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                             "%s Process bound to hwthread",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                    } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        /* if the binding policy is core, then we bind to the nrank-th
                         * core on this node
                         */
                        if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
                                                                           0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Getting core object";
                            goto error;
                        }
                        cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                        if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                            error = "Setting processor affinity failed";
                            ret = ORTE_ERROR;
                            goto error;
                        }
                        orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
                        orte_process_info.bind_idx = orte_process_info.my_node_rank;
                        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                             "%s Process bound to core",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                    } else {
                        /* for all higher binding policies, we bind to the specified
                         * object that the nrank-th core belongs to
                         */
                        if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
                                                                           0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Getting core object";
                            goto error;
                        }
                        if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_CACHE;
                            cache_level = 1;
                            orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
                        } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_CACHE;
                            cache_level = 2;
                            orte_process_info.bind_level = OPAL_HWLOC_L2CACHE_LEVEL;
                        } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_CACHE;
                            cache_level = 3;
                            orte_process_info.bind_level = OPAL_HWLOC_L3CACHE_LEVEL;
                        } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_SOCKET;
                            orte_process_info.bind_level = OPAL_HWLOC_SOCKET_LEVEL;
                        } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_NODE;
                            orte_process_info.bind_level = OPAL_HWLOC_NUMA_LEVEL;
                        } else {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Binding policy not known";
                            goto error;
                        }
                        for (obj = obj->parent; NULL != obj; obj = obj->parent) {
                            if (target == obj->type) {
                                if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
                                    continue;
                                }
                                /* this is the place! */
                                cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                                if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                                    ret = ORTE_ERROR;
                                    error = "Setting processor affinity failed";
                                    goto error;
                                }
                                orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
                                                                                         obj, OPAL_HWLOC_LOGICAL);
                                orte_proc_is_bound = true;
                                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                                     "%s Process bound to %s",
                                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                     opal_hwloc_base_print_level(orte_process_info.bind_level)));
                                break;
                            }
                        }
                        if (!orte_proc_is_bound) {
                            ret = ORTE_ERROR;
                            error = "Setting processor affinity failed";
                            goto error;
                        }
                    }
                }
            }
        }
    } else {
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                             "%s Process bound at launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    }

 MOVEON:
    /* get or update our local cpuset - it will get used multiple
     * times, so it's more efficient to keep a global copy
     */
    opal_hwloc_base_get_local_cpuset();
    /* report bindings, if requested */
    if (opal_hwloc_report_bindings) {
        char bindings[64];
        hwloc_obj_t root;
        hwloc_cpuset_t cpus;
        /* get the root object for this node */
        root = hwloc_get_root_obj(opal_hwloc_topology);
        cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);
        /* we are not bound if this equals our cpuset */
        if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) {
            opal_output(0, "%s is not bound",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        } else {
            hwloc_bitmap_list_snprintf(bindings, 64, opal_hwloc_my_cpuset);
            opal_output(0, "%s is bound to cpus %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        bindings);
        }
    }

    return ORTE_SUCCESS;

 error:
    if (ORTE_ERR_SILENT != ret) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ORTE_ERR_SILENT;

#else
    return ORTE_SUCCESS;
#endif
}
Example #12
0
int orte_ess_base_proc_binding(void)
{
    hwloc_obj_t node, obj;
    hwloc_cpuset_t cpus, nodeset;
    hwloc_obj_type_t target;
    unsigned int cache_level = 0;
    struct hwloc_topology_support *support;
    char *map;
    int ret;
    char *error=NULL;
    hwloc_cpuset_t mycpus;

    /* Determine if we were pre-bound or not */
    if (NULL != getenv(OPAL_MCA_PREFIX"orte_bound_at_launch")) {
        orte_proc_is_bound = true;
        if (NULL != (map = getenv(OPAL_MCA_PREFIX"orte_base_applied_binding"))) {
            orte_proc_applied_binding = hwloc_bitmap_alloc();
            if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) {
                error = "applied_binding parse";
                goto error;
            }
        }
    }

    /* see if we were bound when launched */
    if (!orte_proc_is_bound) {
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
                             "%s Not bound at launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* we were not bound at launch */
        if (NULL == opal_hwloc_topology) {
            /* there is nothing we can do, so just return */
            return ORTE_SUCCESS;
        }
        support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
        /* get our node object */
        node = hwloc_get_root_obj(opal_hwloc_topology);
        nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node);
        /* get our bindings */
        cpus = hwloc_bitmap_alloc();
        if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) {
            /* we are NOT bound if get_cpubind fails, nor can we be bound - the
             * environment does not support it
             */
            hwloc_bitmap_free(cpus);
            OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
                                 "%s Binding not supported",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto MOVEON;
        }
        /* we are bound if the two cpusets are not equal,
         * or if there is only ONE cpu available to us
         */
        if (0 != hwloc_bitmap_compare(cpus, nodeset) ||
            opal_hwloc_base_single_cpu(nodeset) ||
            opal_hwloc_base_single_cpu(cpus)) {
            /* someone external set it - indicate it is set
             * so that we know
             */
            orte_proc_is_bound = true;
            hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus);
            hwloc_bitmap_free(cpus);
            OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
                                 "%s Process was externally bound",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        } else if (support->cpubind->set_thisproc_cpubind &&
                   OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
                   OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
            /* the system is capable of doing processor affinity, but it
             * has not yet been set - see if a slot_list was given
             */
            hwloc_bitmap_zero(cpus);
            if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
                                                                           opal_hwloc_topology,
                                                                           OPAL_HWLOC_LOGICAL, cpus))) {
                    error = "Setting processor affinity failed";
                    hwloc_bitmap_free(cpus);
                    goto error;
                }
                if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                    error = "Setting processor affinity failed";
                    hwloc_bitmap_free(cpus);
                    goto error;
                }
                hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus);
                hwloc_bitmap_free(cpus);
                orte_proc_is_bound = true;
                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
                                     "%s Process bound according to slot_list",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            } else {
                /* cleanup */
                hwloc_bitmap_free(cpus);
                /* get the node rank */
                if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) {
                    /* this is not an error - could be due to being
                     * direct launched - so just ignore and leave
                     * us unbound
                     */
                    OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
                                         "%s Process not bound - no node rank available",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                    goto MOVEON;
                }
                /* if the binding policy is hwthread, then we bind to the nrank-th
                 * hwthread on this node
                 */
                if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                    if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU,
                                                                       0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                        ret = ORTE_ERR_NOT_FOUND;
                        error = "Getting hwthread object";
                        goto error;
                    }
                    cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                    if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                        ret = ORTE_ERROR;
                        error = "Setting processor affinity failed";
                        goto error;
                    }
                    hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus);
                    hwloc_bitmap_free(cpus);
                    OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
                                         "%s Process bound to hwthread",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                    /* if the binding policy is core, then we bind to the nrank-th
                     * core on this node
                     */
                    if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
                                                                       0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                        ret = ORTE_ERR_NOT_FOUND;
                        error = "Getting core object";
                        goto error;
                    }
                    cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                    if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                        error = "Setting processor affinity failed";
                        ret = ORTE_ERROR;
                        goto error;
                    }
                    hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus);
                    OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
                                         "%s Process bound to core",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                } else {
                    /* for all higher binding policies, we bind to the specified
                     * object that the nrank-th core belongs to
                     */
                    if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
                                                                       0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                        ret = ORTE_ERR_NOT_FOUND;
                        error = "Getting core object";
                        goto error;
                    }
                    if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        target = HWLOC_OBJ_CACHE;
                        cache_level = 1;
                    } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        target = HWLOC_OBJ_CACHE;
                        cache_level = 2;
                    } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        target = HWLOC_OBJ_CACHE;
                        cache_level = 3;
                    } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        target = HWLOC_OBJ_SOCKET;
                    } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        target = HWLOC_OBJ_NODE;
                    } else {
                        ret = ORTE_ERR_NOT_FOUND;
                        error = "Binding policy not known";
                        goto error;
                    }
                    for (obj = obj->parent; NULL != obj; obj = obj->parent) {
                        if (target == obj->type) {
                            if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
                                continue;
                            }
                            /* this is the place! */
                            cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                            if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                                ret = ORTE_ERROR;
                                error = "Setting processor affinity failed";
                                goto error;
                            }
                            hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus);
                            orte_proc_is_bound = true;
                            OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
                                                 "%s Process bound to %s",
                                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                 hwloc_obj_type_string(target)));
                            break;
                        }
                    }
                    if (!orte_proc_is_bound) {
                        ret = ORTE_ERROR;
                        error = "Setting processor affinity failed";
                        goto error;
                    }
                }
            }
        }
    } else {
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
                             "%s Process bound at launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    }

 MOVEON:
    /* get or update our local cpuset - it will get used multiple
     * times, so it's more efficient to keep a global copy
     */
    opal_hwloc_base_get_local_cpuset();

    /* get the cpus we are bound to */
    mycpus = hwloc_bitmap_alloc();
    if (hwloc_get_cpubind(opal_hwloc_topology,
                          mycpus,
                          HWLOC_CPUBIND_PROCESS) < 0) {
        if (NULL != orte_process_info.cpuset) {
            free(orte_process_info.cpuset);
            orte_process_info.cpuset = NULL;
        }
        if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
            opal_output(0, "MCW rank %d is not bound",
                        ORTE_PROC_MY_NAME->vpid);
        }
    } else {
        /* store/update the string representation of our local binding */
        if (NULL != orte_process_info.cpuset) {
            free(orte_process_info.cpuset);
            orte_process_info.cpuset = NULL;
        }
        hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, mycpus);
        /* report the binding, if requested */
        if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
            char tmp1[1024], tmp2[1024];
            if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), opal_hwloc_topology, mycpus)) {
                opal_output(0, "MCW rank %d is not bound (or bound to all available processors)", ORTE_PROC_MY_NAME->vpid);
            } else {
                opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), opal_hwloc_topology, mycpus);
                opal_output(0, "MCW rank %d bound to %s: %s",
                            ORTE_PROC_MY_NAME->vpid, tmp1, tmp2);
            }
        }
    }
    hwloc_bitmap_free(mycpus);
    /* push our cpuset so others can calculate our locality */
    if (NULL != orte_process_info.cpuset) {
        OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_CPUSET,
                              orte_process_info.cpuset, OPAL_STRING);
    }
    return ORTE_SUCCESS;

 error:
    if (ORTE_ERR_SILENT != ret) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ORTE_ERR_SILENT;
}