Example #1
0
/*
 * Sequentially map the ranks according to the placement in the
 * specified hostfile
 */
static int orte_rmaps_seq_map(orte_job_t *jdata)
{
    orte_job_map_t *map;
    orte_app_context_t *app;
    int i, n;
    orte_std_cntr_t j;
    opal_list_item_t *item;
    orte_node_t *node, *nd;
    seq_node_t *sq, *save=NULL, *seq;;
    orte_vpid_t vpid;
    orte_std_cntr_t num_nodes;
    int rc;
    opal_list_t default_seq_list;
    opal_list_t node_list, *seq_list, sq_list;
    orte_proc_t *proc;
    mca_base_component_t *c = &mca_rmaps_seq_component.base_version;
    char *hosts = NULL, *sep, *eptr;
    FILE *fp;
    opal_hwloc_resource_type_t rtype;

    OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base_framework.framework_output,
                         "%s rmaps:seq called on job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid)));

    /* this mapper can only handle initial launch
     * when seq mapping is desired - allow
     * restarting of failed apps
     */
    if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:seq: job %s is being restarted - seq cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper) {
        if (0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
            /* a mapper has been specified, and it isn't me */
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: job %s not using sequential mapper",
                                ORTE_JOBID_PRINT(jdata->jobid));
            return ORTE_ERR_TAKE_NEXT_OPTION;
        }
        /* we need to process it */
        goto process;
    }
    if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* I don't know how to do these - defer */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:seq: job %s not using seq mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

process:
    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:seq: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* convenience def */
    map = jdata->map;

    /* if there is a default hostfile, go and get its ordered list of nodes */
    OBJ_CONSTRUCT(&default_seq_list, opal_list_t);
    if (NULL != orte_default_hostfile) {
        char *hstname = NULL;
        /* open the file */
        fp = fopen(orte_default_hostfile, "r");
        if (NULL == fp) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            rc = ORTE_ERR_NOT_FOUND;
            goto error;
        }
        while (NULL != (hstname = orte_getline(fp))) {
            if (0 == strlen(hstname)) {
                free(hstname);
                /* blank line - ignore */
                continue;
            }
            if( '#' == hstname[0] ) {
                free(hstname);
                /* Comment line - ignore */
                continue;
            }
            sq = OBJ_NEW(seq_node_t);
            if (NULL != (sep = strchr(hstname, ' '))) {
                *sep = '\0';
                sep++;
                /* remove any trailing space */
                eptr = sep + strlen(sep) - 1;
                while (eptr > sep && isspace(*eptr)) {
                    eptr--;
                }
                *(eptr+1) = 0;
                sq->cpuset = strdup(sep);
            }

            // Strip off the FQDN if present, ignore IP addresses
            if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) {
                char *ptr;
                if (NULL != (ptr = strchr(hstname, '.'))) {
                    *ptr = '\0';
                }
            }

            sq->hostname = hstname;
            opal_list_append(&default_seq_list, &sq->super);
        }
        fclose(fp);
    }

    /* start at the beginning... */
    vpid = 0;
    jdata->num_procs = 0;
    if (0 < opal_list_get_size(&default_seq_list)) {
        save = (seq_node_t*)opal_list_get_first(&default_seq_list);
    }

    /* default to LOGICAL processors */
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, NULL, OPAL_BOOL)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:seq: using PHYSICAL processors");
        rtype = OPAL_HWLOC_PHYSICAL;
    } else {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:seq: using LOGICAL processors");
        rtype = OPAL_HWLOC_LOGICAL;
    }

    /* initialize all the nodes as not included in this job map */
    for (j=0; j < orte_node_pool->size; j++) {
        if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
            ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
        }
    }

    /* cycle through the app_contexts, mapping them sequentially */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }

        /* dash-host trumps hostfile */
        if (orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: using dash-host nodes on app %s", app->app);
            OBJ_CONSTRUCT(&node_list, opal_list_t);
            /* dash host entries cannot specify cpusets, so used the std function to retrieve the list */
            if (ORTE_SUCCESS != (rc = orte_util_get_ordered_dash_host_list(&node_list, hosts))) {
                ORTE_ERROR_LOG(rc);
                free(hosts);
                goto error;
            }
            free(hosts);
            /* transfer the list to a seq_node_t list */
            OBJ_CONSTRUCT(&sq_list, opal_list_t);
            while (NULL != (nd = (orte_node_t*)opal_list_remove_first(&node_list))) {
                sq = OBJ_NEW(seq_node_t);
                sq->hostname = strdup(nd->name);
                opal_list_append(&sq_list, &sq->super);
                OBJ_RELEASE(nd);
            }
            OBJ_DESTRUCT(&node_list);
            seq_list = &sq_list;
        } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
            char *hstname;
            if (NULL == hosts) {
                rc = ORTE_ERR_NOT_FOUND;
                goto error;
            }
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: using hostfile %s nodes on app %s", hosts, app->app);
            OBJ_CONSTRUCT(&sq_list, opal_list_t);
            /* open the file */
            fp = fopen(hosts, "r");
            if (NULL == fp) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                rc = ORTE_ERR_NOT_FOUND;
                OBJ_DESTRUCT(&sq_list);
                goto error;
            }
            while (NULL != (hstname = orte_getline(fp))) {
                if (0 == strlen(hstname)) {
                    free(hstname);
                    /* blank line - ignore */
                    continue;
                }
                if( '#' == hstname[0] ) {
                    free(hstname);
                    /* Comment line - ignore */
                    continue;
                }
                sq = OBJ_NEW(seq_node_t);
                if (NULL != (sep = strchr(hstname, ' '))) {
                    *sep = '\0';
                    sep++;
                    /* remove any trailing space */
                    eptr = sep + strlen(sep) - 1;
                    while (eptr > sep && isspace(*eptr)) {
                        eptr--;
                    }
                    *(eptr+1) = 0;
                    sq->cpuset = strdup(sep);
                }

                // Strip off the FQDN if present, ignore IP addresses
                if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) {
                    char *ptr;
                    if (NULL != (ptr = strchr(hstname, '.'))) {
                        (*ptr) = '\0';
                    }
                }

                sq->hostname = hstname;
                opal_list_append(&sq_list, &sq->super);
            }
            fclose(fp);
            free(hosts);
            seq_list = &sq_list;
        } else if (0 < opal_list_get_size(&default_seq_list)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: using default hostfile nodes on app %s", app->app);
            seq_list = &default_seq_list;
        } else {
            /* can't do anything - no nodes available! */
            orte_show_help("help-orte-rmaps-base.txt",
                           "orte-rmaps-base:no-available-resources",
                           true);
            return ORTE_ERR_SILENT;
        }

        /* check for nolocal and remove the head node, if required */
        if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) {
            for (item  = opal_list_get_first(seq_list);
                    item != opal_list_get_end(seq_list);
                    item  = opal_list_get_next(item) ) {
                seq = (seq_node_t*)item;
                /* need to check ifislocal because the name in the
                 * hostfile may not have been FQDN, while name returned
                 * by gethostname may have been (or vice versa)
                 */
                if (orte_ifislocal(seq->hostname)) {
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:seq: removing head node %s", seq->hostname);
                    opal_list_remove_item(seq_list, item);
                    OBJ_RELEASE(item);  /* "un-retain" it */
                }
            }
        }

        if (NULL == seq_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(seq_list))) {
            orte_show_help("help-orte-rmaps-base.txt",
                           "orte-rmaps-base:no-available-resources",
                           true);
            return ORTE_ERR_SILENT;
        }

        /* if num_procs wasn't specified, set it now */
        if (0 == app->num_procs) {
            app->num_procs = num_nodes;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: setting num procs to %s for app %s",
                                ORTE_VPID_PRINT(app->num_procs), app->app);
        } else if (num_nodes < app->num_procs) {
            orte_show_help("help-orte-rmaps-base.txt", "seq:not-enough-resources", true,
                           app->num_procs, num_nodes);
            return ORTE_ERR_SILENT;
        }

        if (seq_list == &default_seq_list) {
            sq = save;
        } else {
            sq = (seq_node_t*)opal_list_get_first(seq_list);
        }
        for (n=0; n < app->num_procs; n++) {
            /* find this node on the global array - this is necessary so
             * that our mapping gets saved on that array as the objects
             * returned by the hostfile function are -not- on the array
             */
            node = NULL;
            for (j=0; j < orte_node_pool->size; j++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
                    continue;
                }
                if (0 == strcmp(sq->hostname, node->name)) {
                    break;
                }
            }
            if (NULL == node) {
                /* wasn't found - that is an error */
                orte_show_help("help-orte-rmaps-seq.txt",
                               "orte-rmaps-seq:resource-not-found",
                               true, sq->hostname);
                rc = ORTE_ERR_SILENT;
                goto error;
            }
            /* ensure the node is in the map */
            if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
                OBJ_RETAIN(node);
                opal_pointer_array_add(map->nodes, node);
                jdata->map->num_nodes++;
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
            }
            proc = orte_rmaps_base_setup_proc(jdata, node, i);
            if ((node->slots < (int)node->num_procs) ||
                    (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
                if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                    orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                   true, node->num_procs, app->app);
                    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    rc = ORTE_ERR_SILENT;
                    goto error;
                }
                /* flag the node as oversubscribed so that sched-yield gets
                 * properly set
                 */
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
                ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
                /* check for permission */
                if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
                    /* if we weren't given a directive either way, then we will error out
                     * as the #slots were specifically given, either by the host RM or
                     * via hostfile/dash-host */
                    if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
                        orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                       true, app->num_procs, app->app);
                        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                        return ORTE_ERR_SILENT;
                    } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                        /* if we were explicitly told not to oversubscribe, then don't */
                        orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                       true, app->num_procs, app->app);
                        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                        return ORTE_ERR_SILENT;
                    }
                }
            }
            /* assign the vpid */
            proc->name.vpid = vpid++;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: assign proc %s to node %s for app %s",
                                ORTE_VPID_PRINT(proc->name.vpid), sq->hostname, app->app);

            /* record the cpuset, if given */
            if (NULL != sq->cpuset) {
                hwloc_cpuset_t bitmap;
                char *cpu_bitmap;
                if (NULL == node->topology) {
                    /* not allowed - for sequential cpusets, we must have
                     * the topology info
                     */
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name);
                    rc = ORTE_ERR_SILENT;
                    goto error;
                }
                /* if we are using hwthreads as cpus and binding to hwthreads, then
                 * we can just copy the cpuset across as it already specifies things
                 * at that level */
                if (opal_hwloc_use_hwthreads_as_cpus &&
                        OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                    cpu_bitmap = strdup(sq->cpuset);
                } else {
                    /* setup the bitmap */
                    bitmap = hwloc_bitmap_alloc();
                    /* parse the slot_list to find the socket and core */
                    if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(sq->cpuset, node->topology, rtype, bitmap))) {
                        ORTE_ERROR_LOG(rc);
                        hwloc_bitmap_free(bitmap);
                        goto error;
                    }
                    /* note that we cannot set the proc locale to any specific object
                     * as the slot list may have assigned it to more than one - so
                     * leave that field NULL
                     */
                    /* set the proc to the specified map */
                    hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap);
                    hwloc_bitmap_free(bitmap);
                }
                orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "mca:rmaps:seq: binding proc %s to cpuset %s bitmap %s",
                                    ORTE_VPID_PRINT(proc->name.vpid), sq->cpuset, cpu_bitmap);
                /* we are going to bind to cpuset since the user is specifying the cpus */
                OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CPUSET);
                /* note that the user specified the mapping */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYUSER);
                ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_GIVEN);
                /* cleanup */
                free(cpu_bitmap);
            } else {
                hwloc_obj_t locale;

                /* assign the locale - okay for the topo to be null as
                 * it just means it wasn't returned
                 */
                if (NULL != node->topology) {
                    locale = hwloc_get_root_obj(node->topology);
                    orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE,
                                       ORTE_ATTR_LOCAL, locale, OPAL_PTR);
                }
            }

            /* add to the jdata proc array */
            if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                ORTE_ERROR_LOG(rc);
                goto error;
            }
            /* move to next node */
            sq = (seq_node_t*)opal_list_get_next(&sq->super);
        }

        /** track the total number of processes we mapped */
        jdata->num_procs += app->num_procs;

        /* cleanup the node list if it came from this app_context */
        if (seq_list != &default_seq_list) {
            OPAL_LIST_DESTRUCT(seq_list);
        } else {
            save = sq;
        }
    }

    return ORTE_SUCCESS;

error:
    OPAL_LIST_DESTRUCT(&default_seq_list);
    return rc;
}
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
{
    hwloc_obj_type_t hwb, hwm;
    unsigned clvl=0, clvm=0;
    opal_binding_policy_t bind;
    orte_mapping_policy_t map;
    orte_node_t *node;
    int i, rc;
    struct hwloc_topology_support *support;
    bool force_down = false;
    hwloc_cpuset_t totalcpuset;
    int bind_depth, map_depth;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: compute bindings for job %s with policy %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));

    map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping);
    bind = OPAL_GET_BINDING_POLICY(jdata->map->binding);

    if (ORTE_MAPPING_BYUSER == map) {
        /* user specified binding by rankfile - nothing for us to do */
        return ORTE_SUCCESS;
    }

    if (OPAL_BIND_TO_CPUSET == bind) {
        int rc;
        /* cpuset was given - setup the bindings */
        if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) {
            ORTE_ERROR_LOG(rc);
        }
        return rc;
    }

    if (OPAL_BIND_TO_NONE == bind) {
        /* no binding requested */
        return ORTE_SUCCESS;
    }

    if (OPAL_BIND_TO_BOARD == bind) {
        /* doesn't do anything at this time */
        return ORTE_SUCCESS;
    }

    /* binding requested - convert the binding level to the hwloc obj type */
    switch (bind) {
    case OPAL_BIND_TO_NUMA:
        hwb = HWLOC_OBJ_NODE;
        break;
    case OPAL_BIND_TO_SOCKET:
        hwb = HWLOC_OBJ_SOCKET;
        break;
    case OPAL_BIND_TO_L3CACHE:
        hwb = HWLOC_OBJ_CACHE;
        clvl = 3;
        break;
    case OPAL_BIND_TO_L2CACHE:
        hwb = HWLOC_OBJ_CACHE;
        clvl = 2;
        break;
    case OPAL_BIND_TO_L1CACHE:
        hwb = HWLOC_OBJ_CACHE;
        clvl = 1;
        break;
    case OPAL_BIND_TO_CORE:
        hwb = HWLOC_OBJ_CORE;
        break;
    case OPAL_BIND_TO_HWTHREAD:
        hwb = HWLOC_OBJ_PU;
        break;
    default:
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }

    /* do the same for the mapping policy */
    switch (map) {
    case ORTE_MAPPING_BYNODE:
    case ORTE_MAPPING_BYSLOT:
    case ORTE_MAPPING_SEQ:
        hwm = HWLOC_OBJ_MACHINE;
        break;
    case ORTE_MAPPING_BYDIST:
    case ORTE_MAPPING_BYNUMA:
        hwm = HWLOC_OBJ_NODE;
        break;
    case ORTE_MAPPING_BYSOCKET:
        hwm = HWLOC_OBJ_SOCKET;
        break;
    case ORTE_MAPPING_BYL3CACHE:
        hwm = HWLOC_OBJ_CACHE;
        clvm = 3;
        break;
    case ORTE_MAPPING_BYL2CACHE:
        hwm = HWLOC_OBJ_CACHE;
        clvm = 2;
        break;
    case ORTE_MAPPING_BYL1CACHE:
        hwm = HWLOC_OBJ_CACHE;
        clvm = 1;
        break;
    case ORTE_MAPPING_BYCORE:
        hwm = HWLOC_OBJ_CORE;
        break;
    case ORTE_MAPPING_BYHWTHREAD:
        hwm = HWLOC_OBJ_PU;
        break;
    default:
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }

    /* if the job was mapped by the corresponding target, then
     * we bind in place
     *
     * otherwise, we have to bind either up or down the hwloc
     * tree. If we are binding upwards (e.g., mapped to hwthread
     * but binding to core), then we just climb the tree to find
     * the first matching object.
     *
     * if we are binding downwards (e.g., mapped to node and bind
     * to core), then we have to do a round-robin assigment of
     * procs to the resources below.
     */

    if (ORTE_MAPPING_BYDIST == map) {
        int rc = ORTE_SUCCESS;
        if (OPAL_BIND_TO_NUMA == bind) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - dist to numa",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        } else if (OPAL_BIND_TO_NUMA < bind) {
            /* bind every proc downwards */
            force_down = true;
            goto execute;
        }
        /* if the binding policy is less than numa, then we are unbound - so
         * just ignore this and return (should have been caught in prior
         * tests anyway as only options meeting that criteria are "none"
         * and "board")
         */
        return rc;
    }

    /* now deal with the remaining binding policies based on hardware */
    if (bind == map) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps: bindings for job %s - bind in place",
                            ORTE_JOBID_PRINT(jdata->jobid));
        if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) {
            ORTE_ERROR_LOG(rc);
        }
        return rc;
    }

    /* we need to handle the remaining binding options on a per-node
     * basis because different nodes could potentially have different
     * topologies, with different relative depths for the two levels
     */
 execute:
    /* initialize */
    totalcpuset = hwloc_bitmap_alloc();

    for (i=0; i < jdata->map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(jdata->map->binding) ||
                    !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability - don't warn if the user didn't
             * specifically request binding
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind &&
                OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* some systems do not report cores, and so we can get a situation where our
         * default binding policy will fail for no necessary reason. So if we are
         * computing a binding due to our default policy, and no cores are found
         * on this node, just silently skip it - we will not bind
         */
        if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) &&
            HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "Unable to bind-to core by default on node %s as no cores detected",
                                node->name);
            continue;
        }

        /* we share topologies in order
         * to save space, so we need to reset the usage info to reflect
         * our own current state
         */
        reset_usage(node, jdata->jobid);

        if (force_down) {
            if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        } else {
            /* determine the relative depth on this node */
            if (HWLOC_OBJ_CACHE == hwb) {
                /* must use a unique function because blasted hwloc
                 * just doesn't deal with caches very well...sigh
                 */
                bind_depth = hwloc_get_cache_type_depth(node->topology, clvl, -1);
            } else {
                bind_depth = hwloc_get_type_depth(node->topology, hwb);
            }
            if (0 > bind_depth) {
                /* didn't find such an object */
                orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
                               true, hwloc_obj_type_string(hwb), node->name);
                return ORTE_ERR_SILENT;
            }
            if (HWLOC_OBJ_CACHE == hwm) {
                /* must use a unique function because blasted hwloc
                 * just doesn't deal with caches very well...sigh
                 */
                map_depth = hwloc_get_cache_type_depth(node->topology, clvm, -1);
            } else {
                map_depth = hwloc_get_type_depth(node->topology, hwm);
            }
            if (0 > map_depth) {
                /* didn't find such an object */
                orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
                               true, hwloc_obj_type_string(hwm), node->name);
                return ORTE_ERR_SILENT;
            }
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s bind_depth: %d map_depth %d",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                bind_depth, map_depth);
            if (bind_depth > map_depth) {
                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            } else {
                if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            }
        }
    }

    return ORTE_SUCCESS;
}
Example #3
0
int orte_rmaps_base_set_ranking_policy(orte_ranking_policy_t *policy,
                                       orte_mapping_policy_t mapping,
                                       char *spec)
{
    orte_mapping_policy_t map;
    orte_ranking_policy_t tmp;
    char **ck;
    size_t len;

    /* set default */
    tmp = 0;

    if (NULL == spec) {
        /* check for map-by object directives - we set the
         * ranking to match if one was given
         */
        if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
            map = ORTE_GET_MAPPING_POLICY(mapping);
            switch (map) {
            case ORTE_MAPPING_BYSLOT:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT);
                break;
            case ORTE_MAPPING_BYNODE:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NODE);
                break;
            case ORTE_MAPPING_BYCORE:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_CORE);
                break;
            case ORTE_MAPPING_BYL1CACHE:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L1CACHE);
                break;
            case ORTE_MAPPING_BYL2CACHE:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L2CACHE);
                break;
            case ORTE_MAPPING_BYL3CACHE:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L3CACHE);
                break;
            case ORTE_MAPPING_BYSOCKET:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SOCKET);
                break;
            case ORTE_MAPPING_BYNUMA:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NUMA);
                break;
            case ORTE_MAPPING_BYBOARD:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_BOARD);
                break;
            case ORTE_MAPPING_BYHWTHREAD:
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_HWTHREAD);
                break;
            default:
                /* anything not tied to a specific hw obj can rank by slot */
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT);
                break;
            }
        } else {
            /* if no map-by was given, default to by-slot */
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT);
        }
    } else {
        ck = opal_argv_split(spec, ':');
        if (2 < opal_argv_count(ck)) {
            /* incorrect format */
            orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", policy);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        if (2 == opal_argv_count(ck)) {
            if (0 == strncasecmp(ck[1], "span", strlen(ck[1]))) {
                ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_SPAN);
            } else if (0 == strncasecmp(ck[1], "fill", strlen(ck[1]))) {
                ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_FILL);
            } else {
                /* unrecognized modifier */
                orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, ck[1]);
                opal_argv_free(ck);
                return ORTE_ERR_SILENT;
            }
        }
        len = strlen(ck[0]);
        if (0 == strncasecmp(ck[0], "slot", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT);
        } else if (0 == strncasecmp(ck[0], "node", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NODE);
        } else if (0 == strncasecmp(ck[0], "hwthread", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_HWTHREAD);
        } else if (0 == strncasecmp(ck[0], "core", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_CORE);
        } else if (0 == strncasecmp(ck[0], "l1cache", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L1CACHE);
        } else if (0 == strncasecmp(ck[0], "l2cache", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L2CACHE);
        } else if (0 == strncasecmp(ck[0], "l3cache", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L3CACHE);
        } else if (0 == strncasecmp(ck[0], "socket", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SOCKET);
        } else if (0 == strncasecmp(ck[0], "numa", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NUMA);
        } else if (0 == strncasecmp(ck[0], "board", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_BOARD);
        } else {
            orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", rmaps_base_ranking_policy);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        opal_argv_free(ck);
        ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_GIVEN);
    }

    *policy = tmp;
    return ORTE_SUCCESS;
}
Example #4
0
static int ppr_mapper(orte_job_t *jdata)
{
    int rc = ORTE_SUCCESS, j, n;
    mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_vpid_t total_procs, nprocs_mapped;
    opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL;
#if OPAL_HAVE_HWLOC
    hwloc_obj_t obj;
    hwloc_obj_type_t lowest;
    unsigned cache_level=0;
    unsigned int nobjs, i;
    bool pruning_reqd = false;
    opal_hwloc_level_t level;
#endif
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_std_cntr_t num_slots;
    orte_app_idx_t idx;
    char **ppr_req, **ck;
    size_t len;
    bool initial_map=true;

    /* only handle initial launch of loadbalanced
     * or NPERxxx jobs - allow restarting of failed apps
     */
    if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s being restarted - ppr cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper &&
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
        /* a mapper has been specified, and it isn't me */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s not using ppr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL == jdata->map->ppr ||
        ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* not for us */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s not using ppr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: mapping job %s with ppr %s",
                        ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr);

    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* initialize */
    memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));

    /* parse option */
    n=0;
    ppr_req = opal_argv_split(jdata->map->ppr, ',');
    for (j=0; NULL != ppr_req[j]; j++) {
        /* split on the colon */
        ck = opal_argv_split(ppr_req[j], ':');
        if (2 != opal_argv_count(ck)) {
            /* must provide a specification */
            orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr);
            opal_argv_free(ppr_req);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        len = strlen(ck[1]);
        if (0 == strncasecmp(ck[1], "node", len)) {
            ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE);
            start = OPAL_HWLOC_NODE_LEVEL;
            n++;
#if OPAL_HAVE_HWLOC
        } else if (0 == strncasecmp(ck[1], "hwthread", len) ||
                   0 == strncasecmp(ck[1], "thread", len)) {
            ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
            start = OPAL_HWLOC_HWTHREAD_LEVEL;
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD);
            n++;
        } else if (0 == strncasecmp(ck[1], "core", len)) {
            ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_CORE_LEVEL) {
                start = OPAL_HWLOC_CORE_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "socket", len) ||
                   0 == strncasecmp(ck[1], "skt", len)) {
            ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_SOCKET_LEVEL) {
                start = OPAL_HWLOC_SOCKET_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l1cache", len)) {
            ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
                start = OPAL_HWLOC_L1CACHE_LEVEL;
                cache_level = 1;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l2cache", len)) {
            ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
                start = OPAL_HWLOC_L2CACHE_LEVEL;
                cache_level = 2;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l3cache", len)) {
            ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
                start = OPAL_HWLOC_L3CACHE_LEVEL;
                cache_level = 3;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "numa", len)) {
            ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_NUMA_LEVEL) {
                start = OPAL_HWLOC_NUMA_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA);
            }
            n++;
#endif
        } else {
            /* unknown spec */
            orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr);
            opal_argv_free(ppr_req);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        opal_argv_free(ck);
    }
    opal_argv_free(ppr_req);
    /* if nothing was given, that's an error */
    if (0 == n) {
        opal_output(0, "NOTHING GIVEN");
        return ORTE_ERR_SILENT;
    }
#if OPAL_HAVE_HWLOC
    /* if more than one level was specified, then pruning will be reqd */
    if (1 < n) {
        pruning_reqd = true;
    }
#endif

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: job %s assigned policy %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        orte_rmaps_base_print_mapping(jdata->map->mapping));

#if OPAL_HAVE_HWLOC
    /* convenience */
    level = start;
    lowest = opal_hwloc_levels[start];
#endif

    for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
            continue;
        }

        /* if the number of total procs was given, set that
         * limit - otherwise, set to max so we simply fill
         * all the nodes with the pattern
         */
        if (0 < app->num_procs) {
            total_procs = app->num_procs;
        } else {
            total_procs = ORTE_VPID_MAX;
        }

        /* get the available nodes */
        OBJ_CONSTRUCT(&node_list, opal_list_t);
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->mapping, initial_map, false))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        /* flag that all subsequent requests should not reset the node->mapped flag */
        initial_map = false;

        /* if a bookmark exists from some prior mapping, set us to start there */
        jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);

        /* cycle across the nodes */
        nprocs_mapped = 0;
        for (item = opal_list_get_first(&node_list);
             item != opal_list_get_end(&node_list);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
            /* bozo check */
            if (NULL == node->topology) {
                orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
                               true, node->name);
                rc = ORTE_ERR_SILENT;
                goto error;
            }
#endif
            /* add the node to the map, if needed */
            if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                    ORTE_ERROR_LOG(rc);
                    goto error;
                }
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
                OBJ_RETAIN(node);  /* maintain accounting on object */
                jdata->map->num_nodes++;
            }
            /* if we are mapping solely at the node level, just put
             * that many procs on this node
             */
            if (OPAL_HWLOC_NODE_LEVEL == start) {
#if OPAL_HAVE_HWLOC
                obj = hwloc_get_root_obj(node->topology);
#endif
                for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
                    if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
                        rc = ORTE_ERR_OUT_OF_RESOURCE;
                        goto error;
                    }
                    nprocs_mapped++;
#if OPAL_HAVE_HWLOC
                    orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
#endif
                }
#if OPAL_HAVE_HWLOC
            } else {
                /* get the number of lowest resources on this node */
                nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
                                                           lowest, cache_level,
                                                           OPAL_HWLOC_AVAILABLE);

                /* map the specified number of procs to each such resource on this node,
                 * recording the locale of each proc so we know its cpuset
                 */
                for (i=0; i < nobjs; i++) {
                    obj = opal_hwloc_base_get_obj_by_type(node->topology,
                                                          lowest, cache_level,
                                                          i, OPAL_HWLOC_AVAILABLE);
                    for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
                        if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
                            rc = ORTE_ERR_OUT_OF_RESOURCE;
                            goto error;
                        }
                        nprocs_mapped++;
                        orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                    }
                }

                if (pruning_reqd) {
                    /* go up the ladder and prune the procs according to
                     * the specification, adjusting the count of procs on the
                     * node as we go
                     */
                    level--;
                    prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
                }
#endif
            }

            /* set the total slots used */
            if ((int)node->num_procs <= node->slots) {
                node->slots_inuse = (int)node->num_procs;
            } else {
                node->slots_inuse = node->slots;
            }

            /* if no-oversubscribe was specified, check to see if
             * we have violated the total slot specification - regardless,
             * if slots_max was given, we are not allowed to violate it!
             */
            if ((node->slots < (int)node->num_procs) ||
                (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
                if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                    orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                   true, node->num_procs, app->app);
                    rc = ORTE_ERR_SILENT;
                    goto error;
                }
                /* flag the node as oversubscribed so that sched-yield gets
                 * properly set
                 */
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
            }

            /* if we haven't mapped all the procs, continue on to the
             * next node
             */
            if (total_procs == nprocs_mapped) {
                break;
            }
        }
        if (0 == app->num_procs) {
            app->num_procs = nprocs_mapped;
        }
        if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
            /* couldn't map them all */
            orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
                           true, app->app, app->num_procs, jdata->map->ppr);
            rc = ORTE_ERR_SILENT;
            goto error;
        }
        /* compute vpids and add proc objects to the job */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }

        /* track the total number of processes we mapped - must update
         * this AFTER we compute vpids so that computation is done
         * correctly
         */
        jdata->num_procs += app->num_procs;

        while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_DESTRUCT(&node_list);
    }
    return ORTE_SUCCESS;

 error:
    while (NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);
    return rc;
}
Example #5
0
/*
 * Sequentially map the ranks according to the placement in the
 * specified hostfile
 */
static int orte_rmaps_seq_map(orte_job_t *jdata)
{
    orte_job_map_t *map;
    orte_app_context_t *app;
    int i, n;
    orte_std_cntr_t j;
    opal_list_item_t *item;
    orte_node_t *node, *nd, *save=NULL;
    orte_vpid_t vpid;
    orte_std_cntr_t num_nodes;
    int rc;
    opal_list_t *default_node_list=NULL;
    opal_list_t *node_list=NULL;
    orte_proc_t *proc;
    mca_base_component_t *c = &mca_rmaps_seq_component.base_version;

    OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
                         "%s rmaps:seq mapping job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid)));

    /* this mapper can only handle initial launch
     * when seq mapping is desired - allow
     * restarting of failed apps
     */
    if (ORTE_JOB_CONTROL_RESTART & jdata->controls) {
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps:seq: job %s is being restarted - seq cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper &&
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
        /* a mapper has been specified, and it isn't me */
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps:seq: job %s not using sequential mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* I don't know how to do these - defer */
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps:seq: job %s not using seq mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                        "mca:rmaps:seq: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* conveniece def */
    map = jdata->map;
      
    /* if there is a default hostfile, go and get its ordered list of nodes */
    if (NULL != orte_default_hostfile) {
        default_node_list = OBJ_NEW(opal_list_t);
        if (ORTE_SUCCESS != (rc = orte_util_get_ordered_host_list(default_node_list, orte_default_hostfile))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
    }
    
    /* start at the beginning... */
    vpid = 0;
    jdata->num_procs = 0;
    if (NULL != default_node_list) {
        save = (orte_node_t*)opal_list_get_first(default_node_list);
    }
    
    /* cycle through the app_contexts, mapping them sequentially */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
    
        /* dash-host trumps hostfile */
        if (NULL != app->dash_host) {
            node_list = OBJ_NEW(opal_list_t);
            if (ORTE_SUCCESS != (rc = orte_util_get_ordered_dash_host_list(node_list, app->dash_host))) {
                ORTE_ERROR_LOG(rc);
                goto error;
            }            
            nd = (orte_node_t*)opal_list_get_first(node_list);
        } else if (NULL != app->hostfile) {
            node_list = OBJ_NEW(opal_list_t);
            if (ORTE_SUCCESS != (rc = orte_util_get_ordered_host_list(node_list, app->hostfile))) {
                ORTE_ERROR_LOG(rc);
                goto error;
            }            
            nd = (orte_node_t*)opal_list_get_first(node_list);
        } else if (NULL != default_node_list) {
            node_list = default_node_list;
            nd = save;
        } else {
            /* can't do anything - no nodes available! */
            orte_show_help("help-orte-rmaps-base.txt",
                           "orte-rmaps-base:no-available-resources",
                           true);
            return ORTE_ERR_SILENT;
        }
        
        /* check for nolocal and remove the head node, if required */
        if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) {
            for (item  = opal_list_get_first(node_list);
                 item != opal_list_get_end(node_list);
                 item  = opal_list_get_next(item) ) {
                node = (orte_node_t*)item;
                /* need to check ifislocal because the name in the
                 * hostfile may not have been FQDN, while name returned
                 * by gethostname may have been (or vice versa)
                 */
                if (opal_ifislocal(node->name)) {
                    opal_list_remove_item(node_list, item);
                    OBJ_RELEASE(item);  /* "un-retain" it */
                }
            }
        }
            
        if (NULL == node_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(node_list))) {
            orte_show_help("help-orte-rmaps-base.txt",
                           "orte-rmaps-base:no-available-resources",
                           true);
            return ORTE_ERR_SILENT;
        }

        /* if num_procs wasn't specified, set it now */
        if (0 == app->num_procs) {
            app->num_procs = num_nodes;
        }
        
        for (n=0; n < app->num_procs; n++) {
            /* find this node on the global array - this is necessary so
             * that our mapping gets saved on that array as the objects
             * returned by the hostfile function are -not- on the array
             */
            node = NULL;
            for (j=0; j < orte_node_pool->size; j++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
                    continue;
                } 
                if (0 == strcmp(nd->name, node->name)) {
                    break;
                }
            }
            if (NULL == node) {
                /* wasn't found - that is an error */
                orte_show_help("help-orte-rmaps-seq.txt",
                               "orte-rmaps-seq:resource-not-found",
                               true, nd->name);
                rc = ORTE_ERR_SILENT;
                goto error;
            }
            /* ensure the node is in the map */
            if (!node->mapped) {
                OBJ_RETAIN(node);
                opal_pointer_array_add(map->nodes, node);
                node->mapped = true;
            }
            proc = orte_rmaps_base_setup_proc(jdata, node, i);
            if ((node->slots < (int)node->num_procs) ||
                (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
                if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                    orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                   true, node->num_procs, app->app);
                    rc = ORTE_ERR_SILENT;
                    goto error;
                }
                /* flag the node as oversubscribed so that sched-yield gets
                 * properly set
                 */
                node->oversubscribed = true;
            }
            /* assign the vpid */
            proc->name.vpid = vpid++;

#if OPAL_HAVE_HWLOC
            /* assign the locale - okay for the topo to be null as
             * it just means it wasn't returned
             */
            if (NULL != node->topology) {
                proc->locale = hwloc_get_root_obj(node->topology);
            }
#endif

            /* add to the jdata proc array */
            if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                ORTE_ERROR_LOG(rc);
                goto error;
            }
            /* move to next node */
            nd = (orte_node_t*)opal_list_get_next((opal_list_item_t*)nd);
        }

        /** track the total number of processes we mapped */
        jdata->num_procs += app->num_procs;
        
        /* cleanup the node list if it came from this app_context */
        if (node_list != default_node_list) {
            while (NULL != (item = opal_list_remove_first(node_list))) {
                OBJ_RELEASE(item);
            }
            OBJ_RELEASE(node_list);
        } else {
            save = nd;
        }
    }

    return ORTE_SUCCESS;

 error:
    if (NULL != default_node_list) {
        while (NULL != (item = opal_list_remove_first(default_node_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_RELEASE(default_node_list);
    }
    if (NULL != node_list) {
        while (NULL != (item = opal_list_remove_first(node_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_RELEASE(node_list);
    }
    
    return rc;
}
Example #6
0
/**
 * Function for finding and opening either all MCA components, or the one
 * that was specifically requested via a MCA parameter.
 */
static int orte_rmaps_base_open(mca_base_open_flag_t flags)
{
    int rc;

    /* init the globals */
    OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t);
    orte_rmaps_base.slot_list = NULL;
    orte_rmaps_base.mapping = 0;
    orte_rmaps_base.ranking = 0;
    orte_rmaps_base.device = NULL;

    /* if a topology file was given, then set our topology
     * from it. Even though our actual topology may differ,
     * mpirun only needs to see the compute node topology
     * for mapping purposes
     */
    if (NULL != rmaps_base_topo_file) {
        if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(rmaps_base_topo_file))) {
            orte_show_help("help-orte-rmaps-base.txt", "topo-file", true, rmaps_base_topo_file);
            return ORTE_ERR_SILENT;
        }
    }

    /* check for violations that has to be detected before we parse the mapping option */
    if (NULL != orte_rmaps_base.ppr) {
        orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
                       "--ppr, -ppr", "--map-by ppr:<pattern>",
                       "rmaps_base_pattern, rmaps_ppr_pattern",
                       "rmaps_base_mapping_policy=ppr:<pattern>");
        /* if the mapping policy is NULL, then we can proceed */
        if (NULL == rmaps_base_mapping_policy) {
            asprintf(&rmaps_base_mapping_policy, "ppr:%s", orte_rmaps_base.ppr);
        } else {
            return ORTE_ERR_SILENT;
        }
    }
    if (1 < orte_rmaps_base.cpus_per_rank) {
        orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
                       "--cpus-per-proc, -cpus-per-proc, --cpus-per-rank, -cpus-per-rank",
                       "--map-by <obj>:PE=N, default <obj>=NUMA",
                       "rmaps_base_cpus_per_proc", "rmaps_base_mapping_policy=<obj>:PE=N, default <obj>=NUMA");
    }

    if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping,
                                                                 &orte_rmaps_base.device,
                                                                 rmaps_base_mapping_policy))) {
        return rc;
    }

    if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&orte_rmaps_base.ranking,
                                                                 orte_rmaps_base.mapping,
                                                                 rmaps_base_ranking_policy))) {
        return rc;
    }

    if (rmaps_base_bycore) {
        orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
                       "--bycore, -bycore", "--map-by core",
                       "rmaps_base_bycore", "rmaps_base_mapping_policy=core");
        /* set mapping policy to bycore - error if something else already set */
        if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYCORE) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "bycore", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYCORE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* set ranking policy to bycore - error if something else already set */
        if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
            ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_CORE) {
            /* error - cannot redefine the default ranking policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
                           "bycore", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_CORE);
        ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
    }

    if (rmaps_base_byslot) {
        orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
                       "--byslot, -byslot", "--map-by slot",
                       "rmaps_base_byslot", "rmaps_base_mapping_policy=slot");
        /* set mapping policy to byslot - error if something else already set */
        if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* set ranking policy to byslot - error if something else already set */
        if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
            ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) {
            /* error - cannot redefine the default ranking policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
                           "byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT);
        ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
    }

    if (rmaps_base_bynode) {
        orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
                       "--bynode, -bynode", "--map-by node",
                       "rmaps_base_bynode", "rmaps_base_mapping_policy=node");
        /* set mapping policy to bynode - error if something else already set */
        if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) {
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* set ranking policy to bynode - error if something else already set */
        if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
            ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) {
            /* error - cannot redefine the default ranking policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
                           "bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE);
        ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
    }

    if (1 < orte_rmaps_base.cpus_per_rank) {
        /* if we were asked for multiple cpus/proc, then we have to
         * bind to those cpus - any other binding policy is an
         * error
         */
        if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
            if (opal_hwloc_use_hwthreads_as_cpus) {
                if (OPAL_BIND_TO_HWTHREAD != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) &&
                    OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                    orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true,
                                   orte_rmaps_base.cpus_per_rank, "use-hwthreads-as-cpus",
                                   opal_hwloc_base_print_binding(opal_hwloc_binding_policy),
                                   "bind-to hwthread");
                    return ORTE_ERR_SILENT;
                }
            } else if (OPAL_BIND_TO_CORE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) &&
                       OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true,
                               orte_rmaps_base.cpus_per_rank, "cores as cpus",
                               opal_hwloc_base_print_binding(opal_hwloc_binding_policy),
                               "bind-to core");
                return ORTE_ERR_SILENT;
            }
        } else {
            if (opal_hwloc_use_hwthreads_as_cpus) {
                OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
            } else {
                OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
            }
        }
        /* we also need to ensure we are mapping to a high-enough level to have
         * multiple cpus beneath it - by default, we'll go to the NUMA level */
        if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
            if (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYHWTHREAD ||
              (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYCORE &&
              !opal_hwloc_use_hwthreads_as_cpus)) {
                orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low-init", true);
                return ORTE_ERR_SILENT;
            }
        } else {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s rmaps:base pe/rank set - setting mapping to BYNUMA",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNUMA);
            ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        }
    }

    if (orte_rmaps_base_pernode) {
        /* there is no way to resolve this conflict, so if something else was
         * given, we have no choice but to error out
         */
        if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        /* ensure we set the mapping policy to ppr */
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* define the ppr */
        orte_rmaps_base.ppr = strdup("1:node");
    }

    if (0 < orte_rmaps_base_n_pernode) {
        /* there is no way to resolve this conflict, so if something else was
         * given, we have no choice but to error out
         */
        if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        /* ensure we set the mapping policy to ppr */
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* define the ppr */
        asprintf(&orte_rmaps_base.ppr, "%d:node", orte_rmaps_base_n_pernode);
    }

    if (0 < orte_rmaps_base_n_persocket) {
        /* there is no way to resolve this conflict, so if something else was
         * given, we have no choice but to error out
         */
        if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        /* ensure we set the mapping policy to ppr */
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* define the ppr */
        asprintf(&orte_rmaps_base.ppr, "%d:socket", orte_rmaps_base_n_persocket);
    }

    /* Should we schedule on the local node or not? */
    if (rmaps_base_no_schedule_local) {
        orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL;
    }

    /* Should we oversubscribe or not? */
    if (rmaps_base_no_oversubscribe) {
        if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
    }

    /** force oversubscription permission */
    if (rmaps_base_oversubscribe) {
        if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
        /* also set the overload allowed flag */
        opal_hwloc_binding_policy |= OPAL_BIND_ALLOW_OVERLOAD;
    }

    /* should we display a detailed (developer-quality) version of the map after determining it? */
    if (rmaps_base_display_devel_map) {
        orte_rmaps_base.display_map = true;
        orte_devel_level_output = true;
    }

    /* should we display a diffable report of proc locations after determining it? */
    if (rmaps_base_display_diffable_map) {
        orte_rmaps_base.display_map = true;
        orte_display_diffable_output = true;
    }

    /* Open up all available components */
    rc = mca_base_framework_components_open(&orte_rmaps_base_framework, flags);

    /* check to see if any component indicated a problem */
    if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
        /* the component would have already reported the error, so
         * tell the rest of the chain to shut up
         */
        return ORTE_ERR_SILENT;
    }

    /* All done */
    return rc;
}
Example #7
0
char* orte_rmaps_base_print_mapping(orte_mapping_policy_t mapping)
{
    char *ret, *map, *mymap, *tmp;
    orte_rmaps_print_buffers_t *ptr;

    if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
        return "CONFLICTED";
    }

    ptr = get_print_buffer();
    if (NULL == ptr) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return orte_rmaps_print_null;
    }
    /* cycle around the ring */
    if (ORTE_RMAPS_PRINT_NUM_BUFS == ptr->cntr) {
        ptr->cntr = 0;
    }

    switch(ORTE_GET_MAPPING_POLICY(mapping)) {
    case ORTE_MAPPING_BYNODE:
        map = "BYNODE";
        break;
    case ORTE_MAPPING_BYBOARD:
        map = "BYBOARD";
        break;
    case ORTE_MAPPING_BYNUMA:
        map = "BYNUMA";
        break;
    case ORTE_MAPPING_BYSOCKET:
        map = "BYSOCKET";
        break;
    case ORTE_MAPPING_BYL3CACHE:
        map = "BYL3CACHE";
        break;
    case ORTE_MAPPING_BYL2CACHE:
        map = "BYL2CACHE";
        break;
    case ORTE_MAPPING_BYL1CACHE:
        map = "BYL1CACHE";
        break;
    case ORTE_MAPPING_BYCORE:
        map = "BYCORE";
        break;
    case ORTE_MAPPING_BYHWTHREAD:
        map = "BYHWTHREAD";
        break;
    case ORTE_MAPPING_BYSLOT:
        map = "BYSLOT";
        break;
    case ORTE_MAPPING_SEQ:
        map = "SEQUENTIAL";
        break;
    case ORTE_MAPPING_BYUSER:
        map = "BYUSER";
        break;
    case ORTE_MAPPING_BYDIST:
        map = "MINDIST";
        break;
    default:
        if (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
            map = "PPR";
        } else {
            map = "UNKNOWN";
        }
    }
    if (0 != strcmp(map, "PPR") && (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping))) {
        asprintf(&mymap, "%s[PPR]:", map);
    } else {
        asprintf(&mymap, "%s:", map);
    }
    if (ORTE_MAPPING_NO_USE_LOCAL & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
        asprintf(&tmp, "%sNO_USE_LOCAL,", mymap);
        free(mymap);
        mymap = tmp;
    }
    if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
        asprintf(&tmp, "%sNOOVERSUBSCRIBE,", mymap);
        free(mymap);
        mymap = tmp;
    } else if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
        asprintf(&tmp, "%sOVERSUBSCRIBE,", mymap);
        free(mymap);
        mymap = tmp;
    }
    if (ORTE_MAPPING_SPAN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
        asprintf(&tmp, "%sSPAN,", mymap);
        free(mymap);
        mymap = tmp;
    }

    /* remove the trailing mark */
    mymap[strlen(mymap)-1] = '\0';

    snprintf(ptr->buffers[ptr->cntr], ORTE_RMAPS_PRINT_MAX_SIZE, "%s", mymap);
    free(mymap);
    ret = ptr->buffers[ptr->cntr];
    ptr->cntr++;

    return ret;
}
Example #8
0
/*
 * Create a round-robin mapping for the job.
 */
static int orte_rmaps_rr_map(orte_job_t *jdata)
{
    orte_app_context_t *app;
    int i;
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_std_cntr_t num_slots;
    int rc;
    mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version;
    bool initial_map=true;

    /* this mapper can only handle initial launch
     * when rr mapping is desired - allow
     * restarting of failed apps
     */
    if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr: job %s is being restarted - rr cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper &&
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
        /* a mapper has been specified, and it isn't me */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr: job %s not using rr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (ORTE_MAPPING_RR < ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* I don't know how to do these - defer */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr: job %s not using rr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:rr: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* start at the beginning... */
    jdata->num_procs = 0;

    /* cycle through the app_contexts, mapping them sequentially */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }

        /* setup the nodelist here in case we jump to error */
        OBJ_CONSTRUCT(&node_list, opal_list_t);

        /* if the number of processes wasn't specified, then we know there can be only
         * one app_context allowed in the launch, and that we are to launch it across
         * all available slots. We'll double-check the single app_context rule first
         */
        if (0 == app->num_procs && 1 < jdata->num_apps) {
            orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np",
                           true, jdata->num_apps, NULL);
            rc = ORTE_ERR_SILENT;
            goto error;
        }

        /* for each app_context, we have to get the list of nodes that it can
         * use since that can now be modified with a hostfile and/or -host
         * option
         */
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->mapping, initial_map, false))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        /* flag that all subsequent requests should not reset the node->mapped flag */
        initial_map = false;

        /* if a bookmark exists from some prior mapping, set us to start there */
        jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);

        if (0 == app->num_procs) {
            /* set the num_procs to equal the number of slots on these
             * mapped nodes, taking into account the number of cpus/rank
             */
            app->num_procs = num_slots / orte_rmaps_base.cpus_per_rank;
            /* sometimes, we have only one "slot" assigned, but may
             * want more than one cpu/rank - so ensure we always wind
             * up with at least one proc */
            if (0 == app->num_procs) {
                app->num_procs = 1;
            }
        }

        /* Make assignments */
        if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_bynode(jdata, app, &node_list, num_slots,
                                      app->num_procs);
        } else if (ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                      app->num_procs);
#if OPAL_HAVE_HWLOC
        } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_PU, 0);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CORE, 0);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CACHE, 1);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CACHE, 2);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CACHE, 3);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_SOCKET, 0);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_NODE, 0);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
#endif
        } else {
            /* unrecognized mapping directive */
            orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy",
                           true, "mapping",
                           orte_rmaps_base_print_mapping(jdata->map->mapping));
            rc = ORTE_ERR_SILENT;
            goto error;
        }
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }

        /* compute vpids and add proc objects to the job - do this after
         * each app_context so that the ranks within each context are
         * contiguous
         */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* track the total number of processes we mapped - must update
         * this value AFTER we compute vpids so that computation
         * is done correctly
         */
        jdata->num_procs += app->num_procs;

        /* cleanup the node list - it can differ from one app_context
         * to another, so we have to get it every time
         */
        while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_DESTRUCT(&node_list);
    }

    return ORTE_SUCCESS;

 error:
    while(NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);

    return rc;
}
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
{
    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: compute bindings for job %s with policy %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));

    if (ORTE_MAPPING_BYUSER == ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping)) {
        /* user specified binding by rankfile - nothing for us to do */
        return ORTE_SUCCESS;
    }

    if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        int rc;
        /* cpuset was given - setup the bindings */
        if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) {
            ORTE_ERROR_LOG(rc);
        }
        return rc;
    }

    if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) ||
        OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        /* no binding requested */
        return ORTE_SUCCESS;
    }

    if (OPAL_BIND_TO_BOARD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        /* doesn't do anything at this time */
        return ORTE_SUCCESS;
    }

    /* binding requested */
    /* if the job was mapped by the corresponding target, then
     * we bind in place
     *
     * otherwise, we have to bind either up or down the hwloc
     * tree. If we are binding upwards (e.g., mapped to hwthread
     * but binding to core), then we just climb the tree to find
     * the first matching object.
     *
     * if we are binding downwards (e.g., mapped to node and bind
     * to core), then we have to do a round-robin assigment of
     * procs to the resources below.
     */
            
    if (ORTE_MAPPING_BYDIST == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        int rc = ORTE_SUCCESS;
        if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - dist to numa",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        } else if (OPAL_BIND_TO_NUMA < OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
            if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) {
                    ORTE_ERROR_LOG(rc);
                } 
            } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
                    ORTE_ERROR_LOG(rc);
                } 
            } else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
                    ORTE_ERROR_LOG(rc);
                } 
            } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
                    ORTE_ERROR_LOG(rc);
                } 
            } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
                    ORTE_ERROR_LOG(rc);
                } 
            } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
                    ORTE_ERROR_LOG(rc);
                } 
            }
        }
        /* if the binding policy is less than numa, then we are unbound - so
         * just ignore this and return (should have been caught in prior
         * tests anyway as only options meeting that criteria are "none"
         * and "board")
         */
        return rc;
    } else if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        int rc;
        if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - hwthread to hwthread",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_PU, 0))) {
                ORTE_ERROR_LOG(rc);
            }
            return rc;
        }
        /* HW threads are at the bottom, so we must have mapped somewhere
         * above this level - and we need to bind downwards
         */
        if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) {
            ORTE_ERROR_LOG(rc);
        }
        return rc;
    } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        int rc;
        if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - core to core",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CORE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
            return rc;
        }
        /* if the mapping policy used is less than bycore, then it is a
         * downward binding - i.e., the locale is above the binding location.
         * for example, if we map-to-socket and bind-to-core, then we compare
         * the mapping value of ORTE_MAPPING_BYCORE to ORTE_MAPPING_BYSOCKET.
         * In this case, BYCORE > BYSOCKET, so we know that the locale is
         * above the desired binding level (sockets are at a higher level than
         * the desired core binding level), and we will have to bind downwards
         */
        if (ORTE_MAPPING_BYCORE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CORE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        return rc;
    } else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        int rc;
        if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - L1cache to L1cache",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 1))) {
                ORTE_ERROR_LOG(rc);
            }
            return rc;
        }
        /* if the mapping policy is less than l1cache, then it is a
         * downward binding
         */
        if (ORTE_MAPPING_BYL1CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 1))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        return rc;
    } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        int rc;
        if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - L2cache to L2cache",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 2))) {
                ORTE_ERROR_LOG(rc);
            }
            return rc;
        }
        /* if the mapping policy is less than l2cache, then it is a
         * downward binding
         */
        if (ORTE_MAPPING_BYL2CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 2))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        return rc;
    } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        int rc;
        if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - L3cache to L3cache",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 3))) {
                ORTE_ERROR_LOG(rc);
            }
            return rc;
        }
        /* if the mapping policy is less than l3cache, then it is a
         * downward binding
         */
        if (ORTE_MAPPING_BYL3CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 3))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        return rc;
    } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        int rc;
        if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - socket to socket",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_SOCKET, 0))) {
                ORTE_ERROR_LOG(rc);
            }
            return rc;
        }
        /* if the mapping policy is less than bysocket, then it is a
         * downward binding
         */
        if (ORTE_MAPPING_BYSOCKET > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        return rc;
    } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
        int rc;
        if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - numa to numa",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
            return rc;
        }
        /* if the mapping policy is less than numa, then it is a
         * downward binding
         */
        if (ORTE_MAPPING_BYNUMA > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_NODE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_NODE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        return rc;
    } else {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
        return ORTE_ERR_NOT_SUPPORTED;
    }

    return ORTE_SUCCESS;
}
Example #10
0
/**
 * Function for finding and opening either all MCA components, or the one
 * that was specifically requested via a MCA parameter.
 */
static int orte_rmaps_base_open(mca_base_open_flag_t flags)
{
    int rc;

    /* init the globals */
    OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t);
    orte_rmaps_base.ppr = NULL;
    orte_rmaps_base.slot_list = NULL;
    orte_rmaps_base.mapping = 0;
    orte_rmaps_base.ranking = 0;

#if OPAL_HAVE_HWLOC
    /* if a topology file was given, then set our topology
     * from it. Even though our actual topology may differ,
     * mpirun only needs to see the compute node topology
     * for mapping purposes
     */
    if (NULL != rmaps_base_topo_file) {
        if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(rmaps_base_topo_file))) {
            orte_show_help("help-orte-rmaps-base.txt", "topo-file", true, rmaps_base_topo_file);
            return ORTE_ERR_SILENT;
        }
    }
#endif

    if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping,
                                                                 &orte_rmaps_base.device,
                                                                 rmaps_base_mapping_policy))) {
        return rc;
    }

    if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&orte_rmaps_base.ranking,
                                                                 orte_rmaps_base.mapping,
                                                                 rmaps_base_ranking_policy))) {
        return rc;
    }

    if (rmaps_base_byslot) {
        /* set mapping policy to byslot - error if something else already set */
        if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* set ranking policy to byslot - error if something else already set */
        if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
            ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) {
            /* error - cannot redefine the default ranking policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
                           "byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT);
        ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
    }

    if (rmaps_base_bynode) {
        /* set mapping policy to bynode - error if something else already set */
        if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) {
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* set ranking policy to bynode - error if something else already set */
        if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
            ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) {
            /* error - cannot redefine the default ranking policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
                           "bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE);
        ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
    }

    /* Should we schedule on the local node or not? */
    if (rmaps_base_no_schedule_local) {
        orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL;
    }

    /* Should we oversubscribe or not? */
    if (rmaps_base_no_oversubscribe) {
        if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
    }

    /** force oversubscription permission */
    if (rmaps_base_oversubscribe) {
        if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
    }

    /* should we display a detailed (developer-quality) version of the map after determining it? */
    if (rmaps_base_display_devel_map) {
        orte_rmaps_base.display_map = true;
        orte_devel_level_output = true;
    }
   
    /* should we display a diffable report of proc locations after determining it? */
    if (rmaps_base_display_diffable_map) {
        orte_rmaps_base.display_map = true;
        orte_display_diffable_output = true;
    }

    /* Open up all available components */
    rc = mca_base_framework_components_open(&orte_rmaps_base_framework, flags);

    /* check to see if any component indicated a problem */
    if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
        /* the component would have already reported the error, so
         * tell the rest of the chain to shut up
         */
        return ORTE_ERR_SILENT;
    }

    /* All done */
    return rc;
}
Example #11
0
static int assign_locations(orte_job_t *jdata)
{
    int i, j, m, n;
    mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_app_context_t *app;
    hwloc_obj_type_t level;
    hwloc_obj_t obj;
    unsigned int cache_level=0;
    int ppr, cnt, nobjs, nprocs_mapped;
    char **ppr_req, **ck;

    if (NULL == jdata->map->last_mapper ||
        0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) {
        /* a mapper has been specified, and it isn't me */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s not using ppr assign: %s",
                            ORTE_JOBID_PRINT(jdata->jobid),
                            (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper);
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: assigning locations for job %s with ppr %s policy %s",
                        ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr,
                        orte_rmaps_base_print_mapping(jdata->map->mapping));

    /* pickup the object level */
    if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        level = HWLOC_OBJ_MACHINE;
    } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        level = HWLOC_OBJ_PU;
    } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        level = HWLOC_OBJ_CORE;
    } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        level = HWLOC_OBJ_SOCKET;
    } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        level = HWLOC_OBJ_L1CACHE;
        cache_level = 1;
    } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        level = HWLOC_OBJ_L2CACHE;
        cache_level = 2;
    } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        level = HWLOC_OBJ_L3CACHE;
        cache_level = 3;
    } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        level = HWLOC_OBJ_NUMANODE;
    } else {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    /* get the ppr value */
    ppr_req = opal_argv_split(jdata->map->ppr, ',');
    ck = opal_argv_split(ppr_req[0], ':');
    ppr = strtol(ck[0], NULL, 10);
    opal_argv_free(ck);
    opal_argv_free(ppr_req);

    /* start assigning procs to objects, filling each object as we go until
     * all procs are assigned. */
    for (n=0; n < jdata->apps->size; n++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
            continue;
        }
        nprocs_mapped = 0;
        for (m=0; m < jdata->map->nodes->size; m++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
                continue;
            }
            if (NULL == node->topology || NULL == node->topology->topo) {
                orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
                               true, node->name);
                return ORTE_ERR_SILENT;
            }
            if (HWLOC_OBJ_MACHINE == level) {
                obj = hwloc_get_root_obj(node->topology->topo);
                for (j=0; j < node->procs->size; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    if (proc->name.jobid != jdata->jobid) {
                        continue;
                    }
                    orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                }
            } else {
                /* get the number of resources on this node at this level */
                nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
                                                           level, cache_level,
                                                           OPAL_HWLOC_AVAILABLE);

                /* map the specified number of procs to each such resource on this node,
                 * recording the locale of each proc so we know its cpuset
                 */
                for (i=0; i < nobjs; i++) {
                    cnt = 0;
                    obj = opal_hwloc_base_get_obj_by_type(node->topology->topo,
                                                          level, cache_level,
                                                          i, OPAL_HWLOC_AVAILABLE);
                    for (j=0; j < node->procs->size && cnt < ppr && nprocs_mapped < app->num_procs; j++) {
                        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                            continue;
                        }
                        if (proc->name.jobid != jdata->jobid) {
                            continue;
                        }
                        /* if we already assigned it, then skip */
                        if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, NULL, OPAL_PTR)) {
                            continue;
                        }
                        nprocs_mapped++;
                        cnt++;
                        orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                    }
                }
            }
        }
    }
    return ORTE_SUCCESS;
}
/*********************************
 * Parsing Functions
 *********************************/
int rmaps_lama_process_alias_params(orte_job_t *jdata)
{
    int exit_status = ORTE_SUCCESS;

    /*
     * Mapping options
     * Note: L1, L2, L3 are not exposed in orterun to the user, so
     *       there is no need to specify them here.
     */
    if( NULL == rmaps_lama_cmd_map ) {
        /* orte_rmaps_base.mapping */
        switch( ORTE_GET_MAPPING_POLICY(jdata->map->mapping) ) {
        case ORTE_MAPPING_BYNODE:
            /* rmaps_lama_cmd_map = strdup("nbNsL3L2L1ch"); */
            rmaps_lama_cmd_map = strdup("nbsch");
            break;
        case ORTE_MAPPING_BYBOARD:
            /* rmaps_lama_cmd_map = strdup("bnNsL3L2L1ch"); */
            orte_show_help("help-orte-rmaps-lama.txt",
                           "invalid mapping option",
                           true,
                           "by board", "mapping by board not supported by LAMA");
            exit_status = ORTE_ERR_NOT_SUPPORTED;
            goto cleanup;
            break;
        case ORTE_MAPPING_BYNUMA:
            /* rmaps_lama_cmd_map = strdup("NbnsL3L2L1ch"); */
            rmaps_lama_cmd_map = strdup("Nbnsch");
            break;
        case ORTE_MAPPING_BYSOCKET:
            /* rmaps_lama_cmd_map = strdup("sNbnL3L2L1ch"); */
            rmaps_lama_cmd_map = strdup("sbnch");
            break;
        case ORTE_MAPPING_BYL3CACHE:
            rmaps_lama_cmd_map = strdup("L3sNbnL2L1ch");
            break;
        case ORTE_MAPPING_BYL2CACHE:
            rmaps_lama_cmd_map = strdup("L2sNbnL1ch");
            break;
        case ORTE_MAPPING_BYL1CACHE:
            rmaps_lama_cmd_map = strdup("L1sNbnch");
            break;
        case ORTE_MAPPING_BYCORE:
        case ORTE_MAPPING_BYSLOT:
            /* rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); */
            rmaps_lama_cmd_map = strdup("csbnh");
            break;
        case ORTE_MAPPING_BYHWTHREAD:
            /* rmaps_lama_cmd_map = strdup("hcL1L2L3sNbn"); */
            rmaps_lama_cmd_map = strdup("hcsbn");
            break;
        case ORTE_MAPPING_RR:
            orte_show_help("help-orte-rmaps-lama.txt",
                           "invalid mapping option",
                           true,
                           "round robin", "mapping by round robin not supported by LAMA");
            exit_status = ORTE_ERR_NOT_SUPPORTED;
            goto cleanup;
        case ORTE_MAPPING_SEQ:
            orte_show_help("help-orte-rmaps-lama.txt",
                           "invalid mapping option",
                           true,
                           "sequential", "mapping by sequential not supported by LAMA");
            exit_status = ORTE_ERR_NOT_SUPPORTED;
            goto cleanup;
        case ORTE_MAPPING_BYUSER:
            orte_show_help("help-orte-rmaps-lama.txt",
                           "invalid mapping option",
                           true,
                           "by user", "mapping by user not supported by LAMA");
            exit_status = ORTE_ERR_NOT_SUPPORTED;
            goto cleanup;
        default:
            /*
             * Default is map-by core
             */
            rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh");
            break;
        }
    }

    /*
     * Binding Options
     */
    if( NULL == rmaps_lama_cmd_bind ) {
        /*
         * No binding specified, use default
         */
        if( !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) ||
            !OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
            OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding) ) {
            rmaps_lama_cmd_bind = NULL;
        }

        switch( OPAL_GET_BINDING_POLICY(jdata->map->binding) ) {
        case OPAL_BIND_TO_BOARD:
            /* rmaps_lama_cmd_bind = strdup("1b"); */
            orte_show_help("help-orte-rmaps-lama.txt",
                           "invalid binding option",
                           true,
                           "by board", "binding to board not supported by LAMA");
            exit_status = ORTE_ERR_NOT_SUPPORTED;
            goto cleanup;
            break;
        case OPAL_BIND_TO_NUMA:
            rmaps_lama_cmd_bind = strdup("1N");
            break;
        case OPAL_BIND_TO_SOCKET:
            rmaps_lama_cmd_bind = strdup("1s");
            break;
        case OPAL_BIND_TO_L3CACHE:
            rmaps_lama_cmd_bind = strdup("1L3");
            break;
        case OPAL_BIND_TO_L2CACHE:
            rmaps_lama_cmd_bind = strdup("1L2");
            break;
        case OPAL_BIND_TO_L1CACHE:
            rmaps_lama_cmd_bind = strdup("1L1");
            break;
        case OPAL_BIND_TO_CORE:
            rmaps_lama_cmd_bind = strdup("1c");
            break;
        case OPAL_BIND_TO_HWTHREAD:
            rmaps_lama_cmd_bind = strdup("1h");
            break;
        case OPAL_BIND_TO_CPUSET:
            orte_show_help("help-orte-rmaps-lama.txt",
                           "invalid binding option",
                           true,
                           "by CPU set", "binding to CPU set not supported by LAMA");
            exit_status = ORTE_ERR_NOT_SUPPORTED;
            goto cleanup;
            break;
        default:
            rmaps_lama_cmd_bind = NULL;
            break;
        }
    }

    /*
     * Ordering (a.k.a. Ranking) Options
     */
    if( NULL == rmaps_lama_cmd_ordering ) {
        /* orte_rmaps_base.ranking */
        switch( ORTE_GET_RANKING_POLICY(jdata->map->ranking) ) {
        case ORTE_RANK_BY_SLOT:
            rmaps_lama_cmd_ordering = strdup("s");
            break;
        case ORTE_RANK_BY_NODE:
        case ORTE_RANK_BY_NUMA:
        case ORTE_RANK_BY_SOCKET:
        case ORTE_RANK_BY_L3CACHE:
        case ORTE_RANK_BY_L2CACHE:
        case ORTE_RANK_BY_L1CACHE:
        case ORTE_RANK_BY_CORE:
        case ORTE_RANK_BY_HWTHREAD:
            rmaps_lama_cmd_ordering = strdup("n");
            break;
        case ORTE_RANK_BY_BOARD:
            /* rmaps_lama_cmd_ordering = strdup("n"); */
            orte_show_help("help-orte-rmaps-lama.txt",
                           "invalid ordering option",
                           true,
                           "by board", "ordering by board not supported by LAMA");
            exit_status = ORTE_ERR_NOT_SUPPORTED;
            goto cleanup;
            break;
        default:
            rmaps_lama_cmd_ordering = strdup("n");
            break;
        }
    }

    /*
     * MPPR
     */
    if( NULL == rmaps_lama_cmd_mppr ) {
        /*
         * The ppr is given in the map
         */
        if( NULL != jdata->map->ppr) {
            rmaps_lama_cmd_mppr = rmaps_lama_covert_ppr(jdata->map->ppr);
        }
    }

    /*
     * Oversubscription
     */
    if( ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping) ) {
        rmaps_lama_can_oversubscribe = false;
    }
    else {
        rmaps_lama_can_oversubscribe = true;
    }

    /*
     * Display revised values
     */
    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:lama: Revised Parameters -----");
    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:lama: Map   : %s",
                        rmaps_lama_cmd_map);
    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:lama: Bind  : %s",
                        rmaps_lama_cmd_bind);
    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:lama: MPPR  : %s",
                        rmaps_lama_cmd_mppr);
    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:lama: Order : %s",
                        rmaps_lama_cmd_ordering);

 cleanup:
    return exit_status;
}
static int staged_mapper(orte_job_t *jdata)
{
    mca_base_component_t *c=&mca_rmaps_staged_component.base_version;
    int i, j, k, rc;
    orte_app_context_t *app;
    opal_list_t node_list, desired;
    orte_std_cntr_t num_slots;
    orte_proc_t *proc;
    orte_node_t *node, *next;
    bool work_to_do = false, first_pass = false;
    opal_list_item_t *item, *it2;
    char *cptr, **minimap;
    orte_vpid_t load;

    /* only use this mapper if it was specified */
    if (NULL == jdata->map->req_mapper ||
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name) ||
        ORTE_MAPPING_STAGED != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* I wasn't specified */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:staged: job %s not using staged mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                        "%s mca:rmaps:staged: mapping job %s with %d procs",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_JOBID_PRINT(jdata->jobid), (int)jdata->num_procs);
 
    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* if there are no nodes in the map, then this is our first
     * pass thru this job
     */
    if (0 == jdata->map->num_nodes) {
        first_pass = true;
    }

    /* we assume that the app_contexts are in priority order,
     * with the highest priority being the first entry in the
     * job's app_context array. Loop across the app_contexts
     * in order, looking for apps that have not been
     * fully mapped
     */
    for (i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        /* has it been fully mapped? */
        if (ORTE_APP_STATE_ALL_MAPPED <= app->state) {
            continue;
        }
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "%s mca:rmaps:staged: working app %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app);

        /* find nodes that meet any constraints provided in the form of
         * -hostfile or -host directives
         */
        OBJ_CONSTRUCT(&node_list, opal_list_t);
        /* get nodes based on a strict interpretation of the location hints */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                   jdata->map->mapping, false, true))) {
            /* we were unable to get any nodes that match those
             * specified in the app
             */
            if (ORTE_ERR_RESOURCE_BUSY == rc) {
                /* if the return is "busy", then at least one of the
                 * specified resources must exist, but no slots are
                 * currently available. This means there is at least
                 * a hope of eventually being able to map this app
                 * within its specified constraints, so continue working
                 */
                if (orte_soft_locations) {
                    /* if soft locations were given, then we know that
                     * none of the nodes in this allocation are available,
                     * so there is no point in continuing to check the
                     * remaining apps
                     */
                    while (NULL != (item = opal_list_remove_first(&node_list))) {
                        OBJ_RELEASE(item);
                    }
                    OBJ_DESTRUCT(&node_list);
                    goto complete;
                }
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s mca:rmaps:staged: all nodes for this app are currently busy",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                OBJ_DESTRUCT(&node_list);
                continue;
            } else {
                /* this indicates that there are no nodes that match
                 * the specified constraints, so there is no hope of
                 * ever being able to execute this app. This is an
                 * unrecoverable error - note that a return of
                 * "silent" means that the function already printed
                 * an error message, so the error_log will print nothing
                 */
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        }

        /* if a max number of procs/node was given for this
         * app, remove all nodes from the list that exceed
         * that limit
         */
        if (0 < app->max_procs_per_node) {
            item = opal_list_get_first(&node_list);
            while (item != opal_list_get_end(&node_list)) {
                it2 = opal_list_get_next(item);
                node = (orte_node_t*)item;
                if (app->max_procs_per_node <= node->num_procs) {
                    opal_list_remove_item(&node_list, item);
                    OBJ_RELEASE(item);
                }
                item = it2;
            }
        }

        /* if we have no available nodes, then move on to next app */
        if (0 == opal_list_get_size(&node_list)) {
            OBJ_DESTRUCT(&node_list);
            continue;
        }

        /* if the app specified locations, soft or not, search the list of nodes
         * for those that match the requested locations and move those
         * to the desired list so we use them first
         */
        if (NULL != app->dash_host) {
            OBJ_CONSTRUCT(&desired, opal_list_t);
            /* no particular order is required */
            for (j=0; j < opal_argv_count(app->dash_host); j++) {
                minimap = opal_argv_split(app->dash_host[j], ',');
                for (k=0; k < opal_argv_count(minimap); k++) {
                    cptr = minimap[k];
                    for (item = opal_list_get_first(&node_list);
                         item != opal_list_get_end(&node_list);
                         item = opal_list_get_next(item)) {
                        node = (orte_node_t*)item;
                        if (0 == strcmp(node->name, cptr) ||
                            (0 == strcmp("localhost", cptr) &&
                             0 == strcmp(node->name, orte_process_info.nodename))) {
                            opal_list_remove_item(&node_list, item);
                            opal_list_append(&desired, item);
                            opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
                                                "%s mca:rmaps:staged: placing node %s on desired list",
                                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                node->name);
                            break;
                        }
                    }
                }
                opal_argv_free(minimap);
            }
            /* if no nodes made the transition and the app specified soft
             * locations, then we can skip to look at the non-desired list
             */
            if (0 == opal_list_get_size(&desired)) {
                OBJ_DESTRUCT(&desired);
                if (orte_soft_locations) {
                    goto process;
                } else {
                    /* move on to next app */
                    continue;
                }
            }
            /* cycle thru the procs for this app and attempt to map them
             * to the desired nodes using a load-balancing algo
             */
            for (j=0; j < app->procs.size; j++) {
                if (NULL == (proc = opal_pointer_array_get_item(&app->procs, j))) {
                    continue;
                }
                if (ORTE_PROC_STATE_UNDEF != proc->state) {
                    /* this proc has already been mapped or executed */
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "%s mca:rmaps:staged: proc %s has already been mapped",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                        ORTE_NAME_PRINT(&proc->name));
                    continue;
                }
                /* flag that there is at least one proc still to
                 * be executed
                 */
                work_to_do = true;
                /* track number mapped */
                jdata->num_mapped++;
                /* find the lightest-loaded node on the desired list */
                node = NULL;
                load = ORTE_VPID_MAX;
                for (item = opal_list_get_first(&desired);
                     item != opal_list_get_end(&desired);
                     item = opal_list_get_next(item)) {
                    next = (orte_node_t*)item;
                    if (next->num_procs < load) {
                        node = next;
                        load = next->num_procs;
                    }
                }
                /* put the proc there */
                proc->node = node;
                proc->nodename = node->name;
                /* the local rank is the number of procs
                 * on this node from this job - we don't
                 * directly track this number, so it must
                 * be found by looping across the node->procs
                 * array and counting it each time. For now,
                 * since we don't use this value in this mode
                 * of operation, just set it to something arbitrary
                 */
                proc->local_rank = node->num_procs;
                /* the node rank is simply the number of procs
                 * on the node at this time
                 */
                proc->node_rank = node->num_procs;
                /* track number of procs on node and number of slots used */
                node->num_procs++;
                node->slots_inuse++;
                opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
                                    "%s Proc %s on node %s: slots %d inuse %d",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    ORTE_NAME_PRINT(&proc->name), node->name,
                                    (int)node->slots, (int)node->slots_inuse);
                if (node->slots_inuse == node->slots) {
                    opal_list_remove_item(&desired, &node->super);
                    OBJ_RELEASE(node);
                }
                if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
                    ORTE_ERROR_LOG(rc);
                    OBJ_RELEASE(proc);
                    return rc;
                }
                /* retain the proc struct so that we correctly track its release */
                OBJ_RETAIN(proc);
                proc->state = ORTE_PROC_STATE_INIT;
                /* flag the proc as updated so it will be included
                 * in the next pidmap message
                 */
                proc->updated =true;
                /* add the node to the map, if needed */
                if (!node->mapped) {
                    if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                    node->mapped = true;
                    OBJ_RETAIN(node);  /* maintain accounting on object */
                    jdata->map->num_nodes++;
                }
                if (0 == opal_list_get_size(&desired)) {
                    /* nothing more we can do */
                    break;
                }
            }
            /* clear the list */
            while (NULL != (item = opal_list_remove_first(&desired))) {
                OBJ_RELEASE(item);
            }
            OBJ_DESTRUCT(&desired);
        }

    process:
        for (j=0; j < app->procs.size; j++) {
            if (NULL == (proc = opal_pointer_array_get_item(&app->procs, j))) {
                continue;
            }
            if (ORTE_PROC_STATE_UNDEF != proc->state) {
                /* this proc has already been mapped or executed */
	        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
				    "%s mca:rmaps:staged: proc %s has already been mapped",
				    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
				    ORTE_NAME_PRINT(&proc->name));
                continue;
            }
            /* find the lightest-loaded node on the node list */
            node = NULL;
            load = ORTE_VPID_MAX;
            for (item = opal_list_get_first(&node_list);
                 item != opal_list_get_end(&node_list);
                 item = opal_list_get_next(item)) {
                next = (orte_node_t*)item;
                if (next->num_procs < load) {
                    node = next;
                    load = next->num_procs;
                }
            }
            /* flag that there is at least one proc still to
             * be executed
             */
            work_to_do = true;
            /* track number mapped */
            jdata->num_mapped++;
            /* map this proc to the first available slot */
            OBJ_RETAIN(node);  /* maintain accounting on object */    
	    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
				"%s mca:rmaps:staged: assigning proc %s to node %s",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
				ORTE_NAME_PRINT(&proc->name), node->name);
            proc->node = node;
            proc->nodename = node->name;
	    /* the local rank is the number of procs
	     * on this node from this job - we don't
	     * directly track this number, so it must
	     * be found by looping across the node->procs
	     * array and counting it each time. For now,
	     * since we don't use this value in this mode
	     * of operation, just set it to something arbitrary
	     */
	    proc->local_rank = node->num_procs;
	    /* the node rank is simply the number of procs
	     * on the node at this time
	     */
	    proc->node_rank = node->num_procs;
	    /* track number of procs on node and number of slots used */
            node->num_procs++;
            node->slots_inuse++;
            opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
                                "%s Proc %s on node %s: slots %d inuse %d",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&proc->name), node->name,
                                (int)node->slots, (int)node->slots_inuse);
            if (node->slots_inuse == node->slots) {
                opal_list_remove_item(&node_list, &node->super);
                OBJ_RELEASE(node);
            }
            if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(proc);
                return rc;
            }
            /* retain the proc struct so that we correctly track its release */
            OBJ_RETAIN(proc);
            proc->state = ORTE_PROC_STATE_INIT;
            /* flag the proc as updated so it will be included
             * in the next pidmap message
             */
            proc->updated =true;
            /* add the node to the map, if needed */
            if (!node->mapped) {
                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                node->mapped = true;
                OBJ_RETAIN(node);  /* maintain accounting on object */
                jdata->map->num_nodes++;
            }
            if (0 == opal_list_get_size(&node_list)) {
                /* nothing more we can do */
                break;
            }
        }
	/* clear the list */
	while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
	}
	OBJ_DESTRUCT(&node_list);
    }

 complete:
    /* if there isn't at least one proc that can be launched,
     * then indicate that we don't need to proceed with the
     * launch sequence
     */
    if (!work_to_do) {
        return ORTE_ERR_RESOURCE_BUSY;
    }
 
    /* flag that the job was updated so it will be
     * included in the pidmap message
     */
    jdata->updated = true;

    /* if we successfully mapped ALL procs in the first pass,
     * then this job is capable of supporting MPI procs
     */
    if (first_pass && jdata->num_mapped == jdata->num_procs) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "%s mca:rmaps:staged: job %s is MPI-capable",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_JOBID_PRINT(jdata->jobid));
        jdata->gang_launched = true;
    }

    return ORTE_SUCCESS;
}