示例#1
0
int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
{
    orte_job_t *jdat;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_job_map_t *map;
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_app_context_t *app;
    orte_std_cntr_t num_slots;
    int rc, i, n;
    bool ignored;

    /* get the daemon app if provided - may include -host or hostfile
     * info about available nodes
     */
    app = (orte_app_context_t *) opal_pointer_array_get_item(jdata->apps, 0);
    
    map = jdata->map;
    
    /* get the list of all available nodes that do not already
     * have a daemon on them
     */
    OBJ_CONSTRUCT(&node_list, opal_list_t);
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots,
                                                               app, map->policy))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&node_list);
        return rc;
    }
    /* check all other known jobs to see if they have something to
     * add to the allocation - we won't have seen these and the
     * daemon job won't have any in its app
     */
    for (i=0; i < orte_job_data->size; i++) {
        if (NULL == (jdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
            continue;
        }
        for (n=0; n < jdat->apps->size; n++) {
            if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdat->apps, n))) {
                continue;
            }
            if (NULL != app->hostfile) {
                /* hostfile was specified - parse it and add it to the list. The
                 * function automatically ignores duplicates
                 */
                if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&node_list,
                                                                       &ignored,
                                                                       app->hostfile))) {
                    ORTE_ERROR_LOG(rc);
                    OBJ_DESTRUCT(&node_list);
                    return rc;
                }
            }
            if (NULL != app->dash_host) {
                /* parse and add to list, ignoring duplicates */
                if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&node_list,
                                                                        &ignored,
                                                                        app->dash_host))) {
                    ORTE_ERROR_LOG(rc);
                    OBJ_DESTRUCT(&node_list);
                    return rc;
                }
            }
        }
    }

    /* add all these nodes to the map */
    while (NULL != (item = opal_list_remove_first(&node_list))) {
        node = (orte_node_t*)item;
        /* if this is my node, ignore it - we are already here */
        if (0 == strcmp(node->name, orte_process_info.nodename)) {
            continue;
        }
        opal_pointer_array_add(map->nodes, (void*)node);
        ++(map->num_nodes);
        /* if this node already has a daemon, release that object
         * to maintain bookkeeping
         */
        if (NULL != node->daemon) {
            OBJ_RELEASE(node->daemon);
        }
        /* create a new daemon object for this node */
        proc = OBJ_NEW(orte_proc_t);
        if (NULL == proc) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        proc->name.jobid = ORTE_PROC_MY_HNP->jobid;
        if (ORTE_VPID_MAX-1 <= jdata->num_procs) {
            /* no more daemons available */
            orte_show_help("help-orte-rmaps-base.txt", "out-of-vpids", true);
            OBJ_RELEASE(proc);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        proc->name.vpid = jdata->num_procs;  /* take the next available vpid */
        ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
        ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
        proc->node = node;
        proc->nodename = node->name;
        OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
                             "%s rmaps:base:setup_vm add new daemon %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&proc->name)));
        /* add the daemon to the daemon job object */
        if (0 > (rc = opal_pointer_array_add(jdata->procs, (void*)proc))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        ++jdata->num_procs;
        /* point the node to the daemon */
        node->daemon = proc;
        OBJ_RETAIN(proc);  /* maintain accounting */
        /* track number of daemons to be launched */
        ++map->num_new_daemons;
        /* and their starting vpid */
        if (ORTE_VPID_INVALID == map->daemon_vpid_start) {
            map->daemon_vpid_start = proc->name.vpid;
        }
    }
    OBJ_DESTRUCT(&node_list);
    
    return ORTE_SUCCESS;
}
示例#2
0
static int ppr_mapper(orte_job_t *jdata)
{
    int rc = ORTE_SUCCESS, j, n;
    mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_vpid_t total_procs, nprocs_mapped;
    opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL;
#if OPAL_HAVE_HWLOC
    hwloc_obj_t obj;
    hwloc_obj_type_t lowest;
    unsigned cache_level=0;
    unsigned int nobjs, i;
    bool pruning_reqd = false;
    opal_hwloc_level_t level;
#endif
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_std_cntr_t num_slots;
    orte_app_idx_t idx;
    char **ppr_req, **ck;
    size_t len;
    bool initial_map=true;

    /* only handle initial launch of loadbalanced
     * or NPERxxx jobs - allow restarting of failed apps
     */
    if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s being restarted - ppr cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper &&
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
        /* a mapper has been specified, and it isn't me */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s not using ppr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL == jdata->map->ppr ||
        ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* not for us */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s not using ppr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: mapping job %s with ppr %s",
                        ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr);

    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* initialize */
    memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));

    /* parse option */
    n=0;
    ppr_req = opal_argv_split(jdata->map->ppr, ',');
    for (j=0; NULL != ppr_req[j]; j++) {
        /* split on the colon */
        ck = opal_argv_split(ppr_req[j], ':');
        if (2 != opal_argv_count(ck)) {
            /* must provide a specification */
            orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr);
            opal_argv_free(ppr_req);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        len = strlen(ck[1]);
        if (0 == strncasecmp(ck[1], "node", len)) {
            ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE);
            start = OPAL_HWLOC_NODE_LEVEL;
            n++;
#if OPAL_HAVE_HWLOC
        } else if (0 == strncasecmp(ck[1], "hwthread", len) ||
                   0 == strncasecmp(ck[1], "thread", len)) {
            ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
            start = OPAL_HWLOC_HWTHREAD_LEVEL;
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD);
            n++;
        } else if (0 == strncasecmp(ck[1], "core", len)) {
            ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_CORE_LEVEL) {
                start = OPAL_HWLOC_CORE_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "socket", len) ||
                   0 == strncasecmp(ck[1], "skt", len)) {
            ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_SOCKET_LEVEL) {
                start = OPAL_HWLOC_SOCKET_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l1cache", len)) {
            ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
                start = OPAL_HWLOC_L1CACHE_LEVEL;
                cache_level = 1;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l2cache", len)) {
            ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
                start = OPAL_HWLOC_L2CACHE_LEVEL;
                cache_level = 2;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l3cache", len)) {
            ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
                start = OPAL_HWLOC_L3CACHE_LEVEL;
                cache_level = 3;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "numa", len)) {
            ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_NUMA_LEVEL) {
                start = OPAL_HWLOC_NUMA_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA);
            }
            n++;
#endif
        } else {
            /* unknown spec */
            orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr);
            opal_argv_free(ppr_req);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        opal_argv_free(ck);
    }
    opal_argv_free(ppr_req);
    /* if nothing was given, that's an error */
    if (0 == n) {
        opal_output(0, "NOTHING GIVEN");
        return ORTE_ERR_SILENT;
    }
#if OPAL_HAVE_HWLOC
    /* if more than one level was specified, then pruning will be reqd */
    if (1 < n) {
        pruning_reqd = true;
    }
#endif

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: job %s assigned policy %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        orte_rmaps_base_print_mapping(jdata->map->mapping));

#if OPAL_HAVE_HWLOC
    /* convenience */
    level = start;
    lowest = opal_hwloc_levels[start];
#endif

    for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
            continue;
        }

        /* if the number of total procs was given, set that
         * limit - otherwise, set to max so we simply fill
         * all the nodes with the pattern
         */
        if (0 < app->num_procs) {
            total_procs = app->num_procs;
        } else {
            total_procs = ORTE_VPID_MAX;
        }

        /* get the available nodes */
        OBJ_CONSTRUCT(&node_list, opal_list_t);
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->mapping, initial_map, false))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        /* flag that all subsequent requests should not reset the node->mapped flag */
        initial_map = false;

        /* if a bookmark exists from some prior mapping, set us to start there */
        jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);

        /* cycle across the nodes */
        nprocs_mapped = 0;
        for (item = opal_list_get_first(&node_list);
             item != opal_list_get_end(&node_list);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
            /* bozo check */
            if (NULL == node->topology) {
                orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
                               true, node->name);
                rc = ORTE_ERR_SILENT;
                goto error;
            }
#endif
            /* add the node to the map, if needed */
            if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                    ORTE_ERROR_LOG(rc);
                    goto error;
                }
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
                OBJ_RETAIN(node);  /* maintain accounting on object */
                jdata->map->num_nodes++;
            }
            /* if we are mapping solely at the node level, just put
             * that many procs on this node
             */
            if (OPAL_HWLOC_NODE_LEVEL == start) {
#if OPAL_HAVE_HWLOC
                obj = hwloc_get_root_obj(node->topology);
#endif
                for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
                    if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
                        rc = ORTE_ERR_OUT_OF_RESOURCE;
                        goto error;
                    }
                    nprocs_mapped++;
#if OPAL_HAVE_HWLOC
                    orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
#endif
                }
#if OPAL_HAVE_HWLOC
            } else {
                /* get the number of lowest resources on this node */
                nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
                                                           lowest, cache_level,
                                                           OPAL_HWLOC_AVAILABLE);

                /* map the specified number of procs to each such resource on this node,
                 * recording the locale of each proc so we know its cpuset
                 */
                for (i=0; i < nobjs; i++) {
                    obj = opal_hwloc_base_get_obj_by_type(node->topology,
                                                          lowest, cache_level,
                                                          i, OPAL_HWLOC_AVAILABLE);
                    for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
                        if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
                            rc = ORTE_ERR_OUT_OF_RESOURCE;
                            goto error;
                        }
                        nprocs_mapped++;
                        orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                    }
                }

                if (pruning_reqd) {
                    /* go up the ladder and prune the procs according to
                     * the specification, adjusting the count of procs on the
                     * node as we go
                     */
                    level--;
                    prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
                }
#endif
            }

            /* set the total slots used */
            if ((int)node->num_procs <= node->slots) {
                node->slots_inuse = (int)node->num_procs;
            } else {
                node->slots_inuse = node->slots;
            }

            /* if no-oversubscribe was specified, check to see if
             * we have violated the total slot specification - regardless,
             * if slots_max was given, we are not allowed to violate it!
             */
            if ((node->slots < (int)node->num_procs) ||
                (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
                if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                    orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                   true, node->num_procs, app->app);
                    rc = ORTE_ERR_SILENT;
                    goto error;
                }
                /* flag the node as oversubscribed so that sched-yield gets
                 * properly set
                 */
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
            }

            /* if we haven't mapped all the procs, continue on to the
             * next node
             */
            if (total_procs == nprocs_mapped) {
                break;
            }
        }
        if (0 == app->num_procs) {
            app->num_procs = nprocs_mapped;
        }
        if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
            /* couldn't map them all */
            orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
                           true, app->app, app->num_procs, jdata->map->ppr);
            rc = ORTE_ERR_SILENT;
            goto error;
        }
        /* compute vpids and add proc objects to the job */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }

        /* track the total number of processes we mapped - must update
         * this AFTER we compute vpids so that computation is done
         * correctly
         */
        jdata->num_procs += app->num_procs;

        while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_DESTRUCT(&node_list);
    }
    return ORTE_SUCCESS;

 error:
    while (NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);
    return rc;
}
示例#3
0
/*
 * Create a round-robin mapping for the job.
 */
static int orte_rmaps_rr_map(orte_job_t *jdata)
{
    orte_app_context_t *app;
    int i;
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_std_cntr_t num_slots;
    int rc;
    mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version;
    bool initial_map=true;

    /* this mapper can only handle initial launch
     * when rr mapping is desired - allow
     * restarting of failed apps
     */
    if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr: job %s is being restarted - rr cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper &&
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
        /* a mapper has been specified, and it isn't me */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr: job %s not using rr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (ORTE_MAPPING_RR < ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* I don't know how to do these - defer */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr: job %s not using rr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:rr: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* start at the beginning... */
    jdata->num_procs = 0;

    /* cycle through the app_contexts, mapping them sequentially */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }

        /* setup the nodelist here in case we jump to error */
        OBJ_CONSTRUCT(&node_list, opal_list_t);

        /* if the number of processes wasn't specified, then we know there can be only
         * one app_context allowed in the launch, and that we are to launch it across
         * all available slots. We'll double-check the single app_context rule first
         */
        if (0 == app->num_procs && 1 < jdata->num_apps) {
            orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np",
                           true, jdata->num_apps, NULL);
            rc = ORTE_ERR_SILENT;
            goto error;
        }

        /* for each app_context, we have to get the list of nodes that it can
         * use since that can now be modified with a hostfile and/or -host
         * option
         */
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->mapping, initial_map, false))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        /* flag that all subsequent requests should not reset the node->mapped flag */
        initial_map = false;

        /* if a bookmark exists from some prior mapping, set us to start there */
        jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);

        if (0 == app->num_procs) {
            /* set the num_procs to equal the number of slots on these
             * mapped nodes, taking into account the number of cpus/rank
             */
            app->num_procs = num_slots / orte_rmaps_base.cpus_per_rank;
            /* sometimes, we have only one "slot" assigned, but may
             * want more than one cpu/rank - so ensure we always wind
             * up with at least one proc */
            if (0 == app->num_procs) {
                app->num_procs = 1;
            }
        }

        /* Make assignments */
        if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_bynode(jdata, app, &node_list, num_slots,
                                      app->num_procs);
        } else if (ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                      app->num_procs);
#if OPAL_HAVE_HWLOC
        } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_PU, 0);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CORE, 0);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CACHE, 1);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CACHE, 2);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CACHE, 3);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_SOCKET, 0);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
        } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_NODE, 0);
            if (ORTE_ERR_NOT_FOUND == rc) {
                /* if the mapper couldn't map by this object because
                 * it isn't available, but the error allows us to try
                 * byslot, then do so
                 */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
                rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                          app->num_procs);
            }
#endif
        } else {
            /* unrecognized mapping directive */
            orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy",
                           true, "mapping",
                           orte_rmaps_base_print_mapping(jdata->map->mapping));
            rc = ORTE_ERR_SILENT;
            goto error;
        }
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }

        /* compute vpids and add proc objects to the job - do this after
         * each app_context so that the ranks within each context are
         * contiguous
         */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* track the total number of processes we mapped - must update
         * this value AFTER we compute vpids so that computation
         * is done correctly
         */
        jdata->num_procs += app->num_procs;

        /* cleanup the node list - it can differ from one app_context
         * to another, so we have to get it every time
         */
        while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_DESTRUCT(&node_list);
    }

    return ORTE_SUCCESS;

 error:
    while(NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);

    return rc;
}
示例#4
0
/* place specified #procs on each node, up to the specified total
 * number of procs (if one was given).
 */
static int npernode(orte_job_t *jdata)
{
    orte_app_context_t *app;
    int i, j, rc=ORTE_SUCCESS;
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_std_cntr_t num_slots;
    orte_node_t *node;
    int np, nprocs;
    int num_nodes;
    
    /* setup the node list */
    OBJ_CONSTRUCT(&node_list, opal_list_t);
   
    /* loop through the app_contexts */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        /* use the number of procs if one was given */
        if (0 < app->num_procs) {
            np = app->num_procs;
        } else {
            np = INT_MAX;
        }
        /* for each app_context, we have to get the list of nodes that it can
         * use since that can now be modified with a hostfile and/or -host
         * option
         */
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->policy))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        /* loop through the list of nodes */
        num_nodes = opal_list_get_size(&node_list);
        nprocs = 0;
        while (NULL != (item = opal_list_remove_first(&node_list))) {
            node = (orte_node_t*)item;
            /* put the specified number of procs on each node */
            for (j=0; j < jdata->map->npernode && nprocs < np; j++) {
                if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
                                                                     jdata->map->cpus_per_rank, app->idx,
                                                                     &node_list, jdata->map->oversubscribe,
                                                                     false, NULL))) {
                    /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
                     * more procs to place, then that is an error
                     */
                    if (ORTE_ERR_NODE_FULLY_USED != rc ||
                        j < jdata->map->npernode-1) {
                        ORTE_ERROR_LOG(rc);
                        OBJ_RELEASE(node);
                        goto error;
                    }
                }
                nprocs++;
            }
            OBJ_RELEASE(node);
        }
        /* update the number of procs in the job */
        jdata->num_procs += nprocs;
        /* if the user requested a specific number of procs and
         * the total number of procs we were able to assign
         * doesn't equal the number requested, then we have a
         * problem
         */
        if (0 < app->num_procs && nprocs < app->num_procs) {
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
                           app->app, app->num_procs,
                           "number of nodes", num_nodes,
                           "npernode", jdata->map->npernode);
            return ORTE_ERR_SILENT;
        }
        /* compute vpids and add proc objects to the job - this has to be
         * done after each app_context is mapped in order to keep the
         * vpids contiguous within an app_context
         */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
    }

error:
    while (NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);
    return rc;
}
示例#5
0
/*
 * Create a load balanced mapping for the job by assigning a constant #procs/node, with
 * leftovers being spread one/node starting from the first node.
 */
static int loadbalance(orte_job_t *jdata)
{
    orte_app_context_t *app;
    int i, j;
    opal_list_t node_list;
    orte_std_cntr_t num_nodes, num_slots;
    int rc=ORTE_SUCCESS, np, nprocs;
    int ppn = 0;
    opal_list_item_t *item, *start;
    orte_node_t *node;

    /* setup */
    OBJ_CONSTRUCT(&node_list, opal_list_t);

    /* compute total #procs we are going to add and the total number of nodes available */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        /* get the nodes and #slots available for this app_context */
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->policy))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        if (0 < app->num_procs) {
            np = app->num_procs;
        } else {
            /* set the num_procs to the #slots */
            np = num_slots;
        }
        num_nodes = opal_list_get_size(&node_list);
        /* compute the base ppn */
        ppn = np / num_nodes;
        /* if a bookmark exists from some prior mapping, set us to start there */
        start = orte_rmaps_base_get_starting_point(&node_list, jdata);
        /* loop through the list of nodes until we either assign all the procs
         * or return to the starting point
         */
        item = start;
        nprocs = 0;
        do {
            node = (orte_node_t*)item;
            /* put the specified number of procs on each node */
            for (j=0; j < ppn; j++) {
                if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
                                                                     jdata->map->cpus_per_rank, app->idx,
                                                                     &node_list, jdata->map->oversubscribe,
                                                                     false, NULL))) {
                    /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
                     * more procs to place, then that is an error
                     */
                    if (ORTE_ERR_NODE_FULLY_USED != rc ||
                        j < ppn-1) {
                        ORTE_ERROR_LOG(rc);
                        goto error;
                    }
                }
                nprocs++;
            }
            /* move to next node */
            if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
                item = opal_list_get_first(&node_list);
            }
            else {
                item = opal_list_get_next(item);
            }
        } while (item != start && nprocs < np);
        
        /* save the bookmark */
        jdata->bookmark = node;

        /* if we haven't assigned all the procs, then loop through the list
         * again, assigning 1 per node until all are assigned
         */
        item = start;
        while (nprocs < np) {
            node = (orte_node_t*)item;
            if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
                                                                 jdata->map->cpus_per_rank, app->idx,
                                                                 &node_list, jdata->map->oversubscribe,
                                                                 false, NULL))) {
                /* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */
                if (ORTE_ERR_NODE_FULLY_USED != rc) {
                    ORTE_ERROR_LOG(rc);
                    goto error;
                }
            }
            nprocs++;
            /* move to next node */
            if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
                item = opal_list_get_first(&node_list);
            }
            else {
                item = opal_list_get_next(item);
            }
        }
        /* save the bookmark */
        jdata->bookmark = node;
        /* update the number of procs in the job */
        jdata->num_procs += nprocs;
        
        /* cleanup */
        while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
        }
        /* if the user requested a specific number of procs and
         * the total number of procs we were able to assign
         * doesn't equal the number requested, then we have a
         * problem
         */
        if (0 < app->num_procs && nprocs < app->num_procs) {
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
                           app->app, app->num_procs,
                           "number of slots", nprocs,
                           "number of nodes", num_nodes);
            return ORTE_ERR_SILENT;
        }
        /* compute vpids and add proc objects to the job - this has to be
         * done after each app_context is mapped in order to keep the
         * vpids contiguous within an app_context
         */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
    }
    
error:
    while(NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);

    return rc;
}
static int staged_mapper(orte_job_t *jdata)
{
    mca_base_component_t *c=&mca_rmaps_staged_component.base_version;
    int i, j, k, rc;
    orte_app_context_t *app;
    opal_list_t node_list, desired;
    orte_std_cntr_t num_slots;
    orte_proc_t *proc;
    orte_node_t *node, *next;
    bool work_to_do = false, first_pass = false;
    opal_list_item_t *item, *it2;
    char *cptr, **minimap;
    orte_vpid_t load;

    /* only use this mapper if it was specified */
    if (NULL == jdata->map->req_mapper ||
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name) ||
        ORTE_MAPPING_STAGED != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* I wasn't specified */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:staged: job %s not using staged mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                        "%s mca:rmaps:staged: mapping job %s with %d procs",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_JOBID_PRINT(jdata->jobid), (int)jdata->num_procs);
 
    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* if there are no nodes in the map, then this is our first
     * pass thru this job
     */
    if (0 == jdata->map->num_nodes) {
        first_pass = true;
    }

    /* we assume that the app_contexts are in priority order,
     * with the highest priority being the first entry in the
     * job's app_context array. Loop across the app_contexts
     * in order, looking for apps that have not been
     * fully mapped
     */
    for (i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        /* has it been fully mapped? */
        if (ORTE_APP_STATE_ALL_MAPPED <= app->state) {
            continue;
        }
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "%s mca:rmaps:staged: working app %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app);

        /* find nodes that meet any constraints provided in the form of
         * -hostfile or -host directives
         */
        OBJ_CONSTRUCT(&node_list, opal_list_t);
        /* get nodes based on a strict interpretation of the location hints */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                   jdata->map->mapping, false, true))) {
            /* we were unable to get any nodes that match those
             * specified in the app
             */
            if (ORTE_ERR_RESOURCE_BUSY == rc) {
                /* if the return is "busy", then at least one of the
                 * specified resources must exist, but no slots are
                 * currently available. This means there is at least
                 * a hope of eventually being able to map this app
                 * within its specified constraints, so continue working
                 */
                if (orte_soft_locations) {
                    /* if soft locations were given, then we know that
                     * none of the nodes in this allocation are available,
                     * so there is no point in continuing to check the
                     * remaining apps
                     */
                    while (NULL != (item = opal_list_remove_first(&node_list))) {
                        OBJ_RELEASE(item);
                    }
                    OBJ_DESTRUCT(&node_list);
                    goto complete;
                }
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s mca:rmaps:staged: all nodes for this app are currently busy",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                OBJ_DESTRUCT(&node_list);
                continue;
            } else {
                /* this indicates that there are no nodes that match
                 * the specified constraints, so there is no hope of
                 * ever being able to execute this app. This is an
                 * unrecoverable error - note that a return of
                 * "silent" means that the function already printed
                 * an error message, so the error_log will print nothing
                 */
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        }

        /* if a max number of procs/node was given for this
         * app, remove all nodes from the list that exceed
         * that limit
         */
        if (0 < app->max_procs_per_node) {
            item = opal_list_get_first(&node_list);
            while (item != opal_list_get_end(&node_list)) {
                it2 = opal_list_get_next(item);
                node = (orte_node_t*)item;
                if (app->max_procs_per_node <= node->num_procs) {
                    opal_list_remove_item(&node_list, item);
                    OBJ_RELEASE(item);
                }
                item = it2;
            }
        }

        /* if we have no available nodes, then move on to next app */
        if (0 == opal_list_get_size(&node_list)) {
            OBJ_DESTRUCT(&node_list);
            continue;
        }

        /* if the app specified locations, soft or not, search the list of nodes
         * for those that match the requested locations and move those
         * to the desired list so we use them first
         */
        if (NULL != app->dash_host) {
            OBJ_CONSTRUCT(&desired, opal_list_t);
            /* no particular order is required */
            for (j=0; j < opal_argv_count(app->dash_host); j++) {
                minimap = opal_argv_split(app->dash_host[j], ',');
                for (k=0; k < opal_argv_count(minimap); k++) {
                    cptr = minimap[k];
                    for (item = opal_list_get_first(&node_list);
                         item != opal_list_get_end(&node_list);
                         item = opal_list_get_next(item)) {
                        node = (orte_node_t*)item;
                        if (0 == strcmp(node->name, cptr) ||
                            (0 == strcmp("localhost", cptr) &&
                             0 == strcmp(node->name, orte_process_info.nodename))) {
                            opal_list_remove_item(&node_list, item);
                            opal_list_append(&desired, item);
                            opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
                                                "%s mca:rmaps:staged: placing node %s on desired list",
                                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                node->name);
                            break;
                        }
                    }
                }
                opal_argv_free(minimap);
            }
            /* if no nodes made the transition and the app specified soft
             * locations, then we can skip to look at the non-desired list
             */
            if (0 == opal_list_get_size(&desired)) {
                OBJ_DESTRUCT(&desired);
                if (orte_soft_locations) {
                    goto process;
                } else {
                    /* move on to next app */
                    continue;
                }
            }
            /* cycle thru the procs for this app and attempt to map them
             * to the desired nodes using a load-balancing algo
             */
            for (j=0; j < app->procs.size; j++) {
                if (NULL == (proc = opal_pointer_array_get_item(&app->procs, j))) {
                    continue;
                }
                if (ORTE_PROC_STATE_UNDEF != proc->state) {
                    /* this proc has already been mapped or executed */
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "%s mca:rmaps:staged: proc %s has already been mapped",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                        ORTE_NAME_PRINT(&proc->name));
                    continue;
                }
                /* flag that there is at least one proc still to
                 * be executed
                 */
                work_to_do = true;
                /* track number mapped */
                jdata->num_mapped++;
                /* find the lightest-loaded node on the desired list */
                node = NULL;
                load = ORTE_VPID_MAX;
                for (item = opal_list_get_first(&desired);
                     item != opal_list_get_end(&desired);
                     item = opal_list_get_next(item)) {
                    next = (orte_node_t*)item;
                    if (next->num_procs < load) {
                        node = next;
                        load = next->num_procs;
                    }
                }
                /* put the proc there */
                proc->node = node;
                proc->nodename = node->name;
                /* the local rank is the number of procs
                 * on this node from this job - we don't
                 * directly track this number, so it must
                 * be found by looping across the node->procs
                 * array and counting it each time. For now,
                 * since we don't use this value in this mode
                 * of operation, just set it to something arbitrary
                 */
                proc->local_rank = node->num_procs;
                /* the node rank is simply the number of procs
                 * on the node at this time
                 */
                proc->node_rank = node->num_procs;
                /* track number of procs on node and number of slots used */
                node->num_procs++;
                node->slots_inuse++;
                opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
                                    "%s Proc %s on node %s: slots %d inuse %d",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    ORTE_NAME_PRINT(&proc->name), node->name,
                                    (int)node->slots, (int)node->slots_inuse);
                if (node->slots_inuse == node->slots) {
                    opal_list_remove_item(&desired, &node->super);
                    OBJ_RELEASE(node);
                }
                if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
                    ORTE_ERROR_LOG(rc);
                    OBJ_RELEASE(proc);
                    return rc;
                }
                /* retain the proc struct so that we correctly track its release */
                OBJ_RETAIN(proc);
                proc->state = ORTE_PROC_STATE_INIT;
                /* flag the proc as updated so it will be included
                 * in the next pidmap message
                 */
                proc->updated =true;
                /* add the node to the map, if needed */
                if (!node->mapped) {
                    if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                    node->mapped = true;
                    OBJ_RETAIN(node);  /* maintain accounting on object */
                    jdata->map->num_nodes++;
                }
                if (0 == opal_list_get_size(&desired)) {
                    /* nothing more we can do */
                    break;
                }
            }
            /* clear the list */
            while (NULL != (item = opal_list_remove_first(&desired))) {
                OBJ_RELEASE(item);
            }
            OBJ_DESTRUCT(&desired);
        }

    process:
        for (j=0; j < app->procs.size; j++) {
            if (NULL == (proc = opal_pointer_array_get_item(&app->procs, j))) {
                continue;
            }
            if (ORTE_PROC_STATE_UNDEF != proc->state) {
                /* this proc has already been mapped or executed */
	        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
				    "%s mca:rmaps:staged: proc %s has already been mapped",
				    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
				    ORTE_NAME_PRINT(&proc->name));
                continue;
            }
            /* find the lightest-loaded node on the node list */
            node = NULL;
            load = ORTE_VPID_MAX;
            for (item = opal_list_get_first(&node_list);
                 item != opal_list_get_end(&node_list);
                 item = opal_list_get_next(item)) {
                next = (orte_node_t*)item;
                if (next->num_procs < load) {
                    node = next;
                    load = next->num_procs;
                }
            }
            /* flag that there is at least one proc still to
             * be executed
             */
            work_to_do = true;
            /* track number mapped */
            jdata->num_mapped++;
            /* map this proc to the first available slot */
            OBJ_RETAIN(node);  /* maintain accounting on object */    
	    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
				"%s mca:rmaps:staged: assigning proc %s to node %s",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
				ORTE_NAME_PRINT(&proc->name), node->name);
            proc->node = node;
            proc->nodename = node->name;
	    /* the local rank is the number of procs
	     * on this node from this job - we don't
	     * directly track this number, so it must
	     * be found by looping across the node->procs
	     * array and counting it each time. For now,
	     * since we don't use this value in this mode
	     * of operation, just set it to something arbitrary
	     */
	    proc->local_rank = node->num_procs;
	    /* the node rank is simply the number of procs
	     * on the node at this time
	     */
	    proc->node_rank = node->num_procs;
	    /* track number of procs on node and number of slots used */
            node->num_procs++;
            node->slots_inuse++;
            opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
                                "%s Proc %s on node %s: slots %d inuse %d",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&proc->name), node->name,
                                (int)node->slots, (int)node->slots_inuse);
            if (node->slots_inuse == node->slots) {
                opal_list_remove_item(&node_list, &node->super);
                OBJ_RELEASE(node);
            }
            if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(proc);
                return rc;
            }
            /* retain the proc struct so that we correctly track its release */
            OBJ_RETAIN(proc);
            proc->state = ORTE_PROC_STATE_INIT;
            /* flag the proc as updated so it will be included
             * in the next pidmap message
             */
            proc->updated =true;
            /* add the node to the map, if needed */
            if (!node->mapped) {
                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                node->mapped = true;
                OBJ_RETAIN(node);  /* maintain accounting on object */
                jdata->map->num_nodes++;
            }
            if (0 == opal_list_get_size(&node_list)) {
                /* nothing more we can do */
                break;
            }
        }
	/* clear the list */
	while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
	}
	OBJ_DESTRUCT(&node_list);
    }

 complete:
    /* if there isn't at least one proc that can be launched,
     * then indicate that we don't need to proceed with the
     * launch sequence
     */
    if (!work_to_do) {
        return ORTE_ERR_RESOURCE_BUSY;
    }
 
    /* flag that the job was updated so it will be
     * included in the pidmap message
     */
    jdata->updated = true;

    /* if we successfully mapped ALL procs in the first pass,
     * then this job is capable of supporting MPI procs
     */
    if (first_pass && jdata->num_mapped == jdata->num_procs) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "%s mca:rmaps:staged: job %s is MPI-capable",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_JOBID_PRINT(jdata->jobid));
        jdata->gang_launched = true;
    }

    return ORTE_SUCCESS;
}