Exemple #1
0
static int bind_upwards(orte_job_t *jdata,
                        orte_node_t *node,
                        hwloc_obj_type_t target,
                        unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node upwards
     * until we find an object of type target - and then bind
     * the process to that target
     */
    int j;
    orte_job_map_t *map;
    orte_proc_t *proc;
    hwloc_obj_t obj;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    opal_hwloc_obj_data_t *data;
    hwloc_obj_t locale;
    char *cpu_bitmap;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind upwards for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;


    /* cycle thru the procs */
    for (j=0; j < node->procs->size; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
            continue;
        }
        /* ignore procs from other jobs */
        if (proc->name.jobid != jdata->jobid) {
            continue;
        }
        /* bozo check */
        if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
            return ORTE_ERR_SILENT;
        }
        /* starting at the locale, move up thru the parents
         * to find the target object type
         */
        cpu_bitmap = NULL;
        for (obj = locale->parent; NULL != obj; obj = obj->parent) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s bind:upward target %s type %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                hwloc_obj_type_string(target),
                                hwloc_obj_type_string(obj->type));
            if (target == obj->type) {
                if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
                    continue;
                }
                /* get its index */
                if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
                    ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                    return ORTE_ERR_SILENT;
                }
                /* track the number bound */
                data = (opal_hwloc_obj_data_t*)obj->userdata;
                data->num_bound++;
                /* get the number of cpus under this location */
                if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                    return ORTE_ERR_SILENT;
                }
                /* error out if adding a proc would cause overload and that wasn't allowed,
                 * and it wasn't a default binding policy (i.e., the user requested it)
                 */
                if (ncpus < data->num_bound &&
                    !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                    if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                        /* if the user specified a binding policy, then we cannot meet
                         * it since overload isn't allowed, so error out - have the
                         * message indicate that setting overload allowed will remove
                         * this restriction */
                        orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                       opal_hwloc_base_print_binding(map->binding), node->name,
                                       data->num_bound, ncpus);
                        return ORTE_ERR_SILENT;
                    } else {
                        /* if we have the default binding policy, then just don't bind */
                        OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
                        unbind_procs(jdata);
                        return ORTE_SUCCESS;
                    }
                }
                /* bind it here */
                cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
                hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus);
                orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
                /* record the location */
                orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    ORTE_NAME_PRINT(&proc->name),
                                    cpu_bitmap,
                                    hwloc_obj_type_string(target),
                                    idx, node->name);
                break;
            }
        }
        if (NULL == cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
            /* didn't find anyone to bind to - this is an error
             * unless the user specified if-supported
             */
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
                           opal_hwloc_base_print_binding(map->binding), node->name);
            return ORTE_ERR_SILENT;
        }
        if (NULL != cpu_bitmap) {
            free(cpu_bitmap);
        }
    }

    return ORTE_SUCCESS;
}
Exemple #2
0
static int setup_fork(orte_job_t *jdata,
                      orte_app_context_t *app)
{
    int i;
    char *param;
    bool oversubscribed;
    orte_node_t *node;
    char **envcpy, **nps, **firstranks;
    char *npstring, *firstrankstring;
    char *num_app_ctx;
    bool takeus = false;
    orte_app_context_t* tmp_app;

    opal_output_verbose(1, orte_schizo_base_framework.framework_output,
                        "%s schizo:ompi: setup_fork",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

    /* if no personality was specified, then nothing to do */
    if (NULL == jdata->personality) {
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    if (NULL != orte_schizo_base.personalities) {
    /* see if we are included */
        for (i=0; NULL != jdata->personality[i]; i++) {
            if (0 == strcmp(jdata->personality[i], "ompi")) {
                takeus = true;
                break;
            }
        }
        if (!takeus) {
            return ORTE_ERR_TAKE_NEXT_OPTION;
        }
    }

    /* see if the mapper thinks we are oversubscribed */
    oversubscribed = false;
    if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_OVERSUBSCRIBED)) {
        oversubscribed = true;
    }

    /* setup base environment: copy the current environ and merge
       in the app context environ */
    if (NULL != app->env) {
        /* manually free original context->env to avoid a memory leak */
        char **tmp = app->env;
        envcpy = opal_environ_merge(orte_launch_environ, app->env);
        if (NULL != tmp) {
            opal_argv_free(tmp);
        }
    } else {
        envcpy = opal_argv_copy(orte_launch_environ);
    }
    app->env = envcpy;

    /* special case handling for --prefix: this is somewhat icky,
       but at least some users do this.  :-\ It is possible that
       when using --prefix, the user will also "-x PATH" and/or
       "-x LD_LIBRARY_PATH", which would therefore clobber the
       work that was done in the prior pls to ensure that we have
       the prefix at the beginning of the PATH and
       LD_LIBRARY_PATH.  So examine the context->env and see if we
       find PATH or LD_LIBRARY_PATH.  If found, that means the
       prior work was clobbered, and we need to re-prefix those
       variables. */
    param = NULL;
    orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&param, OPAL_STRING);
    for (i = 0; NULL != param && NULL != app->env && NULL != app->env[i]; ++i) {
        char *newenv;

        /* Reset PATH */
        if (0 == strncmp("PATH=", app->env[i], 5)) {
            asprintf(&newenv, "%s/bin:%s", param, app->env[i] + 5);
            opal_setenv("PATH", newenv, true, &app->env);
            free(newenv);
        }

        /* Reset LD_LIBRARY_PATH */
        else if (0 == strncmp("LD_LIBRARY_PATH=", app->env[i], 16)) {
            asprintf(&newenv, "%s/lib:%s", param, app->env[i] + 16);
            opal_setenv("LD_LIBRARY_PATH", newenv, true, &app->env);
            free(newenv);
        }
    }
    if (NULL != param) {
        free(param);
    }

    /* pass my contact info to the local proc so we can talk */
    opal_setenv("OMPI_MCA_orte_local_daemon_uri", orte_process_info.my_daemon_uri, true, &app->env);

    /* pass the hnp's contact info to the local proc in case it
     * needs it
     */
    if (NULL != orte_process_info.my_hnp_uri) {
        opal_setenv("OMPI_MCA_orte_hnp_uri", orte_process_info.my_hnp_uri, true, &app->env);
    }

    /* setup yield schedule - do not override any user-supplied directive! */
    if (oversubscribed) {
        opal_setenv("OMPI_MCA_mpi_yield_when_idle", "1", false, &app->env);
    } else {
        opal_setenv("OMPI_MCA_mpi_yield_when_idle", "0", false, &app->env);
    }

    /* set the app_context number into the environment */
    asprintf(&param, "%ld", (long)app->idx);
    opal_setenv("OMPI_MCA_orte_app_num", param, true, &app->env);
    free(param);

    /* although the total_slots_alloc is the universe size, users
     * would appreciate being given a public environmental variable
     * that also represents this value - something MPI specific - so
     * do that here. Also required by the ompi_attributes code!
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    asprintf(&param, "%ld", (long)jdata->total_slots_alloc);
    opal_setenv("OMPI_UNIVERSE_SIZE", param, true, &app->env);
    free(param);

    /* pass the number of nodes involved in this job */
    asprintf(&param, "%ld", (long)(jdata->map->num_nodes));
    opal_setenv("OMPI_MCA_orte_num_nodes", param, true, &app->env);
    free(param);

    /* pass a param telling the child what type and model of cpu we are on,
     * if we know it. If hwloc has the value, use what it knows. Otherwise,
     * see if we were explicitly given it and use that value.
     */
    hwloc_obj_t obj;
    char *htmp;
    if (NULL != opal_hwloc_topology) {
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUType")) ||
            NULL != (htmp = orte_local_cpu_type)) {
            opal_setenv("OMPI_MCA_orte_cpu_type", htmp, true, &app->env);
        }
        if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUModel")) ||
            NULL != (htmp = orte_local_cpu_model)) {
            opal_setenv("OMPI_MCA_orte_cpu_model", htmp, true, &app->env);
        }
    } else {
        if (NULL != orte_local_cpu_type) {
            opal_setenv("OMPI_MCA_orte_cpu_type", orte_local_cpu_type, true, &app->env);
        }
        if (NULL != orte_local_cpu_model) {
            opal_setenv("OMPI_MCA_orte_cpu_model", orte_local_cpu_model, true, &app->env);
        }
    }

    /* get shmem's best component name so we can provide a hint to the shmem
     * framework. the idea here is to have someone figure out what component to
     * select (via the shmem framework) and then have the rest of the
     * components in shmem obey that decision. for more details take a look at
     * the shmem framework in opal.
     */
    if (NULL != (param = opal_shmem_base_best_runnable_component_name())) {
        opal_setenv("OMPI_MCA_shmem_RUNTIME_QUERY_hint", param, true, &app->env);
        free(param);
    }

    /* Set an info MCA param that tells the launched processes that
     * any binding policy was applied by us (e.g., so that
     * MPI_INIT doesn't try to bind itself)
     */
    opal_setenv("OMPI_MCA_orte_bound_at_launch", "1", true, &app->env);

    /* tell the ESS to avoid the singleton component - but don't override
     * anything that may have been provided elsewhere
     */
    opal_setenv("OMPI_MCA_ess", "^singleton", false, &app->env);

    /* ensure that the spawned process ignores direct launch components,
     * but do not overrride anything we were given */
    opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray", false, &app->env);

    /* since we want to pass the name as separate components, make sure
     * that the "name" environmental variable is cleared!
     */
    opal_unsetenv("OMPI_MCA_orte_ess_name", &app->env);

    asprintf(&param, "%ld", (long)jdata->num_procs);
    opal_setenv("OMPI_MCA_orte_ess_num_procs", param, true, &app->env);

    /* although the num_procs is the comm_world size, users
     * would appreciate being given a public environmental variable
     * that also represents this value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    opal_setenv("OMPI_COMM_WORLD_SIZE", param, true, &app->env);
    free(param);

    /* users would appreciate being given a public environmental variable
     * that also represents this value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    asprintf(&param, "%ld", (long)jdata->num_local_procs);
    opal_setenv("OMPI_COMM_WORLD_LOCAL_SIZE", param, true, &app->env);
    free(param);

    /* forcibly set the local tmpdir base and top session dir to match ours */
    opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env);
    /* TODO: should we use PMIx key to pass this data? */
    opal_setenv("OMPI_MCA_orte_top_session_dir", orte_process_info.top_session_dir, true, &app->env);
    opal_setenv("OMPI_MCA_orte_jobfam_session_dir", orte_process_info.jobfam_session_dir, true, &app->env);

    /* MPI-3 requires we provide some further info to the procs,
     * so we pass them as envars to avoid introducing further
     * ORTE calls in the MPI layer
     */
    asprintf(&num_app_ctx, "%lu", (unsigned long)jdata->num_apps);

    /* build some common envars we need to pass for MPI-3 compatibility */
    nps = NULL;
    firstranks = NULL;
    for (i=0; i < jdata->apps->size; i++) {
        if (NULL == (tmp_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        opal_argv_append_nosize(&nps, ORTE_VPID_PRINT(tmp_app->num_procs));
        opal_argv_append_nosize(&firstranks, ORTE_VPID_PRINT(tmp_app->first_rank));
    }
    npstring = opal_argv_join(nps, ' ');
    firstrankstring = opal_argv_join(firstranks, ' ');
    opal_argv_free(nps);
    opal_argv_free(firstranks);

    /* add the MPI-3 envars */
    opal_setenv("OMPI_NUM_APP_CTX", num_app_ctx, true, &app->env);
    opal_setenv("OMPI_FIRST_RANKS", firstrankstring, true, &app->env);
    opal_setenv("OMPI_APP_CTX_NUM_PROCS", npstring, true, &app->env);
    free(num_app_ctx);
    free(firstrankstring);
    free(npstring);
    return ORTE_SUCCESS;
}
Exemple #3
0
/* recursively climb the topology, pruning procs beyond that allowed
 * by the given ppr
 */
static void prune(orte_jobid_t jobid,
                  orte_app_idx_t app_idx,
                  orte_node_t *node,
                  opal_hwloc_level_t *level,
                  orte_vpid_t *nmapped)
{
    hwloc_obj_t obj, top;
    unsigned int i, nobjs;
    hwloc_obj_type_t lvl;
    unsigned cache_level = 0, k;
    int nprocs;
    hwloc_cpuset_t avail, cpus, childcpus;
    int n, limit, nmax, nunder, idx, idxmax = 0;
    orte_proc_t *proc, *pptr, *procmax;
    opal_hwloc_level_t ll;
    char dang[64];
    hwloc_obj_t locale;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: pruning level %d",
                        *level);

    /* convenience */
    ll = *level;

    /* convenience */
    lvl = opal_hwloc_levels[ll];
    limit = ppr[ll];

    if (0 == limit) {
        /* no limit at this level, so move up if necessary */
        if (0 == ll) {
            /* done */
            return;
        }
        --(*level);
        prune(jobid, app_idx, node, level, nmapped);
        return;
    }

    /* handle the darn cache thing again */
    if (OPAL_HWLOC_L3CACHE_LEVEL == ll) {
        cache_level = 3;
    } else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) {
        cache_level = 2;
    } else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) {
        cache_level = 1;
    }

    /* get the number of resources at this level on this node */
    nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
                                               lvl, cache_level,
                                               OPAL_HWLOC_AVAILABLE);

    /* for each resource, compute the number of procs sitting
     * underneath it and check against the limit
     */
    for (i=0; i < nobjs; i++) {
        obj = opal_hwloc_base_get_obj_by_type(node->topology,
                                              lvl, cache_level,
                                              i, OPAL_HWLOC_AVAILABLE);
        /* get the available cpuset */
        avail = opal_hwloc_base_get_available_cpus(node->topology, obj);

        /* look at the intersection of this object's cpuset and that
         * of each proc in the job/app - if they intersect, then count this proc
         * against the limit
         */
        nprocs = 0;
        for (n=0; n < node->procs->size; n++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
                continue;
            }
            if (proc->name.jobid != jobid ||
                proc->app_idx != app_idx) {
                continue;
            }
            locale = NULL;
            if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                return;
            }
            cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
            if (hwloc_bitmap_intersects(avail, cpus)) {
                nprocs++;
            }
        }
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: found %d procs limit %d",
                            nprocs, limit);

        /* check against the limit */
        while (limit < nprocs) {
            /* need to remove procs - do this in a semi-intelligent
             * manner to provide a little load balancing by cycling
             * across the objects beneath this one, removing procs
             * in a round-robin fashion until the limit is satisfied
             *
             * NOTE: I'm sure someone more knowledgeable with hwloc
             * will come up with a more efficient way to do this, so
             * consider this is a starting point
             */

            /* find the first level that has more than
             * one child beneath it - if all levels
             * have only one child, then return this
             * object
             */
            top = find_split(node->topology, obj);
            hwloc_obj_type_snprintf(dang, 64, top, 1);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:ppr: SPLIT AT LEVEL %s", dang);

            /* cycle across the children of this object */
            nmax = 0;
            procmax = NULL;
            idx = 0;
            /* find the child with the most procs underneath it */
            for (k=0; k < top->arity && limit < nprocs; k++) {
                /* get this object's available cpuset */
                childcpus = opal_hwloc_base_get_available_cpus(node->topology, top->children[k]);
                nunder = 0;
                pptr = NULL;
                for (n=0; n < node->procs->size; n++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
                        continue;
                    }
                    if (proc->name.jobid != jobid ||
                        proc->app_idx != app_idx) {
                        continue;
                    }
                    locale = NULL;
                    if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                        return;
                    }
                    cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
                    if (hwloc_bitmap_intersects(childcpus, cpus)) {
                        nunder++;
                        if (NULL == pptr) {
                            /* save the location of the first proc under this object */
                            pptr = proc;
                            idx = n;
                        }
                    }
                }
                if (nmax < nunder) {
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d",
                                        k, nunder, nmax);
                    nmax = nunder;
                    procmax = pptr;
                    idxmax = idx;
                }
            }
            if (NULL == procmax) {
                /* can't find anything to remove - error out */
                goto error;
            }
            /* remove it */
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:ppr: removing proc at posn %d",
                                idxmax);
            opal_pointer_array_set_item(node->procs, idxmax, NULL);
            node->num_procs--;
            node->slots_inuse--;
            if (node->slots_inuse < 0) {
                node->slots_inuse = 0;
            }
            nprocs--;
            *nmapped -= 1;
            OBJ_RELEASE(procmax);
        }
    }
    /* finished with this level - move up if necessary */
    if (0 == ll) {
        return;
    }
    --(*level);
    prune(jobid, app_idx, node, level, nmapped);
    return;

 error:
    opal_output(0, "INFINITE LOOP");
}
Exemple #4
0
static void launch_daemons(int fd, short args, void *cbdata)
{
    orte_app_context_t *app;
    orte_node_t *node;
    orte_std_cntr_t n;
    orte_job_map_t *map;
    char *jobid_string = NULL;
    char *param;
    char **argv = NULL;
    int argc;
    int rc;
    char *tmp;
    char** env = NULL;
    char *nodelist_flat;
    char **nodelist_argv;
    char *name_string;
    char **custom_strings;
    int num_args, i;
    char *cur_prefix;
    int proc_vpid_index;
    bool failed_launch=true;
    orte_job_t *daemons;
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;

    OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                         "%s plm:slurm: LAUNCH DAEMONS CALLED",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* if we are launching debugger daemons, then just go
     * do it - no new daemons will be launched
     */
    if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* start by setting up the virtual machine */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

   /* if we don't want to launch, then don't attempt to
     * launch the daemons - the user really wants to just
     * look at the proposed process map
     */
    if (orte_do_not_launch) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* Get the map for this job */
    if (NULL == (map = daemons->map)) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }

    if (0 == map->num_new_daemons) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:slurm: no new daemons to launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* need integer value for command line parameter */
    asprintf(&jobid_string, "%lu", (unsigned long) daemons->jobid);

    /*
     * start building argv array
     */
    argv = NULL;
    argc = 0;

    /*
     * SLURM srun OPTIONS
     */

    /* add the srun command */
    opal_argv_append(&argc, &argv, "srun");

    /* start one orted on each node */
    opal_argv_append(&argc, &argv, "--ntasks-per-node=1");

    /* alert us if any orteds die during startup */
    opal_argv_append(&argc, &argv, "--kill-on-bad-exit");

    /* ensure the orteds are not bound to a single processor,
     * just in case the TaskAffinity option is set by default.
     * This will *not* release the orteds from any cpu-set
     * constraint, but will ensure it doesn't get
     * bound to only one processor
     */
    opal_argv_append(&argc, &argv, "--cpu_bind=none");

    /* Append user defined arguments to srun */
    if ( NULL != mca_plm_slurm_component.custom_args ) {
        custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' ');
        num_args       = opal_argv_count(custom_strings);
        for (i = 0; i < num_args; ++i) {
            opal_argv_append(&argc, &argv, custom_strings[i]);
        }
        opal_argv_free(custom_strings);
    }

    /* create nodelist */
    nodelist_argv = NULL;

    for (n=0; n < map->nodes->size; n++ ) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
            continue;
        }
        /* if the daemon already exists on this node, then
         * don't include it
         */
        if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
            continue;
        }

        /* otherwise, add it to the list of nodes upon which
         * we need to launch a daemon
         */
        opal_argv_append_nosize(&nodelist_argv, node->name);
    }
    if (0 == opal_argv_count(nodelist_argv)) {
        orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true);
        rc = ORTE_ERR_FAILED_TO_START;
        goto cleanup;
    }
    nodelist_flat = opal_argv_join(nodelist_argv, ',');
    opal_argv_free(nodelist_argv);

    /* if we are using all allocated nodes, then srun doesn't
     * require any further arguments
     */
    if (map->num_new_daemons < orte_num_allocated_nodes) {
        asprintf(&tmp, "--nodes=%lu", (unsigned long)map->num_new_daemons);
        opal_argv_append(&argc, &argv, tmp);
        free(tmp);

        asprintf(&tmp, "--nodelist=%s", nodelist_flat);
        opal_argv_append(&argc, &argv, tmp);
        free(tmp);
    }

    /* tell srun how many tasks to run */
    asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons);
    opal_argv_append(&argc, &argv, tmp);
    free(tmp);

    OPAL_OUTPUT_VERBOSE((2, orte_plm_base_framework.framework_output,
                         "%s plm:slurm: launching on nodes %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));

    /*
     * ORTED OPTIONS
     */

    /* add the daemon command (as specified by user) */
    orte_plm_base_setup_orted_cmd(&argc, &argv);

    /* Add basic orted command line options, including debug flags */
    orte_plm_base_orted_append_basic_args(&argc, &argv,
                                          "slurm", &proc_vpid_index,
                                          nodelist_flat);
    free(nodelist_flat);

    /* tell the new daemons the base of the name list so they can compute
     * their own name on the other end
     */
    rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start);
    if (ORTE_SUCCESS != rc) {
        opal_output(0, "plm_slurm: unable to get daemon vpid as string");
        goto cleanup;
    }

    free(argv[proc_vpid_index]);
    argv[proc_vpid_index] = strdup(name_string);
    free(name_string);

    /* Copy the prefix-directory specified in the
       corresponding app_context.  If there are multiple,
       different prefix's in the app context, complain (i.e., only
       allow one --prefix option for the entire slurm run -- we
       don't support different --prefix'es for different nodes in
       the SLURM plm) */
    cur_prefix = NULL;
    for (n=0; n < state->jdata->apps->size; n++) {
        char * app_prefix_dir;
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) {
            continue;
        }
        app_prefix_dir = NULL;
        orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
        /* Check for already set cur_prefix_dir -- if different,
           complain */
        if (NULL != app_prefix_dir) {
            if (NULL != cur_prefix &&
                0 != strcmp (cur_prefix, app_prefix_dir)) {
                orte_show_help("help-plm-slurm.txt", "multiple-prefixes",
                               true, cur_prefix, app_prefix_dir);
                goto cleanup;
            }

            /* If not yet set, copy it; iff set, then it's the
             * same anyway
             */
            if (NULL == cur_prefix) {
                cur_prefix = strdup(app_prefix_dir);
                OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                                     "%s plm:slurm: Set prefix:%s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     cur_prefix));
            }
            free(app_prefix_dir);
        }
    }

    /* protect the args in case someone has a script wrapper around srun */
    mca_base_cmd_line_wrap_args(argv);

    /* setup environment */
    env = opal_argv_copy(orte_launch_environ);

    if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
        param = opal_argv_join(argv, ' ');
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:slurm: final top-level argv:\n\t%s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == param) ? "NULL" : param));
        if (NULL != param) free(param);
    }

    /* exec the daemon(s) */
    if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* indicate that the daemons for this job were launched */
    state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
    daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;

    /* flag that launch was successful, so far as we currently know */
    failed_launch = false;

 cleanup:
    if (NULL != argv) {
        opal_argv_free(argv);
    }
    if (NULL != env) {
        opal_argv_free(env);
    }

    if(NULL != jobid_string) {
        free(jobid_string);
    }

    /* cleanup the caddy */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
}
Exemple #5
0
static void track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata, *pptr;
    opal_buffer_t *alert;
    int rc, i;
    orte_plm_cmd_flag_t cmd;

    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                         "%s state:orted:track_procs called for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        /* don't update until we are told that all are done */
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_local_procs) {
            /* once everyone registers, notify the HNP */

            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orted: notifying HNP all local registered",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

            alert = OBJ_NEW(opal_buffer_t);
            /* pack registered command */
            cmd = ORTE_PLM_REGISTERED_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack all the local child vpids */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                if (pptr->name.jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        goto cleanup;
                    }
                }
            }
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        /* Release the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         * Do this after we handle termination in case the IOF needs
         * to check to see if all procs from the job are actually terminated
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDIN);
        }
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* if this proc has not already recorded as terminated, then
         * update the accounting here */
        if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            jdata->num_terminated++;
        }
        /* update the proc state */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED);
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        pdata->state = state;
        /* Clean up the session directory as if we were the process
         * itself.  This covers the case where the process died abnormally
         * and didn't cleanup its own session directory.
         */
        orte_session_dir_finalize(proc);
        /* if we are trying to terminate and our routes are
         * gone, then terminate ourselves IF no local procs
         * remain (might be some from another job)
         */
        if (orte_orteds_term_ordered &&
            0 == orte_routed.num_routes()) {
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                    ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
                    /* at least one is still alive */
                    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                         "%s state:orted all routes gone but proc %s still alive",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         ORTE_NAME_PRINT(&pdata->name)));
                    goto cleanup;
                }
            }
            /* call our appropriate exit procedure */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orted all routes and children gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            goto cleanup;
        }
        /* track job status */
        if (jdata->num_terminated == jdata->num_local_procs &&
            !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) {
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            alert = OBJ_NEW(opal_buffer_t);
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the job info */
            if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
                ORTE_ERROR_LOG(rc);
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orted: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(jdata->jobid)));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
            /* mark that we sent it so we ensure we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Exemple #6
0
/*****************
 * Local Functions
 *****************/
static void notify_job_errors(orte_job_t *jdata, orte_proc_t *proc)
{
    int command = ORTE_DAEMON_ABORT_PROCS_CALLED;
    int rc;
    opal_buffer_t *buf;

    if (ORTE_JOB_STATE_ANY == jdata->state) {
        goto CLEANUP;
    }

    buf = OBJ_NEW(opal_buffer_t);
    /* pack the complete command flag */
    if (OPAL_SUCCESS !=
        (rc = opal_dss.pack(buf, &command, 1, OPAL_INT))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }

    /* pack the job */
    if (OPAL_SUCCESS !=
        (rc = opal_dss.pack(buf, &jdata, 1, ORTE_JOB))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }

    /* pack the proc name */
    if (OPAL_SUCCESS !=
        (rc = opal_dss.pack(buf, &proc, 1, ORTE_PROC))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }
    /* pack the proc state */
    if (OPAL_SUCCESS !=
        (rc = opal_dss.pack(buf, &proc->state, 1, ORTE_PROC_STATE))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }

    /* pack the proc exit code */
    if (OPAL_SUCCESS !=
        (rc = opal_dss.pack(buf, &proc->exit_code, 1, ORTE_EXIT_CODE))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }

    /* pack the node name */
    if (OPAL_SUCCESS !=
        (rc = opal_dss.pack(buf, &proc->node->name, 1, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }


    OBJ_RETAIN(jdata);
    if (ORTE_SUCCESS !=
        (rc = orte_rml.send_buffer_nb(&jdata->originator, buf,
                     ORTE_RML_TAG_TOOL,
                     orte_rml_send_callback, NULL))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }


    OPAL_OUTPUT_VERBOSE((0, orte_state_base_framework.framework_output,
                        "%s state:orcmsd:notify job error %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)));

    /* if this job is a continuously operating one, then don't do
     * anything further - just return here
     */
    if (NULL != jdata &&
        (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
         ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) {
        /* set state to ANY */
        jdata->state = ORTE_JOB_STATE_ANY;
    } else {
        /* give us a chance to stop the orcmsds */
        orte_plm.terminate_orteds();
    }

CLEANUP:
    return;
}
Exemple #7
0
/*
 * JOB
 * NOTE: We do not pack all of the job object's fields as many of them have no
 * value in sending them to another location. The only purpose in packing and
 * sending a job object is to communicate the data required to dynamically
 * spawn another job - so we only pack that limited set of required data.
 * Therefore, only unpack what was packed
 */
int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
                       int32_t *num_vals, opal_data_type_t type)
{
    int rc;
    int32_t i, k, n, count, bookmark;
    orte_job_t **jobs;
    orte_app_idx_t j;
    orte_attribute_t *kv;
    char *tmp;
    opal_value_t *val;
    opal_list_t *cache;

    /* unpack into array of orte_job_t objects */
    jobs = (orte_job_t**) dest;
    for (i=0; i < *num_vals; i++) {

        /* create the orte_job_t object */
        jobs[i] = OBJ_NEW(orte_job_t);
        if (NULL == jobs[i]) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }

        /* unpack the jobid */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                &(jobs[i]->jobid), &n, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* unpack the flags */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(jobs[i]->flags)), &n, ORTE_JOB_FLAGS_T))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* unpack the attributes */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
                                                         &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        for (k=0; k < count; k++) {
            n=1;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv,
                                                             &n, ORTE_ATTRIBUTE))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            kv->local = ORTE_ATTR_GLOBAL;  // obviously not a local value
            opal_list_append(&jobs[i]->attributes, &kv->super);
        }
        /* unpack any job info */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
                                                         &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        if (0 < count){
            cache = OBJ_NEW(opal_list_t);
            orte_set_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, (void*)cache, OPAL_PTR);
            for (k=0; k < count; k++) {
                n=1;
                if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &val,
                                                                 &n, OPAL_VALUE))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                opal_list_append(cache, &val->super);
            }
        }

        /* unpack the personality */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, &n, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        for (k=0; k < count; k++) {
            n=1;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &tmp, &n, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            opal_argv_append_nosize(&jobs[i]->personality, tmp);
            free(tmp);
        }

        /* unpack the num apps */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                (&(jobs[i]->num_apps)), &n, ORTE_APP_IDX))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* if there are apps, unpack them */
        if (0 < jobs[i]->num_apps) {
            orte_app_context_t *app;
            for (j=0; j < jobs[i]->num_apps; j++) {
                n = 1;
                if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                               &app, &n, ORTE_APP_CONTEXT))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                opal_pointer_array_add(jobs[i]->apps, app);
            }
        }

        /* unpack num procs and offset */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                (&(jobs[i]->num_procs)), &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                (&(jobs[i]->offset)), &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        if (0 < jobs[i]->num_procs) {
            /* check attributes to see if this job was fully
             * described in the launch msg */
            if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
                orte_proc_t *proc;
                for (j=0; j < jobs[i]->num_procs; j++) {
                    n = 1;
                    if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                   &proc, &n, ORTE_PROC))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                    opal_pointer_array_add(jobs[i]->procs, proc);
                }
            }
        }

        /* unpack stdin target */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                            (&(jobs[i]->stdin_target)), &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* unpack the total slots allocated to the job */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(jobs[i]->total_slots_alloc)), &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* if the map is NULL, then we didn't pack it as there was
         * nothing to pack. Instead, we packed a flag to indicate whether or not
         * the map is included */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                            &j, &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        if (0 < j) {
            /* unpack the map */
            n = 1;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                            (&(jobs[i]->map)), &n, ORTE_JOB_MAP))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        }

        /* unpack the bookmark */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                            &bookmark, &n, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        if (0 <= bookmark) {
            /* retrieve it */
            jobs[i]->bookmark = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, bookmark);
        }

        /* unpack the job state */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(jobs[i]->state)), &n, ORTE_JOB_STATE))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
    }

    return ORTE_SUCCESS;
}
static int rank_fill(orte_job_t *jdata,
                     hwloc_obj_type_t target,
                     unsigned cache_level)
{
    orte_app_context_t *app;
    hwloc_obj_t obj;
    int num_objs, i, j, m, n, rc;
    orte_vpid_t num_ranked=0;
    orte_node_t *node;
    orte_proc_t *proc, *pptr;
    orte_vpid_t vpid;
    int cnt;
    hwloc_obj_t locale;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:rank_fill: for job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* if the ranking is fill, then we rank all the procs
     * within a given object before moving on to the next
     *
     *        Node 0                Node 1
     *    Obj 0     Obj 1       Obj 0     Obj 1
     *     0 1       4 5         8 9      12 13
     *     2 3       6 7        10 11     14 15
     */

    vpid = 0;
    for (n=0; n < jdata->apps->size; n++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
            continue;
        }

        cnt = 0;
        for (m=0; m < jdata->map->nodes->size; m++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
                continue;
            }
            /* get the number of objects - only consider those we can actually use */
            num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target,
                                                          cache_level, OPAL_HWLOC_AVAILABLE);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:rank_fill: found %d objects on node %s with %d procs",
                                num_objs, node->name, (int)node->num_procs);
            if (0 == num_objs) {
                return ORTE_ERR_NOT_SUPPORTED;
            }

            /* for each object */
            for (i=0; i < num_objs && cnt < app->num_procs; i++) {
                obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target,
                                                      cache_level, i, OPAL_HWLOC_AVAILABLE);

                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "mca:rmaps:rank_fill: working object %d", i);

                /* cycle thru the procs on this node */
                for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    /* ignore procs from other jobs */
                    if (proc->name.jobid != jdata->jobid) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d",
                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
                        continue;
                    }
                    /* ignore procs that are already assigned */
                    if (ORTE_VPID_INVALID != proc->name.vpid) {
                        continue;
                    }
                    /* ignore procs from other apps */
                    if (proc->app_idx != app->idx) {
                        continue;
                    }
                     /* protect against bozo case */
                    locale = NULL;
                    if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        ORTE_ERROR_LOG(ORTE_ERROR);
                        return ORTE_ERROR;
                    }
                    /* ignore procs not on this object */
                    if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_fill: proc at position %d is not on object %d",
                                            j, i);
                        continue;
                    }
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid));
                    proc->name.vpid = vpid++;
                    if (0 == cnt) {
                        app->first_rank = proc->name.vpid;
                    }
                    cnt++;

                    /* insert the proc into the jdata array */
                    if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
                        OBJ_RELEASE(pptr);
                    }
                    OBJ_RETAIN(proc);
                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                    /* track where the highest vpid landed - this is our
                     * new bookmark
                     */
                    jdata->bookmark = node;
                }
            }
        }
    }

    return ORTE_SUCCESS;
}
Exemple #9
0
/* we cannot use the RML to communicate with SLURM as it doesn't
 * understand our internal protocol, so we have to do a bare-bones
 * exchange based on sockets
 */
static int dyn_allocate(orte_job_t *jdata)
{
    char *cmd_str, **cmd=NULL, *tmp, *jstring;
    char *node_list;
    orte_app_context_t *app;
    int i;
    struct timeval tv;
    local_jobtracker_t *jtrk;
    int64_t i64, *i64ptr;

    if (NULL == mca_ras_slurm_component.config_file) {
        opal_output(0, "Cannot perform dynamic allocation as no Slurm configuration file provided");
        return ORTE_ERR_NOT_FOUND;
    }

    /* track this request */
    jtrk = OBJ_NEW(local_jobtracker_t);
    jtrk->jobid = jdata->jobid;
    opal_list_append(&jobs, &jtrk->super);

    /* construct the command - note that the jdata structure contains
     * a field for the minimum number of nodes required for the job.
     * The node list can be constructed from the union of all the nodes
     * contained in the dash_host field of the app_contexts. So you'll
     * need to do a little work to build the command. We don't currently
     * have a field in the jdata structure for "mandatory" vs "optional"
     * allocations, so we'll have to add that someday. Likewise, you may
     * want to provide a param to adjust the timeout value
     */
    /* construct the cmd string */
    opal_argv_append_nosize(&cmd, "allocate");
    /* add the jobid */
    orte_util_convert_jobid_to_string(&jstring, jdata->jobid);
    asprintf(&tmp, "jobid=%s", jstring);
    opal_argv_append_nosize(&cmd, tmp);
    free(tmp);
    free(jstring);
    /* if we want the allocation for all apps in one shot,
     * then tell slurm
     *
     * RHC: we don't currently have the ability to handle
     * rolling allocations in the rest of the code base
     */
#if 0
    if (!mca_ras_slurm_component.rolling_alloc) {
        opal_argv_append_nosize(&cmd, "return=all");
    }
#else
    opal_argv_append_nosize(&cmd, "return=all");
#endif

    /* pass the timeout */
    asprintf(&tmp, "timeout=%d", mca_ras_slurm_component.timeout);
    opal_argv_append_nosize(&cmd, tmp);
    free(tmp);

    /* for each app, add its allocation request info */
    i64ptr = &i64;
    for (i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        /* add the app id, preceded by a colon separator */
        asprintf(&tmp, ": app=%d", (int)app->idx);
        opal_argv_append_nosize(&cmd, tmp);
        free(tmp);
        /* add the number of process "slots" we need */
        asprintf(&tmp, "np=%d", app->num_procs);
        opal_argv_append_nosize(&cmd, tmp);
        free(tmp);
        /* if we were given a minimum number of nodes, pass it along */
        if (orte_get_attribute(&app->attributes, ORTE_APP_MIN_NODES, (void**)&i64ptr, OPAL_INT64)) {
            asprintf(&tmp, "N=%ld", (long int)i64);
            opal_argv_append_nosize(&cmd, tmp);
            free(tmp);
        }
        /* add the list of nodes, if one was given, ensuring
         * that each node only appears once
         */
        node_list =  get_node_list(app);
        if (NULL != node_list) {
            asprintf(&tmp, "node_list=%s", node_list);
            opal_argv_append_nosize(&cmd, tmp);
            free(node_list);
            free(tmp);
        }
        /* add the mandatory/optional flag */
        if (orte_get_attribute(&app->attributes, ORTE_APP_MANDATORY, NULL, OPAL_BOOL)) {
            opal_argv_append_nosize(&cmd, "flag=mandatory");
        } else {
            opal_argv_append_nosize(&cmd, "flag=optional");
        }
    }

    /* assemble it into the final cmd to be sent */
    cmd_str = opal_argv_join(cmd, ' ');
    opal_argv_free(cmd);

    /* start a timer - if the response to our request doesn't appear
     * in the defined time, then we will error out as Slurm isn't
     * responding to us
     */
    opal_event_evtimer_set(orte_event_base, &jtrk->timeout_ev, timeout, jtrk);
    tv.tv_sec = mca_ras_slurm_component.timeout * 2;
    tv.tv_usec = 0;
    opal_event_evtimer_add(&jtrk->timeout_ev, &tv);

    opal_output_verbose(2, orte_ras_base_framework.framework_output,
                        "%s slurm:dynalloc cmd_str = %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        cmd_str);

    if (send(socket_fd, cmd_str, strlen(cmd_str)+1, 0) < 0) {
        ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
    }
    free(cmd_str);

    /* we cannot wait here for a response as we
     * are already in an event. So return a value
     * that indicates we are waiting for an
     * allocation so the base functions know
     * that they shouldn't progress the job
     */
    return ORTE_ERR_ALLOCATION_PENDING;
}
int orte_rmaps_base_filter_nodes(orte_app_context_t *app,
                                 opal_list_t *nodes, bool remove)
{
    int rc=ORTE_ERR_TAKE_NEXT_OPTION;
    char *hosts;

    /* did the app_context contain a hostfile? */
    if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
        /* yes - filter the node list through the file, removing
         * any nodes not found in the file
         */
        if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(nodes, hosts, remove))) {
            ORTE_ERROR_LOG(rc);
            free(hosts);
            return rc;
        }
        /** check that anything is here */
        if (0 == opal_list_get_size(nodes)) {
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
                           true, app->app, "-hostfile", hosts);
            free(hosts);
            return ORTE_ERR_SILENT;
        }
        free(hosts);
    }
    /* did the app_context contain an add-hostfile? */
    if (orte_get_attribute(&app->attributes, ORTE_APP_ADD_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
        /* yes - filter the node list through the file, removing
         * any nodes not found in the file
         */
        if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(nodes, hosts, remove))) {
            free(hosts);
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /** check that anything is here */
        if (0 == opal_list_get_size(nodes)) {
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
                           true, app->app, "-add-hostfile", hosts);
            free(hosts);
            return ORTE_ERR_SILENT;
        }
        free(hosts);
    }
    /* now filter the list through any -host specification */
    if (!orte_soft_locations &&
        orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
        if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(nodes, hosts, remove))) {
            ORTE_ERROR_LOG(rc);
            free(hosts);
            return rc;
        }
        /** check that anything is left! */
        if (0 == opal_list_get_size(nodes)) {
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
                           true, app->app, "-host", hosts);
            free(hosts);
            return ORTE_ERR_SILENT;
        }
        free(hosts);
    }
    /* now filter the list through any add-host specification */
    if (orte_get_attribute(&app->attributes, ORTE_APP_ADD_HOST, (void**)&hosts, OPAL_STRING)) {
        if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(nodes, hosts, remove))) {
            ORTE_ERROR_LOG(rc);
            free(hosts);
            return rc;
        }
        /** check that anything is left! */
        if (0 == opal_list_get_size(nodes)) {
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
                           true, app->app, "-add-host", hosts);
            free(hosts);
            return ORTE_ERR_SILENT;
        }
        free(hosts);
    }

    return rc;
}
/* process incoming messages in order of receipt */
void orte_plm_base_recv(int status, orte_process_name_t* sender,
                        opal_buffer_t* buffer, orte_rml_tag_t tag,
                        void* cbdata)
{
    orte_plm_cmd_flag_t command;
    orte_std_cntr_t count;
    orte_jobid_t job;
    orte_job_t *jdata, *parent;
    opal_buffer_t *answer;
    orte_vpid_t vpid;
    orte_proc_t *proc;
    orte_proc_state_t state;
    orte_exit_code_t exit_code;
    int32_t rc=ORTE_SUCCESS, ret;
    orte_app_context_t *app, *child_app;
    orte_process_name_t name;
    pid_t pid;
    bool running;
    int i;
    char **env;
    char *prefix_dir;

    OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                         "%s plm:base:receive processing msg",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    count = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_PLM_CMD))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }

    switch (command) {
    case ORTE_PLM_LAUNCH_JOB_CMD:
        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive job launch command from %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(sender)));

        /* unpack the job object */
        count = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &count, ORTE_JOB))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }

        /* record the sender so we know who to respond to */
        jdata->originator.jobid = sender->jobid;
        jdata->originator.vpid = sender->vpid;

        /* get the parent's job object */
        if (NULL != (parent = orte_get_job_data_object(sender->jobid))) {
            /* if the prefix was set in the parent's job, we need to transfer
             * that prefix to the child's app_context so any further launch of
             * orteds can find the correct binary. There always has to be at
             * least one app_context in both parent and child, so we don't
             * need to check that here. However, be sure not to overwrite
             * the prefix if the user already provided it!
             */
            app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
            child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
            prefix_dir = NULL;
            if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) &&
                !orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) {
                orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING);
            }
            if (NULL != prefix_dir) {
                free(prefix_dir);
            }
        }

        /* if the user asked to forward any envars, cycle through the app contexts
         * in the comm_spawn request and add them
         */
        if (NULL != orte_forwarded_envars) {
            for (i=0; i < jdata->apps->size; i++) {
                if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
                    continue;
                }
                env = opal_environ_merge(orte_forwarded_envars, app->env);
                opal_argv_free(app->env);
                app->env = env;
            }
        }

        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive adding hosts",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

        /* process any add-hostfile and add-host options that were provided */
        if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }

        if (NULL != parent) {
            if (NULL == parent->bookmark) {
                /* find the sender's node in the job map */
                if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) {
                    /* set the bookmark so the child starts from that place - this means
                     * that the first child process could be co-located with the proc
                     * that called comm_spawn, assuming slots remain on that node. Otherwise,
                     * the procs will start on the next available node
                     */
                    jdata->bookmark = proc->node;
                }
            } else {
                jdata->bookmark = parent->bookmark;
            }
        }
        /* provide the parent's last object */
        jdata->bkmark_obj = parent->bkmark_obj;

        /* launch it */
        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive calling spawn",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }
        break;
    ANSWER_LAUNCH:
        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive - error on launch: %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc));

        /* setup the response */
        answer = OBJ_NEW(opal_buffer_t);

        /* pack the error code to be returned */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
            ORTE_ERROR_LOG(ret);
        }

        /* send the response back to the sender */
        if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_PLM_PROXY,
                                               orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(answer);
        }
        break;

    case ORTE_PLM_UPDATE_PROC_STATE:
        opal_output_verbose(5, orte_plm_base_framework.framework_output,
                            "%s plm:base:receive update proc state command from %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(sender));
        count = 1;
        while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {

            opal_output_verbose(5, orte_plm_base_framework.framework_output,
                                "%s plm:base:receive got update_proc_state for job %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_JOBID_PRINT(job));

            name.jobid = job;
            running = false;
            /* get the job object */
            jdata = orte_get_job_data_object(job);
            count = 1;
            while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID))) {
                if (ORTE_VPID_INVALID == vpid) {
                    /* flag indicates that this job is complete - move on */
                    break;
                }
                name.vpid = vpid;
                /* unpack the pid */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &count, OPAL_PID))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                /* unpack the state */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &count, ORTE_PROC_STATE))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                if (ORTE_PROC_STATE_RUNNING == state) {
                    running = true;
                }
                /* unpack the exit code */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &count, ORTE_EXIT_CODE))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }

                OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                                     "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code));

                if (NULL != jdata) {
                    /* get the proc data object */
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
                        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    }
                    /* NEVER update the proc state before activating the state machine - let
                     * the state cbfunc update it as it may need to compare this
                     * state against the prior proc state */
                    proc->pid = pid;
                    proc->exit_code = exit_code;
                    ORTE_ACTIVATE_PROC_STATE(&name, state);
                }
            }
            /* record that we heard back from a daemon during app launch */
            if (running && NULL != jdata) {
                jdata->num_daemons_reported++;
                if (orte_report_launch_progress) {
                    if (0 == jdata->num_daemons_reported % 100 ||
                        jdata->num_daemons_reported == orte_process_info.num_procs) {
                        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS);
                    }
                }
            }
            /* prepare for next job */
            count = 1;
        }
        if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        break;

    case ORTE_PLM_REGISTERED_CMD:
        count=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            goto DEPART;
        }
        name.jobid = job;
        /* get the job object */
        if (NULL == (jdata = orte_get_job_data_object(job))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            rc = ORTE_ERR_NOT_FOUND;
            goto DEPART;
        }
        count=1;
        while (ORTE_SUCCESS == opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID)) {
            name.vpid = vpid;
            ORTE_ACTIVATE_PROC_STATE(&name, ORTE_PROC_STATE_REGISTERED);
            count=1;
        }
        break;

    default:
        ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
        rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
        break;
    }

 CLEANUP:
    if (ORTE_SUCCESS != rc) {
        goto DEPART;
    }

 DEPART:
    /* see if an error occurred - if so, wakeup the HNP so we can exit */
    if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {
        jdata = NULL;
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }

    OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                         "%s plm:base:receive done processing commands",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
/*
 * Query the registry for all nodes allocated to a specified app_context
 */
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
                                     orte_app_context_t *app, orte_mapping_policy_t policy,
                                     bool initial_map, bool silent)
{
    opal_list_item_t *item, *next;
    orte_node_t *node, *nd, *nptr;
    orte_std_cntr_t num_slots;
    orte_std_cntr_t i;
    int rc;
    orte_job_t *daemons;
    bool novm;
    opal_list_t nodes;
    char *hosts;

    /** set default answer */
    *total_num_slots = 0;
    
    /* get the daemon job object */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    /* see if we have a vm or not */
    novm = orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL);

    /* if this is NOT a managed allocation, then we use the nodes
     * that were specified for this app - there is no need to collect
     * all available nodes and "filter" them
     */
    if (!orte_managed_allocation) {
        OBJ_CONSTRUCT(&nodes, opal_list_t);
        /* if the app provided a dash-host, and we are not treating
         * them as requested or "soft" locations, then use those nodes
         */
        if (!orte_soft_locations &&
            orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using dash_host %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
            if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts))) {
                ORTE_ERROR_LOG(rc);
                free(hosts);
                return rc;
            }
            free(hosts);
        } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
            /* otherwise, if the app provided a hostfile, then use that */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using hostfile %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
            if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, hosts))) {
                free(hosts);
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            free(hosts);
        } else if (NULL != orte_rankfile) {
            /* use the rankfile, if provided */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using rankfile %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 orte_rankfile));
            if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
                                                                   orte_rankfile))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            if (0 == opal_list_get_size(&nodes)) {
                OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                     "%s nothing found in given rankfile",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                OBJ_DESTRUCT(&nodes);
                return ORTE_ERR_BAD_PARAM;
            }
        } else if (NULL != orte_default_hostfile) {
            /* fall back to the default hostfile, if provided */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using default hostfile %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 orte_default_hostfile));
            if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
                                                                   orte_default_hostfile))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            /* this is a special case - we always install a default
             * hostfile, but it is empty. If the user didn't remove it
             * or put something into it, then we will have pursued that
             * option and found nothing. This isn't an error, we just need
             * to add all the known nodes
             */
            if (0 == opal_list_get_size(&nodes)) {
                OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                     "%s nothing in default hostfile - using known nodes",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                goto addknown;
            }
        } else {
            /* if nothing else was available, then use all known nodes, which
             * will include ourselves
             */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using known nodes",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto addknown;
        }
        /** if we still don't have anything */
        if (0 == opal_list_get_size(&nodes)) {
            if (!silent) {
                orte_show_help("help-orte-rmaps-base.txt",
                               "orte-rmaps-base:no-available-resources",
                               true);
            }
            OBJ_DESTRUCT(&nodes);
            return ORTE_ERR_SILENT;
        }
        /* find the nodes in our node array and assemble them
         * in daemon order if the vm was launched
         */
        while (NULL != (item = opal_list_remove_first(&nodes))) {
            nptr = (orte_node_t*)item;
            nd = NULL;
            for (i=0; i < orte_node_pool->size; i++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
                    continue;
                }
                if (0 != strcmp(node->name, nptr->name)) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s DOESNT MATCH NODE %s",
                                         node->name, nptr->name));
                    continue;
                }
                /* ignore nodes that are marked as do-not-use for this mapping */
                if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s IS MARKED NO_USE", node->name));
                    /* reset the state so it can be used another time */
                    node->state = ORTE_NODE_STATE_UP;
                    continue;
                }
                if (ORTE_NODE_STATE_DOWN == node->state) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s IS DOWN", node->name));
                    continue;
                }
                if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s IS MARKED NO_INCLUDE", node->name));
                    /* not to be used */
                    continue;
                }
                /* if this node wasn't included in the vm (e.g., by -host), ignore it,
                 * unless we are mapping prior to launching the vm
                 */
                if (NULL == node->daemon && !novm) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s HAS NO DAEMON", node->name));
                    continue;
                }
                /* retain a copy for our use in case the item gets
                 * destructed along the way
                 */
                OBJ_RETAIN(node);
                if (initial_map) {
                    /* if this is the first app_context we
                     * are getting for an initial map of a job,
                     * then mark all nodes as unmapped
                     */
                    ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
                }
                if (NULL == nd || NULL == nd->daemon ||
                    NULL == node->daemon ||
                    nd->daemon->name.vpid < node->daemon->name.vpid) {
                    /* just append to end */
                    opal_list_append(allocated_nodes, &node->super);
                    nd = node;
                } else {
                    /* starting from end, put this node in daemon-vpid order */
                    while (node->daemon->name.vpid < nd->daemon->name.vpid) {
                        if (opal_list_get_begin(allocated_nodes) == opal_list_get_prev(&nd->super)) {
                            /* insert at beginning */
                            opal_list_prepend(allocated_nodes, &node->super);
                            goto moveon1;
                        }
                        nd = (orte_node_t*)opal_list_get_prev(&nd->super);
                    }
                    item = opal_list_get_next(&nd->super);
                    if (item == opal_list_get_end(allocated_nodes)) {
                        /* we are at the end - just append */
                        opal_list_append(allocated_nodes, &node->super);
                    } else {
                        nd = (orte_node_t*)item;
                        opal_list_insert_pos(allocated_nodes, item, &node->super);
                    }
                moveon1:
                    /* reset us back to the end for the next node */
                    nd = (orte_node_t*)opal_list_get_last(allocated_nodes);
                }
            }
            OBJ_RELEASE(nptr);
        }
        OBJ_DESTRUCT(&nodes);
        /* now prune for usage and compute total slots */
        goto complete;
    }

 addknown:
    /* if the hnp was allocated, include it unless flagged not to */
    if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) {
        if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
            if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "HNP IS MARKED NO_USE"));
                /* clear this for future use, but don't include it */
                node->state = ORTE_NODE_STATE_UP;
            } else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) {
                OBJ_RETAIN(node);
                if (initial_map) {
                    /* if this is the first app_context we
                     * are getting for an initial map of a job,
                     * then mark all nodes as unmapped
                     */
                    ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
                }
                opal_list_append(allocated_nodes, &node->super);
            }
        }
    }
    
    /* add everything in the node pool that can be used - add them
     * in daemon order, which may be different than the order in the
     * node pool. Since an empty list is passed into us, the list at
     * this point either has the HNP node or nothing, and the HNP
     * node obviously has a daemon on it (us!)
     */
    if (0 == opal_list_get_size(allocated_nodes)) {
        /* the list is empty */
        nd = NULL;
    } else {
        nd = (orte_node_t*)opal_list_get_last(allocated_nodes);
    }
    for (i=1; i < orte_node_pool->size; i++) {
        if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
            /* ignore nodes that are marked as do-not-use for this mapping */
            if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "NODE %s IS MARKED NO_USE", node->name));
                /* reset the state so it can be used another time */
                node->state = ORTE_NODE_STATE_UP;
                continue;
            }
            if (ORTE_NODE_STATE_DOWN == node->state) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "NODE %s IS MARKED DOWN", node->name));
                continue;
            }
            if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "NODE %s IS MARKED NO_INCLUDE", node->name));
                /* not to be used */
                continue;
            }
            /* if this node wasn't included in the vm (e.g., by -host), ignore it,
             * unless we are mapping prior to launching the vm
             */
            if (NULL == node->daemon && !novm) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "NODE %s HAS NO DAEMON", node->name));
                continue;
            }
            /* retain a copy for our use in case the item gets
             * destructed along the way
             */
            OBJ_RETAIN(node);
            if (initial_map) {
                /* if this is the first app_context we
                 * are getting for an initial map of a job,
                 * then mark all nodes as unmapped
                 */
                    ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
            }
            if (NULL == nd || NULL == nd->daemon ||
		NULL == node->daemon ||
                nd->daemon->name.vpid < node->daemon->name.vpid) {
                /* just append to end */
                opal_list_append(allocated_nodes, &node->super);
                nd = node;
            } else {
                /* starting from end, put this node in daemon-vpid order */
                while (node->daemon->name.vpid < nd->daemon->name.vpid) {
                    if (opal_list_get_begin(allocated_nodes) == opal_list_get_prev(&nd->super)) {
                        /* insert at beginning */
                        opal_list_prepend(allocated_nodes, &node->super);
                        goto moveon;
                    }
                    nd = (orte_node_t*)opal_list_get_prev(&nd->super);
                }
                item = opal_list_get_next(&nd->super);
                if (item == opal_list_get_end(allocated_nodes)) {
                    /* we are at the end - just append */
                    opal_list_append(allocated_nodes, &node->super);
                } else {
                    nd = (orte_node_t*)item;
                    opal_list_insert_pos(allocated_nodes, item, &node->super);
                }
            moveon:
                /* reset us back to the end for the next node */
                nd = (orte_node_t*)opal_list_get_last(allocated_nodes);
            }
        }
    }

    OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                         "%s Starting with %d nodes in list",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (int)opal_list_get_size(allocated_nodes)));

    /** check that anything is here */
    if (0 == opal_list_get_size(allocated_nodes)) {
        if (!silent) {
            orte_show_help("help-orte-rmaps-base.txt",
                           "orte-rmaps-base:no-available-resources",
                           true);
        }
        return ORTE_ERR_SILENT;
    }
    
    /* filter the nodes thru any hostfile and dash-host options */
    OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                         "%s Filtering thru apps",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_filter_nodes(app, allocated_nodes, true))
        && ORTE_ERR_TAKE_NEXT_OPTION != rc) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                         "%s Retained %d nodes in list",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (int)opal_list_get_size(allocated_nodes)));

 complete:
    /* remove all nodes that are already at max usage, and
     * compute the total number of allocated slots while
     * we do so
     */
    num_slots = 0;
    item  = opal_list_get_first(allocated_nodes);
    while (item != opal_list_get_end(allocated_nodes)) {
        /** save the next pointer in case we remove this node */
        next  = opal_list_get_next(item);
        /** check to see if this node is fully used - remove if so */
        node = (orte_node_t*)item;
        if (0 != node->slots_max && node->slots_inuse > node->slots_max) {
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s Removing node %s: max %d inuse %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 node->name, node->slots_max, node->slots_inuse));
            opal_list_remove_item(allocated_nodes, item);
            OBJ_RELEASE(item);  /* "un-retain" it */
        } else if (node->slots <= node->slots_inuse &&
                   (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
            /* remove the node as fully used */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s Removing node %s slots %d inuse %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 node->name, node->slots, node->slots_inuse));
            opal_list_remove_item(allocated_nodes, item);
            OBJ_RELEASE(item);  /* "un-retain" it */
        } else if (node->slots > node->slots_inuse) {
                /* add the available slots */
                OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                     "%s node %s has %d slots available",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     node->name, node->slots - node->slots_inuse));
                num_slots += node->slots - node->slots_inuse;
        } else if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
                /* nothing needed to do here - we don't add slots to the
                 * count as we don't have any available. Just let the mapper
                 * do what it needs to do to meet the request
                 */
                OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                     "%s node %s is fully used, but available for oversubscrition",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     node->name));
        } else {
            /* if we cannot use it, remove it from list */
            opal_list_remove_item(allocated_nodes, item);
            OBJ_RELEASE(item);  /* "un-retain" it */
        }
        /** go on to next item */
        item = next;
    }

    /* Sanity check to make sure we have resources available */
    if (0 == opal_list_get_size(allocated_nodes)) {
        if (silent) {
            /* let the caller know that the resources exist,
             * but are currently busy
             */
            return ORTE_ERR_RESOURCE_BUSY;
        } else {
            orte_show_help("help-orte-rmaps-base.txt", 
                           "orte-rmaps-base:all-available-resources-used", true);
            return ORTE_ERR_SILENT;
        }
    }
    
    *total_num_slots = num_slots;
    
    if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
        opal_output(0, "AVAILABLE NODES FOR MAPPING:");
        for (item = opal_list_get_first(allocated_nodes);
             item != opal_list_get_end(allocated_nodes);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
            opal_output(0, "    node: %s daemon: %s", node->name,
                        (NULL == node->daemon) ? "NULL" : ORTE_VPID_PRINT(node->daemon->name.vpid));
        }
    }

    return ORTE_SUCCESS;
}
Exemple #13
0
static int bind_in_place(orte_job_t *jdata,
                         hwloc_obj_type_t target,
                         unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node downwards
     * until we find an unused object of type target - and then bind
     * the process to that target
     */
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    struct hwloc_topology_support *support;
    opal_hwloc_obj_data_t *data;
    hwloc_obj_t locale, sib;
    char *cpu_bitmap;
    bool found;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind in place for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(map->binding) ||
                    !OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability - don't warn if the user didn't
             * specifically request binding
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind &&
                OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* some systems do not report cores, and so we can get a situation where our
         * default binding policy will fail for no necessary reason. So if we are
         * computing a binding due to our default policy, and no cores are found
         * on this node, just silently skip it - we will not bind
         */
        if (!OPAL_BINDING_POLICY_IS_SET(map->binding) &&
            HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "Unable to bind-to core by default on node %s as no cores detected",
                                node->name);
            continue;
        }

        /* we share topologies in order
         * to save space, so we need to reset the usage info to reflect
         * our own current state
         */
        reset_usage(node, jdata->jobid);

        /* cycle thru the procs */
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            /* bozo check */
            if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
                return ORTE_ERR_SILENT;
            }
            /* get the index of this location */
            if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, locale, OPAL_HWLOC_AVAILABLE))) {
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                return ORTE_ERR_SILENT;
            }
            data = (opal_hwloc_obj_data_t*)locale->userdata;
            /* get the number of cpus under this location */
            if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, locale))) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* if we don't have enough cpus to support this additional proc, try
             * shifting the location to a cousin that can support it - the important
             * thing is that we maintain the same level in the topology */
            if (ncpus < (data->num_bound+1)) {
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s bind_in_place: searching right",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                sib = locale;
                found = false;
                while (NULL != (sib = sib->next_cousin)) {
                    data = (opal_hwloc_obj_data_t*)sib->userdata;
                    ncpus = opal_hwloc_base_get_npus(node->topology, sib);
                    if (data->num_bound < ncpus) {
                        found = true;
                        locale = sib;
                        break;
                    }
                }
                if (!found) {
                    /* try the other direction */
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "%s bind_in_place: searching left",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                    sib = locale;
                    while (NULL != (sib = sib->prev_cousin)) {
                        data = (opal_hwloc_obj_data_t*)sib->userdata;
                        ncpus = opal_hwloc_base_get_npus(node->topology, sib);
                        if (data->num_bound < ncpus) {
                            found = true;
                            locale = sib;
                            break;
                        }
                    }
                }
                if (!found) {
                    /* no place to put this - see if overload is allowed */
                    if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                        if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                            /* if the user specified a binding policy, then we cannot meet
                             * it since overload isn't allowed, so error out - have the
                             * message indicate that setting overload allowed will remove
                             * this restriction */
                            orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                           opal_hwloc_base_print_binding(map->binding), node->name,
                                           data->num_bound, ncpus);
                            return ORTE_ERR_SILENT;
                        } else {
                            /* if we have the default binding policy, then just don't bind */
                            OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
                            unbind_procs(jdata);
                            return ORTE_SUCCESS;
                        }
                    }
                }
            }
            /* track the number bound */
            data = (opal_hwloc_obj_data_t*)locale->userdata;  // just in case it changed
            data->num_bound++;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "BINDING PROC %s TO %s NUMBER %u",
                                ORTE_NAME_PRINT(&proc->name),
                                hwloc_obj_type_string(locale->type), idx);
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
            hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus);
            orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
            /* update the location, in case it changed */
            orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, locale, OPAL_PTR);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&proc->name),
                                cpu_bitmap, hwloc_obj_type_string(locale->type),
                                idx, node->name);
            if (NULL != cpu_bitmap) {
                free(cpu_bitmap);
            }
        }
    }

    return ORTE_SUCCESS;
}
Exemple #14
0
static int bind_downwards(orte_job_t *jdata,
                          orte_node_t *node,
                          hwloc_obj_type_t target,
                          unsigned cache_level)
{
    int j;
    orte_job_map_t *map;
    orte_proc_t *proc;
    hwloc_obj_t trg_obj, nxt_obj;
    hwloc_cpuset_t cpus;
    unsigned int ncpus;
    opal_hwloc_obj_data_t *data;
    int total_cpus;
    hwloc_cpuset_t totalcpuset;
    hwloc_obj_t locale;
    char *cpu_bitmap;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind downward for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;
    totalcpuset = hwloc_bitmap_alloc();

    /* cycle thru the procs */
    for (j=0; j < node->procs->size; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
            continue;
        }
        /* ignore procs from other jobs */
        if (proc->name.jobid != jdata->jobid) {
            continue;
        }
        /* bozo check */
        locale = NULL;
        if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
            hwloc_bitmap_free(totalcpuset);
            return ORTE_ERR_SILENT;
        }
        /* we don't know if the target is a direct child of this locale,
         * or if it is some depth below it, so we have to conduct a bit
         * of a search. Let hwloc find the min usage one for us.
         */
        trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology, locale,
                                                                  target, cache_level);
        if (NULL == trg_obj) {
            /* there aren't any such targets under this object */
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
            hwloc_bitmap_free(totalcpuset);
            return ORTE_ERR_SILENT;
        }
        /* record the location */
        orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, trg_obj, OPAL_PTR);
        /* start with a clean slate */
        hwloc_bitmap_zero(totalcpuset);
        total_cpus = 0;
        nxt_obj = trg_obj;
        do {
            if (NULL == nxt_obj) {
                /* could not find enough cpus to meet request */
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            trg_obj = nxt_obj;
            /* get the number of cpus under this location */
            ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s GOT %d CPUS",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
            /* track the number bound */
            if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
                data = OBJ_NEW(opal_hwloc_obj_data_t);
                trg_obj->userdata = data;
            }
            data->num_bound++;
            /* error out if adding a proc would cause overload and that wasn't allowed,
             * and it wasn't a default binding policy (i.e., the user requested it)
             */
            if (ncpus < data->num_bound &&
                !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                    /* if the user specified a binding policy, then we cannot meet
                     * it since overload isn't allowed, so error out - have the
                     * message indicate that setting overload allowed will remove
                     * this restriction */
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                   opal_hwloc_base_print_binding(map->binding), node->name,
                                   data->num_bound, ncpus);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                } else {
                    /* if we have the default binding policy, then just don't bind */
                    OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
                    unbind_procs(jdata);
                    hwloc_bitmap_zero(totalcpuset);
                    return ORTE_SUCCESS;
                }
            }
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
            hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
            /* track total #cpus */
            total_cpus += ncpus;
            /* move to the next location, in case we need it */
            nxt_obj = trg_obj->next_cousin;
        } while (total_cpus < orte_rmaps_base.cpus_per_rank);
        hwloc_bitmap_list_asprintf(&cpu_bitmap, totalcpuset);
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "%s PROC %s BITMAP %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), cpu_bitmap);
        orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
        if (NULL != cpu_bitmap) {
            free(cpu_bitmap);
        }
        if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
            char tmp1[1024], tmp2[1024];
            if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1),
                                                               node->topology, totalcpuset)) {
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s PROC %s ON %s IS NOT BOUND",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name);
            } else {
                opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology, totalcpuset);
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s BOUND PROC %s[%s] TO %s: %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name,
                            tmp1, tmp2);
            }
        }
    }
    hwloc_bitmap_free(totalcpuset);
    
    return ORTE_SUCCESS;
}
Exemple #15
0
static void job_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_job_state_t jobstate;
    orte_exit_code_t sts;
    orte_proc_t *aborted_proc;
    opal_buffer_t *answer;
    int32_t rc, ret;

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        return;
    }

    /* if the jdata is NULL, then we abort as this
     * is reporting an unrecoverable error
     */
    if (NULL == caddy->jdata) {
        ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
        OBJ_RELEASE(caddy);
        return;
    }

    /* update the state */
    jdata = caddy->jdata;
    jobstate = caddy->job_state;
    jdata->state = jobstate;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:orcmsd_hnp: job %s reported state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid),
                         orte_job_state_to_str(jobstate)));

    if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
        ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
        ORTE_JOB_STATE_MAP_FAILED == jobstate ||
        ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
        if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
            /* this is the primary job */
            orte_never_launched = true;
        }
        /* disable routing as we may not have performed the daemon
         * wireup - e.g., in a managed environment, all the daemons
         * "phone home", but don't actually wireup into the routed
         * network until they receive the launch message
         */
        orte_routing_is_enabled = false;
        jdata->num_terminated = jdata->num_procs;
        ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
        /* if it was a dynamic spawn, then we better tell them this didn't work */
        if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
            rc = jobstate;
            answer = OBJ_NEW(opal_buffer_t);
            if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
                ORTE_ERROR_LOG(ret);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                OBJ_RELEASE(caddy);
                return;
            }
            if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(ret);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                OBJ_RELEASE(caddy);
                return;
            }
            OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                                 "%s errmgr:hnp sending dyn error release of job %s to %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(jdata->jobid),
                                 ORTE_NAME_PRINT(&jdata->originator)));
            if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
                                                   ORTE_RML_TAG_PLM_PROXY,
                                                   orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(answer);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            }
        }
        OBJ_RELEASE(caddy);
        return;
    }

    if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
        ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
        /* the job object for this job will have been NULL'd
         * in the array if the job was solely local. If it isn't
         * NULL, then we need to tell everyone else to die
         */
        aborted_proc = NULL;
        if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
            sts = aborted_proc->exit_code;
            if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
                if (WIFSIGNALED(sts)) { /* died on signal */
#ifdef WCOREDUMP
                    if (WCOREDUMP(sts)) {
                        orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
                                       WTERMSIG(sts));
                        sts = WTERMSIG(sts);
                    } else {
                        orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
                                       WTERMSIG(sts));
                        sts = WTERMSIG(sts);
                    }
#else
                    orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
                                   WTERMSIG(sts));
                    sts = WTERMSIG(sts);
#endif /* WCOREDUMP */
                } else {
                    orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
                                   WEXITSTATUS(sts));
                    sts = WEXITSTATUS(sts);
                }
            }
        }
        /* if this is the daemon job, then we need to ensure we
         * output an error message indicating we couldn't launch the
         * daemons */
        if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
            orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
        }
    }

    /* if the daemon job aborted and we haven't heard from everyone yet,
     * then this could well have been caused by a daemon not finding
     * a way back to us. In this case, output a message indicating a daemon
     * died without reporting. Otherwise, say nothing as we
     * likely already output an error message */
    if (ORTE_JOB_STATE_ABORTED == jobstate &&
        jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
        jdata->num_procs != jdata->num_reported) {
        orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
    }
        
    /* abort the job */
    ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
    /* set the global abnormal exit flag  */
    orte_abnormal_term_ordered = true;
    OBJ_RELEASE(caddy);
}
Exemple #16
0
static void dump_aborted_procs(void)
{
    orte_std_cntr_t i, n;
    orte_proc_t *proc, *pptr;
    orte_app_context_t *approc;
    orte_job_t *job;
    orte_node_t *node;
    
    /* find the job that caused the problem - be sure to start the loop
     * at 1 as the daemons are in 0 and will clearly be "running", so no
     * point in checking them
     */
    for (n=1; n < orte_job_data->size; n++) {
        if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
            /* the array is no longer left-justified, so we have to continue */
            continue;
        }
        if (ORTE_JOB_STATE_UNDEF != job->state &&
            ORTE_JOB_STATE_INIT != job->state &&
            ORTE_JOB_STATE_RUNNING != job->state &&
            ORTE_JOB_STATE_TERMINATED != job->state &&
            ORTE_JOB_STATE_ABORT_ORDERED != job->state) {

            /* cycle through and count the number that were killed or aborted */
            for (i=0; i < job->procs->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) {
                    /* array is left-justfied - we are done */
                    continue;
                }
                if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state ||
                    ORTE_PROC_STATE_FAILED_TO_LAUNCH == pptr->state) {
                    ++num_failed_start;
                } else if (ORTE_PROC_STATE_ABORTED == pptr->state) {
                    ++num_aborted;
                } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) {
                    ++num_killed;
                } else if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == pptr->state) {
                    ++num_killed;
                }
            }

            /* this is a guilty party */
            proc = NULL;
            if (!orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC, (void**)&proc, OPAL_PTR) ||
                NULL == proc) {
                continue;
            }

            approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
            node = proc->node;
            if (ORTE_JOB_STATE_FAILED_TO_START == job->state ||
                ORTE_JOB_STATE_FAILED_TO_LAUNCH == job->state) {
                switch (proc->exit_code) {
                case ORTE_ERR_SILENT:
                    /* say nothing - it was already reported */
                    break;
                case ORTE_ERR_SYS_LIMITS_PIPES:
                    orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
                                   orte_basename, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_PIPE_SETUP_FAILURE:
                    orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
                                   orte_basename, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_SYS_LIMITS_CHILDREN:
                    orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
                                   orte_basename, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_FAILED_GET_TERM_ATTRS:
                    orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
                                   orte_basename, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_WDIR_NOT_FOUND:
                    orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
                                   orte_basename, approc->cwd,
                                   proc->node->name, (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_EXE_NOT_FOUND:
                    orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
                                   orte_basename, 
                                   (unsigned long)proc->name.vpid,
                                   orte_basename, 
                                   orte_basename, 
                                   proc->node->name, 
                                   approc->app);
                    break;
                case ORTE_ERR_EXE_NOT_ACCESSIBLE:
                    orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
                                   orte_basename, approc->app, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_MULTIPLE_AFFINITIES:
                    orte_show_help("help-orterun.txt",
                                   "orterun:multiple-paffinity-schemes", true, NULL);
                    break;
                case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
                    orte_show_help("help-orterun.txt",
                                   "orterun:topo-not-supported", 
                                   true, orte_process_info.nodename, "rankfile containing a slot_list of ", 
                                   NULL, approc->app);
                    break;
                case ORTE_ERR_INVALID_NODE_RANK:
                    orte_show_help("help-orterun.txt",
                                   "orterun:invalid-node-rank", true);
                    break;
                case ORTE_ERR_INVALID_LOCAL_RANK:
                    orte_show_help("help-orterun.txt",
                                   "orterun:invalid-local-rank", true);
                    break;
                case ORTE_ERR_NOT_ENOUGH_CORES:
                    orte_show_help("help-orterun.txt",
                                   "orterun:not-enough-resources", true,
                                   "sockets", node->name,
                                   "bind-to-core", approc->app);
                    break;
                case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
                    orte_show_help("help-orterun.txt",
                                   "orterun:topo-not-supported", 
                                   true, node->name, "bind-to-core", "",
                                   approc->app);
                    break;
                case ORTE_ERR_INVALID_PHYS_CPU:
                    orte_show_help("help-orterun.txt",
                                   "orterun:invalid-phys-cpu", true);
                    break;
                case ORTE_ERR_NOT_ENOUGH_SOCKETS:
                    orte_show_help("help-orterun.txt",
                                   "orterun:not-enough-resources", true,
                                   "sockets", node->name,
                                   "bind-to-socket", approc->app);
                    break;
                case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
                    orte_show_help("help-orterun.txt",
                                   "orterun:topo-not-supported", 
                                   true, node->name, "bind-to-socket", "",
                                   approc->app);
                    break;
                case ORTE_ERR_MODULE_NOT_FOUND:
                    orte_show_help("help-orterun.txt",
                                   "orterun:paffinity-missing-module", 
                                   true, node->name);
                    break;
                case ORTE_ERR_SLOT_LIST_RANGE:
                    orte_show_help("help-orterun.txt",
                                   "orterun:invalid-slot-list-range", 
                                   true, node->name, NULL);
                    break;
                case ORTE_ERR_PIPE_READ_FAILURE:
                    orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
                                   orte_basename, node->name, (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_SOCKET_NOT_AVAILABLE:
                    orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
                                   orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
                                   (unsigned long)proc->name.vpid);
                    break;

                default:
                    if (0 != proc->exit_code) {
                        orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
                                       orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
                                       (unsigned long)proc->name.vpid);
                    } else {
                        orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
                                       orte_basename, node->name);
                    }
                    break;
                }
            } else if (ORTE_JOB_STATE_ABORTED == job->state) {
                orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
                               orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
                               node->name, orte_basename);
            } else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) {  /* aborted by signal */
#ifdef HAVE_STRSIGNAL
                if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
                    orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
                                   orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
                                   node->name, WTERMSIG(proc->exit_code),
                                   strsignal(WTERMSIG(proc->exit_code)));
                } else {
#endif
                    orte_show_help("help-orterun.txt", "orterun:proc-aborted", true,
                                   orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
                                   node->name, WTERMSIG(proc->exit_code));
#ifdef HAVE_STRSIGNAL
                }
#endif
            } else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
                orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
                               orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
                               node->name, orte_basename, orte_basename);
            } else if (ORTE_JOB_STATE_COMM_FAILED == job->state) {
                orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true,
                               ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                               ORTE_NAME_PRINT(&proc->name), node->name);
            } else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
                switch (proc->exit_code) {
                case ORTE_ERR_MEM_LIMIT_EXCEEDED:
                    orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
                                   ORTE_NAME_PRINT(&proc->name), node->name);
                    break;
                case ORTE_ERR_PROC_STALLED:
                    orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
                    break;

                default:
                    orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
                    break;
                }
            } else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
                orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
                               orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
            } else if (orte_abort_non_zero_exit &&
                       ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
                orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
                               orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
            }
            return;
        }
    }
}
Exemple #17
0
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_proc_t *pptr, *proct;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    int i;
    int32_t i32, *i32ptr;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:orcmsd_hnp: for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));
    
    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* could be a race condition */
        goto cleanup;
    }
    pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    /* we MUST handle a communication failure before doing anything else
     * as it requires some special care to avoid normal termination issues
     * for local application procs
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* is this to a daemon? */
        if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
            /* nope - ignore it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s Comm failure to non-daemon proc - ignoring it",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* if this is my own connection, ignore it */
        if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s Comm failure on my own connection - ignoring it",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* mark the daemon as gone */
        ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
        /* if we have ordered orteds to terminate or abort
         * is in progress, record it */
        if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s Comm failure: daemons terminating - recording daemon %s as gone",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
            /* remove from dependent routes, if it is one */
            orte_routed.route_lost(proc);
	    /* if all my routes and local children are gone, then terminate ourselves */
	    if (0 == orte_routed.num_routes()) {
                for (i=0; i < orte_local_children->size; i++) {
                    if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                        ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
                        /* at least one is still alive */
                        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                             "%s Comm failure: at least one proc (%s) still alive",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             ORTE_NAME_PRINT(&proct->name)));
                        goto cleanup;
                    }
                }
                /* call our appropriate exit procedure */
                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr_hnp: all routes and children gone - ordering exit",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
	    } else {
                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                     "%s Comm failure: %d routes remain alive",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (int)orte_routed.num_routes()));
            }
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s Comm failure: daemon %s - aborting",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
        /* record the first one to fail */
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_COMM_FAILED;
            /* point to the lowest rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        }
        /* abort the system */
        notify_job_errors(jdata, pptr);
        goto cleanup;
    }

    /* update the proc state - can get multiple reports on a proc
     * depending on circumstances, so ensure we only do this once
     */
    if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
        pptr->state = state;
    }

    /* if we were ordered to terminate, mark this proc as dead and see if
     * any of our routes or local  children remain alive - if not, then
     * terminate ourselves. */
    if (orte_orteds_term_ordered) {
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
                    goto keep_going;
                }
            }
        }
        /* if all my routes and children are gone, then terminate
           ourselves nicely (i.e., this is a normal termination) */
        if (0 == orte_routed.num_routes()) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default:hnp all routes gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
        }
    }

 keep_going:
    /* ensure we record the failed proc properly so we can report
     * the error once we terminate
     */
    switch (state) {
    case ORTE_PROC_STATE_KILLED_BY_CMD:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s killed by cmd",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* we ordered this proc to die, so it isn't an abnormal termination
         * and we don't flag it as such
         */
        if (jdata->num_terminated >= jdata->num_procs) {
            /* this job has terminated */
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        }
        /* don't abort the job as this isn't an abnormal termination */
        break;

    case ORTE_PROC_STATE_ABORTED:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s aborted",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_ABORTED;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            notify_job_errors(jdata, pptr);
        }
        break;

    case ORTE_PROC_STATE_ABORTED_BY_SIG:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s aborted by signal",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            notify_job_errors(jdata, pptr);
        }
        break;

    case ORTE_PROC_STATE_TERM_WO_SYNC:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s terminated without sync",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* now treat a special case - if the proc exit'd without a required
             * sync, it may have done so with a zero exit code. We want to ensure
             * that the user realizes there was an error, so in this -one- case,
             * we overwrite the process' exit code with the default error code
             */
            ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            notify_job_errors(jdata, pptr);
        }
        break;

    case ORTE_PROC_STATE_FAILED_TO_START:
    case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             orte_proc_state_to_str(state)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            if (ORTE_PROC_STATE_FAILED_TO_START) {
                jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
            } else {
                jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
            }
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            notify_job_errors(jdata, pptr);
        }
        /* if this was a daemon, report it */
        if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
            /* output a message indicating we failed to launch a daemon */
            orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
        }
        break;

    case ORTE_PROC_STATE_CALLED_ABORT:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s called abort with exit code %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc), pptr->exit_code));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
            /* point to the first proc to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            notify_job_errors(jdata, pptr);
        }
        break;

    case ORTE_PROC_STATE_TERM_NON_ZERO:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s exited with non-zero status %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             pptr->exit_code));
        ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        /* track the number of non-zero exits */
        i32 = 0;
        i32ptr = &i32;
        orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
        ++i32;
        orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
        if (orte_abort_non_zero_exit) {
            if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
                jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
                /* point to the first rank to cause the problem */
                orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
                /* retain the object so it doesn't get free'd */
                OBJ_RETAIN(pptr);
                ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
                /* abnormal termination - abort, but only do it once
                 * to avoid creating a lot of confusion */
                notify_job_errors(jdata, pptr);
            }
        } else {
            /* user requested we consider this normal termination */
            if (jdata->num_terminated >= jdata->num_procs) {
                /* this job has terminated */
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
            }
        }
        break;

    case ORTE_PROC_STATE_HEARTBEAT_FAILED:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s heartbeat failed",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            notify_job_errors(jdata, pptr);
        }
        /* remove from dependent routes, if it is one */
        orte_routed.route_lost(proc);
        break;

    case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: unable to send message to proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* if this proc is one of my daemons, then we are truly
         * hosed - so just exit out
         */
        if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            break;
        }
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            notify_job_errors(jdata, pptr);
        }
        break;

    default:
        /* shouldn't get this, but terminate job if required */
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s default error %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             orte_proc_state_to_str(state)));
        if (jdata->num_terminated == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        }
        break;
    }
    /* if the waitpid fired, be sure to let the state machine know */
    if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
        ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Exemple #18
0
static void launch_daemons(int fd, short args, void *cbdata)
{
    orte_job_map_t *map;
    char *jobid_string = NULL;
    char *param;
    char **argv = NULL;
    int argc;
    int rc;
    char *tmp;
    char** env = NULL;
    char *nodelist_flat;
    char **nodelist_argv;
    int nodelist_argc;
    char *vpid_string;
    char **custom_strings;
    int num_args, i;
    char *cur_prefix;
    int proc_vpid_index;
    orte_app_context_t *app;
    orte_node_t *node;
    orte_std_cntr_t nnode;
    orte_job_t *daemons;
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;

    /* if we are launching debugger daemons, then just go
     * do it - no new daemons will be launched
     */
    if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* start by setting up the virtual machine */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

   /* if we don't want to launch, then don't attempt to
     * launch the daemons - the user really wants to just
     * look at the proposed process map
     */
    if (orte_do_not_launch) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* Get the map for this job */
    if (NULL == (map = daemons->map)) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }

    if (0 == map->num_new_daemons) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:alps: no new daemons to launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* need integer value for command line parameter */
    orte_util_convert_jobid_to_string(&jobid_string, daemons->jobid);

    /*
     * start building argv array
     */
    argv = NULL;
    argc = 0;

    /*
     * ALPS aprun  OPTIONS
     */

    /* add the aprun command */
    opal_argv_append(&argc, &argv, mca_plm_alps_component.aprun_cmd);

    /* Append user defined arguments to aprun */
    if ( NULL != mca_plm_alps_component.custom_args ) {
        custom_strings = opal_argv_split(mca_plm_alps_component.custom_args, ' ');
        num_args       = opal_argv_count(custom_strings);
        for (i = 0; i < num_args; ++i) {
            opal_argv_append(&argc, &argv, custom_strings[i]);
        }
        opal_argv_free(custom_strings);
    }

    /* number of processors needed */
    opal_argv_append(&argc, &argv, "-n");
    asprintf(&tmp, "%lu", (unsigned long) map->num_new_daemons);
    opal_argv_append(&argc, &argv, tmp);
    free(tmp);
    opal_argv_append(&argc, &argv, "-N");
    opal_argv_append(&argc, &argv, "1");
    opal_argv_append(&argc, &argv, "-cc");
    opal_argv_append(&argc, &argv, "none");
    /*
     * stuff below is necessary in the event that we've sadly configured Open MPI with --disable-dlopen,
     * which results in the orted's being linked against all kinds of unnecessary cray libraries, including
     * the cray pmi, which has a ctor that cause bad things if run when using mpirun/orted based launch.
     *
     * Code below adds env. variables for aprun to forward which suppresses the action of the Cray PMI ctor.
     */
    opal_argv_append(&argc, &argv, "-e");
    opal_argv_append(&argc, &argv, "PMI_NO_PREINITIALIZE=1");
    opal_argv_append(&argc, &argv, "-e");
    opal_argv_append(&argc, &argv, "PMI_NO_FORK=1");

    /* create nodelist */
    nodelist_argv = NULL;
    nodelist_argc = 0;

    for (nnode=0; nnode < map->nodes->size; nnode++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
            continue;
        }

        /* if the daemon already exists on this node, then
         * don't include it
         */
        if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
            continue;
        }

        /* otherwise, add it to the list of nodes upon which
         * we need to launch a daemon
         */
        opal_argv_append(&nodelist_argc, &nodelist_argv, node->name);
    }
    if (0 == opal_argv_count(nodelist_argv)) {
        orte_show_help("help-plm-alps.txt", "no-hosts-in-list", true);
        rc = ORTE_ERR_FAILED_TO_START;
        goto cleanup;
    }
    nodelist_flat = opal_argv_join(nodelist_argv, ',');
    opal_argv_free(nodelist_argv);

    /* if we are using all allocated nodes, then alps
     * doesn't need a nodelist, or if running without a batch scheduler
     */
    if ((map->num_new_daemons < orte_num_allocated_nodes) || (orte_num_allocated_nodes == 0)) {
        opal_argv_append(&argc, &argv, "-L");
        opal_argv_append(&argc, &argv, nodelist_flat);
    }


    /*
     * ORTED OPTIONS
     */

    /* add the daemon command (as specified by user) */
    orte_plm_base_setup_orted_cmd(&argc, &argv);

    /* Add basic orted command line options, including debug flags */
    orte_plm_base_orted_append_basic_args(&argc, &argv,
                                          NULL,
                                          &proc_vpid_index,
                                          nodelist_flat);
    free(nodelist_flat);

    /* tell the new daemons the base of the name list so they can compute
     * their own name on the other end
     */
    rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start);
    if (ORTE_SUCCESS != rc) {
        opal_output(0, "plm_alps: unable to create process name");
        goto cleanup;
    }

    free(argv[proc_vpid_index]);
    argv[proc_vpid_index] = strdup(vpid_string);
    free(vpid_string);

    if (mca_plm_alps_component.debug) {
        param = opal_argv_join(argv, ' ');
        if (NULL != param) {
            opal_output(0, "plm:alps: final top-level argv:");
            opal_output(0, "plm:alps:     %s", param);
            free(param);
        }
    }

    /* Copy the prefix-directory specified in the
       corresponding app_context.  If there are multiple,
       different prefix's in the app context, complain (i.e., only
       allow one --prefix option for the entire alps run -- we
       don't support different --prefix'es for different nodes in
       the ALPS plm) */
    cur_prefix = NULL;
    for (i=0; i < state->jdata->apps->size; i++) {
        char *app_prefix_dir = NULL;
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, i))) {
            continue;
        }
        orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
        /* Check for already set cur_prefix_dir -- if different,
           complain */
        if (NULL != app_prefix_dir) {
            if (NULL != cur_prefix &&
                0 != strcmp (cur_prefix, app_prefix_dir)) {
                orte_show_help("help-plm-alps.txt", "multiple-prefixes",
                               true, cur_prefix, app_prefix_dir);
                goto cleanup;
            }

            /* If not yet set, copy it; iff set, then it's the
               same anyway */
            if (NULL == cur_prefix) {
                cur_prefix = strdup(app_prefix_dir);
                if (mca_plm_alps_component.debug) {
                    opal_output (0, "plm:alps: Set prefix:%s",
                                 cur_prefix);
                }
            }
            free(app_prefix_dir);
        }
    }

    /* protect the args in case someone has a script wrapper around aprun */
    mca_base_cmd_line_wrap_args(argv);

    /* setup environment */
    env = opal_argv_copy(orte_launch_environ);

    if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
        param = opal_argv_join(argv, ' ');
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:alps: final top-level argv:\n\t%s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == param) ? "NULL" : param));
        if (NULL != param) free(param);
    }

    /* exec the daemon(s) */
    if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* indicate that the daemons for this job were launched */
    state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
    daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;

    /* flag that launch was successful, so far as we currently know */
    failed_launch = false;

 cleanup:
    if (NULL != argv) {
        opal_argv_free(argv);
    }
    if (NULL != env) {
        opal_argv_free(env);
    }

    if(NULL != jobid_string) {
        free(jobid_string);
    }

    /* cleanup the caddy */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
}
Exemple #19
0
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    int i;
    opal_buffer_t *buffer;
    char hostname[OPAL_MAXHOSTNAMELEN];
#if OPAL_ENABLE_FT_CR == 1
    char *tmp_env_var = NULL;
#endif

    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    bucket = OBJ_NEW(opal_buffer_t);

    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }

    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);

    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);

    /* purge any ess/pmix flags set in the environ when we were launched */
    opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
    opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ);

    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, sizeof(hostname));
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }

    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }

    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }

    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

    /* bind ourselves if so directed */
    if (NULL != orte_daemon_cores) {
        char **cores=NULL, tmp[128];
        hwloc_obj_t pu;
        hwloc_cpuset_t ours, res;
        int core;

        /* could be a collection of comma-delimited ranges, so
         * use our handy utility to parse it
         */
        orte_util_parse_range_options(orte_daemon_cores, &cores);
        if (NULL != cores) {
            ours = hwloc_bitmap_alloc();
            hwloc_bitmap_zero(ours);
            res = hwloc_bitmap_alloc();
            for (i=0; NULL != cores[i]; i++) {
                core = strtoul(cores[i], NULL, 10);
                if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                    /* turn off the show help forwarding as we won't
                     * be able to cycle the event library to send
                     */
                    orte_show_help_finalize();
                    /* the message will now come out locally */
                    orte_show_help("help-orted.txt", "orted:cannot-bind",
                                   true, orte_process_info.nodename,
                                   orte_daemon_cores);
                    ret = ORTE_ERR_NOT_SUPPORTED;
                    hwloc_bitmap_free(ours);
                    hwloc_bitmap_free(res);
                    goto DONE;
                }
                hwloc_bitmap_or(res, ours, pu->cpuset);
                hwloc_bitmap_copy(ours, res);
            }
            /* if the result is all zeros, then don't bind */
            if (!hwloc_bitmap_iszero(ours)) {
                (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                if (opal_hwloc_report_bindings) {
                    opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                    opal_output(0, "Daemon %s is bound to cores %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                }
            }
            /* cleanup */
            hwloc_bitmap_free(ours);
            hwloc_bitmap_free(res);
            opal_argv_free(cores);
        }
    }

    if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_debug_failure) {
            orted_debug_failure = -1*orted_debug_failure;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_debug_failure_delay) {
                ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);

            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }

                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_oob_base_get_addr(&orte_process_info.my_daemon_uri);
    if (NULL == orte_process_info.my_daemon_uri) {
        /* no way to communicate */
        ret = ORTE_ERROR;
        goto DONE;
    }
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;

    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = strdup(orte_process_info.my_daemon_uri);
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }

    /* setup the primary daemon command receive function */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);

    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, report my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        char **singenv=NULL, *string_key, *env_str;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        /* default to ompi for now */
        opal_argv_append_nosize(&jdata->personality, "ompi");
        orte_plm_base_create_jobid(jdata);
        opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        jdata->num_apps = 1;

        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        proc->parent = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* add the node to the job map */
        OBJ_RETAIN(node);
        opal_pointer_array_add(jdata->map->nodes, node);
        jdata->map->num_nodes++;
        /* and it obviously is on the node */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(node->procs, proc);
        node->num_procs++;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);

        /* set the ORTE_JOB_TRANSPORT_KEY from the environment */
        orte_pre_condition_transports(jdata, NULL);

        /* register the singleton's nspace with our PMIx server */
        if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) {
          ORTE_ERROR_LOG(ret);
          goto DONE;
        }
        /* use setup fork to create the envars needed by the singleton */
        if (OPAL_SUCCESS != (ret = opal_pmix.server_setup_fork(&proc->name, &singenv))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* append the transport key to the envars needed by the singleton */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&string_key, OPAL_STRING) || NULL == string_key) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            goto DONE;
        }
        asprintf(&env_str, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
        opal_argv_append_nosize(&singenv, env_str);
        free(env_str);

        nptr = opal_argv_join(singenv, '*');
        opal_argv_free(singenv);

        /* create a string that contains our uri + sysinfo + PMIx server URI envars */
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, nptr);
        free(sysinfo);
        free(nptr);

        /* pass that info to the singleton */
        if (OPAL_SUCCESS != (ret = opal_fd_write(orted_globals.uri_pipe, strlen(tmp)+1, tmp))) { ; /* need to add 1 to get the NULL */
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* cleanup */
        free(tmp);
        close(orted_globals.uri_pipe);

        /* since a singleton spawned us, we need to harvest
         * any MCA params from the local environment so
         * we can pass them along to any subsequent daemons
         * we may start as the result of a comm_spawn
         */
        for (i=0; NULL != environ[i]; i++) {
            if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
                /* make a copy to manipulate */
                tmp = strdup(environ[i]);
                /* find the equal sign */
                nptr = strchr(tmp, '=');
                *nptr = '\0';
                nptr++;
                /* add the mca param to the orted cmd line */
                opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
                opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
                opal_argv_append_nosize(&orted_cmd_line, nptr);
                free(tmp);
            }
        }
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */
    orte_parent_uri = NULL;
    (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
                                  "URI for the parent if tree launch is enabled.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_CONSTANT,
                                  &orte_parent_uri);
    if (NULL != orte_parent_uri) {
        orte_process_name_t parent;
        opal_value_t val;

        /* set the contact info into our local database */
        ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
        if (ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free (orte_parent_uri);
            orte_parent_uri = NULL;
            goto DONE;
        }
        OBJ_CONSTRUCT(&val, opal_value_t);
        val.key = OPAL_PMIX_PROC_URI;
        val.type = OPAL_STRING;
        val.data.string = orte_parent_uri;
        if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&parent, &val))) {
            ORTE_ERROR_LOG(ret);
            OBJ_DESTRUCT(&val);
            goto DONE;
        }
        val.key = NULL;
        val.data.string = NULL;
        OBJ_DESTRUCT(&val);

        /* don't need this value anymore */
        free(orte_parent_uri);
        orte_parent_uri = NULL;

        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        orte_process_name_t target;
        target.jobid = ORTE_PROC_MY_NAME->jobid;

        if (orte_fwd_mpirun_port || orte_static_ports) {
            /* setup the rollup callback */
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
                                    ORTE_RML_PERSISTENT, rollup, NULL);
            target.vpid = ORTE_PROC_MY_NAME->vpid;
            /* since we will be waiting for any children to send us
             * their rollup info before sending to our parent, save
             * a little time in the launch phase by "warming up" the
             * connection to our parent while we wait for our children */
            buffer = OBJ_NEW(opal_buffer_t);  // zero-byte message
            if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                                   ORTE_PROC_MY_PARENT, buffer,
                                                   ORTE_RML_TAG_WARMUP_CONNECTION,
                                                   orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
        } else {
            target.vpid = 0;
        }

        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* get any connection info we may have pushed */
        {
            opal_value_t *val = NULL, *kv;
            opal_list_t *modex;
            int32_t flag;

            if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) {
                /* just pack a marker indicating we don't have any to share */
                flag = 0;
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buffer);
                    goto DONE;
                }
            } else {
                /* the data is returned as a list of key-value pairs in the opal_value_t */
                if (OPAL_PTR == val->type) {
                    modex = (opal_list_t*)val->data.ptr;
                    flag = (int32_t)opal_list_get_size(modex);
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
                        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &kv, 1, OPAL_VALUE))) {
                            ORTE_ERROR_LOG(ret);
                            OBJ_RELEASE(buffer);
                            goto DONE;
                        }
                    }
                    OPAL_LIST_RELEASE(modex);
                } else {
                    opal_output(0, "VAL KEY: %s", (NULL == val->key) ? "NULL" : val->key);
                    /* single value */
                    flag = 1;
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &val, 1, OPAL_VALUE))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                }
                OBJ_RELEASE(val);
            }
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;

    orte_proc_t *child, *ptr;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    int i;

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_orted:proc_errors process %s error state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors finalizing - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_LIFELINE_LOST == state ||
        ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:orted lifeline lost - exiting",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* set our exit status */
        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
        /* kill our children */
        killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
        /* terminate - our routed children will see
         * us leave and automatically die
         */
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* must already be complete */
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - we can't seem to trust that we will catch the waitpid
             * in this situation, so push this over to be handled as if
             * it were a waitpid trigger so we don't create a bunch of
             * duplicate code */
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            /* get the proc_t */
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                goto cleanup;
            }
            /* leave the exit code alone - process this as a waitpid */
            odls_base_default_wait_local_proc(child, NULL);
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default:orted daemon %s exited",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* if we are using static ports, then it is possible that the HNP
         * will not see this termination. So if the HNP didn't order us
         * to terminate, then we should ensure it knows */
        if (orte_static_ports && !orte_orteds_term_ordered) {
            /* send an alert to the HNP */
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* get the proc_t */
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                goto cleanup;
            }
            /* set the exit code to reflect the problem */
            child->exit_code = ORTE_ERR_COMM_FAILURE;
            /* pack only the data for this daemon - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }

            /* now pack the daemon's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting lost connection to daemon %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(proc)));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(alert);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
            /* continue on */
            goto cleanup;
        }

        if (orte_orteds_term_ordered) {
            /* are any of my children still alive */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                        OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                             "%s errmgr:default:orted[%s(%d)] proc %s is alive",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             __FILE__, __LINE__,
                                             ORTE_NAME_PRINT(&child->name)));
                        goto cleanup;
                    }
                }
            }
            /* if all my routes and children are gone, then terminate
               ourselves nicely (i.e., this is a normal termination) */
            if (0 == orte_routed.num_routes()) {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted all routes gone - exiting",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            } else {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted not exiting, num_routes() == %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (int)orte_routed.num_routes()));
            }
        }
        /* if not, then we can continue */
        goto cleanup;
    }

    if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
    }
    /* if this is not a local proc for this job, we can
     * ignore this call
     */
    if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors proc is not local - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_orted got state %s for proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc)));

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        /* update the state */
        child->state = state;
        /* report this as abnormal termination to the HNP, unless we already have
         * done so for this job */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack only the data for this proc - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }

            /* now pack the child's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 jdata->num_local_procs));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(alert);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
        /* if the proc has terminated, notify the state machine */
        if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
        goto cleanup;
    }

    if (ORTE_PROC_STATE_FAILED_TO_START == state ||
        ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) {
        /* update the proc state */
        child->state = state;
        /* count the proc as having "terminated" */
        jdata->num_terminated++;
        /* leave the error report in this case to the
         * state machine, which will receive notice
         * when all local procs have attempted to start
         * so that we send a consolidated error report
         * back to the HNP
         */
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if we were ordered to terminate, see if
         * any of our routes or local children remain alive - if not, then
         * terminate ourselves. */
        if (orte_orteds_term_ordered) {
            /* mark the child as no longer alive and update the counters, if necessary.
             * we have to do this here as we aren't going to send this to the state
             * machine, and we want to keep the bookkeeping accurate just in case */
            if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
            }
            if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
                ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED);
                jdata->num_terminated++;
            }
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                        goto keep_going;
                    }
                }
            }
            /* if all my routes and children are gone, then terminate
               ourselves nicely (i.e., this is a normal termination) */
            if (0 == orte_routed.num_routes()) {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted all routes gone - exiting",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            }
            /* no need to alert the HNP - we are already on our way out */
            goto cleanup;
        }

    keep_going:
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away - but
         * only do this once!
         */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack only the data for this proc - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            child->state = state;
            /* now pack the child's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 jdata->num_local_procs));
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
        /* if the proc has terminated, notify the state machine */
        if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
        goto cleanup;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));

        /* remove all of this job's children from the global list */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (jdata->jobid == ptr->name.jobid) {
                opal_pointer_array_set_item(orte_local_children, i, NULL);
                OBJ_RELEASE(ptr);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jdata->jobid);

        /* remove this job from our local job data since it is complete */
        opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
        OBJ_RELEASE(jdata);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Exemple #21
0
static void orte_job_destruct(orte_job_t* job)
{
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_job_t *jdata;
    int n;
    orte_timer_t *evtimer;

    if (NULL == job) {
        /* probably just a race condition - just return */
        return;
    }
    
    if (orte_debug_flag) {
        opal_output(0, "%s Releasing job data for %s",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid));
    }
    
    for (n=0; n < job->apps->size; n++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, n))) {
            continue;
        }
        OBJ_RELEASE(app);
    }
    OBJ_RELEASE(job->apps);
    
    /* release any pointers in the attributes */
    evtimer = NULL;
    if (orte_get_attribute(&job->attributes, ORTE_JOB_FAILURE_TIMER_EVENT,
                           (void**)&evtimer, OPAL_PTR)) {
        orte_remove_attribute(&job->attributes, ORTE_JOB_FAILURE_TIMER_EVENT);
        /* the timer is a pointer to orte_timer_t */
        OBJ_RELEASE(evtimer);
    }
    proc = NULL;
    if (orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC,
                           (void**)&proc, OPAL_PTR)) {
        orte_remove_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC);
        /* points to an orte_proc_t */
        OBJ_RELEASE(proc);
    }

    if (NULL != job->map) {
        OBJ_RELEASE(job->map);
        job->map = NULL;
    }
    
    for (n=0; n < job->procs->size; n++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(job->procs, n))) {
            continue;
        }
        OBJ_RELEASE(proc);
    }
    OBJ_RELEASE(job->procs);
    
    /* release the attributes */
    OPAL_LIST_DESTRUCT(&job->attributes);

    /* find the job in the global array */
    if (NULL != orte_job_data && ORTE_JOBID_INVALID != job->jobid) {
        for (n=0; n < orte_job_data->size; n++) {
            if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
                continue;
            }
            if (jdata->jobid == job->jobid) {
                /* set the entry to NULL */
                opal_pointer_array_set_item(orte_job_data, n, NULL);
                break;
            }
        }
    }
}
Exemple #22
0
static void job_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_job_state_t jobstate;
    opal_buffer_t *answer;
    int32_t rc, ret;
    int room, *rmptr;

    ORTE_ACQUIRE_OBJECT(caddy);

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        return;
    }

    /* if the jdata is NULL, then we ignore it as this
     * is reporting an unrecoverable error
     */
    if (NULL == caddy->jdata) {
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        OBJ_RELEASE(caddy);
        return;
    }

    /* update the state */
    jdata = caddy->jdata;
    jobstate = caddy->job_state;
    jdata->state = jobstate;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:dvm: job %s reported state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid),
                         orte_job_state_to_str(jobstate)));

    if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
        /* if the daemon job aborted and we haven't heard from everyone yet,
         * then this could well have been caused by a daemon not finding
         * a way back to us. In this case, output a message indicating a daemon
         * died without reporting. Otherwise, say nothing as we
         * likely already output an error message */
        if (ORTE_JOB_STATE_ABORTED == jobstate &&
            jdata->num_procs != jdata->num_reported) {
            orte_routing_is_enabled = false;
            orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
        }
        /* there really isn't much else we can do since the problem
         * is in the DVM itself, so best just to terminate */
        jdata->num_terminated = jdata->num_procs;
        /* activate the terminated state so we can exit */
        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        OBJ_RELEASE(caddy);
        return;
    }

    /* all other cases involve jobs submitted to the DVM - therefore,
     * we only inform the submitter of the problem, but do NOT terminate
     * the DVM itself */

    rc = jobstate;
    answer = OBJ_NEW(opal_buffer_t);
    if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(caddy);
        return;
    }
    if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(caddy);
        return;
    }
    /* pack the room number */
    rmptr = &room;
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
        if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(caddy);
            return;
        }
    }
    OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:dvm sending notification of job %s failure to %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid),
                         ORTE_NAME_PRINT(&jdata->originator)));
    if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                           &jdata->originator, answer,
                                           ORTE_RML_TAG_LAUNCH_RESP,
                                           orte_rml_send_callback, NULL))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(answer);
    }
    /* ensure we terminate any processes left running in the DVM */
    _terminate_job(jdata->jobid);

    /* cleanup */
    OBJ_RELEASE(caddy);
}
Exemple #23
0
static int setup_fork(orte_job_t *jdata,
                      orte_app_context_t *app)
{
    int i;
    bool takeus = false;
    char *p, *t2;
    char dir[MAXPATHLEN];

    if (NULL != orte_schizo_base.personalities) {
        /* see if we are included */
        for (i=0; NULL != jdata->personality[i]; i++) {
            if (0 == strcmp(jdata->personality[i], "singularity")) {
                takeus = true;
                break;
            }
        }
    }
    if (!takeus) {
        /* even if they didn't specify, check to see if
         * this involves a singularity container */
        if (0 != strcmp(app->argv[0],"singularity") &&
            0 != strcmp(app->argv[0],"sapprun") &&
            NULL == strstr(app->argv[0], ".sapp")) {
            /* guess not! */
            return ORTE_ERR_TAKE_NEXT_OPTION;
        }
    }

    /* set the singularity cache dir, unless asked not to do so */
    if (!orte_get_attribute(&app->attributes, ORTE_APP_NO_CACHEDIR, NULL, OPAL_BOOL)) {
        opal_setenv("SINGULARITY_CACHEDIR", orte_process_info.job_session_dir, true, &app->env);
        opal_setenv("SINGULARITY_CACHEDIR", orte_process_info.job_session_dir, true, &environ);
    }

    /* save our current directory */
    getcwd(dir, sizeof(dir));

    /* change to the working directory for this context */
    chdir(app->cwd);

    /* if the app contains .sapp, then we need to strip that
     * extension so singularity doesn't bark at us */
    if (NULL != strstr(app->argv[0], ".sapp")) {
        /* ensure the app is installed */
        opal_output_verbose(1, orte_schizo_base_framework.framework_output,
                            "%s schizo:singularity: installing app %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->argv[0]);
        t2 = opal_basename(app->argv[0]);
        p = strstr(t2, ".sapp");
        *p = '\0'; // strip the extension
        if (0 < opal_output_get_verbosity(orte_schizo_base_framework.framework_output)) {
            (void)asprintf(&p, "singularity -vv install --runkey %s %s", t2, app->argv[0]);
        } else {
            (void)asprintf(&p, "singularity --quiet install --runkey %s %s", t2, app->argv[0]);
        }
        system(p);
        free(p);
        free(app->argv[0]);
        app->argv[0] = t2;
    }

    /* ensure that we use "singularity run" to execute this app */
    if (0 != strcmp(app->app, "singularity")) {
        opal_output_verbose(1, orte_schizo_base_framework.framework_output,
                            "%s schizo:singularity: adding singularity cmd",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        /* change the app to the "singularity" command */
        free(app->app);
        app->app = strdup("singularity");
        opal_argv_prepend_nosize(&app->argv, "run");
        if (0 < opal_output_get_verbosity(orte_schizo_base_framework.framework_output)) {
            opal_argv_prepend_nosize(&app->argv, "-vv");
        } else {
            opal_argv_prepend_nosize(&app->argv, "--quiet");
        }
        opal_argv_prepend_nosize(&app->argv, "singularity");
    }

    /* return to the original directory */
    chdir(dir);

    return ORTE_SUCCESS;
}
/* stuff proc attributes for sending back to a proc */
int orte_pmix_server_register_nspace(orte_job_t *jdata)
{
    int rc;
    orte_proc_t *pptr;
    int i, k, n;
    opal_list_t *info, *pmap;
    opal_value_t *kv;
    orte_node_t *node, *mynode;
    opal_vpid_t vpid;
    char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist;
    orte_job_t *dmns;
    orte_job_map_t *map;
    orte_app_context_t *app;
    uid_t uid;
    gid_t gid;
    opal_list_t *cache;

    opal_output_verbose(2, orte_pmix_server_globals.output,
                        "%s register nspace for %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* setup the info list */
    info = OBJ_NEW(opal_list_t);
    uid = geteuid();
    gid = getegid();

    /* jobid */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_JOBID);
    kv->data.string = strdup(ORTE_JOBID_PRINT(jdata->jobid));
    kv->type = OPAL_STRING;
    opal_list_append(info, &kv->super);

    /* offset */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NPROC_OFFSET);
    kv->data.uint32 = jdata->offset;
    kv->type = OPAL_UINT32;
    opal_list_append(info, &kv->super);

    /* check for cached values to add to the job info */
    cache = NULL;
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) &&
        NULL != cache) {
        while (NULL != (kv = (opal_value_t*)opal_list_remove_first(cache))) {
            opal_list_append(info, &kv->super);
        }
        orte_remove_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE);
        OBJ_RELEASE(cache);
    }

    /* assemble the node and proc map info */
    list = NULL;
    procs = NULL;
    map = jdata->map;
    for (i=0; i < map->nodes->size; i++) {
        micro = NULL;
        if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            opal_argv_append_nosize(&list, node->name);
            /* assemble all the ranks for this job that are on this node */
            for (k=0; k < node->procs->size; k++) {
                if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) {
                    if (jdata->jobid == pptr->name.jobid) {
                        opal_argv_append_nosize(&micro, ORTE_VPID_PRINT(pptr->name.vpid));
                    }
                }
            }
            /* assemble the rank/node map */
            if (NULL != micro) {
                tmp = opal_argv_join(micro, ',');
                opal_argv_free(micro);
                opal_argv_append_nosize(&procs, tmp);
                free(tmp);
            }
        }
    }
    /* let the PMIx server generate the nodemap regex */
    if (NULL != list) {
        tmp = opal_argv_join(list, ',');
        opal_argv_free(list);
        list = NULL;
        if (OPAL_SUCCESS != (rc = opal_pmix.generate_regex(tmp, &regex))) {
            ORTE_ERROR_LOG(rc);
            free(tmp);
            OPAL_LIST_RELEASE(info);
            return rc;
        }
        free(tmp);
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_NODE_MAP);
        kv->type = OPAL_STRING;
        kv->data.string = regex;
        opal_list_append(info, &kv->super);
    }

    /* let the PMIx server generate the procmap regex */
    if (NULL != procs) {
        tmp = opal_argv_join(procs, ';');
        opal_argv_free(procs);
        procs = NULL;
        if (OPAL_SUCCESS != (rc = opal_pmix.generate_ppn(tmp, &regex))) {
            ORTE_ERROR_LOG(rc);
            free(tmp);
            OPAL_LIST_RELEASE(info);
            return rc;
        }
        free(tmp);
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_PROC_MAP);
        kv->type = OPAL_STRING;
        kv->data.string = regex;
        opal_list_append(info, &kv->super);
    }

    /* get our local node */
    if (NULL == (dmns = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OPAL_LIST_RELEASE(info);
        return ORTE_ERR_NOT_FOUND;
    }
    if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(dmns->procs, ORTE_PROC_MY_NAME->vpid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OPAL_LIST_RELEASE(info);
        return ORTE_ERR_NOT_FOUND;
    }
    mynode = pptr->node;
    if (NULL == mynode) {
        /* cannot happen */
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OPAL_LIST_RELEASE(info);
        return ORTE_ERR_NOT_FOUND;
    }
    /* pass our node ID */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NODEID);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = mynode->index;
    opal_list_append(info, &kv->super);

    /* pass our node size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NODE_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = mynode->num_procs;
    opal_list_append(info, &kv->super);

    /* pass the number of nodes in the job */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NUM_NODES);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = map->num_nodes;
    opal_list_append(info, &kv->super);

    /* univ size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_UNIV_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->total_slots_alloc;
    opal_list_append(info, &kv->super);

    /* job size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_JOB_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->num_procs;
    opal_list_append(info, &kv->super);

    /* number of apps in this job */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_JOB_NUM_APPS);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->num_apps;
    opal_list_append(info, &kv->super);

    /* local size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_LOCAL_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->num_local_procs;
    opal_list_append(info, &kv->super);

    /* max procs */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_MAX_PROCS);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->total_slots_alloc;
    opal_list_append(info, &kv->super);

    /* register any local clients */
    vpid = ORTE_VPID_MAX;
    for (i=0; i < mynode->procs->size; i++) {
        if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(mynode->procs, i))) {
            continue;
        }
        if (pptr->name.jobid == jdata->jobid) {
            if (pptr->name.vpid < vpid) {
                vpid = pptr->name.vpid;
            }
            /* go ahead and register this client */
            if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
                                                                       (void*)pptr, NULL, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        }
    }
    /* pass the local ldr */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_LOCALLDR);
    kv->type = OPAL_VPID;
    kv->data.name.vpid = vpid;
    opal_list_append(info, &kv->super);

    /* for each proc in this job, create an object that
     * includes the info describing the proc so the recipient has a complete
     * picture. This allows procs to connect to each other without
     * any further info exchange, assuming the underlying transports
     * support it. We also pass all the proc-specific data here so
     * that each proc can lookup info about every other proc in the job */

    for (n=0; n < map->nodes->size; n++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
            continue;
        }
        /* construct the list of local peers, while adding
         * each proc's locality info */
        list = NULL;
        procs = NULL;
        cpulist = NULL;
        peerlist = NULL;
        vpid = ORTE_VPID_MAX;
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            if (pptr->name.jobid == jdata->jobid) {
                opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid));
                if (pptr->name.vpid < vpid) {
                    vpid = pptr->name.vpid;
                }
                /* note that we have to pass the cpuset for each local
                 * peer so locality can be computed */
                tmp = NULL;
                if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
                    if (NULL != tmp) {
                        opal_argv_append_nosize(&procs, tmp);
                        free(tmp);
                    } else {
                        opal_argv_append_nosize(&procs, "UNBOUND");
                    }
                } else {
                    opal_argv_append_nosize(&procs, "UNBOUND");
                }
            }
        }
        /* construct the list of peers for transmission */
        if (NULL != list) {
            peerlist = opal_argv_join(list, ',');
            opal_argv_free(list);
            list = NULL;
        }
        /* construct the list of cpusets for transmission */
        if (NULL != procs) {
            cpulist = opal_argv_join(procs, ':');
            opal_argv_free(procs);
            procs = NULL;
        }

        /* if this is me, then pass the peers and cpusets to myself
         * in order to maintain backward compatibility for the non-pmix
         * components in OPAL/pmix */
        if (node == mynode) {
            /* pass the list of peers */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(peerlist);
            opal_list_append(info, &kv->super);

            /* pass the list of cpusets */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(cpulist);
            opal_list_append(info, &kv->super);

        }

        /* now cycle across each proc on this node, passing all data that
         * varies by proc */
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            /* only consider procs from this job */
            if (pptr->name.jobid != jdata->jobid) {
                continue;
            }
            /* setup the proc map object */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_PROC_DATA);
            kv->type = OPAL_PTR;
            kv->data.ptr = OBJ_NEW(opal_list_t);
            opal_list_append(info, &kv->super);
            pmap = kv->data.ptr;

            /* must start with rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_RANK);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = pptr->name.vpid;
            opal_list_append(pmap, &kv->super);

            /* pass the list of peers */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(peerlist);
            opal_list_append(pmap, &kv->super);

            /* pass the list of cpusets */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(cpulist);
            opal_list_append(pmap, &kv->super);

            /* appnum */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_APPNUM);
            kv->type = OPAL_UINT32;
            kv->data.uint32 = pptr->app_idx;
            opal_list_append(pmap, &kv->super);

            /* app ldr */
            app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_APPLDR);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = app->first_rank;
            opal_list_append(pmap, &kv->super);

            /* global/univ rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = pptr->name.vpid + jdata->offset;
            opal_list_append(pmap, &kv->super);

            /* app rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_APP_RANK);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = pptr->app_rank;
            opal_list_append(pmap, &kv->super);

            /* app size */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_APP_SIZE);
            kv->type = OPAL_UINT32;
            kv->data.uint32 = app->num_procs;
            opal_list_append(info, &kv->super);

            /* local rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_RANK);
            kv->type = OPAL_UINT16;
            kv->data.uint16 = pptr->local_rank;
            opal_list_append(pmap, &kv->super);

            /* node rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_NODE_RANK);
            kv->type = OPAL_UINT16;
            kv->data.uint32 = pptr->node_rank;
            opal_list_append(pmap, &kv->super);

            /* hostname */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_HOSTNAME);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(pptr->node->name);
            opal_list_append(pmap, &kv->super);

            /* node ID */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_NODEID);
            kv->type = OPAL_UINT32;
            kv->data.uint32 = pptr->node->index;
            opal_list_append(pmap, &kv->super);
        }
        /* cleanup */
        if (NULL != cpulist) {
            free(cpulist);
        }
        if (NULL != peerlist) {
            free(peerlist);
        }
    }

    /* mark the job as registered */
    orte_set_attribute(&jdata->attributes, ORTE_JOB_NSPACE_REGISTERED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);

    /* pass it down */
    /* we are in an event, so no need to callback */
    rc = opal_pmix.server_register_nspace(jdata->jobid,
                                          jdata->num_local_procs,
                                          info, NULL, NULL);
    OPAL_LIST_RELEASE(info);

    return rc;
}
Exemple #25
0
static void set(orte_job_t *jobdat,
                orte_proc_t *child,
                char ***environ_copy,
                int write_fd)
{
    hwloc_cpuset_t cpuset;
    hwloc_obj_t root;
    opal_hwloc_topo_data_t *sum;
    orte_app_context_t *context;
    int rc=ORTE_ERROR;
    char *msg, *param;
    char *cpu_bitmap;

    opal_output_verbose(2, orte_rtc_base_framework.framework_output,
                        "%s hwloc:set on child %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        (NULL == child) ? "NULL" : ORTE_NAME_PRINT(&child->name));

    if (NULL == jobdat || NULL == child) {
        /* nothing for us to do */
        opal_output_verbose(2, orte_rtc_base_framework.framework_output,
                            "%s hwloc:set jobdat %s child %s - nothing to do",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            (NULL == jobdat) ? "NULL" : ORTE_JOBID_PRINT(jobdat->jobid),
                            (NULL == child) ? "NULL" : ORTE_NAME_PRINT(&child->name));
        return;
    }

    context = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx);

    /* Set process affinity, if given */
    cpu_bitmap = NULL;
    if (!orte_get_attribute(&child->attributes, ORTE_PROC_CPU_BITMAP, (void**)&cpu_bitmap, OPAL_STRING) ||
        NULL == cpu_bitmap || 0 == strlen(cpu_bitmap)) {
        /* if the daemon is bound, then we need to "free" this proc */
        if (NULL != orte_daemon_cores) {
            root = hwloc_get_root_obj(opal_hwloc_topology);
            if (NULL == root->userdata) {
                orte_rtc_base_send_warn_show_help(write_fd,
                                                  "help-orte-odls-default.txt", "incorrectly bound",
                                                  orte_process_info.nodename, context->app,
                                                  __FILE__, __LINE__);
            }
            sum = (opal_hwloc_topo_data_t*)root->userdata;
            /* bind this proc to all available processors */
            rc = hwloc_set_cpubind(opal_hwloc_topology, sum->available, 0);
            /* if we got an error and this wasn't a default binding policy, then report it */
            if (rc < 0  && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) {
                if (errno == ENOSYS) {
                    msg = "hwloc indicates cpu binding not supported";
                } else if (errno == EXDEV) {
                    msg = "hwloc indicates cpu binding cannot be enforced";
                } else {
                    char *tmp;
                    (void)hwloc_bitmap_list_asprintf(&tmp, sum->available);
                    asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"",
                             opal_strerror(rc), tmp);
                    free(tmp);
                }
                if (OPAL_BINDING_REQUIRED(jobdat->map->binding)) {
                    /* If binding is required, send an error up the pipe (which exits
                       -- it doesn't return). */
                    orte_rtc_base_send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
                                                       "binding generic error",
                                                       orte_process_info.nodename, context->app, msg,
                                                       __FILE__, __LINE__);
                } else {
                    orte_rtc_base_send_warn_show_help(write_fd,
                                                      "help-orte-odls-default.txt", "not bound",
                                                      orte_process_info.nodename, context->app, msg,
                                                      __FILE__, __LINE__);
                    return;
                }
            }
        }
        if (0 == rc && opal_hwloc_report_bindings) {
            opal_output(0, "MCW rank %d is not bound (or bound to all available processors)", child->name.vpid);
            /* avoid reporting it twice */
            (void) mca_base_var_env_name ("hwloc_base_report_bindings", &param);
            opal_unsetenv(param, environ_copy);
            free(param);
        }
    } else {
        /* convert the list to a cpuset */
        cpuset = hwloc_bitmap_alloc();
        if (0 != (rc = hwloc_bitmap_list_sscanf(cpuset, cpu_bitmap))) {
            /* See comment above about "This may be a small memory leak" */
            asprintf(&msg, "hwloc_bitmap_sscanf returned \"%s\" for the string \"%s\"",
                     opal_strerror(rc), cpu_bitmap);
            if (NULL == msg) {
                msg = "failed to convert bitmap list to hwloc bitmap";
            }
            if (OPAL_BINDING_REQUIRED(jobdat->map->binding) &&
                OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) {
                /* If binding is required and a binding directive was explicitly
                 * given (i.e., we are not binding due to a default policy),
                 * send an error up the pipe (which exits -- it doesn't return).
                 */
                orte_rtc_base_send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
                                                   "binding generic error",
                                                   orte_process_info.nodename,
                                                   context->app, msg,
                                                   __FILE__, __LINE__);
            } else {
                orte_rtc_base_send_warn_show_help(write_fd,
                                                  "help-orte-odls-default.txt", "not bound",
                                                  orte_process_info.nodename, context->app, msg,
                                                  __FILE__, __LINE__);
                free(cpu_bitmap);
                return;
            }
        }
        /* bind as specified */
        rc = hwloc_set_cpubind(opal_hwloc_topology, cpuset, 0);
        /* if we got an error and this wasn't a default binding policy, then report it */
        if (rc < 0  && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) {
            char *tmp = NULL;
            if (errno == ENOSYS) {
                msg = "hwloc indicates cpu binding not supported";
            } else if (errno == EXDEV) {
                msg = "hwloc indicates cpu binding cannot be enforced";
            } else {
                asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"",
                         opal_strerror(rc), cpu_bitmap);
            }
            if (OPAL_BINDING_REQUIRED(jobdat->map->binding)) {
                /* If binding is required, send an error up the pipe (which exits
                   -- it doesn't return). */
                orte_rtc_base_send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
                                                   "binding generic error",
                                                   orte_process_info.nodename, context->app, msg,
                                                   __FILE__, __LINE__);
            } else {
                orte_rtc_base_send_warn_show_help(write_fd,
                                                  "help-orte-odls-default.txt", "not bound",
                                                  orte_process_info.nodename, context->app, msg,
                                                  __FILE__, __LINE__);
                if (NULL != tmp) {
                    free(tmp);
                    free(msg);
                }
                return;
            }
            if (NULL != tmp) {
                free(tmp);
                free(msg);
            }
        }
        if (0 == rc && opal_hwloc_report_bindings) {
            char tmp1[1024], tmp2[1024];
            hwloc_cpuset_t mycpus;
            /* get the cpus we are bound to */
            mycpus = hwloc_bitmap_alloc();
            if (hwloc_get_cpubind(opal_hwloc_topology,
                                  mycpus,
                                  HWLOC_CPUBIND_PROCESS) < 0) {
                opal_output(0, "MCW rank %d is not bound",
                            child->name.vpid);
            } else {
                if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), opal_hwloc_topology, mycpus)) {
                    opal_output(0, "MCW rank %d is not bound (or bound to all available processors)", child->name.vpid);
                } else {
                    opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), opal_hwloc_topology, mycpus);
                    opal_output(0, "MCW rank %d bound to %s: %s",
                                child->name.vpid, tmp1, tmp2);
                }
            }
            hwloc_bitmap_free(mycpus);
            /* avoid reporting it twice */
            (void) mca_base_var_env_name ("hwloc_base_report_bindings", &param);
            opal_unsetenv(param, environ_copy);
            free(param);
        }
        /* set memory affinity policy - if we get an error, don't report
         * anything unless the user actually specified the binding policy
         */
        rc = opal_hwloc_base_set_process_membind_policy();
        if (ORTE_SUCCESS != rc  && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) {
            if (errno == ENOSYS) {
                msg = "hwloc indicates memory binding not supported";
            } else if (errno == EXDEV) {
                msg = "hwloc indicates memory binding cannot be enforced";
            } else {
                msg = "failed to bind memory";
            }
            if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                /* If binding is required, send an error up the pipe (which exits
                   -- it doesn't return). */
                orte_rtc_base_send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
                                                   "memory binding error",
                                                   orte_process_info.nodename, context->app, msg,
                                                   __FILE__, __LINE__);
            } else {
                orte_rtc_base_send_warn_show_help(write_fd,
                                                  "help-orte-odls-default.txt", "memory not bound",
                                                  orte_process_info.nodename, context->app, msg,
                                                  __FILE__, __LINE__);
                free(cpu_bitmap);
                return;
            }
        }
    }
    if (NULL != cpu_bitmap) {
        free(cpu_bitmap);
    }
}
static int rank_by(orte_job_t *jdata,
                   orte_app_context_t *app,
                   opal_list_t *nodes,
                   hwloc_obj_type_t target,
                   unsigned cache_level)
{
    hwloc_obj_t obj;
    int num_objs, i, j, rc;
    orte_vpid_t num_ranked=0;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_vpid_t vpid;
    int cnt;
    opal_pointer_array_t objs;
    bool all_done;
    opal_list_item_t *item;
    hwloc_obj_t locale;

    if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
        return rank_span(jdata, app, nodes, target, cache_level);
    } else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
        return rank_fill(jdata, app, nodes, target, cache_level);
    }

    /* if ranking is not spanned or filled, then we
     * default to assign ranks sequentially across
     * target objects within a node until that node
     * is fully ranked, and then move on to the next
     * node
     *
     *        Node 0                Node 1
     *    Obj 0     Obj 1       Obj 0     Obj 1
     *     0 2       1 3         8 10      9 11
     *     4 6       5 7        12 14     13 15
     */

    /* setup the pointer array */
    OBJ_CONSTRUCT(&objs, opal_pointer_array_t);
    opal_pointer_array_init(&objs, 2, INT_MAX, 2);

    vpid = jdata->num_procs;
    cnt = 0;
    for (item = opal_list_get_first(nodes);
         item != opal_list_get_end(nodes);
         item = opal_list_get_next(item)) {
        node = (orte_node_t*)item;
        /* get the number of objects - only consider those we can actually use */
        num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
                                                      cache_level, OPAL_HWLOC_AVAILABLE);
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rank_by: found %d objects on node %s with %d procs",
                            num_objs, node->name, (int)node->num_procs);
        if (0 == num_objs) {
            return ORTE_ERR_NOT_SUPPORTED;
        }
        /* collect all the objects */
        for (i=0; i < num_objs; i++) {
            obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
                                                  cache_level, i, OPAL_HWLOC_AVAILABLE);
            opal_pointer_array_set_item(&objs, i, obj);
        }

        /* cycle across the objects, assigning a proc to each one,
         * until all procs have been assigned - unfortunately, since
         * more than this job may be mapped onto a node, the number
         * of procs on the node can't be used to tell us when we
         * are done. Instead, we have to just keep going until all
         * procs are ranked - which means we have to make one extra
         * pass thru the loop
         *
         * Perhaps someday someone will come up with a more efficient
         * algorithm, but this works for now.
         */
        all_done = false;
        while (!all_done && cnt < app->num_procs) {
            all_done = true;
            /* cycle across the objects */
            for (i=0; i < num_objs && cnt < app->num_procs; i++) {
                obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);

                /* find the next proc on this object */
                for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    /* ignore procs from other jobs */
                    if (proc->name.jobid != jdata->jobid) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
                        continue;
                    }
                    /* ignore procs that are already ranked */
                    if (ORTE_VPID_INVALID != proc->name.vpid) {
                        continue;
                    }
                    /* ignore procs from other apps */
                    if (proc->app_idx != app->idx) {
                        continue;
                    }
                    if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        continue;
                    }
                    /* ignore procs on other objects */
                    if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_by: proc at position %d is not on object %d",
                                            j, i);
                        continue;
                    }
                    proc->name.vpid = vpid++;
                    if (0 == cnt) {
                        app->first_rank = proc->name.vpid;
                    }
                    cnt++;
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid));
                    /* insert the proc into the jdata array - no harm if already there */
                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                   /* flag that one was mapped */
                    all_done = false;
                    /* track where the highest vpid landed - this is our
                     * new bookmark
                     */
                    jdata->bookmark = node;
                    /* move to next object */
                    break;
                }
            }
        }
    }

    /* cleanup */
    OBJ_DESTRUCT(&objs);

    return ORTE_SUCCESS;
}
Exemple #27
0
static int setup_child(orte_job_t *jdata,
                       orte_proc_t *child,
                       orte_app_context_t *app)
{
    char *param, *value;
    int rc, i;
    int32_t nrestarts=0, *nrptr;
    bool takeus = false;

    opal_output_verbose(1, orte_schizo_base_framework.framework_output,
                        "%s schizo:ompi: setup_child",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

    /* if no personality was specified, then nothing to do */
    if (NULL == jdata->personality) {
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    if (NULL != orte_schizo_base.personalities) {
        /* see if we are included */
        for (i=0; NULL != jdata->personality[i]; i++) {
            if (0 == strcmp(jdata->personality[i], "ompi")) {
                takeus = true;
                break;
            }
        }
        if (!takeus) {
            return ORTE_ERR_TAKE_NEXT_OPTION;
        }
    }

    /* setup the jobid */
    if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name.jobid))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    opal_setenv("OMPI_MCA_ess_base_jobid", value, true, &app->env);
    free(value);

    /* setup the vpid */
    if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name.vpid))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    opal_setenv("OMPI_MCA_ess_base_vpid", value, true, &app->env);

    /* although the vpid IS the process' rank within the job, users
     * would appreciate being given a public environmental variable
     * that also represents this value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    opal_setenv("OMPI_COMM_WORLD_RANK", value, true, &app->env);
    free(value);  /* done with this now */

    /* users would appreciate being given a public environmental variable
     * that also represents the local rank value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    if (ORTE_LOCAL_RANK_INVALID == child->local_rank) {
        ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
        rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
        return rc;
    }
    asprintf(&value, "%lu", (unsigned long) child->local_rank);
    opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
    free(value);

    /* users would appreciate being given a public environmental variable
     * that also represents the node rank value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    if (ORTE_NODE_RANK_INVALID == child->node_rank) {
        ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
        rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
        return rc;
    }
    asprintf(&value, "%lu", (unsigned long) child->node_rank);
    opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env);
    /* set an mca param for it too */
    opal_setenv("OMPI_MCA_orte_ess_node_rank", value, true, &app->env);
    free(value);

    /* provide the identifier for the PMIx connection - the
     * PMIx connection is made prior to setting the process
     * name itself. Although in most cases the ID and the
     * process name are the same, it isn't necessarily
     * required */
    orte_util_convert_process_name_to_string(&value, &child->name);
    opal_setenv("PMIX_ID", value, true, &app->env);
    free(value);

    nrptr = &nrestarts;
    if (orte_get_attribute(&child->attributes, ORTE_PROC_NRESTARTS, (void**)&nrptr, OPAL_INT32)) {
        /* pass the number of restarts for this proc - will be zero for
         * an initial start, but procs would like to know if they are being
         * restarted so they can take appropriate action
         */
        asprintf(&value, "%d", nrestarts);
        opal_setenv("OMPI_MCA_orte_num_restarts", value, true, &app->env);
        free(value);
    }

    /* if the proc should not barrier in orte_init, tell it */
    if (orte_get_attribute(&child->attributes, ORTE_PROC_NOBARRIER, NULL, OPAL_BOOL)
        || 0 < nrestarts) {
        opal_setenv("OMPI_MCA_orte_do_not_barrier", "1", true, &app->env);
    }

    /* if we are using staged execution, tell it */
    if (orte_staged_execution) {
        opal_setenv("OMPI_MCA_orte_staged_execution", "1", true, &app->env);
    }

    /* if the proc isn't going to forward IO, then we need to flag that
     * it has "completed" iof termination as otherwise it will never fire
     */
    if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
        ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
    }

    /* pass an envar so the proc can find any files it had prepositioned */
    param = orte_process_info.proc_session_dir;
    opal_setenv("OMPI_FILE_LOCATION", param, true, &app->env);

    /* if the user wanted the cwd to be the proc's session dir, then
     * switch to that location now
     */
    if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
        /* create the session dir - may not exist */
        if (OPAL_SUCCESS != (rc = opal_os_dirpath_create(param, S_IRWXU))) {
            ORTE_ERROR_LOG(rc);
            /* doesn't exist with correct permissions, and/or we can't
             * create it - either way, we are done
             */
            return rc;
        }
        /* change to it */
        if (0 != chdir(param)) {
            return ORTE_ERROR;
        }
        /* It seems that chdir doesn't
         * adjust the $PWD enviro variable when it changes the directory. This
         * can cause a user to get a different response when doing getcwd vs
         * looking at the enviro variable. To keep this consistent, we explicitly
         * ensure that the PWD enviro variable matches the CWD we moved to.
         *
         * NOTE: if a user's program does a chdir(), then $PWD will once
         * again not match getcwd! This is beyond our control - we are only
         * ensuring they start out matching.
         */
        opal_setenv("PWD", param, true, &app->env);
        /* update the initial wdir value too */
        opal_setenv("OMPI_MCA_initial_wdir", param, true, &app->env);
    }
    return ORTE_SUCCESS;
}
static int rank_span(orte_job_t *jdata,
                     orte_app_context_t *app,
                     opal_list_t *nodes,
                     hwloc_obj_type_t target,
                     unsigned cache_level)
{
    hwloc_obj_t obj;
    int num_objs, i, j, rc;
    orte_vpid_t num_ranked=0;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_vpid_t vpid;
    int cnt;
    opal_list_item_t *item;
    hwloc_obj_t locale;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:rank_span: for job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* if the ranking is spanned, then we perform the
     * ranking as if it was one big node - i.e., we
     * rank one proc on each object, step to the next object
     * moving across all the nodes, then wrap around to the
     * first object on the first node.
     *
     *        Node 0                Node 1
     *    Obj 0     Obj 1       Obj 0     Obj 1
     *     0 4       1 5         2 6       3 7
     *     8 12      9 13       10 14     11 15
     */

    /* In the interest of getting this committed in finite time,
     * just loop across the nodes and objects until all procs
     * are mapped
     */

    vpid = jdata->num_procs;
    cnt = 0;
    while (cnt < app->num_procs) {
        for (item = opal_list_get_first(nodes);
             item != opal_list_get_end(nodes);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
            /* get the number of objects - only consider those we can actually use */
            num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
                                                          cache_level, OPAL_HWLOC_AVAILABLE);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:rank_span: found %d objects on node %s with %d procs",
                                num_objs, node->name, (int)node->num_procs);
            if (0 == num_objs) {
                return ORTE_ERR_NOT_SUPPORTED;
            }

            /* for each object */
            for (i=0; i < num_objs && cnt < app->num_procs; i++) {
                obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
                                                      cache_level, i, OPAL_HWLOC_AVAILABLE);

                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "mca:rmaps:rank_span: working object %d", i);

                /* cycle thru the procs on this node */
                for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    /* ignore procs from other jobs */
                    if (proc->name.jobid != jdata->jobid) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d",
                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
                        continue;
                    }
                    /* ignore procs that are already assigned */
                    if (ORTE_VPID_INVALID != proc->name.vpid) {
                        continue;
                    }
                    /* ignore procs from other apps */
                    if (proc->app_idx != app->idx) {
                        continue;
                    }
                    /* protect against bozo case */
                    locale = NULL;
                    if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        ORTE_ERROR_LOG(ORTE_ERROR);
                        return ORTE_ERROR;
                    }
                    /* ignore procs not on this object */
                    if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_span: proc at position %d is not on object %d",
                                            j, i);
                        continue;
                    }
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid));
                    proc->name.vpid = vpid++;
                    if (0 == cnt) {
                        app->first_rank = proc->name.vpid;
                    }
                    cnt++;

                    /* insert the proc into the jdata array - no harm if already there */
                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                    /* track where the highest vpid landed - this is our
                     * new bookmark
                     */
                    jdata->bookmark = node;
                    /* move to next object */
                    break;
                }
            }
        }
    }

    return ORTE_SUCCESS;
}
Exemple #29
0
void orte_iof_mrorted_read_handler(int fd, short event, void *cbdata)
{
    orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata;
    unsigned char data[ORTE_IOF_BASE_MSG_MAX];
    opal_buffer_t *buf=NULL;
    int rc;
    int32_t numbytes;
    opal_list_item_t *item;
    orte_iof_proc_t *proct;
    orte_ns_cmp_bitmask_t mask;
    orte_job_t *jdata;
    orte_job_map_t *map;
    int i;
    bool write_out=false;
    orte_node_t *node;
    orte_proc_t *daemon;
    orte_jobid_t stdout_target, *jbptr;

    /* read up to the fragment size */
    numbytes = read(fd, data, sizeof(data));
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                         "%s iof:mrorted:read handler read %d bytes from %s, fd %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         numbytes, ORTE_NAME_PRINT(&rev->name), fd));
    
    if (numbytes <= 0) {
        if (0 > numbytes) {
            /* either we have a connection error or it was a non-blocking read */
            if (EAGAIN == errno || EINTR == errno) {
                /* non-blocking, retry */
                opal_event_add(rev->ev, 0);
                return;
            } 

            OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                                 "%s iof:mrorted:read handler %s Error on connection:%d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&rev->name), fd));
        }
        /* numbytes must have been zero, so go down and close the fd etc */
        goto CLEAN_RETURN;
    }
    
    /* see if the user wanted the output directed to files */
    if (NULL != orte_output_filename) {
        /* find the sink for this rank */
        for (item = opal_list_get_first(&mca_iof_mr_orted_component.sinks);
             item != opal_list_get_end(&mca_iof_mr_orted_component.sinks);
             item = opal_list_get_next(item)) {
            orte_iof_sink_t *sink = (orte_iof_sink_t*)item;
            /* if the target is set, then this sink is for another purpose - ignore it */
            if (ORTE_JOBID_INVALID != sink->daemon.jobid) {
                continue;
            }
            /* if this sink isn't for output, ignore it */
            if (ORTE_IOF_STDIN & sink->tag) {
                continue;
            }

            mask = ORTE_NS_CMP_ALL;

            /* is this the desired proc? */
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) {
                /* output to the corresponding file */
                orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev);
                /* done */
                break;
            }
        }
    }
    
    if (ORTE_IOF_STDOUT & rev->tag) {
        /* see if we need to forward this output */
        stdout_target = ORTE_JOBID_INVALID;
        jbptr = &stdout_target;
        jdata = orte_get_job_data_object(rev->name.jobid);
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_STDOUT_TARGET, (void**)&jbptr, ORTE_JOBID)) {
            /* end of the chain - just output the info */
            write_out = true;
            goto PROCESS;
        }
        /* it goes to the next job in the chain */
        jdata = orte_get_job_data_object(stdout_target);
        map = jdata->map;
        for (i=0; i < map->nodes->size; i++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
                continue;
            }
            daemon = node->daemon;
            if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) {
                /* if it is me, then send the bytes down the stdin pipe
                 * for every local proc (they are all on my proct list)
                 */
                for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs);
                     item != opal_list_get_end(&mca_iof_mr_orted_component.procs);
                     item = opal_list_get_next(item)) {
                    proct = (orte_iof_proc_t*)item;
                    if (proct->name.jobid == jdata->jobid) {
                        if (NULL == proct->sink) {
                            opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name));
                            continue;
                        }
                        orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev);
                    }
                }
            } else {
                OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                                     "%s sending %d bytes from stdout of %s to daemon %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
                                     ORTE_NAME_PRINT(&rev->name),
                                     ORTE_NAME_PRINT(&daemon->name)));
                
                /* send the data to the daemon so it can
                 * write it to all local procs from this job
                 */
                send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes);
            }
        }
    }
    
 PROCESS:
    if (write_out) {
        /* prep the buffer */
        buf = OBJ_NEW(opal_buffer_t);
    
        /* pack the stream first - we do this so that flow control messages can
         * consist solely of the tag
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->tag, 1, ORTE_IOF_TAG))) {
            ORTE_ERROR_LOG(rc);
            goto CLEAN_RETURN;
        }
    
        /* pack name of process that gave us this data */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->name, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(rc);
            goto CLEAN_RETURN;
        }
    
        /* pack the data - only pack the #bytes we read! */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &data, numbytes, OPAL_BYTE))) {
            ORTE_ERROR_LOG(rc);
            goto CLEAN_RETURN;
        }

        /* start non-blocking RML call to forward received data */
        OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                             "%s iof:mrorted:read handler sending %d bytes to HNP",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes));
    
        orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP,
                                orte_rml_send_callback, NULL);
    }
    
    /* re-add the event */
    opal_event_add(rev->ev, 0);

    return;
   
 CLEAN_RETURN:
    /* must be an error, or zero bytes were read indicating that the
     * proc terminated this IOF channel - either way, find this proc
     * on our list and clean up
     */
    for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs);
         item != opal_list_get_end(&mca_iof_mr_orted_component.procs);
         item = opal_list_get_next(item)) {
        proct = (orte_iof_proc_t*)item;
        mask = ORTE_NS_CMP_ALL;
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) {
            /* found it - release corresponding event. This deletes
             * the read event and closes the file descriptor
             */
            if (rev->tag & ORTE_IOF_STDOUT) {
                if( NULL != proct->revstdout ) {
                    OBJ_RELEASE(proct->revstdout);
                }
            } else if (rev->tag & ORTE_IOF_STDERR) {
                if( NULL != proct->revstderr ) {
                    OBJ_RELEASE(proct->revstderr);
                }
            } else if (rev->tag & ORTE_IOF_STDDIAG) {
                if( NULL != proct->revstddiag ) {
                    OBJ_RELEASE(proct->revstddiag);
                }
            }
            /* check to see if they are all done */
            if (NULL == proct->revstdout &&
                NULL == proct->revstderr &&
                NULL == proct->revstddiag) {
                /* this proc's iof is complete */
                opal_list_remove_item(&mca_iof_mr_orted_component.procs, item);
                ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE);
                OBJ_RELEASE(proct);
            }
            break;
        }
    }
    if (NULL != buf) {
        OBJ_RELEASE(buf);
    }
    return;
}
Exemple #30
0
int main(int argc, char *argv[])
{
    int rc, i, j;
    opal_cmd_line_t cmd_line;
    char *param, *value;
    orte_job_t *jdata=NULL;
    orte_app_context_t *app;
    char *uri, *ptr;

    /* Setup and parse the command line */
    memset(&myglobals, 0, sizeof(myglobals));
    /* find our basename (the name of the executable) so that we can
       use it in pretty-print error messages */
    myglobals.basename = opal_basename(argv[0]);

    opal_cmd_line_create(&cmd_line, cmd_line_init);
    mca_base_cmd_line_setup(&cmd_line);
    if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true,
                                                  argc, argv)) ) {
        if (OPAL_ERR_SILENT != rc) {
            fprintf(stderr, "%s: command line error (%s)\n", argv[0],
                    opal_strerror(rc));
        }
        return rc;
    }

    /* print version if requested.  Do this before check for help so
       that --version --help works as one might expect. */
    if (myglobals.version) {
        char *str;
        str = opal_info_make_version_str("all",
                                         OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
                                         OPAL_RELEASE_VERSION,
                                         OPAL_GREEK_VERSION,
                                         OPAL_REPO_REV);
        if (NULL != str) {
            fprintf(stdout, "%s %s\n\nReport bugs to %s\n",
                    myglobals.basename, str, PACKAGE_BUGREPORT);
            free(str);
        }
        exit(0);
    }

    /* check if we are running as root - if we are, then only allow
     * us to proceed if the allow-run-as-root flag was given. Otherwise,
     * exit with a giant warning flag
     */
    if (0 == geteuid() && !myglobals.run_as_root) {
        fprintf(stderr, "--------------------------------------------------------------------------\n");
        if (myglobals.help) {
            fprintf(stderr, "%s cannot provide the help message when run as root\n", myglobals.basename);
        } else {
            /* show_help is not yet available, so print an error manually */
            fprintf(stderr, "%s has detected an attempt to run as root.\n", myglobals.basename);
        }
        fprintf(stderr, " This is *strongly* discouraged as any mistake (e.g., in defining TMPDIR) or bug can\n");
        fprintf(stderr, "result in catastrophic damage to the OS file system, leaving\n");
        fprintf(stderr, "your system in an unusable state.\n\n");
        fprintf(stderr, "You can override this protection by adding the --allow-run-as-root\n");
        fprintf(stderr, "option to your cmd line. However, we reiterate our strong advice\n");
        fprintf(stderr, "against doing so - please do so at your own risk.\n");
        fprintf(stderr, "--------------------------------------------------------------------------\n");
        exit(1);
    }

    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     * NOTE: It is "safe" to call mca_base_cmd_line_process_args() before
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) {
        exit(1);
    }

    /* Need to initialize OPAL so that install_dirs are filled in */
    if (OPAL_SUCCESS != opal_init(&argc, &argv)) {
        exit(1);
    }

    /* Check for help request */
    if (myglobals.help) {
        char *str, *args = NULL;
        char *project_name = NULL;
        if (0 == strcmp(myglobals.basename, "mpirun")) {
            project_name = "Open MPI";
        } else {
            project_name = "OpenRTE";
        }
        args = opal_cmd_line_get_usage_msg(&cmd_line);
        str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
                                    myglobals.basename, project_name, OPAL_VERSION,
                                    myglobals.basename, args,
                                    PACKAGE_BUGREPORT);
        if (NULL != str) {
            printf("%s", str);
            free(str);
        }
        free(args);

        /* If someone asks for help, that should be all we do */
        exit(0);
    }

    /* flag that I am the HNP */
    orte_process_info.proc_type = ORTE_PROC_HNP;

    /* Setup MCA params */
    orte_register_params();

    /* specify the DVM state machine */
    opal_setenv("OMPI_MCA_state", "dvm", true, &environ);

    /* Intialize our Open RTE environment */
    if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
        /* cannot call ORTE_ERROR_LOG as it could be the errmgr
         * never got loaded!
         */
        return rc;
    }
    /* finalize OPAL. As it was opened again from orte_init->opal_init
     * we continue to have a reference count on it. So we have to finalize it twice...
     */
    opal_finalize();

    /* check for request to report uri */
    uri = orte_rml.get_contact_info();
    if (NULL != myglobals.report_uri) {
        FILE *fp;
        if (0 == strcmp(myglobals.report_uri, "-")) {
            /* if '-', then output to stdout */
            printf("VMURI: %s\n", uri);
        } else if (0 == strcmp(myglobals.report_uri, "+")) {
            /* if '+', output to stderr */
            fprintf(stderr, "VMURI: %s\n", uri);
        } else if (0 == strncasecmp(myglobals.report_uri, "file:", strlen("file:"))) {
            ptr = strchr(myglobals.report_uri, ':');
            ++ptr;
            fp = fopen(ptr, "w");
            if (NULL == fp) {
                orte_show_help("help-orterun.txt", "orterun:write_file", false,
                               myglobals.basename, "pid", ptr);
                exit(0);
            }
            fprintf(fp, "%s\n", uri);
            fclose(fp);
        } else {
            fp = fopen(myglobals.report_uri, "w");
            if (NULL == fp) {
                orte_show_help("help-orterun.txt", "orterun:write_file", false,
                               myglobals.basename, "pid", myglobals.report_uri);
                exit(0);
            }
            fprintf(fp, "%s\n", uri);
            fclose(fp);
        }
        free(uri);
    } else {
        printf("VMURI: %s\n", uri);
    }

    /* get the daemon job object - was created by ess/hnp component */
    if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
        orte_show_help("help-orterun.txt", "bad-job-object", true,
                       myglobals.basename);
        exit(0);
    }
    /* also should have created a daemon "app" */
    if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
        orte_show_help("help-orterun.txt", "bad-app-object", true,
                       myglobals.basename);
        exit(0);
    }

    /* Did the user specify a prefix, or want prefix by default? */
    if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) {
        size_t param_len;
        /* if both the prefix was given and we have a prefix
         * given above, check to see if they match
         */
        if (opal_cmd_line_is_taken(&cmd_line, "prefix") &&
            NULL != myglobals.prefix) {
            /* if they don't match, then that merits a warning */
            param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
            /* ensure we strip any trailing '/' */
            if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
                param[strlen(param)-1] = '\0';
            }
            value = strdup(myglobals.prefix);
            if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) {
                value[strlen(value)-1] = '\0';
            }
            if (0 != strcmp(param, value)) {
                orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict",
                               true, myglobals.basename, value, param);
                /* let the global-level prefix take precedence since we
                 * know that one is being used
                 */
                free(param);
                param = strdup(myglobals.prefix);
            }
            free(value);
        } else if (NULL != myglobals.prefix) {
            param = myglobals.prefix;
        } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){
            /* must be --prefix alone */
            param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
        } else {
            /* --enable-orterun-prefix-default was given to orterun */
            param = strdup(opal_install_dirs.prefix);
        }

        if (NULL != param) {
            /* "Parse" the param, aka remove superfluous path_sep. */
            param_len = strlen(param);
            while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) {
                param[param_len-1] = '\0';
                param_len--;
                if (0 == param_len) {
                    orte_show_help("help-orterun.txt", "orterun:empty-prefix",
                                   true, myglobals.basename, myglobals.basename);
                    return ORTE_ERR_FATAL;
                }
            }
            orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING);
            free(param);
        }
    }

    /* Did the user specify a hostfile. Need to check for both
     * hostfile and machine file.
     * We can only deal with one hostfile per app context, otherwise give an error.
     */
    if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) {
        if(1 < j) {
            orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
                           true, myglobals.basename, NULL);
            return ORTE_ERR_FATAL;
        } else {
            value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0);
            orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
        }
    }
    if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) {
        if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) {
            orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
                           true, myglobals.basename, NULL);
            return ORTE_ERR_FATAL;
        } else {
            value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0);
            orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
        }
    }

    /* Did the user specify any hosts? */
    if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) {
        char **targ=NULL, *tval;
        for (i = 0; i < j; ++i) {
            value = opal_cmd_line_get_param(&cmd_line, "host", i, 0);
            opal_argv_append_nosize(&targ, value);
        }
        tval = opal_argv_join(targ, ',');
        orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING);
        opal_argv_free(targ);
        free(tval);
    }
    OBJ_DESTRUCT(&cmd_line);

    /* setup to listen for commands sent specifically to me, even though I would probably
     * be the one sending them! Unfortunately, since I am a participating daemon,
     * there are times I need to send a command to "all daemons", and that means *I* have
     * to receive it too
     */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);

    /* override the notify_completed state so we can send a message
     * back to anyone who submits a job to us telling them the job
     * completed */
    if (ORTE_SUCCESS != (rc = orte_state.set_job_state_callback(ORTE_JOB_STATE_NOTIFY_COMPLETED, notify_requestor))) {
        ORTE_ERROR_LOG(rc);
        ORTE_UPDATE_EXIT_STATUS(rc);
        exit(orte_exit_status);
    }

    /* spawn the DVM - we skip the initial steps as this
     * isn't a user-level application */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATE);

    /* loop the event lib until an exit event is detected */
    while (orte_event_base_active) {
        opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
    }

    /* cleanup and leave */
    orte_finalize();

    if (orte_debug_flag) {
        fprintf(stderr, "exiting with status %d\n", orte_exit_status);
    }
    exit(orte_exit_status);
}