static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sender,
                                                      opal_buffer_t* buffer)
{
    opal_buffer_t answer;
    orte_std_cntr_t count;
    orte_job_t *jdata = NULL;
    orte_proc_t **procs = NULL;
    orte_process_name_t name;
    int rc;

    OBJ_CONSTRUCT(&answer, opal_buffer_t);

    /*
     * Unpack the data
     */
    count = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &name, &count, ORTE_NAME))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto CLEANUP;
    }

    /*
     * Process the data
     */
    /* get the job data object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(name.jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto CLEANUP;
    }
    /* get the proc object for it */
    procs = (orte_proc_t**)jdata->procs->addr;
    if (NULL == procs[name.vpid] || NULL == procs[name.vpid]->node) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto CLEANUP;
    }

    /*
     * Send back the answer
     */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &(procs[name.vpid]->node->name), 1, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto CLEANUP;
    }

    if (0 > (rc = orte_rml.send_buffer(sender, &answer, ORTE_RML_TAG_FILEM_BASE_RESP, 0))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }

 CLEANUP:
    OBJ_DESTRUCT(&answer);

}
Beispiel #2
0
static void poll_spawns(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
    int i, rc;
    bool failed_launch = true;
    int local_err;
    tm_event_t event;

    /* TM poll for all the spawns */
    for (i = 0; i < launched; ++i) {
        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
        if (TM_SUCCESS != rc) {
            opal_output(0, "plm:tm: failed to poll for a spawned daemon, return status = %d", rc);
            goto cleanup;
        }
        if (TM_SUCCESS != local_err) {
            opal_output(0, "plm:tm: failed to spawn daemon, error code = %d", local_err );
            goto cleanup;
        }
    }
    failed_launch = false;

#if 0
    /* set a timer to tell us if one or more daemon's fails to start - use the
     * millisec/daemon timeout provided by the user to compute time
     */
    if (0 < orte_startup_timeout) {
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:tm: setting startup timer for %d milliseconds",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             orte_startup_timeout));
        ORTE_DETECT_TIMEOUT(map->num_new_daemons,
                            orte_startup_timeout*1000,
                            -1, failed_start, state->jdata);
    }
#endif
    
 cleanup:
    /* cleanup */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
}
/* after we allocate, we need to map the processes
 * so we know what nodes will be used
 */
static void allocation_complete(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata = state->jdata;
    orte_job_t *daemons;

    jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;

    /* get the daemon job object */
    if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto done;
    }
    /* mark that we are not using a VM */
    daemons->controls |= ORTE_JOB_CONTROL_NO_VM;

#if OPAL_HAVE_HWLOC
    {
        hwloc_topology_t t;
        orte_node_t *node;
        int i;

        /* ensure that all nodes point to our topology - we
         * cannot support hetero nodes with this state machine
         */
        t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, 0);
        for (i=1; i < orte_node_pool->size; i++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
                continue;
            }
            node->topology = t;
        }
    }
#endif
    
    /* move to the map stage */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);

 done:
    /* cleanup */
    OBJ_RELEASE(state);
}
Beispiel #4
0
static void plm_yarn_launch_apps(int fd, short args, void *cbdata)
{
    int rc;
    orte_job_t *jdata;
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

    /* convenience */
    jdata = caddy->jdata;

    if (ORTE_JOB_STATE_LAUNCH_APPS != caddy->job_state) {
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        return;
    }

    /* update job state */
    jdata->state = caddy->job_state;

    /* register recv callback for daemons sync request */
    if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                      ORTE_RML_TAG_YARN_SYNC_REQUEST,
                                                      ORTE_RML_PERSISTENT,
                                                      yarn_hnp_sync_recv, jdata))) {
        ORTE_ERROR_LOG(rc);
    }

    orte_plm_base_launch_apps(fd, args, cbdata);

	//============heartbeat with AM======
	opal_event_t *ev = NULL;
	ev = (opal_event_t*) malloc(sizeof(opal_event_t));

	struct timeval delay;
	delay.tv_sec = 1;
	delay.tv_usec = 0;

	opal_event_evtimer_set(orte_event_base, ev, heartbeat_with_AM_cb, jdata);
	opal_event_evtimer_add(ev, &delay);
	//===================================
}
static void launch_daemons(int fd, short args, void *cbdata)
{
    orte_job_map_t *map;
    char *jobid_string = NULL;
    char *param;
    char **argv = NULL;
    int argc;
    int rc;
    char *tmp;
    char** env = NULL;
    char *nodelist_flat;
    char **nodelist_argv;
    int nodelist_argc;
    char *vpid_string;
    char **custom_strings;
    int num_args, i;
    char *cur_prefix;
    int proc_vpid_index;
    orte_app_context_t *app;
    orte_node_t *node;
    orte_std_cntr_t nnode;
    orte_job_t *daemons;
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;

    /* if we are launching debugger daemons, then just go
     * do it - no new daemons will be launched
     */
    if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) {
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* start by setting up the virtual machine */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

   /* if we don't want to launch, then don't attempt to
     * launch the daemons - the user really wants to just
     * look at the proposed process map
     */
    if (orte_do_not_launch) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* Get the map for this job */
    if (NULL == (map = daemons->map)) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }

    if (0 == map->num_new_daemons) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:alps: no new daemons to launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) {
            ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        }
        OBJ_RELEASE(state);
        return;
    }
    
    /* need integer value for command line parameter */
    orte_util_convert_jobid_to_string(&jobid_string, daemons->jobid);

    /*
     * start building argv array
     */
    argv = NULL;
    argc = 0;

    /*
     * ALPS aprun  OPTIONS
     */

    /* add the aprun command */
    opal_argv_append(&argc, &argv, mca_plm_alps_component.aprun_cmd);

    /* Append user defined arguments to aprun */
    if ( NULL != mca_plm_alps_component.custom_args ) {
        custom_strings = opal_argv_split(mca_plm_alps_component.custom_args, ' ');
        num_args       = opal_argv_count(custom_strings);
        for (i = 0; i < num_args; ++i) {
            opal_argv_append(&argc, &argv, custom_strings[i]);
        }
        opal_argv_free(custom_strings);
    }

    /* number of processors needed */
    opal_argv_append(&argc, &argv, "-n");
    asprintf(&tmp, "%lu", (unsigned long) map->num_new_daemons);
    opal_argv_append(&argc, &argv, tmp);
    free(tmp);
    opal_argv_append(&argc, &argv, "-N");
    opal_argv_append(&argc, &argv, "1");
    opal_argv_append(&argc, &argv, "-cc");
    opal_argv_append(&argc, &argv, "none");

    /* create nodelist */
    nodelist_argv = NULL;
    nodelist_argc = 0;

    for (nnode=0; nnode < map->nodes->size; nnode++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
            continue;
        }

        /* if the daemon already exists on this node, then
         * don't include it
         */
        if (node->daemon_launched) {
            continue;
        }
        
        /* otherwise, add it to the list of nodes upon which
         * we need to launch a daemon
         */
        opal_argv_append(&nodelist_argc, &nodelist_argv, node->name);
    }
    if (0 == opal_argv_count(nodelist_argv)) {
        orte_show_help("help-plm-alps.txt", "no-hosts-in-list", true);
        rc = ORTE_ERR_FAILED_TO_START;
        goto cleanup;
    }
    nodelist_flat = opal_argv_join(nodelist_argv, ',');
    opal_argv_free(nodelist_argv);

    /* if we are using all allocated nodes, then alps
     * doesn't need a nodelist
     */
    if (map->num_new_daemons < orte_num_allocated_nodes) {
        opal_argv_append(&argc, &argv, "-L");
        opal_argv_append(&argc, &argv, nodelist_flat);
    }


    /*
     * ORTED OPTIONS
     */

    /* add the daemon command (as specified by user) */
    orte_plm_base_setup_orted_cmd(&argc, &argv);
    
    /* Add basic orted command line options, including debug flags */
    orte_plm_base_orted_append_basic_args(&argc, &argv,
                                          NULL,
                                          &proc_vpid_index,
                                          nodelist_flat);
    free(nodelist_flat);

    /* tell the new daemons the base of the name list so they can compute
     * their own name on the other end
     */
    rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start);
    if (ORTE_SUCCESS != rc) {
        opal_output(0, "plm_alps: unable to create process name");
        goto cleanup;
    }

    free(argv[proc_vpid_index]);
    argv[proc_vpid_index] = strdup(vpid_string);
    free(vpid_string);

    if (mca_plm_alps_component.debug) {
        param = opal_argv_join(argv, ' ');
        if (NULL != param) {
            opal_output(0, "plm:alps: final top-level argv:");
            opal_output(0, "plm:alps:     %s", param);
            free(param);
        }
    }

    /* Copy the prefix-directory specified in the
       corresponding app_context.  If there are multiple,
       different prefix's in the app context, complain (i.e., only
       allow one --prefix option for the entire alps run -- we
       don't support different --prefix'es for different nodes in
       the ALPS plm) */
    cur_prefix = NULL;
    for (i=0; i < state->jdata->apps->size; i++) {
        char *app_prefix_dir;
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, i))) {
            continue;
        }
        app_prefix_dir = app->prefix_dir;
        /* Check for already set cur_prefix_dir -- if different,
           complain */
        if (NULL != app_prefix_dir) {
            if (NULL != cur_prefix &&
                0 != strcmp (cur_prefix, app_prefix_dir)) {
                orte_show_help("help-plm-alps.txt", "multiple-prefixes",
                               true, cur_prefix, app_prefix_dir);
                goto cleanup;
            }

            /* If not yet set, copy it; iff set, then it's the
               same anyway */
            if (NULL == cur_prefix) {
                cur_prefix = strdup(app_prefix_dir);
                if (mca_plm_alps_component.debug) {
                    opal_output (0, "plm:alps: Set prefix:%s",
                                 cur_prefix);
                }
            }
        }
    }

    /* setup environment */
    env = opal_argv_copy(orte_launch_environ);
    
    if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
        param = opal_argv_join(argv, ' ');
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:alps: final top-level argv:\n\t%s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == param) ? "NULL" : param));
        if (NULL != param) free(param);
    }
        
    /* exec the daemon(s) */
    if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* indicate that the daemons for this job were launched */
    state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
    daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;

    /* flag that launch was successful, so far as we currently know */
    failed_launch = false;

 cleanup:
    if (NULL != argv) {
        opal_argv_free(argv);
    }
    if (NULL != env) {
        opal_argv_free(env);
    }
    
    if(NULL != jobid_string) {
        free(jobid_string);
    }
    
    /* cleanup the caddy */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
}
Beispiel #6
0
/* When working in this function, ALWAYS jump to "cleanup" if
 * you encounter an error so that orterun will be woken up and
 * the job can cleanly terminate
 */
static void launch_daemons(int fd, short args, void *cbdata)
{
    orte_job_map_t *map = NULL;
    orte_app_context_t *app;
    orte_node_t *node;
    int proc_vpid_index;
    char *param;
    char **env = NULL;
    char *var;
    char **argv = NULL;
    char **nodeargv;
    int argc = 0;
    int rc;
    orte_std_cntr_t i; 
    char *bin_base = NULL, *lib_base = NULL;
    tm_event_t *tm_events = NULL;
    tm_task_id *tm_task_ids = NULL;
    bool failed_launch = true;
    mode_t current_umask;
    char *nodelist;
    char* vpid_string;
    orte_job_t *daemons, *jdata;
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;

    jdata = state->jdata;

    /* if we are launching debugger daemons, then just go
     * do it - no new daemons will be launched
     */
    if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) {
        jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* setup the virtual machine */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* if we don't want to launch, then don't attempt to
     * launch the daemons - the user really wants to just
     * look at the proposed process map
     */
    if (orte_do_not_launch) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }
    
    /* Get the map for this job */
    if (NULL == (map = daemons->map)) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }

    if (0 == map->num_new_daemons) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        }
        OBJ_RELEASE(state);
        return;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                         "%s plm:tm: launching vm",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* Allocate a bunch of TM events to use for tm_spawn()ing */
    tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons);
    if (NULL == tm_events) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons);
    if (NULL == tm_task_ids) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* add the daemon command (as specified by user) */
    orte_plm_base_setup_orted_cmd(&argc, &argv);

    /* create a list of nodes in this launch */
    nodeargv = NULL;
    for (i = 0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        
        /* if this daemon already exists, don't launch it! */
        if (node->daemon_launched) {
            continue;
        }
        
        /* add to list */
        opal_argv_append_nosize(&nodeargv, node->name);
    }
    nodelist = opal_argv_join(nodeargv, ',');
    opal_argv_free(nodeargv);
    
    /* Add basic orted command line options */
    orte_plm_base_orted_append_basic_args(&argc, &argv, "tm",
                                          &proc_vpid_index,
                                          nodelist);
    free(nodelist);
    
    if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
        param = opal_argv_join(argv, ' ');
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:tm: final top-level argv:\n\t%s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == param) ? "NULL" : param));
        if (NULL != param) free(param);
    }

    rc = plm_tm_connect();
    if (ORTE_SUCCESS != rc) {
        goto cleanup;
    }
    connected = true;

    /* Figure out the basenames for the libdir and bindir.  There is a
       lengthy comment about this in plm_rsh_module.c explaining all
       the rationale for how / why we're doing this. */
    lib_base = opal_basename(opal_install_dirs.libdir);
    bin_base = opal_basename(opal_install_dirs.bindir);

    /* setup environment */
    env = opal_argv_copy(orte_launch_environ);

    /* enable local launch by the orteds */
    var = mca_base_param_env_var ("plm");
    opal_setenv(var, "rsh", true, &env);
    free(var);
    
    /* add our umask -- see big note in orted.c */
    current_umask = umask(0);
    umask(current_umask);
    asprintf(&var, "0%o", current_umask);
    opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env);
    free(var);
    
    /* If we have a prefix, then modify the PATH and
        LD_LIBRARY_PATH environment variables. We only allow
        a single prefix to be specified. Since there will
        always be at least one app_context, we take it from
        there
    */
    app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
    if (NULL != app->prefix_dir) {
        char *newenv;
        
        for (i = 0; NULL != env && NULL != env[i]; ++i) {
            /* Reset PATH */
            if (0 == strncmp("PATH=", env[i], 5)) {
                asprintf(&newenv, "%s/%s:%s", 
                            app->prefix_dir, bin_base, env[i] + 5);
                OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                                     "%s plm:tm: resetting PATH: %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     newenv));
                opal_setenv("PATH", newenv, true, &env);
                free(newenv);
            } 
            
            /* Reset LD_LIBRARY_PATH */
            else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
                asprintf(&newenv, "%s/%s:%s", 
                            app->prefix_dir, lib_base, env[i] + 16);
                OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                                     "%s plm:tm: resetting LD_LIBRARY_PATH: %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     newenv));
                opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
                free(newenv);
            } 
        }
    }
    
    /* Iterate through each of the nodes and spin
     * up a daemon.
     */
    for (i = 0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        /* if this daemon already exists, don't launch it! */
        if (node->daemon_launched) {
            continue;
        }
 
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:tm: launching on node %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             node->name));
        
        /* setup process name */
        rc = orte_util_convert_vpid_to_string(&vpid_string, node->daemon->name.vpid);
        if (ORTE_SUCCESS != rc) {
            opal_output(0, "plm:tm: unable to get daemon vpid as string");
            exit(-1);
        }
        free(argv[proc_vpid_index]);
        argv[proc_vpid_index] = strdup(vpid_string);
        free(vpid_string);
        
        /* exec the daemon */
        if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
            param = opal_argv_join(argv, ' ');
            OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                                 "%s plm:tm: executing:\n\t%s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 (NULL == param) ? "NULL" : param));
            if (NULL != param) free(param);
        }
        
        rc = tm_spawn(argc, argv, env, node->launch_id, tm_task_ids + launched, tm_events + launched);
        if (TM_SUCCESS != rc) {
            orte_show_help("help-plm-tm.txt", "tm-spawn-failed",
                           true, argv[0], node->name, node->launch_id);
            rc = ORTE_ERROR;
            goto cleanup;
        }
        
        launched++;
    }

    /* indicate that the daemons for this job were launched */
    state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
    daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;

    /* flag that launch was successful, so far as we currently know */
    failed_launch = false;

    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                         "%s plm:tm:launch: finished spawning orteds",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

 cleanup:
    /* cleanup */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
}
Beispiel #7
0
static void launch_daemons(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
    orte_job_map_t *map;
    int rc;
    bool failed_launch = true;
    orte_job_t *daemons;
    int launched_proc_num = 0;

    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                         "%s plm:yarn:launch_daemons: LAUNCH DAEMONS CALLED",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* if we are launching debugger daemons, then just go
     * do it - no new daemons will be launched
     */
    if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) {
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* start by setting up the virtual machine */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

   /* if we don't want to launch, then don't attempt to
     * launch the daemons - the user really wants to just
     * look at the proposed process map
     */
    if (orte_do_not_launch) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* Get the map for this job */
    if (NULL == (map = daemons->map)) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }
        
    if (0 == map->num_new_daemons) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:yarn:launch_daemons: no new daemons to launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        if (ORTE_JOB_STATE_DAEMONS_REPORTED == daemons->state) {
            ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        }
        OBJ_RELEASE(state);
        return;
    }

	rc = common_launch_process(daemons, true, &launched_proc_num);
	if (rc != ORTE_SUCCESS) {
		ORTE_ERROR_LOG(rc);
		goto cleanup;
	}

	/* if all daemon procs are launched successfully, then modify the job's state */
	if (launched_proc_num == (daemons->num_procs - 1)) {
		/* indicate that the daemons for this job were launched */
		state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
		daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
		OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
						"%s plm:yarn:launch_daemons: launch daemon proc successfully with AM",
						ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
	}

    /* flag that launch was successful, so far as we currently know */
    failed_launch = false;

cleanup:
    /* cleanup the caddy */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
}
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_proc_t *pptr;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;

    orte_proc_t *child, *ptr;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    orte_vpid_t null=ORTE_VPID_INVALID;
    orte_ns_cmp_bitmask_t mask=ORTE_NS_CMP_ALL;
    int i;

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:default_orted:proc_errors process %s error state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        goto cleanup;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            goto cleanup;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - ignore */
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                             "%s errmgr:default:orted daemon %s exited",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* see if this was a lifeline */
        if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s errmgr:orted daemon %s was a lifeline - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(proc)));
            /* kill our children */
            killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
            /* terminate - our routed children will see
             * us leave and automatically die
             */
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            goto cleanup;
        }
        /* are any of my children still alive */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                if (child->alive && child->state < ORTE_PROC_STATE_UNTERMINATED) {
                    goto cleanup;
                }
            }
        }
        /* if all my routes and children are gone, then terminate
           ourselves nicely (i.e., this is a normal termination) */
        if (0 == orte_routed.num_routes()) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s errmgr:default:orted all routes gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_TERMINATE(0);
        } else {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s errmgr:default:orted not exiting, num_routes() == %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 (int)orte_routed.num_routes()));
        }
        /* if not, then we can continue */
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* must already be complete */
        goto cleanup;
    }
    pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    /* if there are no local procs for this job, we can
     * ignore this call
     */
    if (0 == jdata->num_local_procs) {
        goto cleanup;
    }

    /* find this proc in the local children */
    child = NULL;
    for (i=0; i < orte_local_children->size; i++) {
        if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
            continue;
        }
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &ptr->name, proc)) {
            child = ptr;
            break;
        }
    }
    if (NULL == child) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:default_orted got state %s for proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc)));
 
    if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
        child->state = state;
        /* Decrement the number of local procs */
        jdata->num_local_procs--;
        /* kill this proc */
        killprocs(proc->jobid, proc->vpid);
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        if (!orte_abort_non_zero_exit) {
            /* leave the child in orte_local_children so we can
             * later send the state info after full job termination
             */
            child->state = state;
            child->waitpid_recvd = true;
            if (child->iof_complete) {
                /* the proc has terminated */
                child->alive = false;
                /* Clean up the session directory as if we were the process
                 * itself.  This covers the case where the process died abnormally
                 * and didn't cleanup its own session directory.
                 */
                orte_session_dir_finalize(&child->name);
                /* track job status */
                jdata->num_terminated++;
            }
            /* treat this as normal termination */
            goto REPORT_STATE;
        }
        /* report this as abnormal termination to the HNP */
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack only the data for this proc - have to start with the jobid
         * so the receiver can unpack it correctly
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        child->state = state;
        /* now pack the child's info */
        if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* remove the child from our local array as it is no longer alive */
        opal_pointer_array_set_item(orte_local_children, i, NULL);
        /* Decrement the number of local procs */
        jdata->num_local_procs--;

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&child->name),
                             jdata->num_local_procs));
        
        /* release the child object */
        OBJ_RELEASE(child);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM, 0,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(alert);
        }
        return;
    }

    if (ORTE_PROC_STATE_FAILED_TO_START == state ||
        ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) {
        /* update the proc state */
        child->state = state;
        /* count the proc as having "terminated" */
        jdata->num_terminated++;
        /* leave the error report in this case to the
         * state machine, which will receive notice
         * when all local procs have attempted to start
         * so that we send a consolidated error report
         * back to the HNP
         */
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away
         */
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack only the data for this proc - have to start with the jobid
         * so the receiver can unpack it correctly
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        child->state = state;
        /* now pack the child's info */
        if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* remove the child from our local array as it is no longer alive */
        opal_pointer_array_set_item(orte_local_children, i, NULL);
        /* Decrement the number of local procs */
        jdata->num_local_procs--;

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&child->name),
                             jdata->num_local_procs));
        
        /* release the child object */
        OBJ_RELEASE(child);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM, 0,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 REPORT_STATE:
    if (ORTE_PROC_STATE_REGISTERED == state) {
        /* see if everyone in this job has registered */
        if (all_children_registered(proc->jobid)) {
            /* once everyone registers, send their contact info to
             * the HNP so it is available to debuggers and anyone
             * else that needs it
             */

            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                 "%s errmgr:default_orted: sending contact info to HNP",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            
            alert = OBJ_NEW(opal_buffer_t);
            /* pack init routes command */
            cmd = ORTE_PLM_INIT_ROUTES_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack all the local child vpids */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                if (ptr->name.jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ptr->name.vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        return;
                    }
                }
            }
            /* pack an invalid marker */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* add in contact info for all procs in the job */
            if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&alert);
                return;
            }
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM, 0,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        }        
        return;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));
        
        /* remove all of this job's children from the global list - do not lock
         * the thread as we are already locked
         */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (jdata->jobid == ptr->name.jobid) {
                opal_pointer_array_set_item(orte_local_children, i, NULL);
                OBJ_RELEASE(ptr);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jdata->jobid);

        /* remove this job from our local job data since it is complete */
        opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
        OBJ_RELEASE(jdata);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM, 0,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
static void job_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_job_state_t jobstate;
    int rc;
    orte_plm_cmd_flag_t cmd;
    opal_buffer_t *alert;

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        return;
    }

    /* if the jdata is NULL, then we abort as this
     * is reporting an unrecoverable error
     */
    if (NULL == caddy->jdata) {
        ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
        OBJ_RELEASE(caddy);
        return;
    }

    /* update the state */
    jdata = caddy->jdata;
    jobstate = caddy->job_state;
    jdata->state = jobstate;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                         "%s errmgr:default_orted: job %s reported error state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid),
                         orte_job_state_to_str(jobstate)));

    switch (jobstate) {
    case ORTE_JOB_STATE_FAILED_TO_START:
        failed_start(jdata);
        break;
    case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
        /* update all procs in job */
        update_local_children(jdata, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
        /* order all local procs for this job to be killed */
        killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
        break;
    case ORTE_JOB_STATE_COMM_FAILED:
        /* kill all local procs */
        killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
        /* order termination */
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
        break;
    case ORTE_JOB_STATE_HEARTBEAT_FAILED:
        /* let the HNP handle this */
        goto cleanup;
        break;

    default:
        break;
    }
    alert = OBJ_NEW(opal_buffer_t);
    /* pack update state command */
    cmd = ORTE_PLM_UPDATE_PROC_STATE;
    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(alert);
        goto cleanup;
    }
    /* pack the job info */
    if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(alert);
        goto cleanup;
    }
    /* send it */
    if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                          ORTE_RML_TAG_PLM, 0,
                                          orte_rml_send_callback, NULL))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(alert);
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
void orte_iof_base_write_handler(int fd, short event, void *cbdata)
{
    orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata;
    orte_iof_write_event_t *wev = sink->wev;
    opal_list_item_t *item;
    orte_iof_write_output_t *output;
    int num_written;
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s write:handler writing data to %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         wev->fd));

    while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
        output = (orte_iof_write_output_t*)item;
        if (0 == output->numbytes) {
            /* indicates we are to close this stream */
            OBJ_RELEASE(sink);
            return;
        }
        num_written = write(wev->fd, output->data, output->numbytes);
        if (num_written < 0) {
            if (EAGAIN == errno || EINTR == errno) {
                /* push this item back on the front of the list */
                opal_list_prepend(&wev->outputs, item);
                /* if the list is getting too large, abort */
                if (orte_iof_base.output_limit < opal_list_get_size(&wev->outputs)) {
                    opal_output(0, "IO Forwarding is running too far behind - something is blocking us from writing");
                    ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    goto ABORT;
                }
                /* leave the write event running so it will call us again
                 * when the fd is ready.
                 */
                return;
            }
            /* otherwise, something bad happened so all we can do is abort
             * this attempt
             */
            OBJ_RELEASE(output);
            goto ABORT;
        } else if (num_written < output->numbytes) {
            /* incomplete write - adjust data to avoid duplicate output */
            memmove(output->data, &output->data[num_written], output->numbytes - num_written);
            /* push this item back on the front of the list */
            opal_list_prepend(&wev->outputs, item);
            /* if the list is getting too large, abort */
            if (orte_iof_base.output_limit < opal_list_get_size(&wev->outputs)) {
                opal_output(0, "IO Forwarding is running too far behind - something is blocking us from writing");
                ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                goto ABORT;
            }
            /* leave the write event running so it will call us again
             * when the fd is ready
             */
            return;
        }
        OBJ_RELEASE(output);
    }
ABORT:
    opal_event_del(wev->ev);
    wev->pending = false;

}
/*
 * Function for selecting one component from all those that are
 * available.
 */
void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
{
    orte_job_t *jdata;
    orte_job_map_t *map;
    int rc;
    bool did_map;
    opal_list_item_t *item;
    orte_rmaps_base_selected_module_t *mod;
    orte_job_t *parent;
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

    /* convenience */
    jdata = caddy->jdata;

    /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT
     * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN
     * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP
     */
    
    opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                        "mca:rmaps: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
     * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
     * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
     * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A
     * NULL MAP FIELD
     * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY
     * DIDN'T SET IT
     */        
    if (NULL == jdata->map) {
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps: creating new map for job %s",
                            ORTE_JOBID_PRINT(jdata->jobid));
        /* create a map object where we will store the results */
        map = OBJ_NEW(orte_job_map_t);
        if (NULL == map) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
        /* load it with the system defaults */
        map->mapping = orte_rmaps_base.mapping;
        map->ranking = orte_rmaps_base.ranking;
#if OPAL_HAVE_HWLOC
        map->binding = opal_hwloc_binding_policy;
#endif
        if (NULL != orte_rmaps_base.ppr) {
            map->ppr = strdup(orte_rmaps_base.ppr);
        }
        map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
        map->display_map = orte_rmaps_base.display_map;
        /* assign the map object to this job */
        jdata->map = map;
    } else {
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps: setting mapping policies for job %s",
                            ORTE_JOBID_PRINT(jdata->jobid));

        if (!jdata->map->display_map) {
            jdata->map->display_map = orte_rmaps_base.display_map;
        }
        /* set the default mapping policy IFF it wasn't provided */
        if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping);
        }
        if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
            ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping));
        }
        /* ditto for rank and bind policies */
        if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) {
            ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking);
        }
#if OPAL_HAVE_HWLOC
        if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
            jdata->map->binding = opal_hwloc_binding_policy;
        }
#endif
    }

#if OPAL_HAVE_HWLOC
    /* if we are not going to launch, then we need to set any
     * undefined topologies to match our own so the mapper
     * can operate
     */
    if (orte_do_not_launch) {
        orte_node_t *node;
        hwloc_topology_t t0;
        int i;
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        t0 = node->topology;
        for (i=1; i < orte_node_pool->size; i++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
                continue;
            }
            if (NULL == node->topology) {
                node->topology = t0;
            }
        }
    }
#endif

    /* cycle thru the available mappers until one agrees to map
     * the job
     */
    did_map = false;
    for (item = opal_list_get_first(&orte_rmaps_base.selected_modules);
         item != opal_list_get_end(&orte_rmaps_base.selected_modules);
         item = opal_list_get_next(item)) {
        mod = (orte_rmaps_base_selected_module_t*)item;
        if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) {
            did_map = true;
            break;
        }
        /* mappers return "next option" if they didn't attempt to
         * map the job. anything else is a true error.
         */
        if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
            ORTE_ERROR_LOG(rc);
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
    }
    /* if we get here without doing the map, or with zero procs in
     * the map, then that's an error
     */
    if (!did_map || 0 == jdata->num_procs) {
        orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }

    /* compute and save local ranks */
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
    
#if OPAL_HAVE_HWLOC
    /* compute and save bindings */
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
#endif
    
    /* if it is a dynamic spawn, save the bookmark on the parent's job too */
    if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
        if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) {
            parent->bookmark = jdata->bookmark;
        }
    }

    /* if we wanted to display the map, now is the time to do it - ignore
     * daemon job
     */
    if (jdata->map->display_map) {
        char *output;
        int i, j;
        orte_node_t *node;
        orte_proc_t *proc;

        if (orte_display_diffable_output) {
            /* intended solely to test mapping methods, this output
             * can become quite long when testing at scale. Rather
             * than enduring all the malloc/free's required to
             * create an arbitrary-length string, custom-generate
             * the output a line at a time here
             */
            /* display just the procs in a diffable format */
            opal_output(orte_clean_output, "<map>");
            fflush(stderr);
            /* loop through nodes */
            for (i=0; i < jdata->map->nodes->size; i++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
                    continue;
                }
                opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name);
                fflush(stderr);
                for (j=0; j < node->procs->size; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
#if OPAL_HAVE_HWLOC
                    {
                        char locale[64];

                        if (NULL != proc->locale) {
                            hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset);
                        }
                        opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s[%s:%u]>",
                                    ORTE_VPID_PRINT(proc->name.vpid),  (long)proc->app_idx,
                                    (unsigned long)proc->local_rank,
                                    (unsigned long)proc->node_rank, locale,
                                    (NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap,
                                    opal_hwloc_base_print_level(jdata->map->bind_level), proc->bind_idx);
                    }
#else
                    opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>",
                                ORTE_VPID_PRINT(proc->name.vpid),  (long)proc->app_idx,
                                (unsigned long)proc->local_rank,
                                (unsigned long)proc->node_rank);
#endif
                    fflush(stderr);
                }
                opal_output(orte_clean_output, "\t</host>");
                fflush(stderr);
            }
#if OPAL_HAVE_HWLOC
            {
                opal_hwloc_locality_t locality;
                orte_proc_t *p0;

                /* test locality - for the first node, print the locality of each proc relative to the first one */
                node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0);
                p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0);
                opal_output(orte_clean_output, "\t<locality>");
                for (j=1; j < node->procs->size; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    locality = opal_hwloc_base_get_relative_locality(node->topology,
                                                                     jdata->map->bind_level,
                                                                     p0->bind_idx,
                                                                     jdata->map->bind_level,
                                                                     proc->bind_idx);
                    opal_output(orte_clean_output, "\t\t<bind_level=%s rank=%s bind_idx=%u rank=%s bind_idx=%u locality=%s>",
                                opal_hwloc_base_print_level(jdata->map->bind_level),
                                ORTE_VPID_PRINT(p0->name.vpid),
                                p0->bind_idx, ORTE_VPID_PRINT(proc->name.vpid),
                                proc->bind_idx, opal_hwloc_base_print_locality(locality));
                }
                opal_output(orte_clean_output, "\t</locality>\n</map>");
                fflush(stderr);
            }
#else
            opal_output(orte_clean_output, "\n</map>");
            fflush(stderr);
#endif
        } else {
            opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
            if (orte_xml_output) {
                fprintf(orte_xml_fp, "%s\n", output);
                fflush(orte_xml_fp);
            } else {
                opal_output(orte_clean_output, "%s", output);
            }
            free(output);
        }
    }
    /* set the job state to the next position */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE);

    /* cleanup */
    OBJ_RELEASE(caddy);
}
/* process incoming messages in order of receipt */
void orte_plm_base_recv(int status, orte_process_name_t* sender,
                        opal_buffer_t* buffer, orte_rml_tag_t tag,
                        void* cbdata)
{
    orte_plm_cmd_flag_t command;
    orte_std_cntr_t count;
    orte_jobid_t job;
    orte_job_t *jdata, *parent;
    opal_buffer_t *answer;
    orte_vpid_t vpid;
    orte_proc_t *proc;
    orte_proc_state_t state;
    orte_exit_code_t exit_code;
    int32_t rc=ORTE_SUCCESS, ret;
    orte_app_context_t *app, *child_app;
    orte_process_name_t name;
    pid_t pid;
    bool running;
        
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:receive processing msg",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    count = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_PLM_CMD))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }
        
    switch (command) {
    case ORTE_PLM_LAUNCH_JOB_CMD:
        OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                             "%s plm:base:receive job launch command from %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(sender)));
                
        /* unpack the job object */
        count = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &count, ORTE_JOB))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }
            
        /* record the sender so we know who to respond to */
        jdata->originator.jobid = sender->jobid;
        jdata->originator.vpid = sender->vpid;

        /* get the parent's job object */
        if (NULL == (parent = orte_get_job_data_object(sender->jobid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            goto ANSWER_LAUNCH;
        }
       /* if the prefix was set in the parent's job, we need to transfer
         * that prefix to the child's app_context so any further launch of
         * orteds can find the correct binary. There always has to be at
         * least one app_context in both parent and child, so we don't
         * need to check that here. However, be sure not to overwrite
         * the prefix if the user already provided it!
         */
        app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
        child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
        if (NULL != app->prefix_dir &&
            NULL == child_app->prefix_dir) {
            child_app->prefix_dir = strdup(app->prefix_dir);
        }
                    
        OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                             "%s plm:base:receive adding hosts",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

        /* process any add-hostfile and add-host options that were provided */
        if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }

        if( NULL == parent->bookmark ) {
            /* find the sender's node in the job map */
            if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) {
                /* set the bookmark so the child starts from that place - this means
                 * that the first child process could be co-located with the proc
                 * that called comm_spawn, assuming slots remain on that node. Otherwise,
                 * the procs will start on the next available node
                 */
                jdata->bookmark = proc->node;
            }
        } else {
            jdata->bookmark = parent->bookmark;
        }

        /* launch it */
        OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                             "%s plm:base:receive calling spawn",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }
        break;
    ANSWER_LAUNCH:
        OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                             "%s plm:base:receive - error on launch: %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc));

        /* setup the response */
        answer = OBJ_NEW(opal_buffer_t);
        
        /* pack the error code to be returned */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
            ORTE_ERROR_LOG(ret);
        }
                
        /* send the response back to the sender */
        if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_PLM_PROXY, 0,
                                               orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(answer);
        }
        break;
                
    case ORTE_PLM_UPDATE_PROC_STATE:
        OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                             "%s plm:base:receive update proc state command from %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(sender)));
        count = 1;
        while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
                    
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:base:receive got update_proc_state for job %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(job)));
                    
            name.jobid = job;
            running = false;
            /* get the job object */
            if (NULL == (jdata = orte_get_job_data_object(job))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                goto CLEANUP;
            }
            count = 1;
            while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID))) {
                if (ORTE_VPID_INVALID == vpid) {
                    /* flag indicates that this job is complete - move on */
                    break;
                }
                name.vpid = vpid;
                /* unpack the pid */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &count, OPAL_PID))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                /* unpack the state */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &count, ORTE_PROC_STATE))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                if (ORTE_PROC_STATE_RUNNING == state) {
                    running = true;
                }
                /* unpack the exit code */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &count, ORTE_EXIT_CODE))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                        
                OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                     "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code));

                /* get the proc data object */
                if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
                    ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                    ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                }
                proc->state = state;
                proc->pid = pid;
                proc->exit_code = exit_code;
                ORTE_ACTIVATE_PROC_STATE(&name, state);
            }
            if (running) {
                jdata->num_daemons_reported++;
                if (orte_report_launch_progress) {
                    if (0 == jdata->num_daemons_reported % 100 ||
                        jdata->num_daemons_reported == orte_process_info.num_procs) {
                        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS);
                    }
                }
            }
            /* prepare for next job */
            count = 1;
        }
        if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        break;
                
    case ORTE_PLM_INIT_ROUTES_CMD:
        count=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            goto CLEANUP;
        }
        name.jobid = job;
        /* get the job object */
        if (NULL == (jdata = orte_get_job_data_object(job))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            goto CLEANUP;
        }
        count=1;
        while (ORTE_SUCCESS == opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID)) {
            if (ORTE_VPID_INVALID == vpid) {
                break;
            }
            name.vpid = vpid;
            ORTE_ACTIVATE_PROC_STATE(&name, ORTE_PROC_STATE_REGISTERED);
            count=1;
        }
        /* pass the remainder of the buffer to the active module's
         * init_routes API
         */
        if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, buffer))) {
            ORTE_ERROR_LOG(rc);
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        }
        break;

    default:
        ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
        rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
        break;
    }
        
 CLEANUP:
    if (ORTE_SUCCESS != rc) {
        goto DEPART;
    }
        
 DEPART:
    /* see if an error occurred - if so, wakeup the HNP so we can exit */
    if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {
        jdata = NULL;
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:receive done processing commands",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
static void recv_data(int fd, short args, void *cbdata)
{
    bool found;
    int i, rc;
    orte_node_t *nd, *nd2;
    opal_list_t nds, ndtmp;
    opal_list_item_t *item, *itm;
    char recv_msg[8192];
    int nbytes, idx, sjob;
    char **alloc, *nodelist, *tpn;
    local_jobtracker_t *ptr, *jtrk;
    local_apptracker_t *aptrk;
    orte_app_context_t *app;
    orte_jobid_t jobid;
    orte_job_t *jdata;

    opal_output_verbose(2, orte_ras_base_framework.framework_output,
                        "%s ras:slurm: dynamic allocation - data recvd",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

    /* read the data from the socket and put it in the
     * nodes field of op
     */
    memset(recv_msg, 0, sizeof(recv_msg));
    nbytes = read(fd, recv_msg, sizeof(recv_msg) - 1);

    opal_output_verbose(2, orte_ras_base_framework.framework_output,
                        "%s ras:slurm: dynamic allocation msg: %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg);

    /* check if we got something */
    if (0 == nbytes || 0 == strlen(recv_msg) || strstr(recv_msg, "failure") != NULL) {
        /* show an error here - basically, a "nothing was available"
         * message
         */
        orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true,
                       (0 == strlen(recv_msg)) ? "NO MSG" : recv_msg);
        ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED);
        return;
    }

    /* break the message into its component parts, separated by colons */
    alloc = opal_argv_split(recv_msg, ':');

    /* the first section contains the ORTE jobid for this allocation */
    tpn = strchr(alloc[0], '=');
    orte_util_convert_string_to_jobid(&jobid, tpn+1);
    /* get the corresponding job object */
    jdata = orte_get_job_data_object(jobid);
    jtrk = NULL;
    /* find the associated tracking object */
    for (item = opal_list_get_first(&jobs);
            item != opal_list_get_end(&jobs);
            item = opal_list_get_next(item)) {
        ptr = (local_jobtracker_t*)item;
        if (ptr->jobid == jobid) {
            jtrk = ptr;
            break;
        }
    }
    if (NULL == jtrk) {
        orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, "NO JOB TRACKER");
        ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED);
        opal_argv_free(alloc);
        return;
    }

    /* stop the timeout event */
    opal_event_del(&jtrk->timeout_ev);

    /* cycle across all the remaining parts - each is the allocation for
     * an app in this job
     */
    OBJ_CONSTRUCT(&nds, opal_list_t);
    OBJ_CONSTRUCT(&ndtmp, opal_list_t);
    idx = -1;
    sjob = -1;
    nodelist = NULL;
    for (i=1; NULL != alloc[i]; i++) {
        if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) {
            orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            return;
        }
        if (idx < 0 || NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
            orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            opal_argv_free(alloc);
            return;
        }
        /* track the Slurm jobid */
        if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) {
            aptrk = OBJ_NEW(local_apptracker_t);
            opal_pointer_array_set_item(&jtrk->apps, idx, aptrk);
        }
        aptrk->sjob = sjob;
        /* release the current dash_host as that contained the *desired* allocation */
        opal_argv_free(app->dash_host);
        app->dash_host = NULL;
        /* since the nodelist/tpn may contain regular expressions, parse them */
        if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) {
            ORTE_ERROR_LOG(rc);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            opal_argv_free(alloc);
            return;
        }
        /* transfer the discovered nodes to our node list, and construct
         * the new dash_host entry to match what was allocated
         */
        while (NULL != (item = opal_list_remove_first(&ndtmp))) {
            nd = (orte_node_t*)item;
            opal_argv_append_nosize(&app->dash_host, nd->name);
            /* check for duplicates */
            found = false;
            for (itm = opal_list_get_first(&nds);
                    itm != opal_list_get_end(&nds);
                    itm = opal_list_get_next(itm)) {
                nd2 = (orte_node_t*)itm;
                if (0 == strcmp(nd->name, nd2->name)) {
                    found = true;
                    nd2->slots += nd->slots;
                    OBJ_RELEASE(item);
                    break;
                }
            }
            if (!found) {
                /* append the new node to our list */
                opal_list_append(&nds, item);
            }
        }
        /* cleanup */
        free(nodelist);
        free(tpn);
    }
    /* cleanup */
    opal_argv_free(alloc);
    OBJ_DESTRUCT(&ndtmp);

    if (opal_list_is_empty(&nds)) {
        /* if we get here, then we were able to contact slurm,
         * which means we are in an actively managed cluster.
         * However, slurm indicated that nothing is currently
         * available that meets our requirements. This is a fatal
         * situation - we do NOT have the option of running on
         * user-specified hosts as the cluster is managed.
         */
        OBJ_DESTRUCT(&nds);
        orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }

    /* store the found nodes */
    if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nds, jdata))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&nds);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        return;
    }
    OBJ_DESTRUCT(&nds);

    /* default to no-oversubscribe-allowed for managed systems */
    if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
    }
    /* flag that the allocation is managed */
    orte_managed_allocation = true;
    /* move the job along */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE);
    /* all done */
    return;
}
/*
 * This function is responsible for:
 * - Constructing the remote absolute path for the specified file/dir
 * - Verify the existence of the file/dir
 * - Determine if the specified file/dir is in fact a file or dir or unknown if not found.
 */
static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender,
                                                   opal_buffer_t* buffer)
{
    opal_buffer_t answer;
    orte_std_cntr_t count;
    char *filename = NULL;
    char *tmp_name = NULL;
    char cwd[OPAL_PATH_MAX];
    int file_type = ORTE_FILEM_TYPE_UNKNOWN;
    struct stat file_status;
    int rc;

    /*
     * Unpack the data
     */
    OBJ_CONSTRUCT(&answer, opal_buffer_t);

    count = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &filename, &count, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto CLEANUP;
    }

    /*
     * Determine the absolute path of the file
     */
    if(filename[0] != '/') { /* if it is not an absolute path already */
        getcwd(cwd, sizeof(cwd));
        asprintf(&tmp_name, "%s/%s", cwd, filename);
    }
    else {
        tmp_name = strdup(filename);
    }

    opal_output_verbose(10, orte_filem_base_output,
                        "filem:base: process_get_remote_path_cmd: %s -> %s: Filename Requested (%s) translated to (%s)",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(sender),
                        filename, tmp_name);

    /*
     * Determine if the file/dir exists at that absolute path
     * Determine if the file/dir is a file or a directory
     */
    if(0 != (rc = stat(tmp_name, &file_status) ) ){
        file_type = ORTE_FILEM_TYPE_UNKNOWN;
    }
    else {
        /* Is it a directory? */
        if(S_ISDIR(file_status.st_mode)) {
            file_type = ORTE_FILEM_TYPE_DIR;
        }
        else if(S_ISREG(file_status.st_mode)) {
            file_type = ORTE_FILEM_TYPE_FILE;
        }
    }

    /*
     * Pack up the response
     * Send back the reference type
     * - ORTE_FILEM_TYPE_FILE    = File
     * - ORTE_FILEM_TYPE_DIR     = Directory
     * - ORTE_FILEM_TYPE_UNKNOWN = Could not be determined, or does not exist
     */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &tmp_name, 1, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto CLEANUP;
    }
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &file_type, 1, OPAL_INT))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto CLEANUP;
    }

    if (0 > (rc = orte_rml.send_buffer(sender, &answer, ORTE_RML_TAG_FILEM_BASE_RESP, 0))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }

 CLEANUP:
    if( NULL != filename) {
        free(filename);
        filename = NULL;
    }
    if( NULL != tmp_name) {
        free(tmp_name);
        tmp_name = NULL;
    }

    OBJ_DESTRUCT(&answer);
}