Beispiel #1
0
static void launch_restart(int fd, short args, void *cbdata)
{
    orte_errmgr_caddy_t *cd = (orte_errmgr_caddy_t*)cbdata;
    int rc;
    opal_buffer_t *bfr;
    uint16_t jfam;

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s RESTARTING JOB %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(cd->jdata->jobid)));

    /* reset the job */
    orte_plm_base_reset_job(cd->jdata);

    /* the resilient mapper will automatically avoid restarting the
     * proc on its former node
     */

    /* map the job again */
    if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(cd->jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    bfr = OBJ_NEW(opal_buffer_t);
    /* indicate the target DVM */
    jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
    opal_dss.pack(bfr, &jfam, 1, OPAL_UINT16);

    /* get the launch data */
    if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(bfr, cd->jdata->jobid))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(bfr);
        goto cleanup;
    }
    /* send it to the daemons */
    if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL,
                                                 NULL, ORCM_PNP_TAG_COMMAND,
                                                 NULL, 0, bfr, cbfunc, NULL))) {
        ORTE_ERROR_LOG(rc);
    }

 cleanup:
    OBJ_RELEASE(cd);
}
Beispiel #2
0
static void errmgr_autor_recover_processes(int fd, short event, void *cbdata)
{
    int ret, exit_status = ORTE_SUCCESS;
    opal_list_item_t *item = NULL;
    errmgr_autor_wp_item_t *wp_item = NULL;
    orte_std_cntr_t i_proc;
    orte_proc_t *proc = NULL;
    orte_sstore_base_global_snapshot_info_t *snapshot = NULL;
    char * tmp_str = NULL;

    autor_mask_faults = true;
    ERRMGR_AUTOR_CLEAR_TIMERS();
    ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_START);

    /*
     * Display the processes that are to be recovered
     */
    OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
                         "%s errmgr:hnp(autor):recover() "
                         "------- Display known failed processes in the job %s -------",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(current_global_jobdata->jobid)));

    opal_list_sort(procs_pending_recovery, autor_procs_sort_compare_fn);
    display_procs();

    /*
     * Find the latest checkpoint
     */
    OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
                         "%s errmgr:hnp(autor):recover() "
                         "------- Find the latest checkpoint for the job %s -------",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(current_global_jobdata->jobid)));

    snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t);
    if( ORTE_SUCCESS != (ret = orte_sstore.request_global_snapshot_data(&orte_sstore_handle_last_stable, snapshot)) ) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_SETUP);

    /*
     * Safely terminate the entire job
     */
    opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
                        "errmgr:hnp(autor):recover() "
                        "------- Safely terminate the job %s -------",
                        ORTE_JOBID_PRINT(current_global_jobdata->jobid));

    for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) {
        proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc);
        if( NULL == proc ) {
            continue;
        }
        if( proc->state < ORTE_PROC_STATE_UNTERMINATED ) {
            proc->state = ORTE_PROC_STATE_MIGRATING;
        }
        if( current_global_jobdata->stdin_target == proc->name.vpid ) {
            orte_iof.close(&(proc->name), ORTE_IOF_STDIN);
        }
    }

    orte_plm.terminate_procs(current_global_jobdata->procs);

    /*
     * Wait for the job to terminate all processes
     */
    while(!check_if_terminated(current_global_jobdata->procs) ) {
        opal_progress();
    }

    ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_TERM);

    opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
                        "errmgr:hnp(autor):recover() "
                        "------- Done waiting for termination of job %s -------",
                        ORTE_JOBID_PRINT(current_global_jobdata->jobid));
    current_global_jobdata->num_terminated = current_global_jobdata->num_procs;
    orte_plm_base_reset_job(current_global_jobdata);

    /*
     * Construct the app contexts to restart
     */
    OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
                         "%s errmgr:hnp(autor):recover() "
                         "------- Rebuild job %s app context -------",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(current_global_jobdata->jobid)));
    for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) {
        proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc);
        if( NULL == proc ) {
            continue;
        }

        if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata,
                                                                                       proc,
                                                                                       &(snapshot->local_snapshots))) ) {
            ORTE_ERROR_LOG(ret);
            exit_status = ret;
            goto cleanup;
        }

        OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle,
                             "\tAdjusted: \"%s\" [0x%d] [%s]\n",
                             ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name));
    }

    ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESETUP);

    /*
     * Spawn the restarted job
     */
    opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
                        "errmgr:hnp(autor):recover() "
                        "------- Respawning the job %s -------",
                        ORTE_JOBID_PRINT(current_global_jobdata->jobid));
    orte_snapc_base_has_recovered = false;
    autor_mask_faults = false; /* Failures pass this point are worth noting */
    orte_plm.spawn(current_global_jobdata);

    /*
     * Wait for all the processes to restart
     */
    opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
                        "errmgr:hnp(autor):recover() "
                        "------- Waiting for restart -------");
    while(!check_if_restarted(current_global_jobdata->procs) ) {
        opal_progress();
    }

    ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESTART);

    /*
     * All done
     */
    while( !orte_snapc_base_has_recovered ) {
        opal_progress();
    }

    opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
                        "errmgr:hnp(autor):recover() "
                        "------- Finished recovering job %s -------",
                        ORTE_JOBID_PRINT(current_global_jobdata->jobid));

    opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovery_complete", true);

    ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_FINISH);

 cleanup:
    while(NULL != (item = opal_list_remove_first(procs_pending_recovery))) {
        wp_item = (errmgr_autor_wp_item_t*)item;
        OBJ_RELEASE(wp_item);
    }

    if( NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }

    ERRMGR_AUTOR_DISPLAY_ALL_TIMERS();

    autor_timer_active = false;
    autor_mask_faults  = false;

    return;
}
Beispiel #3
0
static int update_state(orte_jobid_t job,
                        orte_job_state_t jobstate,
                        orte_process_name_t *proc,
                        orte_proc_state_t state,
                        pid_t pid,
                        orte_exit_code_t exit_code)
{
    int rc=ORTE_SUCCESS, i;
    orte_app_context_t *app;
    orte_node_t *node;
    orte_proc_t *pptr, *daemon, *pptr2;
    opal_buffer_t *notify;
    orcm_triplet_t *trp;
    orcm_source_t *src;
    bool procs_recovered;
    orte_job_t *jdt;
    uint16_t jfam;
    bool send_msg;

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:update_state for job %s proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(job),
                         (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));

    /* protect against threads */
    ORTE_ACQUIRE_THREAD(&ctl);

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }
    
    /***   UPDATE COMMAND FOR A JOB   ***/
    if (NULL == proc) {
        /* should only get this if a daemon restarted and we need
         * to check for procs waiting to migrate
         */
        if (ORTE_JOB_STATE_PROCS_MIGRATING != jobstate) {
            /* we should never get this situation */
            opal_output(0, "%s UNKNOWN JOB ERROR ",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_job_state_to_str(jobstate));
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERROR;
        }
        /* cycle thru all known jobs looking for those with procs
         * awaiting resources to migrate
         */
        for (i=0; i < orte_job_data->size; i++) {
            if (NULL == (jdt = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
                continue;
            }
            if (ORTE_JOB_STATE_PROCS_MIGRATING != jdt->state) {
                continue;
            }
            /* reset the job */
            orte_plm_base_reset_job(jdt);

            /* map the job again */
            if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdt))) {
                ORTE_ERROR_LOG(rc);
                continue;
            }
            /* launch any procs that could be mapped - note that not
             * all procs that were waiting for migration may have
             * been successfully mapped, so this could in fact
             * result in no action by the daemons
             */
            notify = OBJ_NEW(opal_buffer_t);
            /* indicate the target DVM */
            jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
            opal_dss.pack(notify, &jfam, 1, OPAL_UINT16);

            /* get the launch data */
            if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(notify, jdt->jobid))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(notify);
                ORTE_RELEASE_THREAD(&ctl);
                return ORTE_SUCCESS;
            }
            /* send it to the daemons */
            if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL,
                                                         NULL, ORCM_PNP_TAG_COMMAND,
                                                         NULL, 0, notify, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }


    /**** DEAL WITH INDIVIDUAL PROCS ****/

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:sched got state %s for proc %s pid %d exit_code %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc), pid, exit_code));
 
    /* if this was a failed comm or heartbeat */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* ignore this */
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }

    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        /* get the proc object for this daemon */
        if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        /* ensure that the heartbeat system knows to ignore this proc
         * from this point forward
         */
        daemon->beat = 0;
        /* if we have already heard about this proc, ignore repeats */
        if (ORTE_PROC_STATE_HEARTBEAT_FAILED == daemon->state) {
            /* already heard */
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_SUCCESS;
        }
#if 0
        /* delete the route */
        orte_routed.delete_route(proc);
        /* purge the oob */
        orte_rml.purge(proc);
#endif
        /* get the triplet/source and mark this source as "dead" */
        if (NULL == (trp = orcm_get_triplet_stringid("orcmd:0.1:alpha"))) {
            opal_output(0, "%s CANNOT FIND DAEMON TRIPLET",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        if (NULL == (src = orcm_get_source(trp, proc, false))) {
            opal_output(0, "%s DAEMON %s IS UNKNOWN SOURCE",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc));
            ORTE_RELEASE_THREAD(&trp->ctl);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        src->alive = false;
        ORTE_RELEASE_THREAD(&src->ctl);
        ORTE_RELEASE_THREAD(&trp->ctl);

        /* notify all apps immediately */
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        notify = OBJ_NEW(opal_buffer_t);
        send_msg = false;
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) {
                ORTE_ERROR_LOG(rc);
                ORTE_RELEASE_THREAD(&ctl);
                return rc;
            }
            /* reset the proc stats */
            OBJ_DESTRUCT(&pptr->stats);
            OBJ_CONSTRUCT(&pptr->stats, opal_pstats_t);
            /* since we added something, need to send msg */
            send_msg = true;
        }
        if (send_msg) {
            /* send it to all apps */
            if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL,
                                                         ORCM_PNP_TAG_ERRMGR, NULL, 0,
                                                         notify, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            OBJ_RELEASE(notify);
        }
        /* reset the node stats */
        OBJ_DESTRUCT(&node->stats);
        OBJ_CONSTRUCT(&node->stats, opal_node_stats_t);
        /* record that the daemon died */
        daemon->state = state;
        daemon->exit_code = exit_code;
        daemon->pid = 0;
        /* reset the daemon stats */
        OBJ_DESTRUCT(&daemon->stats);
        OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t);
        node = daemon->node;
        if (NULL == node) {
            opal_output(0, "%s Detected failure of daemon %s on unknown node",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc));
            /* can't do anything further */
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_SUCCESS;            
        } else {
            opal_output(0, "%s Detected failure of daemon %s on node %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        (NULL == node->name) ? "UNKNOWN" : node->name);
        }
        /* see if any usable daemons are left alive */
        procs_recovered = false;
        for (i=2; i < daemon_job->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, i))) {
                continue;
            }
            if (ORTE_PROC_STATE_UNTERMINATED < pptr->state) {
                continue;
            }
            /* at least one alive! recover procs from the failed one */
            recover_procs(proc);
            procs_recovered = true;
            break;
        }
        if (!procs_recovered) {
            daemon->node = NULL;
            node->state = ORTE_NODE_STATE_DOWN;
            node->daemon = NULL;
            /* mark all procs on this node as having terminated */
            for (i=0; i < node->procs->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                    continue;
                }
                /* get the job data object for this process */
                if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) {
                    /* major problem */
                    opal_output(0, "%s COULD NOT GET JOB OBJECT FOR PROC %s(%d): state %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&pptr->name), i,
                                orte_proc_state_to_str(pptr->state));
                    continue;
                }
                if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) {
                    continue;
                }
                OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                     "%s REMOVING PROC %s FROM NODE %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&pptr->name), node->name));
                app->num_procs--;
                opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL);
                OBJ_RELEASE(pptr);
                /* clean it off the node */
                opal_pointer_array_set_item(node->procs, i, NULL);
                node->num_procs--;
                /* maintain acctg */
                OBJ_RELEASE(pptr);
                /* see if job is empty */
                jdt->num_terminated++;
                if (jdt->num_procs <= jdt->num_terminated) {
                    OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                         "%s REMOVING JOB %s FROM ACTIVE ARRAY",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         ORTE_JOBID_PRINT(jdt->jobid)));
                    opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL);
                    OBJ_RELEASE(jdt);
                }
            }
        }
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }

    if (ORTE_PROC_STATE_RESTARTED == state) {
        OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                             "%s RESTART OF DAEMON %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* get the proc object for this daemon */
        if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        /* if apps were on that node, notify all apps immediately that
         * those procs have failed
         */
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        notify = OBJ_NEW(opal_buffer_t);
        send_msg = false;
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) {
                ORTE_ERROR_LOG(rc);
                ORTE_RELEASE_THREAD(&ctl);
                return rc;
            }
            /* since we added something, we need to send msg */
            send_msg = true;
            /* remove the proc from the app so that it will get
             * restarted when we re-activate the config
             */
            if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) {
                continue;
            }
            if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) {
                continue;
            }
            OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                 "%s REMOVING PROC %s FROM NODE %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pptr->name), node->name));
            app->num_procs--;
            opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL);
            OBJ_RELEASE(pptr);
            /* clean it off the node */
            opal_pointer_array_set_item(node->procs, i, NULL);
            node->num_procs--;
            /* maintain acctg */
            OBJ_RELEASE(pptr);
            /* see if job is empty */
            jdt->num_terminated++;
            if (jdt->num_procs <= jdt->num_terminated) {
                OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                     "%s REMOVING JOB %s FROM ACTIVE ARRAY",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_JOBID_PRINT(jdt->jobid)));
                opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL);
                OBJ_RELEASE(jdt);
            }
        }
        if (send_msg) {
            /* send it to all apps */
            if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL,
                                                         ORCM_PNP_TAG_ERRMGR, NULL, 0,
                                                         notify, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            OBJ_RELEASE(notify);
        }
        /* reset the node stats */
        OBJ_DESTRUCT(&node->stats);
        OBJ_CONSTRUCT(&node->stats, opal_node_stats_t);
        /* reset the daemon stats */
        OBJ_DESTRUCT(&daemon->stats);
        OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t);
        /* don't restart procs - we'll do that later after
         * we allow time for multiple daemons to restart
         */
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }

    /* to arrive here is an error */
    opal_output(0, "%s GOT UNRECOGNIZED STATE %s FOR PROC %s",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                orte_proc_state_to_str(state),
                ORTE_NAME_PRINT(proc));
    return ORTE_ERROR;

}