Example #1
0
static void get_routing_list(opal_list_t *coll)
{
    orte_namelist_t *nm;
    int32_t i;
    orte_job_t *jdata;
    orte_proc_t *proc;
    
    /* if I am anything other than daemons and the HNP, this
     * is a meaningless command as I am not allowed to route
     */
    if (!ORTE_PROC_IS_DAEMON || !ORTE_PROC_IS_HNP) {
        return;
    }
    
    /* daemons don't route */
    if (ORTE_PROC_IS_DAEMON) {
        return;
    }
    /* HNP sends direct to each daemon */
    if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return;
    }
    for (i=1; i < jdata->procs->size; i++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
            continue;
        }
        if( proc->state <= ORTE_PROC_STATE_UNTERMINATED &&
            NULL != proc->rml_uri ) {
            OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                                 "%s get_routing_tree: Adding process %s state %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&(proc->name)),
                                 orte_proc_state_to_str(proc->state)));

            nm = OBJ_NEW(orte_namelist_t);
            nm->name.jobid = proc->name.jobid;
            nm->name.vpid = proc->name.vpid;
            opal_list_append(coll, &nm->super);
        } else {
            OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                                 "%s get_routing_tree: Skipped process %15s state %s (non functional daemon)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&(proc->name)),
                                 orte_proc_state_to_str(proc->state)));
        }
    }
}
Example #2
0
int orte_state_base_add_proc_state(orte_proc_state_t state,
                                   orte_state_cbfunc_t cbfunc,
                                   int priority)
{
    opal_list_item_t *item;
    orte_state_t *st;

    /* check for uniqueness */
    for (item = opal_list_get_first(&orte_proc_states);
         item != opal_list_get_end(&orte_proc_states);
         item = opal_list_get_next(item)) {
        st = (orte_state_t*)item;
        if (st->proc_state == state) {
            OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
                                 "DUPLICATE STATE DEFINED: %s",
                                 orte_proc_state_to_str(state)));
            return ORTE_ERR_BAD_PARAM;
        }
    }

    st = OBJ_NEW(orte_state_t);
    st->proc_state = state;
    st->cbfunc = cbfunc;
    st->priority = priority;
    opal_list_append(&orte_proc_states, &(st->super));

    return ORTE_SUCCESS;
}
Example #3
0
static int parseable_print(orte_ps_mpirun_info_t *hnpinfo)
{
    orte_job_t **jobs;
    orte_node_t **nodes;
    orte_proc_t *proc;
    orte_app_context_t *app;
    char *appname;
    int i, j;
    char *nodename;

    /* don't include the daemon job in the number of jobs reported */
    printf("mpirun:%lu:num nodes:%d:num jobs:%d\n",
           (unsigned long)hnpinfo->hnp->pid, hnpinfo->num_nodes, hnpinfo->num_jobs-1);

    if (orte_ps_globals.nodes) {
        nodes = hnpinfo->nodes;
        for (i=0; i < hnpinfo->num_nodes; i++) {
            printf("node:%s:state:%s:slots:%d:in use:%d\n",
                   nodes[i]->name, pretty_node_state(nodes[i]->state),
                   nodes[i]->slots, nodes[i]->slots_inuse);
        }
    }

    jobs = hnpinfo->jobs;
    /* skip job=0 as that's the daemon job */
    for (i=1; i < hnpinfo->num_jobs; i++) {
        printf("jobid:%d:state:%s:slots:%d:num procs:%d\n",
               ORTE_LOCAL_JOBID(jobs[i]->jobid),
               orte_job_state_to_str(jobs[i]->state),
               jobs[i]->total_slots_alloc,
               jobs[i]->num_procs);
        /* print the proc info */
        for (j=0; j < jobs[i]->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) {
                continue;
            }
            app = (orte_app_context_t*)opal_pointer_array_get_item(jobs[i]->apps, proc->app_idx);
            if (NULL == app) {
                appname = strdup("NULL");
            } else {
                appname = opal_basename(app->app);
            }
            nodename = NULL;
            orte_get_attribute(&proc->attributes, ORTE_PROC_NODENAME, (void**)&nodename, OPAL_STRING);
            printf("process:%s:rank:%s:pid:%lu:node:%s:state:%s\n",
                   appname, ORTE_VPID_PRINT(proc->name.vpid),
                   (unsigned long)proc->pid,
                   (NULL == nodename) ? "unknown" : nodename,
                   orte_proc_state_to_str(proc->state));
            free(appname);
            if (NULL != nodename) {
                free(nodename);
            }
        }
    }

    return ORTE_SUCCESS;
}
Example #4
0
void orte_state_base_print_proc_state_machine(void)
{
    opal_list_item_t *item;
    orte_state_t *st;

    opal_output(0, "ORTE_PROC_STATE_MACHINE:");
    for (item = opal_list_get_first(&orte_proc_states);
         item != opal_list_get_end(&orte_proc_states);
         item = opal_list_get_next(item)) {
        st = (orte_state_t*)item;
        opal_output(0, "\tState: %s cbfunc: %s",
                    orte_proc_state_to_str(st->proc_state),
                    (NULL == st->cbfunc) ? "NULL" : "DEFINED");
    }
}
Example #5
0
static int update_state(orte_jobid_t job,
                        orte_job_state_t jobstate,
                        orte_process_name_t *proc,
                        orte_proc_state_t state,
                        pid_t pid,
                        orte_exit_code_t exit_code)
{
    orte_ns_cmp_bitmask_t mask;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                         "%s errmgr:default_app: job %s reported state %s"
                         " for proc %s state %s exit_code %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(job),
                         orte_job_state_to_str(jobstate),
                         (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state), exit_code));
    
    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        return ORTE_SUCCESS;
    }

    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        mask = ORTE_NS_CMP_ALL;
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
            return ORTE_SUCCESS;
        }
        /* see is this was a lifeline */
        if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
            return ORTE_ERR_UNRECOVERABLE;
        }
    }
    return ORTE_SUCCESS;
}
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_tool: proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(&caddy->name),
                         orte_proc_state_to_str(caddy->proc_state)));

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        OBJ_RELEASE(caddy);
        return;
    }

    /* all errors require abort */
    orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);

    OBJ_RELEASE(caddy);
}
Example #7
0
int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job,
                                                orte_job_state_t jobstate,
                                                orte_process_name_t *proc_name,
                                                orte_proc_state_t state,
                                                pid_t pid,
                                                orte_exit_code_t exit_code)
{
    orte_proc_t *loc_proc = NULL;
    orte_job_t *jdata = NULL;
    int ret = ORTE_SUCCESS, exit_status = ORTE_SUCCESS;
    int32_t i;

    /*
     * if orte is trying to shutdown, just let it
     */
    if( mca_errmgr_hnp_component.term_in_progress ) {
        return ORTE_SUCCESS;
    }

    if( NULL != proc_name &&
        OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc_name) ) {
        OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                             "%s errmgr:hnp(autor): Update reported on self (%s), state %s. Skip...",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc_name),
                             orte_proc_state_to_str(state) ));
        return ORTE_SUCCESS;
    }

    /*
     * Get the job data object for this process
     */
    if( NULL != proc_name ) { /* Get job from proc's jobid */
        jdata = orte_get_job_data_object(proc_name->jobid);
    } else { /* Get from the general job */
        jdata = orte_get_job_data_object(job);
    }
    if( NULL == jdata ) {
        opal_output(0, "%s errmgr:hnp(autor):update_state() Error: Cannot find job %s for Process %s",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_JOBID_PRINT(job),
                    (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) );
        ret = ORTE_ERROR;
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    /*
     * If this is a tool, ignore
     */
    if( jdata->num_apps == 0 &&
        OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) {
        OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                             "%s errmgr:hnp(autor): An external tool disconnected. Ignore...",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        exit_status = ORTE_SUCCESS;
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                         "%s errmgr:hnp(autor): job %s reported state %s"
                         " for proc %s state %s exit_code %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(job),
                         orte_job_state_to_str(jobstate),
                         (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name),
                         orte_proc_state_to_str(state), exit_code));

    if( ORTE_JOB_STATE_RESTART == jobstate ) {
        for(i = 0; i < jdata->procs->size; ++i) {
            if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
                continue;
            }
            break;
        }

        if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, &(loc_proc->name), state)) ) {
            ORTE_ERROR_LOG(ret);
            exit_status = ret;
            goto cleanup;
        }
    }
    else if( ORTE_PROC_STATE_ABORTED_BY_SIG == state ||
             ORTE_PROC_STATE_COMM_FAILED    == state ) {
        if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, proc_name, state)) ) {
            ORTE_ERROR_LOG(ret);
            exit_status = ret;
            goto cleanup;
        }
    }
    else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) {
        if( autor_mask_faults ) {
            mca_errmgr_hnp_component.ignore_current_update = true;
            orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code);
        }
    }

 cleanup:
    return ret;
}
Example #8
0
static int update_state(orte_jobid_t job,
                        orte_job_state_t jobstate,
                        orte_process_name_t *proc,
                        orte_proc_state_t state,
                        pid_t pid,
                        orte_exit_code_t exit_code)
{
    opal_list_item_t *item, *next;
    orte_odls_job_t *jobdat = NULL;
    orte_odls_child_t *child;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    orte_vpid_t null=ORTE_VPID_INVALID;
    orte_ns_cmp_bitmask_t mask;

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
                "errmgr:default_orted:update_state() %s) "
                "------- %s state updated for process %s",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                ((NULL == proc) ? "App. Process" : 
                 (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
                (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
        ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        return ORTE_SUCCESS;
    }

    /***   UPDATE COMMAND FOR A JOB   ***/
    if (NULL == proc) {
        /* this is an update for an entire job */
        if (ORTE_JOBID_INVALID == job) {
            /* whatever happened, we don't know what job
             * it happened to
             */
            orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error",
                           true, orte_job_state_to_str(jobstate));
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            /* pack the "invalid" jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
            return rc;
        }

        /* lookup the local jobdat for this job */
        jobdat = NULL;
        for (item = opal_list_get_first(&orte_local_jobdata);
             item != opal_list_get_end(&orte_local_jobdata);
             item = opal_list_get_next(item)) {
            jobdat = (orte_odls_job_t*)item;

            /* is this the specified job? */
            if (jobdat->jobid == job) {
                break;
            }
        }
        if (NULL == jobdat) {
            return ORTE_ERR_NOT_FOUND;
        }

        switch (jobstate) {
        case ORTE_JOB_STATE_FAILED_TO_START:
            failed_start(jobdat, exit_code);
            break;
        case ORTE_JOB_STATE_RUNNING:
            /* update all local child states */
            update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING);
            break;
        case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
            /* update all procs in job */
            update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
            /* order all local procs for this job to be killed */
            killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
        case ORTE_JOB_STATE_COMM_FAILED:
            /* kill all local procs */
            killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
            /* tell the caller we can't recover */
            return ORTE_ERR_UNRECOVERABLE;
            break;
        case ORTE_JOB_STATE_HEARTBEAT_FAILED:
            /* let the HNP handle this */
            return ORTE_SUCCESS;
            break;

        default:
            break;
        }
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto FINAL_CLEANUP;
        }
        /* pack the job info */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) {
            ORTE_ERROR_LOG(rc);
        }
        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        return rc;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            return ORTE_SUCCESS;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - ignore */
            return ORTE_SUCCESS;
        }
        /* see if this was a lifeline */
        if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
            /* kill our children */
            killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
            /* terminate - our routed children will see
             * us leave and automatically die
             */
            orte_quit();
        }
        /* if not, then indicate we can continue */
        return ORTE_SUCCESS;
    }

    /* lookup the local jobdat for this job */
    jobdat = NULL;
    for (item = opal_list_get_first(&orte_local_jobdata);
         item != opal_list_get_end(&orte_local_jobdata);
         item = opal_list_get_next(item)) {
        jobdat = (orte_odls_job_t*)item;

        /* is this the specified job? */
        if (jobdat->jobid == proc->jobid) {
            break;
        }
    }
    if (NULL == jobdat) {
        /* must already be complete */
        return ORTE_SUCCESS;
    }

    /* if there are no local procs for this job, we can
     * ignore this call
     */
    if (0 == jobdat->num_local_procs) {
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                         "%s errmgr:default_orted got state %s for proc %s pid %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc), pid));
 
    /***  UPDATE COMMAND FOR A SPECIFIC PROCESS ***/
    if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
        /* find this proc in the local children */
        for (item = opal_list_get_first(&orte_local_children);
             item != opal_list_get_end(&orte_local_children);
             item = opal_list_get_next(item)) {
            child = (orte_odls_child_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
                if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
                    child->state = state;
                    child->exit_code = exit_code;
                    /* Decrement the number of local procs */
                    jobdat->num_local_procs--;
                    /* kill this proc */
                    killprocs(proc->jobid, proc->vpid);
                }
                return ORTE_SUCCESS;
            }
        }
    }

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        if (!orte_abort_non_zero_exit) {
            /* treat this as normal termination */
            goto REPORT_STATE;
        }
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away
         */
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto FINAL_CLEANUP;
        }
        /* pack only the data for this proc - have to start with the jobid
         * so the receiver can unpack it correctly
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* find this proc in the local children */
        for (item = opal_list_get_first(&orte_local_children);
             item != opal_list_get_end(&orte_local_children);
             item = opal_list_get_next(item)) {
            child = (orte_odls_child_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
                if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
                    child->state = state;
                    child->exit_code = exit_code;
                }
                /* now pack the child's info */
                if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                /* remove the child from our local list as it is no longer alive */
                opal_list_remove_item(&orte_local_children, &child->super);
                /* Decrement the number of local procs */
                jobdat->num_local_procs--;

                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                     "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(child->name),
                                     jobdat->num_local_procs));
                
                /* release the child object */
                OBJ_RELEASE(child);
                /* done with loop */
                break;
            }
        }

        /* send it */
    if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        return rc;
    }

 REPORT_STATE:
    /* find this proc in the local children so we can update its state */
    for (item = opal_list_get_first(&orte_local_children);
         item != opal_list_get_end(&orte_local_children);
         item = opal_list_get_next(item)) {
        child = (orte_odls_child_t*)item;
        mask = ORTE_NS_CMP_ALL;
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
            if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
                child->state = state;
                if (0 < pid) {
                    child->pid = pid;
                }
                child->exit_code = exit_code;
            }
            /* done with loop */
            break;
        }
    }

    if (ORTE_PROC_STATE_REGISTERED == state) {
        /* see if everyone in this job has registered */
        if (all_children_registered(proc->jobid)) {
            /* once everyone registers, send their contact info to
             * the HNP so it is available to debuggers and anyone
             * else that needs it
             */

            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                 "%s errmgr:default_orted: sending contact info to HNP",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            
            alert = OBJ_NEW(opal_buffer_t);
            /* pack init routes command */
            cmd = ORTE_PLM_INIT_ROUTES_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto FINAL_CLEANUP;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                goto FINAL_CLEANUP;
            }
            /* pack all the local child vpids and epochs */
            for (item = opal_list_get_first(&orte_local_children);
                 item != opal_list_get_end(&orte_local_children);
                 item = opal_list_get_next(item)) {
                child = (orte_odls_child_t*)item;
                if (child->name->jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        goto FINAL_CLEANUP;
                    }
                }
            }
            /* pack an invalid marker */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                goto FINAL_CLEANUP;
            }
            /* add in contact info for all procs in the job */
            if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&alert);
                return rc;
            }
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
        }        
        return rc;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        /* lookup the local jobdat for this job */
        jobdat = NULL;
        for (item = opal_list_get_first(&orte_local_jobdata);
             item != opal_list_get_end(&orte_local_jobdata);
             item = opal_list_get_next(item)) {
            jobdat = (orte_odls_job_t*)item;

            /* is this the specified job? */
            if (jobdat->jobid == proc->jobid) {
                break;
            }
        }
        if (NULL == jobdat) {
            /* race condition - may not have been formed yet */
            return ORTE_SUCCESS;
        }

        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto FINAL_CLEANUP;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) {
            ORTE_ERROR_LOG(rc);
        }

FINAL_CLEANUP:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jobdat->jobid)));
        
        /* remove all of this job's children from the global list - do not lock
         * the thread as we are already locked
         */
        for (item = opal_list_get_first(&orte_local_children);
             item != opal_list_get_end(&orte_local_children);
             item = next) {
            child = (orte_odls_child_t*)item;
            next = opal_list_get_next(item);

            if (jobdat->jobid == child->name->jobid) {
                opal_list_remove_item(&orte_local_children, &child->super);
                OBJ_RELEASE(child);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jobdat->jobid);

        /* remove this job from our local job data since it is complete */
        opal_list_remove_item(&orte_local_jobdata, &jobdat->super);
        OBJ_RELEASE(jobdat);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }

        /* indicate that the job is complete */
        return rc;
    }
    return ORTE_SUCCESS;
}
Example #9
0
static int pretty_print_vpids(orte_job_t *job) {
    int len_o_proc_name = 0, 
        len_proc_name   = 0, 
        len_rank        = 0, 
        len_pid         = 0,
        len_state       = 0,
        len_node        = 0,
        len_ckpt_s      = 0,
        len_ckpt_r      = 0,
        len_ckpt_l      = 0; 
    int i, line_len;
    orte_vpid_t v;
    orte_proc_t *vpid;
    orte_app_context_t *app;
    char *o_proc_name;
#if OPAL_ENABLE_FT_CR == 1
    char *state_str = NULL;
#endif

    /*
     * Caculate segment lengths
     */
    len_o_proc_name = (int)strlen("ORTE Name");
    len_proc_name   = (int)strlen("Process Name");
    len_rank        = (int)strlen("Local Rank");
    len_pid         = 6;
    len_state       = 0;
    len_node        = 0;
#if OPAL_ENABLE_FT_CR == 1
    len_ckpt_s      = strlen("Ckpt State");
    len_ckpt_r      = strlen("Ckpt Ref");
    len_ckpt_l      = strlen("Ckpt Loc");
#else
    len_ckpt_s      = -3;
    len_ckpt_r      = -3;
    len_ckpt_l      = -3;
#endif

    for(v=0; v < job->num_procs; v++) {
        char *rankstr;
        vpid = (orte_proc_t*)job->procs->addr[v];
        
        /*
         * Find my app context
         */
        if( 0 >= (int)job->num_apps ) {
            if( 0 == vpid->name.vpid ) {
                if( (int)strlen("orterun") > len_proc_name)
                    len_proc_name = strlen("orterun");
            }
            else {
                if( (int)strlen("orted") > len_proc_name)
                    len_proc_name = strlen("orted");
            }
        }
        for( i = 0; i < (int)job->num_apps; ++i) {
            app = (orte_app_context_t*)job->apps->addr[i];
            if( app->idx == vpid->app_idx ) {
                if( (int)strlen(app->app) > len_proc_name) 
                    len_proc_name = strlen(app->app);
                break;
            }
        }
        
        o_proc_name = orte_util_print_name_args(&vpid->name);
        if ((int)strlen(o_proc_name) > len_o_proc_name)
            len_o_proc_name = strlen(o_proc_name);

        asprintf(&rankstr, "%u", (uint)vpid->local_rank);
        if ((int)strlen(rankstr) > len_rank)
            len_rank = strlen(rankstr);
        free(rankstr);

        if( NULL != vpid->nodename && (int)strlen(vpid->nodename) > len_node) {
            len_node = strlen(vpid->nodename);
        } else if ((int)strlen("Unknown") > len_node) {
            len_node = strlen("Unknown");
        }

        if( (int)strlen(orte_proc_state_to_str(vpid->state)) > len_state)
            len_state = strlen(orte_proc_state_to_str(vpid->state));
        
#if OPAL_ENABLE_FT_CR == 1
        orte_snapc_ckpt_state_str(&state_str, vpid->ckpt_state);
        if( (int)strlen(state_str) > len_ckpt_s)
            len_ckpt_s = strlen(state_str);
        
        if( NULL != vpid->ckpt_snapshot_ref &&
            (int)strlen(vpid->ckpt_snapshot_ref) > len_ckpt_r) 
            len_ckpt_r = strlen(vpid->ckpt_snapshot_ref);
        
        if( NULL != vpid->ckpt_snapshot_loc &&
            (int)strlen(vpid->ckpt_snapshot_loc) > len_ckpt_l) 
            len_ckpt_l = strlen(vpid->ckpt_snapshot_loc);
#endif
    }

    line_len = (len_o_proc_name + 3 +
                len_proc_name   + 3 +
                len_rank        + 3 +
                len_pid         + 3 +
                len_state       + 3 +
                len_node        + 3 +
                len_ckpt_s      + 3 +
                len_ckpt_r      + 3 +
                len_ckpt_l)
                + 2;

    /*
     * Print Header
     */
    printf("\t");
    printf("%*s | ", len_proc_name   , "Process Name");
    printf("%*s | ", len_o_proc_name , "ORTE Name");
    printf("%*s | ", len_rank        , "Local Rank");
    printf("%*s | ", len_pid         , "PID");
    printf("%*s | ", len_node        , "Node");
    printf("%*s | ", len_state       , "State");
#if OPAL_ENABLE_FT_CR == 1
    printf("%*s | ", len_ckpt_s      , "Ckpt State");
    printf("%*s | ", len_ckpt_r      , "Ckpt Ref");
    printf("%*s |",  len_ckpt_l      , "Ckpt Loc");
#endif
    printf("\n");
    
    printf("\t");
    pretty_print_dashed_line(line_len);
    
    /*
     * Print Info
     */
    for(v=0; v < job->num_procs; v++) {
        vpid = (orte_proc_t*)job->procs->addr[v];
        
        printf("\t");

        if( 0 >= (int)job->num_apps ) {
            if( 0 == vpid->name.vpid ) {
                printf("%*s | ", len_proc_name, "orterun");
            } else {
                printf("%*s | ", len_proc_name, "orted");
            }
        }
        for( i = 0; i < (int)job->num_apps; ++i) {
            app = (orte_app_context_t*)job->apps->addr[i];
            if( app->idx == vpid->app_idx ) {
                printf("%*s | ", len_proc_name, app->app);
                break;
            }
        }
        
        o_proc_name = orte_util_print_name_args(&vpid->name);

        printf("%*s | ",  len_o_proc_name, o_proc_name);
        printf("%*u | ",  len_rank       , (uint)vpid->local_rank);
        printf("%*d | ",  len_pid        , vpid->pid);
        printf("%*s | ",  len_node       , (NULL == vpid->nodename) ? "Unknown" : vpid->nodename);
        printf("%*s | ",  len_state      , orte_proc_state_to_str(vpid->state));
        
#if OPAL_ENABLE_FT_CR == 1
        printf("%*s | ",  len_ckpt_s, state_str);
        printf("%*s | ",  len_ckpt_r, (NULL == vpid->ckpt_snapshot_ref ? 
                                       "" : 
                                       vpid->ckpt_snapshot_ref));
        printf("%*s |",   len_ckpt_l, (NULL == vpid->ckpt_snapshot_loc ? 
                                       "" : 
                                       vpid->ckpt_snapshot_loc));
#endif
        printf("\n");
        
    }
    
    return ORTE_SUCCESS;
}
Example #10
0
static void track_procs(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata;

    opal_output_verbose(2, orte_state_base_framework.framework_output,
                        "%s state:staged_hnp:track_procs called for proc %s state %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        orte_proc_state_to_str(state));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        if (jdata->num_launched == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING);
        }
    }

    /* if this is a registration, check to see if it came from
     * inside MPI_Init - if it did, that is not acceptable
     */
    if (ORTE_PROC_STATE_REGISTERED == state) {
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_AS_MPI) &&
            !ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_GANG_LAUNCHED)) {
            /* we can't support this - issue an error and abort */
            orte_show_help("help-state-staged-hnp.txt", "mpi-procs-not-supported", true);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);
        }
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED);
        }
        OBJ_RELEASE(caddy);
        return;
    }

    if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* update the proc state */
        pdata->state = state;
        /* Release only the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDIN);
        }
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) {
            goto terminated;
        }
        OBJ_RELEASE(caddy);
        return;
    }

    if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* update the proc state */
        pdata->state = state;
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) {
            goto terminated;
        }
        OBJ_RELEASE(caddy);
        return;
    }
    
    /* if the proc terminated, see if any other procs are
     * waiting to run. We assume that the app_contexts are
     * in priority order, with the highest priority being
     * at position 0 in the app_context array for this job
     */
    if (ORTE_PROC_STATE_TERMINATED == state) {
    terminated:
        /* update the proc state */
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        pdata->state = ORTE_PROC_STATE_TERMINATED;
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
            /* Clean up the session directory as if we were the process
             * itself.  This covers the case where the process died abnormally
             * and didn't cleanup its own session directory.
             */
            orte_session_dir_finalize(proc);
        }
        /* return the allocated slot for reuse */
        cleanup_node(pdata);
        /* track job status */
        jdata->num_terminated++;
        if (jdata->num_terminated == jdata->num_procs) {
            /* no other procs are waiting, so end this job */
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        } else if (jdata->num_mapped < jdata->num_procs) {
            /* schedule the job for re-mapping so that procs
             * waiting for resources can execute
             */
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
        }
	/* otherwise, do nothing until more procs terminate */
        OBJ_RELEASE(caddy);
        return;
    }
}
Example #11
0
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata;
    int i;

    opal_output_verbose(5, orte_state_base_framework.framework_output,
                        "%s state:base:track_procs called for proc %s state %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        orte_proc_state_to_str(state));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        if (jdata->num_launched == jdata->num_procs) {
            if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
            } else {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING);
            }
        }
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED);
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* update the proc state */
        pdata->state = state;
        /* Release only the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDIN);
        }
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* update the proc state */
        pdata->state = state;
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* update the proc state */
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        pdata->state = state;
	if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
            /* Clean up the session directory as if we were the process
             * itself.  This covers the case where the process died abnormally
             * and didn't cleanup its own session directory.
             */
            orte_session_dir_finalize(proc);
	}
        /* if we are trying to terminate and our routes are
         * gone, then terminate ourselves IF no local procs
         * remain (might be some from another job)
         */
        if (orte_orteds_term_ordered &&
            0 == orte_routed.num_routes()) {
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                    ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
                    /* at least one is still alive */
                    goto cleanup;
                }
            }
            /* call our appropriate exit procedure */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:base all routes and children gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            goto cleanup;
        }
        /* return the allocated slot for reuse */
        cleanup_node(pdata);
	/* track job status */
	jdata->num_terminated++;
	if (jdata->num_terminated == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
	}
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
static void track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata;

    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                         "%s state:staged_orted:track_procs called for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    switch (state) {
    case ORTE_PROC_STATE_RUNNING:
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        /* we don't really care - nothing further to do */
        break;

    case ORTE_PROC_STATE_REGISTERED:
        /* update the proc state */
        pdata->state = state;
        /* if this proc registered as an MPI proc, and
         * MPI is not allowed, then that is an error
         */
        if (!jdata->gang_launched && pdata->mpi_proc) {
            /* abort the proc */
            /* notify the HNP of the error */
        }
        break;

    case ORTE_PROC_STATE_IOF_COMPLETE:
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        pdata->iof_complete = true;
        if (pdata->waitpid_recvd) {
            /* the proc has terminated */
            pdata->alive = false;
            pdata->state = ORTE_PROC_STATE_TERMINATED;
            /* retrieve any file maps posted by this process and forward them
             * to the HNP for collection
             */
            orte_dfs.get_file_map(proc, send_fms, pdata);
         }
        /* Release the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         * Do this after we handle termination in case the IOF needs
         * to check to see if all procs from the job are actually terminated
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDIN);
        }
        break;

    case ORTE_PROC_STATE_WAITPID_FIRED:
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        pdata->waitpid_recvd = true;
        if (pdata->iof_complete) {
            /* the proc has terminated */
            pdata->alive = false;
            pdata->state = ORTE_PROC_STATE_TERMINATED;
            /* retrieve any file maps posted by this process and forward them
             * to the HNP for collection
             */
            orte_dfs.get_file_map(proc, send_fms, pdata);
        }
        break;

    default:
        /* ignore */
        break;
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Example #13
0
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata;
    int i;
    char *rtmod;
    orte_process_name_t parent, target, *npptr;

    opal_output_verbose(5, orte_state_base_framework.framework_output,
                        "%s state:base:track_procs called for proc %s state %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        orte_proc_state_to_str(state));

    /* get our "lifeline" routed module */
    rtmod = orte_rml.get_routed(orte_mgmt_conduit);

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        jdata->num_launched++;
        if (jdata->num_launched == jdata->num_procs) {
            if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
            } else {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING);
            }
        }
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED);
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* update the proc state */
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        /* Release only the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDIN);
        }
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* update the proc state */
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* update the proc state */
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
            /* tell the PMIx subsystem to cleanup this client */
            opal_pmix.server_deregister_client(proc, NULL, NULL);
            /* Clean up the session directory as if we were the process
             * itself.  This covers the case where the process died abnormally
             * and didn't cleanup its own session directory.
             */
            orte_session_dir_finalize(proc);
        }
        /* if we are trying to terminate and our routes are
         * gone, then terminate ourselves IF no local procs
         * remain (might be some from another job)
         */
        if (orte_orteds_term_ordered &&
                0 == orte_routed.num_routes(rtmod)) {
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                        ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
                    /* at least one is still alive */
                    goto cleanup;
                }
            }
            /* call our appropriate exit procedure */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:base all routes and children gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            goto cleanup;
        }
        /* return the allocated slot for reuse */
        cleanup_node(pdata);
        /* track job status */
        jdata->num_terminated++;
        if (jdata->num_terminated == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
            /* if they requested notification upon completion, provide it */
            if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) {
                /* notify_completion => notify the parent of the termination
                 * of this child job. So get the parent jobid info */
                npptr = &parent;
                if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) {
                    /* notify everyone who asked for it */
                    target.jobid = jdata->jobid;
                    target.vpid = ORTE_VPID_WILDCARD;
                    _send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD);
                } else {
                    target.jobid = jdata->jobid;
                    target.vpid = ORTE_VPID_WILDCARD;
                    _send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent);
                }
            }
        } else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
                   !orte_job_term_ordered) {
            /* if this was an abnormal term, notify the other procs of the termination */
            parent.jobid = jdata->jobid;
            parent.vpid = ORTE_VPID_WILDCARD;
            _send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent);
        }
    }

cleanup:
    OBJ_RELEASE(caddy);
}
Example #14
0
static void track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata, *pptr;
    opal_buffer_t *alert;
    int rc, i;
    orte_plm_cmd_flag_t cmd;
    char *rtmod;
    orte_std_cntr_t index;
    orte_job_map_t *map;
    orte_node_t *node;

    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                         "%s state:orted:track_procs called for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        /* don't update until we are told that all are done */
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_local_procs) {
            /* once everyone registers, notify the HNP */

            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orted: notifying HNP all local registered",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

            alert = OBJ_NEW(opal_buffer_t);
            /* pack registered command */
            cmd = ORTE_PLM_REGISTERED_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack all the local child vpids */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                if (pptr->name.jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        goto cleanup;
                    }
                }
            }
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                                  ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        /* Release the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         * Do this after we handle termination in case the IOF needs
         * to check to see if all procs from the job are actually terminated
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDALL);
        }
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* if this proc has not already recorded as terminated, then
         * update the accounting here */
        if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            jdata->num_terminated++;
        }
        /* update the proc state */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED);
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        pdata->state = state;
        /* Clean up the session directory as if we were the process
         * itself.  This covers the case where the process died abnormally
         * and didn't cleanup its own session directory.
         */
        orte_session_dir_finalize(proc);
        /* if we are trying to terminate and our routes are
         * gone, then terminate ourselves IF no local procs
         * remain (might be some from another job)
         */
        rtmod = orte_rml.get_routed(orte_mgmt_conduit);
        if (orte_orteds_term_ordered &&
            0 == orte_routed.num_routes(rtmod)) {
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                    ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
                    /* at least one is still alive */
                    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                         "%s state:orted all routes gone but proc %s still alive",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         ORTE_NAME_PRINT(&pdata->name)));
                    goto cleanup;
                }
            }
            /* call our appropriate exit procedure */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orted all routes and children gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            goto cleanup;
        }
        /* track job status */
        if (jdata->num_terminated == jdata->num_local_procs &&
            !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) {
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            alert = OBJ_NEW(opal_buffer_t);
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the job info */
            if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
                ORTE_ERROR_LOG(rc);
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orted: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(jdata->jobid)));
            if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                                  ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
            /* mark that we sent it so we ensure we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
            /* cleanup the procs as these are gone */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                /* if this child is part of the job... */
                if (pptr->name.jobid == jdata->jobid) {
                    /* clear the entry in the local children */
                    opal_pointer_array_set_item(orte_local_children, i, NULL);
                    OBJ_RELEASE(pptr);  // maintain accounting
                }
            }
            /* tell the IOF that the job is complete */
            if (NULL != orte_iof.complete) {
                orte_iof.complete(jdata);
            }

            /* tell the PMIx subsystem the job is complete */
            if (NULL != opal_pmix.server_deregister_nspace) {
                opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL);
            }

            /* release the resources */
            if (NULL != jdata->map) {
                map = jdata->map;
                for (index = 0; index < map->nodes->size; index++) {
                    if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
                        continue;
                    }
                    OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
                                         "%s state:orted releasing procs from node %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         node->name));
                    for (i = 0; i < node->procs->size; i++) {
                        if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                            continue;
                        }
                        if (pptr->name.jobid != jdata->jobid) {
                            /* skip procs from another job */
                            continue;
                        }
                        node->slots_inuse--;
                        node->num_procs--;
                        OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
                                             "%s state:orted releasing proc %s from node %s",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             ORTE_NAME_PRINT(&pptr->name), node->name));
                        /* set the entry in the node array to NULL */
                        opal_pointer_array_set_item(node->procs, i, NULL);
                        /* release the proc once for the map entry */
                        OBJ_RELEASE(pptr);
                    }
                    /* set the node location to NULL */
                    opal_pointer_array_set_item(map->nodes, index, NULL);
                    /* maintain accounting */
                    OBJ_RELEASE(node);
                    /* flag that the node is no longer in a map */
                    ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
                }
                OBJ_RELEASE(map);
                jdata->map = NULL;
            }

            /* cleanup the job info */
            opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
            OBJ_RELEASE(jdata);
        }
    }

  cleanup:
    OBJ_RELEASE(caddy);
}
static void sample(int fd, short event, void *arg)
{
    float prob;
    orte_proc_t *child;
    int i;

    /* if we are not sampling any more, then just return */
    if (NULL == sample_ev) {
        return;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
                         "%s sample:ft_tester considering killing something",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* are we including ourselves? */
    if (ORTE_PROC_IS_DAEMON && 0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
        OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
                             "%s sample:ft_tester considering killing me!",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* roll the dice */
        prob = (double)random() / (double)INT32_MAX;
        if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) {
            /* commit suicide */
            OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
                                 "%s sample:ft_tester committing suicide",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            orte_errmgr.abort(1, NULL);
            return;
        }
    }

    /* see if we should kill a child */
    for (i=0; i < orte_local_children->size; i++) {
        if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
            continue;
        }
        if (!child->alive || 0 == child->pid ||
            ORTE_PROC_STATE_UNTERMINATED < child->state) {
            OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
                                 "%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 child->alive ? "TRUE" : "FALSE",
                                 (unsigned long)child->pid, orte_proc_state_to_str(child->state)));
            continue;
        }
        /* roll the dice */
        prob = (double)random() / (double)INT32_MAX;
        OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
                             "%s sample:ft_tester child: %s dice: %f prob %f",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&child->name),
                             prob, mca_sensor_ft_tester_component.fail_prob));
        if (prob < mca_sensor_ft_tester_component.fail_prob) {
            /* you shall die... */
            OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
                                 "%s sample:ft_tester killing %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name)));
            kill(child->pid, SIGTERM);
            /* are we allowing multiple deaths */
            if (!mca_sensor_ft_tester_component.multi_fail) {
                break;
            }
        }
    }

    /* restart the timer */
    if (NULL != sample_ev) {
        opal_event_evtimer_add(sample_ev, &sample_time);
    }
}
Example #16
0
static void track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata, *pptr;
    opal_buffer_t *alert;
    int rc, i;
    orte_plm_cmd_flag_t cmd;
    int8_t flag;

    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                         "%s state:orcm:track_procs called for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
    if (NULL == pdata) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        /* don't update until we are told that all are done */
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_local_procs) {
            /* once everyone registers, send their contact info to
             * the HNP so it is available to debuggers and anyone
             * else that needs it
             */

            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orcm: notifying HNP all local registered",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

            alert = OBJ_NEW(opal_buffer_t);
            /* pack registered command */
            cmd = ORTE_PLM_REGISTERED_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack all the local child vpids */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                if (pptr->name.jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        goto cleanup;
                    }
                    if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_AS_MPI)) {
                        flag = 1;
                    } else {
                        flag = 0;
                    }
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &flag, 1, OPAL_INT8))) {
                        ORTE_ERROR_LOG(rc);
                        goto cleanup;
                    }
                }
            }
            /* send it */
            if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                              ORTE_RML_TAG_PLM,
                                                              orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* if this proc has not already recorded as terminated, then
         * update the accounting here */
        if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            jdata->num_terminated++;
        }
        /* update the proc state */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED);
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        pdata->state = state;
        /* Clean up the session directory as if we were the process
         * itself.  This covers the case where the process died abnormally
         * and didn't cleanup its own session directory.
         */
        orte_session_dir_finalize(proc);
        /* track job status */
        if (jdata->num_terminated == jdata->num_local_procs &&
            !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) {
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            alert = OBJ_NEW(opal_buffer_t);
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the job info */
            if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
                ORTE_ERROR_LOG(rc);
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orcm: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(jdata->jobid)));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
            /* mark that we sent it so we ensure we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_proc_t *pptr;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;

    orte_proc_t *child, *ptr;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    orte_vpid_t null=ORTE_VPID_INVALID;
    orte_ns_cmp_bitmask_t mask=ORTE_NS_CMP_ALL;
    int i;

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:default_orted:proc_errors process %s error state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        goto cleanup;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            goto cleanup;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - ignore */
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                             "%s errmgr:default:orted daemon %s exited",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* see if this was a lifeline */
        if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s errmgr:orted daemon %s was a lifeline - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(proc)));
            /* kill our children */
            killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
            /* terminate - our routed children will see
             * us leave and automatically die
             */
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            goto cleanup;
        }
        /* are any of my children still alive */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                if (child->alive && child->state < ORTE_PROC_STATE_UNTERMINATED) {
                    goto cleanup;
                }
            }
        }
        /* if all my routes and children are gone, then terminate
           ourselves nicely (i.e., this is a normal termination) */
        if (0 == orte_routed.num_routes()) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s errmgr:default:orted all routes gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_TERMINATE(0);
        } else {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s errmgr:default:orted not exiting, num_routes() == %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 (int)orte_routed.num_routes()));
        }
        /* if not, then we can continue */
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* must already be complete */
        goto cleanup;
    }
    pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    /* if there are no local procs for this job, we can
     * ignore this call
     */
    if (0 == jdata->num_local_procs) {
        goto cleanup;
    }

    /* find this proc in the local children */
    child = NULL;
    for (i=0; i < orte_local_children->size; i++) {
        if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
            continue;
        }
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &ptr->name, proc)) {
            child = ptr;
            break;
        }
    }
    if (NULL == child) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:default_orted got state %s for proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc)));
 
    if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
        child->state = state;
        /* Decrement the number of local procs */
        jdata->num_local_procs--;
        /* kill this proc */
        killprocs(proc->jobid, proc->vpid);
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        if (!orte_abort_non_zero_exit) {
            /* leave the child in orte_local_children so we can
             * later send the state info after full job termination
             */
            child->state = state;
            child->waitpid_recvd = true;
            if (child->iof_complete) {
                /* the proc has terminated */
                child->alive = false;
                /* Clean up the session directory as if we were the process
                 * itself.  This covers the case where the process died abnormally
                 * and didn't cleanup its own session directory.
                 */
                orte_session_dir_finalize(&child->name);
                /* track job status */
                jdata->num_terminated++;
            }
            /* treat this as normal termination */
            goto REPORT_STATE;
        }
        /* report this as abnormal termination to the HNP */
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack only the data for this proc - have to start with the jobid
         * so the receiver can unpack it correctly
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        child->state = state;
        /* now pack the child's info */
        if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* remove the child from our local array as it is no longer alive */
        opal_pointer_array_set_item(orte_local_children, i, NULL);
        /* Decrement the number of local procs */
        jdata->num_local_procs--;

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&child->name),
                             jdata->num_local_procs));
        
        /* release the child object */
        OBJ_RELEASE(child);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM, 0,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(alert);
        }
        return;
    }

    if (ORTE_PROC_STATE_FAILED_TO_START == state ||
        ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) {
        /* update the proc state */
        child->state = state;
        /* count the proc as having "terminated" */
        jdata->num_terminated++;
        /* leave the error report in this case to the
         * state machine, which will receive notice
         * when all local procs have attempted to start
         * so that we send a consolidated error report
         * back to the HNP
         */
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away
         */
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack only the data for this proc - have to start with the jobid
         * so the receiver can unpack it correctly
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        child->state = state;
        /* now pack the child's info */
        if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* remove the child from our local array as it is no longer alive */
        opal_pointer_array_set_item(orte_local_children, i, NULL);
        /* Decrement the number of local procs */
        jdata->num_local_procs--;

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&child->name),
                             jdata->num_local_procs));
        
        /* release the child object */
        OBJ_RELEASE(child);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM, 0,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 REPORT_STATE:
    if (ORTE_PROC_STATE_REGISTERED == state) {
        /* see if everyone in this job has registered */
        if (all_children_registered(proc->jobid)) {
            /* once everyone registers, send their contact info to
             * the HNP so it is available to debuggers and anyone
             * else that needs it
             */

            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                 "%s errmgr:default_orted: sending contact info to HNP",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            
            alert = OBJ_NEW(opal_buffer_t);
            /* pack init routes command */
            cmd = ORTE_PLM_INIT_ROUTES_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack all the local child vpids */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                if (ptr->name.jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ptr->name.vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        return;
                    }
                }
            }
            /* pack an invalid marker */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* add in contact info for all procs in the job */
            if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&alert);
                return;
            }
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM, 0,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        }        
        return;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));
        
        /* remove all of this job's children from the global list - do not lock
         * the thread as we are already locked
         */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (jdata->jobid == ptr->name.jobid) {
                opal_pointer_array_set_item(orte_local_children, i, NULL);
                OBJ_RELEASE(ptr);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jdata->jobid);

        /* remove this job from our local job data since it is complete */
        opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
        OBJ_RELEASE(jdata);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM, 0,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Example #18
0
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_proc_t *pptr, *proct;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    int i;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                         "%s errmgr:default_hnp: for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* if the orteds are terminating, check job complete */
        if (orte_orteds_term_ordered) {
            opal_output(0, "TERM ORDERED - CHECKING COMPLETE");
            goto cleanup;
        } else {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            goto cleanup;
        }
    }
    pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    /* we MUST handle a communication failure before doing anything else
     * as it requires some special care to avoid normal termination issues
     * for local application procs
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* is this to a daemon? */
        if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
            /* nope - ignore it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                 "%s Comm failure to non-daemon proc - ignoring it",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* if this is my own connection, ignore it */
        if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                 "%s Comm failure on my own connection - ignoring it",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* if we have ordered orteds to terminate or abort
         * is in progress, record it */
        if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                 "%s Comm failure: daemons terminating - recording daemon %s as gone",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
            /* remove from dependent routes, if it is one */
            orte_routed.route_lost(proc);
            /* if all my routes and local children are gone, then terminate ourselves */
            if (0 == orte_routed.num_routes()) {
                for (i=0; i < orte_local_children->size; i++) {
                    if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                            proct->alive && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
                        /* at least one is still alive */
                        goto cleanup;
                    }
                }
                /* call our appropriate exit procedure */
                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                     "%s errmgr_hnp: all routes and children gone - ordering exit",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            }
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s Comm failure: daemon %s - aborting",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
        /* record the first one to fail */
        if (!jdata->abort) {
            jdata->state = ORTE_JOB_STATE_COMM_FAILED;
            /* point to the lowest rank to cause the problem */
            jdata->aborted_proc = pptr;
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            jdata->abort = true;
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        }
        /* abort the system */
        default_hnp_abort(jdata);
        goto cleanup;
    }

    /* update the proc state - can get multiple reports on a proc
     * depending on circumstances, so ensure we only do this once
     */
    if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
        pptr->state = state;
        jdata->num_terminated++;
    }
    /* since we only come here if the proc terminated,
     * cleanup the local proc, if required
     */
    cleanup_local_proc(jdata, proc);

    /* ensure we record the failed proc properly so we can report
     * the error once we terminate
     */
    switch (state) {
    case ORTE_PROC_STATE_KILLED_BY_CMD:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s killed by cmd",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* we ordered this proc to die, so it isn't an abnormal termination
         * and we don't flag it as such
         */
        if (jdata->num_terminated >= jdata->num_procs) {
            /* this job has terminated */
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        }
        /* don't abort the job as this isn't an abnormal termination */
        break;

    case ORTE_PROC_STATE_ABORTED:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s aborted",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!jdata->abort) {
            jdata->state = ORTE_JOB_STATE_ABORTED;
            /* point to the first rank to cause the problem */
            jdata->aborted_proc = pptr;
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            jdata->abort = true;
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        }
        /* abnormal termination - abort */
        default_hnp_abort(jdata);
        break;

    case ORTE_PROC_STATE_ABORTED_BY_SIG:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s aborted by signal",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!jdata->abort) {
            jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
            /* point to the first rank to cause the problem */
            jdata->aborted_proc = pptr;
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            jdata->abort = true;
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        }
        /* abnormal termination - abort */
        default_hnp_abort(jdata);
        break;

    case ORTE_PROC_STATE_TERM_WO_SYNC:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s terminated without sync",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!jdata->abort) {
            jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
            /* point to the first rank to cause the problem */
            jdata->aborted_proc = pptr;
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            jdata->abort = true;
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* now treat a special case - if the proc exit'd without a required
             * sync, it may have done so with a zero exit code. We want to ensure
             * that the user realizes there was an error, so in this -one- case,
             * we overwrite the process' exit code with the default error code
             */
            ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
        }
        /* abnormal termination - abort */
        default_hnp_abort(jdata);
        break;

    case ORTE_PROC_STATE_FAILED_TO_START:
    case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             orte_proc_state_to_str(state)));
        if (!jdata->abort) {
            if (ORTE_PROC_STATE_FAILED_TO_START) {
                jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
            } else {
                jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
            }
            /* point to the first rank to cause the problem */
            jdata->aborted_proc = pptr;
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            jdata->abort = true;
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        }
        /* abnormal termination - abort */
        default_hnp_abort(jdata);
        break;

    case ORTE_PROC_STATE_CALLED_ABORT:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s called abort",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!jdata->abort) {
            jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
            /* point to the first proc to cause the problem */
            jdata->aborted_proc = pptr;
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            jdata->abort = true;
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        }
        /* abnormal termination - abort */
        default_hnp_abort(jdata);
        break;

    case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s exceeded sensor boundary",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!jdata->abort) {
            jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED;
            /* point to the lowest rank to cause the problem */
            jdata->aborted_proc = pptr;
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            jdata->abort = true;
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        }
        /* abnormal termination - abort */
        default_hnp_abort(jdata);
        break;

    case ORTE_PROC_STATE_TERM_NON_ZERO:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s exited with non-zero status %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             pptr->exit_code));
        ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        /* track the number of non-zero exits */
        jdata->num_non_zero_exit++;
        if (orte_abort_non_zero_exit) {
            if (!jdata->abort) {
                jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
                /* point to the first rank to cause the problem */
                jdata->aborted_proc = pptr;
                /* retain the object so it doesn't get free'd */
                OBJ_RETAIN(pptr);
                jdata->abort = true;
            }
            /* user requested we abort in this scenario */
            default_hnp_abort(jdata);
        } else {
            /* user requested we consider this normal termination */
            if (jdata->num_terminated >= jdata->num_procs) {
                /* this job has terminated */
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
            }
        }
        break;

    case ORTE_PROC_STATE_HEARTBEAT_FAILED:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s heartbeat failed",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!jdata->abort) {
            jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
            /* point to the first rank to cause the problem */
            jdata->aborted_proc = pptr;
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            jdata->abort = true;
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        }
        /* remove from dependent routes, if it is one */
        orte_routed.route_lost(proc);
        /* kill all jobs */
        default_hnp_abort(jdata);
        break;

    default:
        /* shouldn't get this, but terminate job if required */
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:hnp: proc %s default error %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             orte_proc_state_to_str(state)));
        if (jdata->num_terminated == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        }
        break;
    }

cleanup:
    OBJ_RELEASE(caddy);
}
Example #19
0
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
#if 0
    orte_ns_cmp_bitmask_t mask;
    opal_buffer_t *buf;
    orcm_rm_cmd_flag_t command = ORCM_NODESTATE_UPDATE_COMMAND;
    orcm_node_state_t state = ORCM_NODE_STATE_DOWN;
    int ret;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:orcm: proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(&caddy->name),
                         orte_proc_state_to_str(caddy->proc_state)));
    
    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:orcm: finalizing",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        OBJ_RELEASE(caddy);
        return;
    }

    if (ORTE_PROC_STATE_COMM_FAILED == caddy->proc_state) {
        mask = ORTE_NS_CMP_ALL;
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, &caddy->name)) {
            OBJ_RELEASE(caddy);
            return;
        }
        /* see is this was a lifeline */
        if (ORTE_SUCCESS != orte_routed.route_lost(&caddy->name)) {
            OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:orcm: lost my lifeline",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            /* order an exit */
            ORTE_ERROR_LOG(ORTE_ERR_UNRECOVERABLE);
            OBJ_RELEASE(caddy);
            exit(1);
        } else {
            /* only notify for orcm daemon failures */
            if (0 == caddy->name.jobid) {
                OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:orcm: reporting child aggregator failure",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                /* inform the scheduler of the lost connection */
                buf = OBJ_NEW(opal_buffer_t);
                /* pack the alloc command flag */
                if (OPAL_SUCCESS != (ret = opal_dss.pack(buf, &command,1, ORCM_RM_CMD_T))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buf);
                    OBJ_RELEASE(caddy);
                    return;
                }
                if (OPAL_SUCCESS != (ret = opal_dss.pack(buf, &state, 1, OPAL_INT8))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buf);
                    OBJ_RELEASE(caddy);
                    return;
                }
                if (OPAL_SUCCESS != (ret = opal_dss.pack(buf, &caddy->name, 1, ORTE_NAME))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buf);
                    OBJ_RELEASE(caddy);
                    return;
                }
                if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_SCHEDULER, buf,
                                                                   ORCM_RML_TAG_RM,
                                                                   orte_rml_send_callback, NULL))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buf);
                    OBJ_RELEASE(caddy);
                    return;
                }
            }
        }
    } else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
        OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:orcm: lifeline lost",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* order an exit */
        ORTE_ERROR_LOG(ORTE_ERR_UNRECOVERABLE);
        OBJ_RELEASE(caddy);
        exit(1);
    }

#endif
    /* cleanup */
    OBJ_RELEASE(caddy);
}
Example #20
0
static int update_state(orte_jobid_t job,
                        orte_job_state_t jobstate,
                        orte_process_name_t *proc,
                        orte_proc_state_t state,
                        pid_t pid,
                        orte_exit_code_t exit_code)
{
    int rc=ORTE_SUCCESS, i;
    orte_app_context_t *app;
    orte_node_t *node;
    orte_proc_t *pptr, *daemon, *pptr2;
    opal_buffer_t *notify;
    orcm_triplet_t *trp;
    orcm_source_t *src;
    bool procs_recovered;
    orte_job_t *jdt;
    uint16_t jfam;
    bool send_msg;

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:update_state for job %s proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(job),
                         (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));

    /* protect against threads */
    ORTE_ACQUIRE_THREAD(&ctl);

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }
    
    /***   UPDATE COMMAND FOR A JOB   ***/
    if (NULL == proc) {
        /* should only get this if a daemon restarted and we need
         * to check for procs waiting to migrate
         */
        if (ORTE_JOB_STATE_PROCS_MIGRATING != jobstate) {
            /* we should never get this situation */
            opal_output(0, "%s UNKNOWN JOB ERROR ",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_job_state_to_str(jobstate));
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERROR;
        }
        /* cycle thru all known jobs looking for those with procs
         * awaiting resources to migrate
         */
        for (i=0; i < orte_job_data->size; i++) {
            if (NULL == (jdt = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
                continue;
            }
            if (ORTE_JOB_STATE_PROCS_MIGRATING != jdt->state) {
                continue;
            }
            /* reset the job */
            orte_plm_base_reset_job(jdt);

            /* map the job again */
            if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdt))) {
                ORTE_ERROR_LOG(rc);
                continue;
            }
            /* launch any procs that could be mapped - note that not
             * all procs that were waiting for migration may have
             * been successfully mapped, so this could in fact
             * result in no action by the daemons
             */
            notify = OBJ_NEW(opal_buffer_t);
            /* indicate the target DVM */
            jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
            opal_dss.pack(notify, &jfam, 1, OPAL_UINT16);

            /* get the launch data */
            if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(notify, jdt->jobid))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(notify);
                ORTE_RELEASE_THREAD(&ctl);
                return ORTE_SUCCESS;
            }
            /* send it to the daemons */
            if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL,
                                                         NULL, ORCM_PNP_TAG_COMMAND,
                                                         NULL, 0, notify, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }


    /**** DEAL WITH INDIVIDUAL PROCS ****/

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:sched got state %s for proc %s pid %d exit_code %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc), pid, exit_code));
 
    /* if this was a failed comm or heartbeat */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* ignore this */
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }

    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        /* get the proc object for this daemon */
        if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        /* ensure that the heartbeat system knows to ignore this proc
         * from this point forward
         */
        daemon->beat = 0;
        /* if we have already heard about this proc, ignore repeats */
        if (ORTE_PROC_STATE_HEARTBEAT_FAILED == daemon->state) {
            /* already heard */
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_SUCCESS;
        }
#if 0
        /* delete the route */
        orte_routed.delete_route(proc);
        /* purge the oob */
        orte_rml.purge(proc);
#endif
        /* get the triplet/source and mark this source as "dead" */
        if (NULL == (trp = orcm_get_triplet_stringid("orcmd:0.1:alpha"))) {
            opal_output(0, "%s CANNOT FIND DAEMON TRIPLET",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        if (NULL == (src = orcm_get_source(trp, proc, false))) {
            opal_output(0, "%s DAEMON %s IS UNKNOWN SOURCE",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc));
            ORTE_RELEASE_THREAD(&trp->ctl);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        src->alive = false;
        ORTE_RELEASE_THREAD(&src->ctl);
        ORTE_RELEASE_THREAD(&trp->ctl);

        /* notify all apps immediately */
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        notify = OBJ_NEW(opal_buffer_t);
        send_msg = false;
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) {
                ORTE_ERROR_LOG(rc);
                ORTE_RELEASE_THREAD(&ctl);
                return rc;
            }
            /* reset the proc stats */
            OBJ_DESTRUCT(&pptr->stats);
            OBJ_CONSTRUCT(&pptr->stats, opal_pstats_t);
            /* since we added something, need to send msg */
            send_msg = true;
        }
        if (send_msg) {
            /* send it to all apps */
            if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL,
                                                         ORCM_PNP_TAG_ERRMGR, NULL, 0,
                                                         notify, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            OBJ_RELEASE(notify);
        }
        /* reset the node stats */
        OBJ_DESTRUCT(&node->stats);
        OBJ_CONSTRUCT(&node->stats, opal_node_stats_t);
        /* record that the daemon died */
        daemon->state = state;
        daemon->exit_code = exit_code;
        daemon->pid = 0;
        /* reset the daemon stats */
        OBJ_DESTRUCT(&daemon->stats);
        OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t);
        node = daemon->node;
        if (NULL == node) {
            opal_output(0, "%s Detected failure of daemon %s on unknown node",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc));
            /* can't do anything further */
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_SUCCESS;            
        } else {
            opal_output(0, "%s Detected failure of daemon %s on node %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        (NULL == node->name) ? "UNKNOWN" : node->name);
        }
        /* see if any usable daemons are left alive */
        procs_recovered = false;
        for (i=2; i < daemon_job->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, i))) {
                continue;
            }
            if (ORTE_PROC_STATE_UNTERMINATED < pptr->state) {
                continue;
            }
            /* at least one alive! recover procs from the failed one */
            recover_procs(proc);
            procs_recovered = true;
            break;
        }
        if (!procs_recovered) {
            daemon->node = NULL;
            node->state = ORTE_NODE_STATE_DOWN;
            node->daemon = NULL;
            /* mark all procs on this node as having terminated */
            for (i=0; i < node->procs->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                    continue;
                }
                /* get the job data object for this process */
                if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) {
                    /* major problem */
                    opal_output(0, "%s COULD NOT GET JOB OBJECT FOR PROC %s(%d): state %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&pptr->name), i,
                                orte_proc_state_to_str(pptr->state));
                    continue;
                }
                if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) {
                    continue;
                }
                OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                     "%s REMOVING PROC %s FROM NODE %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&pptr->name), node->name));
                app->num_procs--;
                opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL);
                OBJ_RELEASE(pptr);
                /* clean it off the node */
                opal_pointer_array_set_item(node->procs, i, NULL);
                node->num_procs--;
                /* maintain acctg */
                OBJ_RELEASE(pptr);
                /* see if job is empty */
                jdt->num_terminated++;
                if (jdt->num_procs <= jdt->num_terminated) {
                    OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                         "%s REMOVING JOB %s FROM ACTIVE ARRAY",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         ORTE_JOBID_PRINT(jdt->jobid)));
                    opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL);
                    OBJ_RELEASE(jdt);
                }
            }
        }
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }

    if (ORTE_PROC_STATE_RESTARTED == state) {
        OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                             "%s RESTART OF DAEMON %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* get the proc object for this daemon */
        if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        /* if apps were on that node, notify all apps immediately that
         * those procs have failed
         */
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        notify = OBJ_NEW(opal_buffer_t);
        send_msg = false;
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) {
                ORTE_ERROR_LOG(rc);
                ORTE_RELEASE_THREAD(&ctl);
                return rc;
            }
            /* since we added something, we need to send msg */
            send_msg = true;
            /* remove the proc from the app so that it will get
             * restarted when we re-activate the config
             */
            if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) {
                continue;
            }
            if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) {
                continue;
            }
            OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                 "%s REMOVING PROC %s FROM NODE %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pptr->name), node->name));
            app->num_procs--;
            opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL);
            OBJ_RELEASE(pptr);
            /* clean it off the node */
            opal_pointer_array_set_item(node->procs, i, NULL);
            node->num_procs--;
            /* maintain acctg */
            OBJ_RELEASE(pptr);
            /* see if job is empty */
            jdt->num_terminated++;
            if (jdt->num_procs <= jdt->num_terminated) {
                OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                     "%s REMOVING JOB %s FROM ACTIVE ARRAY",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_JOBID_PRINT(jdt->jobid)));
                opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL);
                OBJ_RELEASE(jdt);
            }
        }
        if (send_msg) {
            /* send it to all apps */
            if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL,
                                                         ORCM_PNP_TAG_ERRMGR, NULL, 0,
                                                         notify, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            OBJ_RELEASE(notify);
        }
        /* reset the node stats */
        OBJ_DESTRUCT(&node->stats);
        OBJ_CONSTRUCT(&node->stats, opal_node_stats_t);
        /* reset the daemon stats */
        OBJ_DESTRUCT(&daemon->stats);
        OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t);
        /* don't restart procs - we'll do that later after
         * we allow time for multiple daemons to restart
         */
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }

    /* to arrive here is an error */
    opal_output(0, "%s GOT UNRECOGNIZED STATE %s FOR PROC %s",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                orte_proc_state_to_str(state),
                ORTE_NAME_PRINT(proc));
    return ORTE_ERROR;

}
Example #21
0
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata;

    opal_output_verbose(5, orte_state_base_framework.framework_output,
                        "%s state:base:track_procs called for proc %s state %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        orte_proc_state_to_str(state));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        if (jdata->num_launched == jdata->num_procs) {
            if (jdata->controls & ORTE_JOB_CONTROL_DEBUGGER_DAEMON) {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
            } else {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING);
            }
        }
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED);
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* update the proc state */
        pdata->state = state;
        /* Release only the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDIN);
        }
        pdata->iof_complete = true;
        if (pdata->waitpid_recvd) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* update the proc state */
        pdata->state = state;
        pdata->waitpid_recvd = true;
        if (pdata->iof_complete) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* update the proc state */
        pdata->alive = false;
        pdata->state = state;
	if (pdata->local_proc) {
            /* Clean up the session directory as if we were the process
             * itself.  This covers the case where the process died abnormally
             * and didn't cleanup its own session directory.
             */
            orte_session_dir_finalize(proc);
	}
        /* return the allocated slot for reuse */
        cleanup_node(pdata);
	/* track job status */
	jdata->num_terminated++;
	if (jdata->num_terminated == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
	}
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Example #22
0
/* failure notifications come here */
static void remote_update(int status,
                          orte_process_name_t *sender,
                          orcm_pnp_tag_t tag,
                          struct iovec *msg,
                          int count,
                          opal_buffer_t *buffer,
                          void *cbdata)
{
    int rc, n, k, cnt;
    orte_process_name_t name;
    uint8_t flag;
    orte_job_t *jdata;
    orte_proc_t *proc, *pptr;
    orte_node_t *node;
    orte_app_context_t *app;
    opal_buffer_t *bfr;
    orte_proc_state_t state;
    orte_exit_code_t exit_code;
    pid_t pid;
    bool restart_reqd, job_released, job_done;
    uint16_t jfam;
    struct timeval offset={0, 0};
    int32_t max_fails=0;
    orte_errmgr_caddy_t *cd;

    OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                         "%s errmgr:sched:receive proc state notification from %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(sender)));

    /* get the node object for the sender */
    if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, sender->vpid))) {
        opal_output(0, "%s CANNOT FIND NODE FOR DAEMON %s",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender));
        return;
    }

    /* unpack the names of the procs */
    restart_reqd = false;
    n=1;
    while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &name, &n, ORTE_NAME))) {

        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                             "%s GOT UPDATE FOR %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&name)));

        /* unpack the pid of the proc */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &n, OPAL_PID))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* unpack the state of the proc */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &n, ORTE_PROC_STATE))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* unpack the exit_code of the proc */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &n, ORTE_EXIT_CODE))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        /* get the job object for this proc */
        if (NULL == (jdata = orte_get_job_data_object(name.jobid))) {
            /* BIG problem*/
            opal_output(0, "%s errmgr:sched JOB %s NOT FOUND",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_JOBID_PRINT(name.jobid));
            return;
        }

        /* get the proc object */
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name.vpid))) {
            /* unknown proc - race condition when killing a proc on cmd */
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s MISSING PROC %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&name)));
            continue;
        }
        /* update data */
        proc->pid = pid;
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                             "%s CHANGING STATE OF PROC %s FROM %s TO %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&name),
                             orte_proc_state_to_str(proc->state),
                             orte_proc_state_to_str(state)));
        proc->state = state;
        proc->exit_code = exit_code;
        /* if the proc has failed, mark the job for restart unless
         * it was killed by our own cmd
         */
        if (ORTE_PROC_STATE_UNTERMINATED < state) {
            /* reset the stats */
            OBJ_DESTRUCT(&proc->stats);
            OBJ_CONSTRUCT(&proc->stats, opal_pstats_t);
            if (ORTE_PROC_STATE_KILLED_BY_CMD == state) {
                /* this is a response to our killing a proc - remove it
                 * from the system
                 */
                opal_pointer_array_set_item(jdata->procs, name.vpid, NULL);
                jdata->num_procs--;
                /* clean it off of the node */
                for (k=0; k < node->procs->size; k++) {
                    if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) {
                        continue;
                    }
                    if (pptr->name.jobid == proc->name.jobid &&
                        pptr->name.vpid == proc->name.vpid) {
                        /* found it */
                        OPAL_OUTPUT_VERBOSE((7, orte_errmgr_base.output,
                                             "%s REMOVING ENTRY %d FOR PROC %s FROM NODE %s",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k,
                                             ORTE_NAME_PRINT(&proc->name),
                                             ORTE_VPID_PRINT(sender->vpid)));
                        opal_pointer_array_set_item(node->procs, k, NULL);
                        node->num_procs--;
                        /* maintain acctg */
                        OBJ_RELEASE(proc);
                        break;
                    }
                }
                /* release the object */
                OBJ_RELEASE(proc);
                /* if the job is now empty, or if the only procs remaining are stopped
                 * due to exceeding restart (and thus cannot run), remove it too
                 */
                if (0 == jdata->num_procs) {
                    opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
                    OBJ_RELEASE(jdata);
                } else {
                    job_done = true;
                    for (k=0; k < jdata->procs->size; k++) {
                        if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, k))) {
                            continue;
                        }
                        OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                             "%s CHECKING PROC %s STATE %s",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             ORTE_NAME_PRINT(&pptr->name),
                                             orte_proc_state_to_str(pptr->state)));
                        if (pptr->state < ORTE_PROC_STATE_UNTERMINATED ||
                            ORTE_PROC_STATE_CANNOT_RESTART != pptr->state) {
                            job_done = false;
                            break;
                        }
                    }
                    if (job_done) {
                        opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
                        OBJ_RELEASE(jdata);
                    }
                }
            } else {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                     "%s FLAGGING JOB %s AS CANDIDATE FOR RESTART",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_JOBID_PRINT(jdata->jobid)));
                jdata->state = ORTE_JOB_STATE_RESTART;
                /* flag that at least one job requires restart */
                restart_reqd = true;
            }
        }
        /* prep for next round */
        n=1;
    }
    if (ORCM_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
        ORTE_ERROR_LOG(rc);
    }

    /* if restart not reqd, nothing more to do */
    if (!restart_reqd) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                             "%s NO RESTARTS REQUIRED",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        return;
    }

    /* cycle thru the array of jobs looking for those requiring restart */
    for (n=1; n < orte_job_data->size; n++) {
        if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
            continue;
        }
        if (ORTE_JOB_STATE_RESTART != jdata->state) {
            continue;
        }
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                             "%s JOB %s CANDIDATE FOR RESTART",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));
        /* find the proc that needs restarting */
        restart_reqd = false;
        job_released = false;
        max_fails = 0;
        offset.tv_sec = 0;
        for (cnt=0; cnt < jdata->procs->size; cnt++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, cnt))) {
                continue;
            }
            if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
                ORTE_PROC_STATE_KILLED_BY_CMD != proc->state) {
                /* get the app for this proc */
                app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx);
                if (NULL == app) {
                    opal_output(0, "%s UNKNOWN APP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                    continue;
                }

                /* check the number of restarts to see if the limit has been reached */
                if (app->max_restarts < 0 ||
                    proc->restarts < app->max_restarts) {
                    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                         "%s FLAGGING PROC %s FOR RESTART",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         ORTE_NAME_PRINT(&proc->name)));
                    /* flag the proc for restart */
                    proc->state = ORTE_PROC_STATE_RESTART;
                    restart_reqd = true;
                    /* adjust accounting */
                    jdata->num_terminated++;
                    /* increment the restart counter since the proc will be restarted */
                    proc->restarts++;
                    /* track max failures */
                    if (max_fails < proc->restarts) {
                        max_fails = proc->restarts;
                    }
                } else {
                    /* limit reached - don't restart it */
                    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                         "%s PROC %s AT LIMIT - CANNOT RESTART",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         ORTE_NAME_PRINT(&proc->name)));
                    /* leave the proc in the system so users can see that it
                     * reached the restart limit
                     */
                    proc->state = ORTE_PROC_STATE_CANNOT_RESTART;
                    proc->pid = 0;
                    /* increment his restarts this once so it shows as too high */
                    proc->restarts++;
                    /* adjust accounting */
                    jdata->num_procs--;
                    jdata->num_terminated++;
                    /* clean it off of the node */
                    if (NULL == (node = proc->node)) {
                        continue;
                    }
                    for (k=0; k < node->procs->size; k++) {
                        if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) {
                            continue;
                        }
                        if (pptr == proc) {
                            /* found it */
                            opal_pointer_array_set_item(node->procs, k, NULL);
                            node->num_procs--;
                            /* maintain acctg */
                            OBJ_RELEASE(proc);
                            proc->node = NULL;
                            break;
                        }
                    }
                }
            }
        }
        /* if the job was released, then move on */
        if (job_released) {
            continue;
        }
        /* if no procs require restart, then move on to next job */
        if (!restart_reqd) {
            jdata->state = ORTE_JOB_STATE_RUNNING;  /* reset this */
            continue;
        }

        /* calculate a delay to avoid racy situation when a proc
         * is continuously failing due to, e.g., a bad command
         * syntax
         */
        if (1 < max_fails) {
            if (4 < max_fails) {
                /* cap the delay at 4 secs */
                offset.tv_sec = 4;
            } else {
                /* add a sec for each failure beyond the first */
                offset.tv_sec = max_fails - 1;
            }
        }
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                             "%s DELAYING RESTART OF JOB %s FOR %d SECS",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid), (int)offset.tv_sec));
        cd = OBJ_NEW(orte_errmgr_caddy_t);
        cd->jdata = jdata;
        opal_event_evtimer_set(opal_event_base, &cd->ev, launch_restart, cd);
        opal_event_evtimer_add(&cd->ev, &offset);
    }
}
Example #23
0
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_proc_t *pptr, *proct;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    int i;
    int32_t i32, *i32ptr;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_hnp: for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* could be a race condition */
        goto cleanup;
    }
    pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    /* we MUST handle a communication failure before doing anything else
     * as it requires some special care to avoid normal termination issues
     * for local application procs
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* is this to a daemon? */
        if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
            /* nope - ignore it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s Comm failure to non-daemon proc - ignoring it",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* if this is my own connection, ignore it */
        if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s Comm failure on my own connection - ignoring it",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* mark the daemon as gone */
        ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
        /* if we have ordered orteds to terminate or abort
         * is in progress, record it */
        if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s Comm failure: daemons terminating - recording daemon %s as gone",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
            /* remove from dependent routes, if it is one */
            orte_routed.route_lost(proc);
            /* if all my routes and local children are gone, then terminate ourselves */
            if (0 == orte_routed.num_routes()) {
                for (i=0; i < orte_local_children->size; i++) {
                    if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                            ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
                        /* at least one is still alive */
                        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                             "%s Comm failure: at least one proc (%s) still alive",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             ORTE_NAME_PRINT(&proct->name)));
                        goto cleanup;
                    }
                }
                /* call our appropriate exit procedure */
                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr_hnp: all routes and children gone - ordering exit",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            } else {
                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                     "%s Comm failure: %d routes remain alive",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (int)orte_routed.num_routes()));
            }
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s Comm failure: daemon %s - aborting",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
        /* record the first one to fail */
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            /* output an error message so the user knows what happened */
            orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
            /* mark the daemon job as failed */
            jdata->state = ORTE_JOB_STATE_COMM_FAILED;
            /* point to the lowest rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            /* update our exit code */
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* just in case the exit code hadn't been set, do it here - this
             * won't override any reported exit code */
            ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
        }
        /* abort the system */
        default_hnp_abort(jdata);
        goto cleanup;
    }

    /* update the proc state - can get multiple reports on a proc
     * depending on circumstances, so ensure we only do this once
     */
    if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
        pptr->state = state;
    }

    /* if we were ordered to terminate, mark this proc as dead and see if
     * any of our routes or local  children remain alive - if not, then
     * terminate ourselves. */
    if (orte_orteds_term_ordered) {
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
                    goto keep_going;
                }
            }
        }
        /* if all my routes and children are gone, then terminate
           ourselves nicely (i.e., this is a normal termination) */
        if (0 == orte_routed.num_routes()) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default:hnp all routes gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
        }
    }

keep_going:
    /* ensure we record the failed proc properly so we can report
     * the error once we terminate
     */
    switch (state) {
    case ORTE_PROC_STATE_KILLED_BY_CMD:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s killed by cmd",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* we ordered this proc to die, so it isn't an abnormal termination
         * and we don't flag it as such
         */
        if (jdata->num_terminated >= jdata->num_procs) {
            /* this job has terminated */
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        }
        /* don't abort the job as this isn't an abnormal termination */
        break;

    case ORTE_PROC_STATE_ABORTED:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s aborted",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_ABORTED;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        break;

    case ORTE_PROC_STATE_ABORTED_BY_SIG:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s aborted by signal",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));

        ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        /* track the number of non-zero exits */
        i32 = 0;
        i32ptr = &i32;
        orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
        ++i32;
        orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
        if (orte_abort_non_zero_exit) {

            if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
                jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
                /* point to the first rank to cause the problem */
                orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
                /* retain the object so it doesn't get free'd */
                OBJ_RETAIN(pptr);
                ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
                ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
                /* abnormal termination - abort, but only do it once
                 * to avoid creating a lot of confusion */
                default_hnp_abort(jdata);
            }
        } else {
            /* user requested we consider this normal termination */
            if (jdata->num_terminated >= jdata->num_procs) {
                /* this job has terminated */
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
            }
        }
        break;

    case ORTE_PROC_STATE_TERM_WO_SYNC:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s terminated without sync",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* now treat a special case - if the proc exit'd without a required
             * sync, it may have done so with a zero exit code. We want to ensure
             * that the user realizes there was an error, so in this -one- case,
             * we overwrite the process' exit code with the default error code
             */
            ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        break;

    case ORTE_PROC_STATE_FAILED_TO_START:
    case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             orte_proc_state_to_str(state)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            if (ORTE_PROC_STATE_FAILED_TO_START) {
                jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
            } else {
                jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
            }
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        /* if this was a daemon, report it */
        if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
            /* output a message indicating we failed to launch a daemon */
            orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
        }
        break;

    case ORTE_PROC_STATE_CALLED_ABORT:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s called abort with exit code %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc), pptr->exit_code));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
            /* point to the first proc to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        break;

    case ORTE_PROC_STATE_TERM_NON_ZERO:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s exited with non-zero status %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             pptr->exit_code));
        ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        /* track the number of non-zero exits */
        i32 = 0;
        i32ptr = &i32;
        orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
        ++i32;
        orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
        if (orte_abort_non_zero_exit) {
            if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
                jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
                /* point to the first rank to cause the problem */
                orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
                /* retain the object so it doesn't get free'd */
                OBJ_RETAIN(pptr);
                ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
                /* abnormal termination - abort, but only do it once
                 * to avoid creating a lot of confusion */
                default_hnp_abort(jdata);
            }
        } else {
            /* user requested we consider this normal termination */
            if (jdata->num_terminated >= jdata->num_procs) {
                /* this job has terminated */
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
            }
        }
        break;

    case ORTE_PROC_STATE_HEARTBEAT_FAILED:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s heartbeat failed",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        /* remove from dependent routes, if it is one */
        orte_routed.route_lost(proc);
        break;

    case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: unable to send message to proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* if this proc is one of my daemons, then we are truly
         * hosed - so just exit out
         */
        if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            break;
        }
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        break;

    default:
        /* shouldn't get this, but terminate job if required */
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s default error %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             orte_proc_state_to_str(state)));
        if (jdata->num_terminated == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        }
        break;
    }
    /* if the waitpid fired, be sure to let the state machine know */
    if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
        ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
    }

cleanup:
    OBJ_RELEASE(caddy);
}
Example #24
0
static void recover_procs(orte_process_name_t *daemon)
{
    orte_job_t *jdt;
    orte_proc_t *proc;
    orte_node_t *node=NULL;
    int i, rc;
    opal_buffer_t *bfr;
    uint16_t jfam;
    struct timeval offset={0, 0};
    int32_t max_fails=0;
    orte_errmgr_caddy_t *cd;

    /* the thread is locked by the caller, so don't do anything here */

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s ATTEMPTING TO RECOVER PROCS FROM DAEMON %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(daemon)));

    /* if not already done, mark this daemon as down */
    if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, daemon->vpid))) {
        /* correctly track number of alive daemons */
        daemon_job->num_terminated++;
        orte_process_info.num_procs--;
        /* get the corresponding node */
        node = proc->node;
        /* maintain accounting */
        OBJ_RELEASE(proc);
        proc->node = NULL;
    } else {
        /* if it has already been removed, then we need to find the node it was on.
         * this doesn't necessarily correspond to the daemon's vpid, so we have
         * to search the array
         */
        opal_output(0, "RECOVER PROCS - MISSING NODE");
        return;
    }
    /* mark the node as down so it won't be used in mapping
     * procs to be relaunched
     */
    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s MARKING NODE %s DOWN",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         node->name));

    node->state = ORTE_NODE_STATE_DOWN;
    node->daemon = NULL;
    max_fails = 0;
    /* mark all procs on this node as having terminated */
    for (i=0; i < node->procs->size; i++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
            continue;
        }
        /* get the job data object for this process */
        if (NULL == (jdt = orte_get_job_data_object(proc->name.jobid))) {
            /* major problem */
            opal_output(0, "%s COULD NOT GET JOB OBJECT FOR PROC %s(%d): state %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&proc->name), i,
                        orte_proc_state_to_str(proc->state));
            continue;
        }
        /* since the proc failed for reasons other than its own, this restart
         * does not count against its total - so mark it for restart
         */
        proc->state = ORTE_PROC_STATE_RESTART;
        proc->pid = 0;
        jdt->state = ORTE_JOB_STATE_RESTART;
        if (max_fails < proc->restarts) {
            max_fails = proc->restarts;
        }
        /* adjust the num terminated so that acctg works right */
        jdt->num_terminated++;
    }

    /* calculate a delay to avoid racy situation when a proc
     * is continuously failing due to, e.g., a bad command
     * syntax
     */
    if (1 < max_fails) {
        if (4 < max_fails) {
            /* cap the delay at 4 secs */
            offset.tv_sec = 4;
        } else {
            /* add a sec for each failure beyond the first */
            offset.tv_sec = max_fails - 1;
        }
    }

    /* now cycle thru the jobs and restart all those that were flagged */
    for (i=0; i < orte_job_data->size; i++) {
        if (NULL == (jdt = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
            continue;
        }
        if (ORTE_JOB_STATE_RESTART == jdt->state) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s DELAYING RESTART OF JOB %s FOR %d SECS",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(jdt->jobid), (int)offset.tv_sec));
            cd = OBJ_NEW(orte_errmgr_caddy_t);
            cd->jdata = jdt;
            opal_event_evtimer_set(opal_event_base, &cd->ev, launch_restart, cd);
            opal_event_evtimer_add(&cd->ev, &offset);
        }
    }
}
Example #25
0
/****    PROC STATE MACHINE    ****/
void orte_state_base_activate_proc_state(orte_process_name_t *proc,
                                         orte_proc_state_t state)
{
    opal_list_item_t *itm, *any=NULL, *error=NULL;
    orte_state_t *s;
    orte_state_caddy_t *caddy;

    for (itm = opal_list_get_first(&orte_proc_states);
         itm != opal_list_get_end(&orte_proc_states);
         itm = opal_list_get_next(itm)) {
        s = (orte_state_t*)itm;
        if (s->proc_state == ORTE_PROC_STATE_ANY) {
            /* save this place */
            any = itm;
        }
        if (s->proc_state == ORTE_PROC_STATE_ERROR) {
            error = itm;
        }
        if (s->proc_state == state) {
            OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
                                 "%s ACTIVATING PROC %s STATE %s PRI %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(proc),
                                 orte_proc_state_to_str(state), s->priority));
            if (NULL == s->cbfunc) {
                OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
                                     "%s NULL CBFUNC FOR PROC %s STATE %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(proc),
                                     orte_proc_state_to_str(state)));
                return;
            }
            caddy = OBJ_NEW(orte_state_caddy_t);
            caddy->name = *proc;
            caddy->proc_state = state;
            opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
            opal_event_set_priority(&caddy->ev, s->priority);
            opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
            return;
        }
    }
    /* if we get here, then the state wasn't found, so execute
     * the default handler if it is defined
     */
    if (ORTE_PROC_STATE_ERROR < state && NULL != error) {
        s = (orte_state_t*)error;
    } else if (NULL != any) {
        s = (orte_state_t*)any;
    } else {
        OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
                             "INCREMENT: ANY STATE NOT FOUND"));
        return;
    }
    if (NULL == s->cbfunc) {
        OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
                             "ACTIVATE: ANY STATE HANDLER NOT DEFINED"));
        return;
    }
    caddy = OBJ_NEW(orte_state_caddy_t);
    caddy->name = *proc;
    caddy->proc_state = state;
            OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
                                 "%s ACTIVATING PROC %s STATE %s PRI %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(proc),
                                 orte_proc_state_to_str(state), s->priority));
    opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
    opal_event_set_priority(&caddy->ev, s->priority);
    opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
}
Example #26
0
static void xcast_recv(int status, orte_process_name_t* sender,
                       opal_buffer_t* buffer, orte_rml_tag_t tg,
                       void* cbdata)
{
    opal_list_item_t *item;
    orte_namelist_t *nm;
    int ret, cnt;
    opal_buffer_t *relay=NULL, *rly;
    orte_daemon_cmd_flag_t command = ORTE_DAEMON_NULL_CMD;
    opal_buffer_t wireup, datbuf, *data;
    opal_byte_object_t *bo;
    int8_t flag;
    orte_job_t *jdata;
    orte_proc_t *rec;
    opal_list_t coll;
    orte_grpcomm_signature_t *sig;
    orte_rml_tag_t tag;
    char *rtmod, *nidmap;
    size_t inlen, cmplen;
    uint8_t *packed_data, *cmpdata;
    int32_t nvals, i;
    opal_value_t kv, *kval;
    orte_process_name_t dmn;

    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:direct:xcast:recv: with %d bytes",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (int)buffer->bytes_used));

    /* we need a passthru buffer to send to our children - we leave it
     * as compressed data */
    rly = OBJ_NEW(opal_buffer_t);
    opal_dss.copy_payload(rly, buffer);
    OBJ_CONSTRUCT(&datbuf, opal_buffer_t);
    /* setup the relay list */
    OBJ_CONSTRUCT(&coll, opal_list_t);

    /* unpack the flag to see if this payload is compressed */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
        ORTE_ERROR_LOG(ret);
        ORTE_FORCED_TERMINATE(ret);
        OBJ_DESTRUCT(&datbuf);
        OBJ_DESTRUCT(&coll);
        OBJ_RELEASE(rly);
        return;
    }
    if (flag) {
        /* unpack the data size */
        cnt=1;
        if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &inlen, &cnt, OPAL_SIZE))) {
            ORTE_ERROR_LOG(ret);
            ORTE_FORCED_TERMINATE(ret);
            OBJ_DESTRUCT(&datbuf);
            OBJ_DESTRUCT(&coll);
            OBJ_RELEASE(rly);
            return;
        }
        /* unpack the unpacked data size */
        cnt=1;
        if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &cmplen, &cnt, OPAL_SIZE))) {
            ORTE_ERROR_LOG(ret);
            ORTE_FORCED_TERMINATE(ret);
            OBJ_DESTRUCT(&datbuf);
            OBJ_DESTRUCT(&coll);
            OBJ_RELEASE(rly);
            return;
        }
        /* allocate the space */
        packed_data = (uint8_t*)malloc(inlen);
        /* unpack the data blob */
        cnt = inlen;
        if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, packed_data, &cnt, OPAL_UINT8))) {
            ORTE_ERROR_LOG(ret);
            free(packed_data);
            ORTE_FORCED_TERMINATE(ret);
            OBJ_DESTRUCT(&datbuf);
            OBJ_DESTRUCT(&coll);
            OBJ_RELEASE(rly);
            return;
        }
        /* decompress the data */
        if (orte_util_uncompress_block(&cmpdata, cmplen,
                                       packed_data, inlen)) {
            /* the data has been uncompressed */
            opal_dss.load(&datbuf, cmpdata, cmplen);
            data = &datbuf;
        } else {
            data = buffer;
        }
        free(packed_data);
    } else {
        data = buffer;
    }

    /* get the signature that we do not need */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &sig, &cnt, ORTE_SIGNATURE))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&datbuf);
        OBJ_DESTRUCT(&coll);
        OBJ_RELEASE(rly);
        ORTE_FORCED_TERMINATE(ret);
        return;
    }
    OBJ_RELEASE(sig);

    /* get the target tag */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &tag, &cnt, ORTE_RML_TAG))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&datbuf);
        OBJ_DESTRUCT(&coll);
        OBJ_RELEASE(rly);
        ORTE_FORCED_TERMINATE(ret);
        return;
    }

    /* get our conduit's routed module name */
    rtmod = orte_rml.get_routed(orte_coll_conduit);

    /* if this is headed for the daemon command processor,
     * then we first need to check for add_local_procs
     * as that command includes some needed wireup info */
    if (ORTE_RML_TAG_DAEMON == tag) {
        /* peek at the command */
        cnt=1;
        if (ORTE_SUCCESS == (ret = opal_dss.unpack(data, &command, &cnt, ORTE_DAEMON_CMD))) {
            /* if it is an exit cmd, then flag that we are quitting so we will properly
             * handle connection losses from our downstream peers */
            if (ORTE_DAEMON_EXIT_CMD == command ||
                ORTE_DAEMON_HALT_VM_CMD == command) {
                orte_orteds_term_ordered = true;
                if (ORTE_DAEMON_HALT_VM_CMD == command) {
                    /* this is an abnormal termination */
                    orte_abnormal_term_ordered = true;
                }
                /* copy the msg for relay to ourselves */
                relay = OBJ_NEW(opal_buffer_t);
                /* repack the command */
                if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                opal_dss.copy_payload(relay, data);
            } else if (ORTE_DAEMON_ADD_LOCAL_PROCS == command ||
                       ORTE_DAEMON_DVM_NIDMAP_CMD == command ||
                       ORTE_DAEMON_DVM_ADD_PROCS == command) {
                /* setup our internal relay buffer */
                relay = OBJ_NEW(opal_buffer_t);
                /* repack the command */
                if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                /* unpack the nidmap string - may be NULL */
                cnt = 1;
                if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &nidmap, &cnt, OPAL_STRING))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                if (NULL != nidmap) {
                    if (ORTE_SUCCESS != (ret = orte_regx.nidmap_parse(nidmap))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }
                    free(nidmap);
                }
                /* see if they included info on node capabilities */
                cnt = 1;
                if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                if (0 != flag) {
                    /* update our local nidmap, if required - the decode function
                     * knows what to do
                     */
                    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                         "%s grpcomm:direct:xcast updating daemon nidmap",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

                    if (ORTE_SUCCESS != (ret = orte_regx.decode_daemon_nodemap(data))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }

                    if (!ORTE_PROC_IS_HNP) {
                        /* update the routing plan - the HNP already did
                         * it when it computed the VM, so don't waste time
                         * re-doing it here */
                        orte_routed.update_routing_plan(rtmod);
                    }
                    /* routing is now possible */
                    orte_routed_base.routing_enabled = true;

                    /* unpack the byte object */
                    cnt=1;
                    if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }
                    if (0 < bo->size) {
                        /* load it into a buffer */
                        OBJ_CONSTRUCT(&wireup, opal_buffer_t);
                        opal_dss.load(&wireup, bo->bytes, bo->size);
                        /* decode it, pushing the info into our database */
                        if (opal_pmix.legacy_get()) {
                            OBJ_CONSTRUCT(&kv, opal_value_t);
                            kv.key = OPAL_PMIX_PROC_URI;
                            kv.type = OPAL_STRING;
                            cnt=1;
                            while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) {
                                cnt = 1;
                                if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kv.data.string, &cnt, OPAL_STRING))) {
                                    ORTE_ERROR_LOG(ret);
                                    break;
                                }
                                if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, &kv))) {
                                    ORTE_ERROR_LOG(ret);
                                    free(kv.data.string);
                                    break;
                                }
                                free(kv.data.string);
                                kv.data.string = NULL;
                            }
                            if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) {
                                ORTE_ERROR_LOG(ret);
                            }
                        } else {
                           cnt=1;
                           while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) {
                               cnt = 1;
                               if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &nvals, &cnt, OPAL_INT32))) {
                                   ORTE_ERROR_LOG(ret);
                                   break;
                               }
                               for (i=0; i < nvals; i++) {
                                cnt = 1;
                                if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kval, &cnt, OPAL_VALUE))) {
                                    ORTE_ERROR_LOG(ret);
                                    break;
                                }
                                OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                                     "%s STORING MODEX DATA FOR PROC %s KEY %s",
                                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                     ORTE_NAME_PRINT(&dmn), kval->key));
                                if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, kval))) {
                                    ORTE_ERROR_LOG(ret);
                                    OBJ_RELEASE(kval);
                                    break;
                                }
                                OBJ_RELEASE(kval);
                            }
                            }
                            if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) {
                                ORTE_ERROR_LOG(ret);
                            }
                        }
                        /* done with the wireup buffer - dump it */
                        OBJ_DESTRUCT(&wireup);
                    }
                    free(bo);
                }
                /* copy the remainder of the payload - we don't pass wiring info
                 * to the odls */
                opal_dss.copy_payload(relay, data);
            } else {
                relay = OBJ_NEW(opal_buffer_t);
                /* repack the command */
                if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                /* copy the msg for relay to ourselves */
                opal_dss.copy_payload(relay, data);
            }
        } else {
            ORTE_ERROR_LOG(ret);
            goto CLEANUP;
        }
    } else {
        /* copy the msg for relay to ourselves */
        relay = OBJ_NEW(opal_buffer_t);
        opal_dss.copy_payload(relay, data);
    }

  relay:
    if (!orte_do_not_launch) {
        /* get the list of next recipients from the routed module */
        orte_routed.get_routing_list(rtmod, &coll);

        /* if list is empty, no relay is required */
        if (opal_list_is_empty(&coll)) {
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                 "%s grpcomm:direct:send_relay - recipient list is empty!",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto CLEANUP;
        }

        /* send the message to each recipient on list, deconstructing it as we go */
        while (NULL != (item = opal_list_remove_first(&coll))) {
            nm = (orte_namelist_t*)item;

            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                 "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)rly->bytes_used,
                                 ORTE_NAME_PRINT(&nm->name)));
            OBJ_RETAIN(rly);
            /* check the state of the recipient - no point
             * sending to someone not alive
             */
            jdata = orte_get_job_data_object(nm->name.jobid);
            if (NULL == (rec = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) {
                if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) {
                    opal_output(0, "%s grpcomm:direct:send_relay proc %s not found - cannot relay",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
                }
                OBJ_RELEASE(rly);
                OBJ_RELEASE(item);
                ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH);
                continue;
            }
            if ((ORTE_PROC_STATE_RUNNING < rec->state &&
                ORTE_PROC_STATE_CALLED_ABORT != rec->state) ||
                !ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
                if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) {
                    opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay: %s ",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name),
                                ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE) ? orte_proc_state_to_str(rec->state) : "NOT ALIVE");
                }
                OBJ_RELEASE(rly);
                OBJ_RELEASE(item);
                ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH);
                continue;
            }
            if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
                                                               &nm->name, rly, ORTE_RML_TAG_XCAST,
                                                               orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(rly);
                OBJ_RELEASE(item);
                ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH);
                continue;
            }
            OBJ_RELEASE(item);
        }
    }

 CLEANUP:
    /* cleanup */
    OPAL_LIST_DESTRUCT(&coll);
    OBJ_RELEASE(rly);  // retain accounting

    /* now pass the relay buffer to myself for processing - don't
     * inject it into the RML system via send as that will compete
     * with the relay messages down in the OOB. Instead, pass it
     * directly to the RML message processor */
    if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) {
        ORTE_RML_POST_MESSAGE(ORTE_PROC_MY_NAME, tag, 1,
                              relay->base_ptr, relay->bytes_used);
        relay->base_ptr = NULL;
        relay->bytes_used = 0;
    }
    if (NULL != relay) {
        OBJ_RELEASE(relay);
    }
    OBJ_DESTRUCT(&datbuf);
}
/*
 * PROC
 */
int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_type_t type)
{
    char *tmp, *tmp2, *pfx2;

    /* set default result */
    *output = NULL;
    
    /* protect against NULL prefix */
    if (NULL == prefix) {
        asprintf(&pfx2, " ");
    } else {
        asprintf(&pfx2, "%s", prefix);
    }
    
    if (orte_xml_output) {
        /* need to create the output in XML format */
        if (0 == src->pid) {
            asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", pfx2,
                     ORTE_VPID_PRINT(src->name.vpid), orte_proc_state_to_str(src->state));
        } else {
            asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\"/>\n", pfx2,
                     ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, orte_proc_state_to_str(src->state));
        }
        free(pfx2);
        return ORTE_SUCCESS;
    }
    
    if (!orte_devel_level_output) {
        /* just print a very simple output for users */
        asprintf(&tmp, "\n%sProcess OMPI jobid: %s App: %ld Process rank: %s", pfx2,
                 ORTE_JOBID_PRINT(src->name.jobid), (long)src->app_idx,
                 ORTE_VPID_PRINT(src->name.vpid));
        
        /* set the return */
        *output = tmp;
        free(pfx2);
        return ORTE_SUCCESS;
    }

    asprintf(&tmp, "\n%sData for proc: %s", pfx2, ORTE_NAME_PRINT(&src->name));
    
    asprintf(&tmp2, "%s\n%s\tPid: %ld\tLocal rank: %lu\tNode rank: %lu\tApp rank: %d", tmp, pfx2,
             (long)src->pid, (unsigned long)src->local_rank, (unsigned long)src->node_rank, src->app_rank);
    free(tmp);
    tmp = tmp2;
    
#if OPAL_HAVE_HWLOC
    {
        char *locale=NULL;
        char *bind = NULL;

        if (NULL != src->locale) {
            hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset);
        }
        if (NULL != src->bind_location) {
            hwloc_bitmap_list_asprintf(&bind, src->bind_location->cpuset);
        }
        asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tLocale: %s\tBind location: %s\tBinding: %s", tmp, pfx2,
                 orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx,
                 (NULL == locale) ? "UNKNOWN" : locale, bind,
                 (NULL == src->cpu_bitmap) ? "NULL" : src->cpu_bitmap);
        if (NULL != locale) {
            free(locale);
        }
        if (NULL != bind) {
            free(bind);
        }
    }
#else
    asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld", tmp, pfx2,
             orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx);
#endif
    free(tmp);
    
    /* set the return */
    *output = tmp2;
    
    free(pfx2);
    return ORTE_SUCCESS;
}
Example #28
0
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;

    orte_proc_t *child, *ptr;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    int i;

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_orted:proc_errors process %s error state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors finalizing - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_LIFELINE_LOST == state ||
        ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:orted lifeline lost - exiting",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* set our exit status */
        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
        /* kill our children */
        killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
        /* terminate - our routed children will see
         * us leave and automatically die
         */
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* must already be complete */
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - we can't seem to trust that we will catch the waitpid
             * in this situation, so push this over to be handled as if
             * it were a waitpid trigger so we don't create a bunch of
             * duplicate code */
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            /* get the proc_t */
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                goto cleanup;
            }
            /* leave the exit code alone - process this as a waitpid */
            odls_base_default_wait_local_proc(child, NULL);
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default:orted daemon %s exited",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* if we are using static ports, then it is possible that the HNP
         * will not see this termination. So if the HNP didn't order us
         * to terminate, then we should ensure it knows */
        if (orte_static_ports && !orte_orteds_term_ordered) {
            /* send an alert to the HNP */
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* get the proc_t */
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                goto cleanup;
            }
            /* set the exit code to reflect the problem */
            child->exit_code = ORTE_ERR_COMM_FAILURE;
            /* pack only the data for this daemon - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }

            /* now pack the daemon's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting lost connection to daemon %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(proc)));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(alert);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
            /* continue on */
            goto cleanup;
        }

        if (orte_orteds_term_ordered) {
            /* are any of my children still alive */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                        OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                             "%s errmgr:default:orted[%s(%d)] proc %s is alive",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             __FILE__, __LINE__,
                                             ORTE_NAME_PRINT(&child->name)));
                        goto cleanup;
                    }
                }
            }
            /* if all my routes and children are gone, then terminate
               ourselves nicely (i.e., this is a normal termination) */
            if (0 == orte_routed.num_routes()) {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted all routes gone - exiting",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            } else {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted not exiting, num_routes() == %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (int)orte_routed.num_routes()));
            }
        }
        /* if not, then we can continue */
        goto cleanup;
    }

    if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
    }
    /* if this is not a local proc for this job, we can
     * ignore this call
     */
    if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors proc is not local - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_orted got state %s for proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc)));

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        /* update the state */
        child->state = state;
        /* report this as abnormal termination to the HNP, unless we already have
         * done so for this job */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack only the data for this proc - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }

            /* now pack the child's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 jdata->num_local_procs));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(alert);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
        /* if the proc has terminated, notify the state machine */
        if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
        goto cleanup;
    }

    if (ORTE_PROC_STATE_FAILED_TO_START == state ||
        ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) {
        /* update the proc state */
        child->state = state;
        /* count the proc as having "terminated" */
        jdata->num_terminated++;
        /* leave the error report in this case to the
         * state machine, which will receive notice
         * when all local procs have attempted to start
         * so that we send a consolidated error report
         * back to the HNP
         */
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if we were ordered to terminate, see if
         * any of our routes or local children remain alive - if not, then
         * terminate ourselves. */
        if (orte_orteds_term_ordered) {
            /* mark the child as no longer alive and update the counters, if necessary.
             * we have to do this here as we aren't going to send this to the state
             * machine, and we want to keep the bookkeeping accurate just in case */
            if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
            }
            if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
                ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED);
                jdata->num_terminated++;
            }
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                        goto keep_going;
                    }
                }
            }
            /* if all my routes and children are gone, then terminate
               ourselves nicely (i.e., this is a normal termination) */
            if (0 == orte_routed.num_routes()) {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted all routes gone - exiting",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            }
            /* no need to alert the HNP - we are already on our way out */
            goto cleanup;
        }

    keep_going:
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away - but
         * only do this once!
         */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack only the data for this proc - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            child->state = state;
            /* now pack the child's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 jdata->num_local_procs));
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
        /* if the proc has terminated, notify the state machine */
        if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
        goto cleanup;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));

        /* remove all of this job's children from the global list */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (jdata->jobid == ptr->name.jobid) {
                opal_pointer_array_set_item(orte_local_children, i, NULL);
                OBJ_RELEASE(ptr);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jdata->jobid);

        /* remove this job from our local job data since it is complete */
        opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
        OBJ_RELEASE(jdata);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Example #29
0
/* process incoming messages in order of receipt */
void orte_plm_base_recv(int status, orte_process_name_t* sender,
                        opal_buffer_t* buffer, orte_rml_tag_t tag,
                        void* cbdata)
{
    orte_plm_cmd_flag_t command;
    orte_std_cntr_t count;
    orte_jobid_t job;
    orte_job_t *jdata, *parent;
    opal_buffer_t *answer;
    orte_vpid_t vpid;
    orte_proc_t *proc;
    orte_proc_state_t state;
    orte_exit_code_t exit_code;
    int32_t rc=ORTE_SUCCESS, ret;
    orte_app_context_t *app, *child_app;
    orte_process_name_t name;
    pid_t pid;
    bool running;
    int i;
    char **env;
    char *prefix_dir;

    OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                         "%s plm:base:receive processing msg",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    count = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_PLM_CMD))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }
        
    switch (command) {
    case ORTE_PLM_LAUNCH_JOB_CMD:
        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive job launch command from %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(sender)));
                
        /* unpack the job object */
        count = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &count, ORTE_JOB))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }
            
        /* record the sender so we know who to respond to */
        jdata->originator.jobid = sender->jobid;
        jdata->originator.vpid = sender->vpid;

        /* get the parent's job object */
        if (NULL != (parent = orte_get_job_data_object(sender->jobid))) {
            /* if the prefix was set in the parent's job, we need to transfer
             * that prefix to the child's app_context so any further launch of
             * orteds can find the correct binary. There always has to be at
             * least one app_context in both parent and child, so we don't
             * need to check that here. However, be sure not to overwrite
             * the prefix if the user already provided it!
             */
            app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
            child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
            prefix_dir = NULL;
            if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) &&
                !orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) {
                orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING);
            }
            if (NULL != prefix_dir) {
                free(prefix_dir);
            }
        }
        
        /* if the user asked to forward any envars, cycle through the app contexts
         * in the comm_spawn request and add them
         */
        if (NULL != orte_forwarded_envars) {
            for (i=0; i < jdata->apps->size; i++) {
                if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
                    continue;
                }
                env = opal_environ_merge(orte_forwarded_envars, app->env);
                opal_argv_free(app->env);
                app->env = env;
            }
        }

        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive adding hosts",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

        /* process any add-hostfile and add-host options that were provided */
        if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }

        if (NULL != parent) {
            if (NULL == parent->bookmark) {
                /* find the sender's node in the job map */
                if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) {
                    /* set the bookmark so the child starts from that place - this means
                     * that the first child process could be co-located with the proc
                     * that called comm_spawn, assuming slots remain on that node. Otherwise,
                     * the procs will start on the next available node
                     */
                    jdata->bookmark = proc->node;
                }
            } else {
                jdata->bookmark = parent->bookmark;
            }
        }

        /* launch it */
        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive calling spawn",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }
        break;
    ANSWER_LAUNCH:
        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive - error on launch: %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc));

        /* setup the response */
        answer = OBJ_NEW(opal_buffer_t);
        
        /* pack the error code to be returned */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
            ORTE_ERROR_LOG(ret);
        }
                
        /* send the response back to the sender */
        if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_PLM_PROXY,
                                               orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(answer);
        }
        break;
                
    case ORTE_PLM_UPDATE_PROC_STATE:
        opal_output_verbose(5, orte_plm_base_framework.framework_output,
                            "%s plm:base:receive update proc state command from %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(sender));
        count = 1;
        while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
                    
            opal_output_verbose(5, orte_plm_base_framework.framework_output,
                                "%s plm:base:receive got update_proc_state for job %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_JOBID_PRINT(job));
                    
            name.jobid = job;
            running = false;
            /* get the job object */
            jdata = orte_get_job_data_object(job);
            count = 1;
            while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID))) {
                if (ORTE_VPID_INVALID == vpid) {
                    /* flag indicates that this job is complete - move on */
                    break;
                }
                name.vpid = vpid;
                /* unpack the pid */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &count, OPAL_PID))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                /* unpack the state */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &count, ORTE_PROC_STATE))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                if (ORTE_PROC_STATE_RUNNING == state) {
                    running = true;
                }
                /* unpack the exit code */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &count, ORTE_EXIT_CODE))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                        
                OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                                     "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code));

                if (NULL != jdata) {
                    /* get the proc data object */
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
                        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    }
                    /* NEVER update the proc state before activating the state machine - let
                     * the state cbfunc update it as it may need to compare this
                     * state against the prior proc state */
                    proc->pid = pid;
                    proc->exit_code = exit_code;
                    ORTE_ACTIVATE_PROC_STATE(&name, state);
                }
            }
            /* record that we heard back from a daemon during app launch */
            if (running && NULL != jdata) {
                jdata->num_daemons_reported++;
                if (orte_report_launch_progress) {
                    if (0 == jdata->num_daemons_reported % 100 ||
                        jdata->num_daemons_reported == orte_process_info.num_procs) {
                        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS);
                    }
                }
            }
            /* prepare for next job */
            count = 1;
        }
        if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        break;
                
    case ORTE_PLM_REGISTERED_CMD:
        count=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            goto DEPART;
        }
        name.jobid = job;
        /* get the job object */
        if (NULL == (jdata = orte_get_job_data_object(job))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            rc = ORTE_ERR_NOT_FOUND;
            goto DEPART;
        }
        count=1;
        while (ORTE_SUCCESS == opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID)) {
            name.vpid = vpid;
            ORTE_ACTIVATE_PROC_STATE(&name, ORTE_PROC_STATE_REGISTERED);
            count=1;
        }
        break;

    default:
        ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
        rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
        break;
    }
        
 CLEANUP:
    if (ORTE_SUCCESS != rc) {
        goto DEPART;
    }
        
 DEPART:
    /* see if an error occurred - if so, wakeup the HNP so we can exit */
    if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {
        jdata = NULL;
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                         "%s plm:base:receive done processing commands",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
Example #30
0
static int pretty_print_vpids(orte_job_t *job) {
    int len_o_proc_name = 0,
        len_proc_name   = 0,
        len_rank        = 0,
        len_pid         = 0,
        len_state       = 0,
        len_node        = 0,
        len_ckpt_s      = 0,
        len_ckpt_r      = 0,
        len_ckpt_l      = 0;
    int i, line_len;
    orte_vpid_t v;
    orte_proc_t *vpid;
    orte_app_context_t *app;
    char *o_proc_name;
    char **nodename = NULL;

    if (0 == job->num_procs) {
        return ORTE_SUCCESS;
    }

    /*
     * Caculate segment lengths
     */
    len_o_proc_name = (int)strlen("ORTE Name");
    len_proc_name   = (int)strlen("Process Name");
    len_rank        = (int)strlen("Local Rank");
    len_pid         = 6;
    len_state       = 0;
    len_node        = 0;
    len_ckpt_s      = -3;
    len_ckpt_r      = -3;
    len_ckpt_l      = -3;

    nodename = (char **) malloc(job->num_procs * sizeof(char *));
    for(v=0; v < job->num_procs; v++) {
        char *rankstr;
        vpid = (orte_proc_t*)job->procs->addr[v];

        /*
         * Find my app context
         */
        if( 0 >= (int)job->num_apps ) {
            if( 0 == vpid->name.vpid ) {
                if( (int)strlen("orterun") > len_proc_name)
                    len_proc_name = strlen("orterun");
            }
            else {
                if( (int)strlen("orted") > len_proc_name)
                    len_proc_name = strlen("orted");
            }
        }
        for( i = 0; i < (int)job->num_apps; ++i) {
            app = (orte_app_context_t*)job->apps->addr[i];
            if( app->idx == vpid->app_idx ) {
                if( (int)strlen(app->app) > len_proc_name)
                    len_proc_name = strlen(app->app);
                break;
            }
        }

        o_proc_name = orte_util_print_name_args(&vpid->name);
        if ((int)strlen(o_proc_name) > len_o_proc_name)
            len_o_proc_name = strlen(o_proc_name);

        asprintf(&rankstr, "%u", (uint)vpid->local_rank);
        if ((int)strlen(rankstr) > len_rank)
            len_rank = strlen(rankstr);
        free(rankstr);

        nodename[v] = NULL;
        if( orte_get_attribute(&vpid->attributes, ORTE_PROC_NODENAME, (void**)&nodename[v], OPAL_STRING) &&
            (int)strlen(nodename[v]) > len_node) {
            len_node = strlen(nodename[v]);
        } else if ((int)strlen("Unknown") > len_node) {
            len_node = strlen("Unknown");
        }

        if( (int)strlen(orte_proc_state_to_str(vpid->state)) > len_state)
            len_state = strlen(orte_proc_state_to_str(vpid->state));

    }

    line_len = (len_o_proc_name + 3 +
                len_proc_name   + 3 +
                len_rank        + 3 +
                len_pid         + 3 +
                len_state       + 3 +
                len_node        + 3 +
                len_ckpt_s      + 3 +
                len_ckpt_r      + 3 +
                len_ckpt_l)
                + 2;

    /*
     * Print Header
     */
    printf("\t");
    printf("%*s | ", len_proc_name   , "Process Name");
    printf("%*s | ", len_o_proc_name , "ORTE Name");
    printf("%*s | ", len_rank        , "Local Rank");
    printf("%*s | ", len_pid         , "PID");
    printf("%*s | ", len_node        , "Node");
    printf("%*s | ", len_state       , "State");
    printf("\n");

    printf("\t");
    pretty_print_dashed_line(line_len);

    /*
     * Print Info
     */
    for(v=0; v < job->num_procs; v++) {
        vpid = (orte_proc_t*)job->procs->addr[v];

        printf("\t");

        if( 0 >= (int)job->num_apps ) {
            if( 0 == vpid->name.vpid ) {
                printf("%*s | ", len_proc_name, "orterun");
            } else {
                printf("%*s | ", len_proc_name, "orted");
            }
        }
        for( i = 0; i < (int)job->num_apps; ++i) {
            app = (orte_app_context_t*)job->apps->addr[i];
            if( app->idx == vpid->app_idx ) {
                printf("%*s | ", len_proc_name, app->app);
                break;
            }
        }

        o_proc_name = orte_util_print_name_args(&vpid->name);

        printf("%*s | ",  len_o_proc_name, o_proc_name);
        printf("%*u | ",  len_rank       , (uint)vpid->local_rank);
        printf("%*d | ",  len_pid        , vpid->pid);
        printf("%*s | ",  len_node       , (NULL == nodename[v]) ? "Unknown" : nodename[v]);
        printf("%*s | ",  len_state      , orte_proc_state_to_str(vpid->state));

        if (NULL != nodename[v]) {
            free(nodename[v]);
        }
        printf("\n");

    }
    if (NULL != nodename) {
        free(nodename);
    }
    return ORTE_SUCCESS;
}