static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_state_t jobstate = caddy->job_state; char *msg; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return; } /* if the jdata is NULL, then we abort as this * is reporting an unrecoverable error */ if (NULL == caddy->jdata) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:orcm: jobid %s reported error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_job_state_to_str(jobstate))); asprintf(&msg, "%s errmgr:orcm: jobid %s reported error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_job_state_to_str(jobstate)); /* notify this */ ORTE_NOTIFIER_INTERNAL_ERROR(caddy->jdata, jobstate, ORTE_NOTIFIER_CRIT, 1, msg); /* cleanup */ /* ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);*/ OBJ_RELEASE(caddy); return; } /* update the state */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:orcm: job %s reported error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(caddy->jdata->jobid), orte_job_state_to_str(jobstate))); asprintf(&msg, "%s errmgr:orcm: jobid %s reported error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(caddy->jdata->jobid), orte_job_state_to_str(jobstate)); /* notify this */ ORTE_NOTIFIER_INTERNAL_ERROR(caddy->jdata, jobstate, ORTE_NOTIFIER_WARN, 1, msg); /* cleanup */ OBJ_RELEASE(caddy); }
int orte_state_base_add_job_state(orte_job_state_t state, orte_state_cbfunc_t cbfunc, int priority) { opal_list_item_t *item; orte_state_t *st; /* check for uniqueness */ for (item = opal_list_get_first(&orte_job_states); item != opal_list_get_end(&orte_job_states); item = opal_list_get_next(item)) { st = (orte_state_t*)item; if (st->job_state == state) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "DUPLICATE STATE DEFINED: %s", orte_job_state_to_str(state))); return ORTE_ERR_BAD_PARAM; } } st = OBJ_NEW(orte_state_t); st->job_state = state; st->cbfunc = cbfunc; st->priority = priority; opal_list_append(&orte_job_states, &(st->super)); return ORTE_SUCCESS; }
static int parseable_print(orte_ps_mpirun_info_t *hnpinfo) { orte_job_t **jobs; orte_node_t **nodes; orte_proc_t *proc; orte_app_context_t *app; char *appname; int i, j; char *nodename; /* don't include the daemon job in the number of jobs reported */ printf("mpirun:%lu:num nodes:%d:num jobs:%d\n", (unsigned long)hnpinfo->hnp->pid, hnpinfo->num_nodes, hnpinfo->num_jobs-1); if (orte_ps_globals.nodes) { nodes = hnpinfo->nodes; for (i=0; i < hnpinfo->num_nodes; i++) { printf("node:%s:state:%s:slots:%d:in use:%d\n", nodes[i]->name, pretty_node_state(nodes[i]->state), nodes[i]->slots, nodes[i]->slots_inuse); } } jobs = hnpinfo->jobs; /* skip job=0 as that's the daemon job */ for (i=1; i < hnpinfo->num_jobs; i++) { printf("jobid:%d:state:%s:slots:%d:num procs:%d\n", ORTE_LOCAL_JOBID(jobs[i]->jobid), orte_job_state_to_str(jobs[i]->state), jobs[i]->total_slots_alloc, jobs[i]->num_procs); /* print the proc info */ for (j=0; j < jobs[i]->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) { continue; } app = (orte_app_context_t*)opal_pointer_array_get_item(jobs[i]->apps, proc->app_idx); if (NULL == app) { appname = strdup("NULL"); } else { appname = opal_basename(app->app); } nodename = NULL; orte_get_attribute(&proc->attributes, ORTE_PROC_NODENAME, (void**)&nodename, OPAL_STRING); printf("process:%s:rank:%s:pid:%lu:node:%s:state:%s\n", appname, ORTE_VPID_PRINT(proc->name.vpid), (unsigned long)proc->pid, (NULL == nodename) ? "unknown" : nodename, orte_proc_state_to_str(proc->state)); free(appname); if (NULL != nodename) { free(nodename); } } } return ORTE_SUCCESS; }
static void myreport(orte_notifier_request_t *req) { char tod[48]; opal_output_verbose(5, orte_notifier_base_framework.framework_output, "notifier:syslog:myreport function called with severity %d state %s and messg %s", (int)req->severity, orte_job_state_to_str(req->state), req->msg); /* If there was a message, output it */ (void)ctime_r(&req->t, tod); /* trim the newline */ tod[strlen(tod)] = '\0'; syslog(req->severity, "[%s]%s JOBID %s REPORTS STATE %s: %s", tod, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT((NULL == req->jdata) ? ORTE_JOBID_INVALID : req->jdata->jobid), orte_job_state_to_str(req->state), (NULL == req->msg) ? "<N/A>" : req->msg); }
void orte_state_base_print_job_state_machine(void) { opal_list_item_t *item; orte_state_t *st; opal_output(0, "ORTE_JOB_STATE_MACHINE:"); for (item = opal_list_get_first(&orte_job_states); item != opal_list_get_end(&orte_job_states); item = opal_list_get_next(item)) { st = (orte_state_t*)item; opal_output(0, "\tState: %s cbfunc: %s", orte_job_state_to_str(st->job_state), (NULL == st->cbfunc) ? "NULL" : "DEFINED"); } }
static int update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { orte_ns_cmp_bitmask_t mask; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:default_app: job %s reported state %s" " for proc %s state %s exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), orte_job_state_to_str(jobstate), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state), exit_code)); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return ORTE_SUCCESS; } if (ORTE_PROC_STATE_COMM_FAILED == state) { mask = ORTE_NS_CMP_ALL; /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { return ORTE_SUCCESS; } /* see is this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { return ORTE_ERR_UNRECOVERABLE; } } return ORTE_SUCCESS; }
static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_job_state_t jobstate; orte_exit_code_t sts; orte_proc_t *aborted_proc; opal_buffer_t *answer; int32_t rc, ret; int room, *rmptr; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return; } /* if the jdata is NULL, then we abort as this * is reporting an unrecoverable error */ if (NULL == caddy->jdata) { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); OBJ_RELEASE(caddy); return; } /* update the state */ jdata = caddy->jdata; jobstate = caddy->job_state; jdata->state = jobstate; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_hnp: job %s reported state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jobstate))); if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate || ORTE_JOB_STATE_ALLOC_FAILED == jobstate || ORTE_JOB_STATE_MAP_FAILED == jobstate || ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) { if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* this is the primary job */ orte_never_launched = true; } /* disable routing as we may not have performed the daemon * wireup - e.g., in a managed environment, all the daemons * "phone home", but don't actually wireup into the routed * network until they receive the launch message */ orte_routing_is_enabled = false; jdata->num_terminated = jdata->num_procs; ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED); /* if it was a dynamic spawn, then we better tell them this didn't work */ if (ORTE_JOBID_INVALID != jdata->originator.jobid) { rc = jobstate; answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* pack the room number */ rmptr = &room; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp sending dyn error release of job %s to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), ORTE_NAME_PRINT(&jdata->originator))); if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer, ORTE_RML_TAG_LAUNCH_RESP, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } } OBJ_RELEASE(caddy); return; } if (ORTE_JOB_STATE_FAILED_TO_START == jobstate || ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) { /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ aborted_proc = NULL; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) { sts = aborted_proc->exit_code; if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) { if (WIFSIGNALED(sts)) { /* died on signal */ #ifdef WCOREDUMP if (WCOREDUMP(sts)) { orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, WTERMSIG(sts)); sts = WTERMSIG(sts); } else { orte_show_help("help-plm-base.txt", "daemon-died-signal", true, WTERMSIG(sts)); sts = WTERMSIG(sts); } #else orte_show_help("help-plm-base.txt", "daemon-died-signal", true, WTERMSIG(sts)); sts = WTERMSIG(sts); #endif /* WCOREDUMP */ } else { orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, WEXITSTATUS(sts)); sts = WEXITSTATUS(sts); } } } /* if this is the daemon job, then we need to ensure we * output an error message indicating we couldn't launch the * daemons */ if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true); } } /* if the daemon job aborted and we haven't heard from everyone yet, * then this could well have been caused by a daemon not finding * a way back to us. In this case, output a message indicating a daemon * died without reporting. Otherwise, say nothing as we * likely already output an error message */ if (ORTE_JOB_STATE_ABORTED == jobstate && jdata->jobid == ORTE_PROC_MY_NAME->jobid && jdata->num_procs != jdata->num_reported) { orte_show_help("help-errmgr-base.txt", "failed-daemon", true); } /* abort the job */ ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT); /* set the global abnormal exit flag */ orte_abnormal_term_ordered = true; OBJ_RELEASE(caddy); }
static int update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { opal_list_item_t *item, *next; orte_odls_job_t *jobdat = NULL; orte_odls_child_t *child; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; orte_vpid_t null=ORTE_VPID_INVALID; orte_ns_cmp_bitmask_t mask; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, "errmgr:default_orted:update_state() %s) " "------- %s state updated for process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ((NULL == proc) ? "App. Process" : (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { return ORTE_SUCCESS; } /*** UPDATE COMMAND FOR A JOB ***/ if (NULL == proc) { /* this is an update for an entire job */ if (ORTE_JOBID_INVALID == job) { /* whatever happened, we don't know what job * it happened to */ orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error", true, orte_job_state_to_str(jobstate)); alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return rc; } /* pack the "invalid" jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } return rc; } /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == job) { break; } } if (NULL == jobdat) { return ORTE_ERR_NOT_FOUND; } switch (jobstate) { case ORTE_JOB_STATE_FAILED_TO_START: failed_start(jobdat, exit_code); break; case ORTE_JOB_STATE_RUNNING: /* update all local child states */ update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); break; case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: /* update all procs in job */ update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); /* order all local procs for this job to be killed */ killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); case ORTE_JOB_STATE_COMM_FAILED: /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* tell the caller we can't recover */ return ORTE_ERR_UNRECOVERABLE; break; case ORTE_JOB_STATE_HEARTBEAT_FAILED: /* let the HNP handle this */ return ORTE_SUCCESS; break; default: break; } alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { ORTE_ERROR_LOG(rc); } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } return rc; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { return ORTE_SUCCESS; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - ignore */ return ORTE_SUCCESS; } /* see if this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ orte_quit(); } /* if not, then indicate we can continue */ return ORTE_SUCCESS; } /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; } } if (NULL == jobdat) { /* must already be complete */ return ORTE_SUCCESS; } /* if there are no local procs for this job, we can * ignore this call */ if (0 == jobdat->num_local_procs) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted got state %s for proc %s pid %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc), pid)); /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { /* find this proc in the local children */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; /* Decrement the number of local procs */ jobdat->num_local_procs--; /* kill this proc */ killprocs(proc->jobid, proc->vpid); } return ORTE_SUCCESS; } } } if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { if (!orte_abort_non_zero_exit) { /* treat this as normal termination */ goto REPORT_STATE; } } if (ORTE_PROC_STATE_TERMINATED < state) { /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } /* find this proc in the local children */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; } /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return rc; } /* remove the child from our local list as it is no longer alive */ opal_list_remove_item(&orte_local_children, &child->super); /* Decrement the number of local procs */ jobdat->num_local_procs--; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), jobdat->num_local_procs)); /* release the child object */ OBJ_RELEASE(child); /* done with loop */ break; } } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } return rc; } REPORT_STATE: /* find this proc in the local children so we can update its state */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; if (0 < pid) { child->pid = pid; } child->exit_code = exit_code; } /* done with loop */ break; } } if (ORTE_PROC_STATE_REGISTERED == state) { /* see if everyone in this job has registered */ if (all_children_registered(proc->jobid)) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted: sending contact info to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack init routes command */ cmd = ORTE_PLM_INIT_ROUTES_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack all the local child vpids and epochs */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; if (child->name->jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } } } /* pack an invalid marker */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* add in contact info for all procs in the job */ if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&alert); return rc; } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } return rc; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; } } if (NULL == jobdat) { /* race condition - may not have been formed yet */ return ORTE_SUCCESS; } alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { ORTE_ERROR_LOG(rc); } FINAL_CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobdat->jobid))); /* remove all of this job's children from the global list - do not lock * the thread as we are already locked */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = next) { child = (orte_odls_child_t*)item; next = opal_list_get_next(item); if (jobdat->jobid == child->name->jobid) { opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jobdat->jobid); /* remove this job from our local job data since it is complete */ opal_list_remove_item(&orte_local_jobdata, &jobdat->super); OBJ_RELEASE(jobdat); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } /* indicate that the job is complete */ return rc; } return ORTE_SUCCESS; }
static int pretty_print_jobs(orte_job_t **jobs, orte_std_cntr_t num_jobs) { int len_jobid = 0, len_state = 0, len_slots = 0, len_vpid_r = 0, len_ckpt_s = 0, len_ckpt_r = 0, len_ckpt_l = 0; int line_len; orte_job_t *job; orte_std_cntr_t i; char *jobstr; orte_jobid_t mask=0x0000ffff; #if OPAL_ENABLE_FT_CR == 1 char * state_str = NULL; #endif for(i=0; i < num_jobs; i++) { job = jobs[i]; /* check the jobid to see if this is the daemons' job */ if ((0 == (mask & job->jobid)) && !orte_ps_globals.daemons) { continue; } /* setup the printed name - do -not- free this! */ jobstr = ORTE_JOBID_PRINT(job->jobid); /* * Caculate segment lengths */ len_jobid = strlen(jobstr);; len_state = (int) (strlen(orte_job_state_to_str(job->state)) < strlen("State") ? strlen("State") : strlen(orte_job_state_to_str(job->state))); len_slots = 6; len_vpid_r = (int) strlen("Num Procs"); #if OPAL_ENABLE_FT_CR == 1 orte_snapc_ckpt_state_str(&state_str, job->ckpt_state); len_ckpt_s = (int) (strlen(state_str) < strlen("Ckpt State") ? strlen("Ckpt State") : strlen(state_str) ); len_ckpt_r = (int) (NULL == job->ckpt_snapshot_ref ? strlen("Ckpt Ref") : (strlen(job->ckpt_snapshot_ref) < strlen("Ckpt Ref") ? strlen("Ckpt Ref") : strlen(job->ckpt_snapshot_ref) ) ); len_ckpt_l = (int) (NULL == job->ckpt_snapshot_loc ? strlen("Ckpt Loc") : (strlen(job->ckpt_snapshot_loc) < strlen("Ckpt Loc") ? strlen("Ckpt Loc") : strlen(job->ckpt_snapshot_loc) ) ); #else len_ckpt_s = -3; len_ckpt_r = -3; len_ckpt_l = -3; #endif line_len = (len_jobid + 3 + len_state + 3 + len_slots + 3 + len_vpid_r + 3 + len_ckpt_s + 3 + len_ckpt_r + 3 + len_ckpt_l) + 2; /* * Print Header */ printf("\n"); printf("%*s | ", len_jobid , "JobID"); printf("%*s | ", len_state , "State"); printf("%*s | ", len_slots , "Slots"); printf("%*s | ", len_vpid_r , "Num Procs"); #if OPAL_ENABLE_FT_CR == 1 printf("%*s | ", len_ckpt_s , "Ckpt State"); printf("%*s | ", len_ckpt_r , "Ckpt Ref"); printf("%*s |", len_ckpt_l , "Ckpt Loc"); #endif printf("\n"); pretty_print_dashed_line(line_len); /* * Print Info */ printf("%*s | ", len_jobid , ORTE_JOBID_PRINT(job->jobid)); printf("%*s | ", len_state , orte_job_state_to_str(job->state)); printf("%*d | ", len_slots , (uint)job->total_slots_alloc); printf("%*d | ", len_vpid_r, job->num_procs); #if OPAL_ENABLE_FT_CR == 1 printf("%*s | ", len_ckpt_s, state_str); printf("%*s | ", len_ckpt_r, (NULL == job->ckpt_snapshot_ref ? "" : job->ckpt_snapshot_ref) ); printf("%*s |", len_ckpt_l, (NULL == job->ckpt_snapshot_loc ? "" : job->ckpt_snapshot_loc) ); #endif printf("\n"); pretty_print_vpids(job); printf("\n\n"); /* give a little room between job outputs */ } return ORTE_SUCCESS; }
static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_job_state_t jobstate; opal_buffer_t *answer; int32_t rc, ret; int room, *rmptr; ORTE_ACQUIRE_OBJECT(caddy); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return; } /* if the jdata is NULL, then we ignore it as this * is reporting an unrecoverable error */ if (NULL == caddy->jdata) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); OBJ_RELEASE(caddy); return; } /* update the state */ jdata = caddy->jdata; jobstate = caddy->job_state; jdata->state = jobstate; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:dvm: job %s reported state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jobstate))); if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* if the daemon job aborted and we haven't heard from everyone yet, * then this could well have been caused by a daemon not finding * a way back to us. In this case, output a message indicating a daemon * died without reporting. Otherwise, say nothing as we * likely already output an error message */ if (ORTE_JOB_STATE_ABORTED == jobstate && jdata->num_procs != jdata->num_reported) { orte_routing_is_enabled = false; orte_show_help("help-errmgr-base.txt", "failed-daemon", true); } /* there really isn't much else we can do since the problem * is in the DVM itself, so best just to terminate */ jdata->num_terminated = jdata->num_procs; /* activate the terminated state so we can exit */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); OBJ_RELEASE(caddy); return; } /* all other cases involve jobs submitted to the DVM - therefore, * we only inform the submitter of the problem, but do NOT terminate * the DVM itself */ rc = jobstate; answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(caddy); return; } if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(caddy); return; } /* pack the room number */ rmptr = &room; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(caddy); return; } } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:dvm sending notification of job %s failure to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), ORTE_NAME_PRINT(&jdata->originator))); if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, &jdata->originator, answer, ORTE_RML_TAG_LAUNCH_RESP, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } /* ensure we terminate any processes left running in the DVM */ _terminate_job(jdata->jobid); /* cleanup */ OBJ_RELEASE(caddy); }
static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_job_state_t jobstate; int rc; orte_plm_cmd_flag_t cmd; opal_buffer_t *alert; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return; } /* if the jdata is NULL, then we abort as this * is reporting an unrecoverable error */ if (NULL == caddy->jdata) { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); OBJ_RELEASE(caddy); return; } /* update the state */ jdata = caddy->jdata; jobstate = caddy->job_state; jdata->state = jobstate; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted: job %s reported error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jobstate))); switch (jobstate) { case ORTE_JOB_STATE_FAILED_TO_START: failed_start(jdata); break; case ORTE_JOB_STATE_COMM_FAILED: /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* order termination */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; break; case ORTE_JOB_STATE_HEARTBEAT_FAILED: /* let the HNP handle this */ goto cleanup; break; default: break; } alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); goto cleanup; } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } cleanup: OBJ_RELEASE(caddy); }
/* * JOB */ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_type_t type) { char *tmp, *tmp2, *tmp3, *pfx2, *pfx; int32_t i; int rc; orte_app_context_t *app; orte_proc_t *proc; /* set default result */ *output = NULL; /* protect against NULL prefix */ if (NULL == prefix) { asprintf(&pfx2, " "); } else { asprintf(&pfx2, "%s", prefix); } asprintf(&tmp, "\n%sData for job: %s\tRecovery: %s(%s)\n%s\tNum apps: %ld\tControls: %0x\tMPI allowed: %s\tStdin target: %s\tState: %s\tAbort: %s", pfx2, ORTE_JOBID_PRINT(src->jobid), (src->enable_recovery) ? "ENABLED" : "DISABLED", (src->recovery_defined) ? "DEFINED" : "DEFAULT", pfx2, (long)src->num_apps, src->controls, src->gang_launched ? "YES" : "NO", ORTE_VPID_PRINT(src->stdin_target), orte_job_state_to_str(src->state), src->abort ? "True" : "False"); asprintf(&pfx, "%s\t", pfx2); free(pfx2); for (i=0; i < src->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(src->apps, i))) { continue; } opal_dss.print(&tmp2, pfx, app, ORTE_APP_CONTEXT); asprintf(&tmp3, "%s\n%s", tmp, tmp2); free(tmp); free(tmp2); tmp = tmp3; } if (NULL != src->map) { if (ORTE_SUCCESS != (rc = opal_dss.print(&tmp2, pfx, src->map, ORTE_JOB_MAP))) { ORTE_ERROR_LOG(rc); return rc; } asprintf(&tmp3, "%s%s", tmp, tmp2); free(tmp); free(tmp2); tmp = tmp3; } else { asprintf(&tmp2, "%s\n%sNo Map", tmp, pfx); free(tmp); tmp = tmp2; } asprintf(&tmp2, "%s\n%sNum procs: %ld\tOffset: %ld", tmp, pfx, (long)src->num_procs, (long)src->offset); free(tmp); tmp = tmp2; for (i=0; i < src->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(src->procs, i))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss.print(&tmp2, pfx, proc, ORTE_PROC))) { ORTE_ERROR_LOG(rc); return rc; } asprintf(&tmp3, "%s%s", tmp, tmp2); free(tmp); free(tmp2); tmp = tmp3; } asprintf(&tmp2, "%s\n%s\tNum launched: %ld\tNum reported: %ld\tNum terminated: %ld", tmp, pfx, (long)src->num_launched, (long)src->num_reported, (long)src->num_terminated); free(tmp); tmp = tmp2; /* set the return */ *output = tmp; free(pfx); return ORTE_SUCCESS; }
static int update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { int rc=ORTE_SUCCESS, i; orte_app_context_t *app; orte_node_t *node; orte_proc_t *pptr, *daemon, *pptr2; opal_buffer_t *notify; orcm_triplet_t *trp; orcm_source_t *src; bool procs_recovered; orte_job_t *jdt; uint16_t jfam; bool send_msg; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:update_state for job %s proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); /* protect against threads */ ORTE_ACQUIRE_THREAD(&ctl); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /*** UPDATE COMMAND FOR A JOB ***/ if (NULL == proc) { /* should only get this if a daemon restarted and we need * to check for procs waiting to migrate */ if (ORTE_JOB_STATE_PROCS_MIGRATING != jobstate) { /* we should never get this situation */ opal_output(0, "%s UNKNOWN JOB ERROR ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_job_state_to_str(jobstate)); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERROR; } /* cycle thru all known jobs looking for those with procs * awaiting resources to migrate */ for (i=0; i < orte_job_data->size; i++) { if (NULL == (jdt = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { continue; } if (ORTE_JOB_STATE_PROCS_MIGRATING != jdt->state) { continue; } /* reset the job */ orte_plm_base_reset_job(jdt); /* map the job again */ if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdt))) { ORTE_ERROR_LOG(rc); continue; } /* launch any procs that could be mapped - note that not * all procs that were waiting for migration may have * been successfully mapped, so this could in fact * result in no action by the daemons */ notify = OBJ_NEW(opal_buffer_t); /* indicate the target DVM */ jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); opal_dss.pack(notify, &jfam, 1, OPAL_UINT16); /* get the launch data */ if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(notify, jdt->jobid))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(notify); ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /* send it to the daemons */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL, NULL, ORCM_PNP_TAG_COMMAND, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /**** DEAL WITH INDIVIDUAL PROCS ****/ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:sched got state %s for proc %s pid %d exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc), pid, exit_code)); /* if this was a failed comm or heartbeat */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* ignore this */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { /* get the proc object for this daemon */ if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } /* ensure that the heartbeat system knows to ignore this proc * from this point forward */ daemon->beat = 0; /* if we have already heard about this proc, ignore repeats */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == daemon->state) { /* already heard */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } #if 0 /* delete the route */ orte_routed.delete_route(proc); /* purge the oob */ orte_rml.purge(proc); #endif /* get the triplet/source and mark this source as "dead" */ if (NULL == (trp = orcm_get_triplet_stringid("orcmd:0.1:alpha"))) { opal_output(0, "%s CANNOT FIND DAEMON TRIPLET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } if (NULL == (src = orcm_get_source(trp, proc, false))) { opal_output(0, "%s DAEMON %s IS UNKNOWN SOURCE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); ORTE_RELEASE_THREAD(&trp->ctl); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } src->alive = false; ORTE_RELEASE_THREAD(&src->ctl); ORTE_RELEASE_THREAD(&trp->ctl); /* notify all apps immediately */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } notify = OBJ_NEW(opal_buffer_t); send_msg = false; for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); ORTE_RELEASE_THREAD(&ctl); return rc; } /* reset the proc stats */ OBJ_DESTRUCT(&pptr->stats); OBJ_CONSTRUCT(&pptr->stats, opal_pstats_t); /* since we added something, need to send msg */ send_msg = true; } if (send_msg) { /* send it to all apps */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL, ORCM_PNP_TAG_ERRMGR, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } else { OBJ_RELEASE(notify); } /* reset the node stats */ OBJ_DESTRUCT(&node->stats); OBJ_CONSTRUCT(&node->stats, opal_node_stats_t); /* record that the daemon died */ daemon->state = state; daemon->exit_code = exit_code; daemon->pid = 0; /* reset the daemon stats */ OBJ_DESTRUCT(&daemon->stats); OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t); node = daemon->node; if (NULL == node) { opal_output(0, "%s Detected failure of daemon %s on unknown node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); /* can't do anything further */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } else { opal_output(0, "%s Detected failure of daemon %s on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), (NULL == node->name) ? "UNKNOWN" : node->name); } /* see if any usable daemons are left alive */ procs_recovered = false; for (i=2; i < daemon_job->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, i))) { continue; } if (ORTE_PROC_STATE_UNTERMINATED < pptr->state) { continue; } /* at least one alive! recover procs from the failed one */ recover_procs(proc); procs_recovered = true; break; } if (!procs_recovered) { daemon->node = NULL; node->state = ORTE_NODE_STATE_DOWN; node->daemon = NULL; /* mark all procs on this node as having terminated */ for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } /* get the job data object for this process */ if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) { /* major problem */ opal_output(0, "%s COULD NOT GET JOB OBJECT FOR PROC %s(%d): state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), i, orte_proc_state_to_str(pptr->state)); continue; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) { continue; } OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING PROC %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); app->num_procs--; opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL); OBJ_RELEASE(pptr); /* clean it off the node */ opal_pointer_array_set_item(node->procs, i, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(pptr); /* see if job is empty */ jdt->num_terminated++; if (jdt->num_procs <= jdt->num_terminated) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING JOB %s FROM ACTIVE ARRAY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdt->jobid))); opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL); OBJ_RELEASE(jdt); } } } ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } if (ORTE_PROC_STATE_RESTARTED == state) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s RESTART OF DAEMON %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* get the proc object for this daemon */ if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } /* if apps were on that node, notify all apps immediately that * those procs have failed */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } notify = OBJ_NEW(opal_buffer_t); send_msg = false; for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); ORTE_RELEASE_THREAD(&ctl); return rc; } /* since we added something, we need to send msg */ send_msg = true; /* remove the proc from the app so that it will get * restarted when we re-activate the config */ if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) { continue; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) { continue; } OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING PROC %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); app->num_procs--; opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL); OBJ_RELEASE(pptr); /* clean it off the node */ opal_pointer_array_set_item(node->procs, i, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(pptr); /* see if job is empty */ jdt->num_terminated++; if (jdt->num_procs <= jdt->num_terminated) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING JOB %s FROM ACTIVE ARRAY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdt->jobid))); opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL); OBJ_RELEASE(jdt); } } if (send_msg) { /* send it to all apps */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL, ORCM_PNP_TAG_ERRMGR, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } else { OBJ_RELEASE(notify); } /* reset the node stats */ OBJ_DESTRUCT(&node->stats); OBJ_CONSTRUCT(&node->stats, opal_node_stats_t); /* reset the daemon stats */ OBJ_DESTRUCT(&daemon->stats); OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t); /* don't restart procs - we'll do that later after * we allow time for multiple daemons to restart */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /* to arrive here is an error */ opal_output(0, "%s GOT UNRECOGNIZED STATE %s FOR PROC %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc)); return ORTE_ERROR; }
static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_job_state_t jobstate; orte_exit_code_t sts; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return; } /* if the jdata is NULL, then we abort as this * is reporting an unrecoverable error */ if (NULL == caddy->jdata) { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); OBJ_RELEASE(caddy); return; } /* update the state */ jdata = caddy->jdata; jobstate = caddy->job_state; jdata->state = jobstate; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:default_hnp: job %s reported state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jobstate))); /* set global flags */ if (ORTE_PROC_MY_NAME->jobid == jdata->jobid && !orte_abnormal_term_ordered) { /* set the flag indicating that a daemon failed so we use the proper * methods for attempting to shutdown the rest of the system */ orte_abnormal_term_ordered = true; } if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate || ORTE_JOB_STATE_ALLOC_FAILED == jobstate) { orte_never_launched = true; jdata->num_terminated = jdata->num_procs; ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED); OBJ_RELEASE(caddy); return; } if (ORTE_JOB_STATE_FAILED_TO_START == jobstate || ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) { /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't * NULL, then we need to tell everyone else to die */ if (NULL != jdata->aborted_proc) { sts = jdata->aborted_proc->exit_code; if (ORTE_PROC_MY_NAME->jobid == jdata->jobid && !orte_abnormal_term_ordered) { /* set the flag indicating that a daemon failed so we use the proper * methods for attempting to shutdown the rest of the system */ orte_abnormal_term_ordered = true; if (WIFSIGNALED(sts)) { /* died on signal */ #ifdef WCOREDUMP if (WCOREDUMP(sts)) { orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, WTERMSIG(sts)); sts = WTERMSIG(sts); } else { orte_show_help("help-plm-base.txt", "daemon-died-signal", true, WTERMSIG(sts)); sts = WTERMSIG(sts); } #else orte_show_help("help-plm-base.txt", "daemon-died-signal", true, WTERMSIG(sts)); sts = WTERMSIG(sts); #endif /* WCOREDUMP */ } else { orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, WEXITSTATUS(sts)); sts = WEXITSTATUS(sts); } } } } /* abort the job */ ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT); OBJ_RELEASE(caddy); }
void orte_state_base_activate_job_state(orte_job_t *jdata, orte_job_state_t state) { opal_list_item_t *itm, *any=NULL, *error=NULL; orte_state_t *s; orte_state_caddy_t *caddy; for (itm = opal_list_get_first(&orte_job_states); itm != opal_list_get_end(&orte_job_states); itm = opal_list_get_next(itm)) { s = (orte_state_t*)itm; if (s->job_state == ORTE_JOB_STATE_ANY) { /* save this place */ any = itm; } if (s->job_state == ORTE_JOB_STATE_ERROR) { error = itm; } if (s->job_state == state) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "%s ACTIVATING JOB %s STATE %s PRI %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(state), s->priority)); if (NULL == s->cbfunc) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "%s NULL CBFUNC FOR JOB %s STATE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "ALL" : ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(state))); return; } caddy = OBJ_NEW(orte_state_caddy_t); if (NULL != jdata) { caddy->jdata = jdata; caddy->job_state = state; OBJ_RETAIN(jdata); } opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); opal_event_set_priority(&caddy->ev, s->priority); opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); return; } } /* if we get here, then the state wasn't found, so execute * the default handler if it is defined */ if (ORTE_JOB_STATE_ERROR < state && NULL != error) { s = (orte_state_t*)error; } else if (NULL != any) { s = (orte_state_t*)any; } else { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "ACTIVATE: ANY STATE NOT FOUND")); return; } if (NULL == s->cbfunc) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "ACTIVATE: ANY STATE HANDLER NOT DEFINED")); return; } caddy = OBJ_NEW(orte_state_caddy_t); if (NULL != jdata) { caddy->jdata = jdata; caddy->job_state = state; OBJ_RETAIN(jdata); } OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "%s ACTIVATING JOB %s STATE %s PRI %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(state), s->priority)); opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); opal_event_set_priority(&caddy->ev, s->priority); opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); }
void orte_state_base_check_all_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = caddy->jdata; orte_proc_t *proc; int i; orte_std_cntr_t j; orte_job_t *job; orte_node_t *node; orte_job_map_t *map; orte_std_cntr_t index; bool one_still_alive; orte_vpid_t lowest=0; int32_t i32, *i32ptr; opal_output_verbose(2, orte_state_base_framework.framework_output, "%s state:base:check_job_complete on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)); if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* just check to see if the daemons are complete */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_complete - received NULL job, checking daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CHECK_DAEMONS; } else { /* mark the job as terminated, but don't override any * abnormal termination flags */ if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) { jdata->state = ORTE_JOB_STATE_TERMINATED; } } /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } i32ptr = &i32; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32) && !orte_abort_non_zero_exit) { if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* update the exit code */ ORTE_UPDATE_EXIT_STATUS(lowest); } /* warn user */ opal_output(orte_clean_output, "-------------------------------------------------------\n" "While %s job %s terminated normally, %d %s. Further examination may be required.\n" "-------------------------------------------------------", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), i32, (1 == i32) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed declared job %s terminated with state %s - checking all jobs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jdata->state))); /* if this job is a continuously operating one, then don't do * anything further - just return here */ if (NULL != jdata && (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) || ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) { goto CHECK_ALIVE; } /* if the job that is being checked is the HNP, then we are * trying to terminate the orteds. In that situation, we * do -not- check all jobs - we simply notify the HNP * that the orteds are complete. Also check special case * if jdata is NULL - we want * to definitely declare the job done if the orteds * have completed, no matter what else may be happening. * This can happen if a ctrl-c hits in the "wrong" place * while launching */ CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { if (0 == orte_routed.num_routes()) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s orteds complete - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (NULL == jdata) { jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); OBJ_RELEASE(caddy); return; } OBJ_RELEASE(caddy); return; } /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". * Note that an aborted/killed job -is- flagged as complete and will * therefore have its resources released. We need to do this after * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (proc->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(proc); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; } CHECK_ALIVE: /* now check to see if all jobs are done - trigger notification of this jdata * object when we find it */ one_still_alive = false; for (j=1; j < orte_job_data->size; j++) { if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { /* since we are releasing jdata objects as we * go, we can no longer assume that the job_data * array is left justified */ continue; } /* if this is the job we are checking AND it normally terminated, * then activate the "notify_completed" state - this will release * the job state, but is provided so that the HNP main code can * take alternative actions if desired. If the state is killed_by_cmd, * then go ahead and release it. We cannot release it if it * abnormally terminated as mpirun needs the info so it can * report appropriately to the user * * NOTE: do not release the primary job (j=1) so we * can pretty-print completion message */ if (NULL != jdata && job->jobid == jdata->jobid) { if (jdata->state == ORTE_JOB_STATE_TERMINATED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed state is terminated - activating notify", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED); one_still_alive = true; } else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD || jdata->state == ORTE_JOB_STATE_NOTIFIED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed state is killed or notified - cleaning up", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* release this object, ensuring that the * pointer array internal accounting * is maintained! */ if (1 < j) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { /* this was a debugger daemon. notify that a debugger has detached */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH); } opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ OBJ_RELEASE(jdata); } } continue; } /* if the job is flagged to not be monitored, skip it */ if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) { continue; } /* when checking for job termination, we must be sure to NOT check * our own job as it - rather obviously - has NOT terminated! */ if (job->num_terminated < job->num_procs) { /* we have at least one job that is not done yet - we cannot * just return, though, as we need to ensure we cleanout the * job data for the job that just completed */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed job %s is not terminated (%d:%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs)); one_still_alive = true; } else { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs, (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); } } /* if a job is still alive, we just return */ if (one_still_alive) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed at least one job is not terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(caddy); return; } /* if we get here, then all jobs are done, so terminate */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed all jobs terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* stop the job timeout event, if set */ if (NULL != orte_mpiexec_timeout) { OBJ_RELEASE(orte_mpiexec_timeout); orte_mpiexec_timeout = NULL; } /* set the exit status to 0 - this will only happen if it * wasn't already set by an error condition */ ORTE_UPDATE_EXIT_STATUS(0); /* order daemon termination - this tells us to cleanup * our local procs as well as telling remote daemons * to die */ orte_plm.terminate_orteds(); OBJ_RELEASE(caddy); }
int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc_name, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { orte_proc_t *loc_proc = NULL; orte_job_t *jdata = NULL; int ret = ORTE_SUCCESS, exit_status = ORTE_SUCCESS; int32_t i; /* * if orte is trying to shutdown, just let it */ if( mca_errmgr_hnp_component.term_in_progress ) { return ORTE_SUCCESS; } if( NULL != proc_name && OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc_name) ) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp(autor): Update reported on self (%s), state %s. Skip...", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc_name), orte_proc_state_to_str(state) )); return ORTE_SUCCESS; } /* * Get the job data object for this process */ if( NULL != proc_name ) { /* Get job from proc's jobid */ jdata = orte_get_job_data_object(proc_name->jobid); } else { /* Get from the general job */ jdata = orte_get_job_data_object(job); } if( NULL == jdata ) { opal_output(0, "%s errmgr:hnp(autor):update_state() Error: Cannot find job %s for Process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); ret = ORTE_ERROR; ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * If this is a tool, ignore */ if( jdata->num_apps == 0 && OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp(autor): An external tool disconnected. Ignore...", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); exit_status = ORTE_SUCCESS; goto cleanup; } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp(autor): job %s reported state %s" " for proc %s state %s exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), orte_job_state_to_str(jobstate), (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), orte_proc_state_to_str(state), exit_code)); if( ORTE_JOB_STATE_RESTART == jobstate ) { for(i = 0; i < jdata->procs->size; ++i) { if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { continue; } break; } if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, &(loc_proc->name), state)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } else if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || ORTE_PROC_STATE_COMM_FAILED == state ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, proc_name, state)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { if( autor_mask_faults ) { mca_errmgr_hnp_component.ignore_current_update = true; orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code); } } cleanup: return ret; }