static void get_routing_list(opal_list_t *coll) { orte_namelist_t *nm; int32_t i; orte_job_t *jdata; orte_proc_t *proc; /* if I am anything other than daemons and the HNP, this * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON || !ORTE_PROC_IS_HNP) { return; } /* daemons don't route */ if (ORTE_PROC_IS_DAEMON) { return; } /* HNP sends direct to each daemon */ if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return; } for (i=1; i < jdata->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { continue; } if( proc->state <= ORTE_PROC_STATE_UNTERMINATED && NULL != proc->rml_uri ) { OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s get_routing_tree: Adding process %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(proc->name)), orte_proc_state_to_str(proc->state))); nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = proc->name.jobid; nm->name.vpid = proc->name.vpid; opal_list_append(coll, &nm->super); } else { OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s get_routing_tree: Skipped process %15s state %s (non functional daemon)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(proc->name)), orte_proc_state_to_str(proc->state))); } } }
int orte_state_base_add_proc_state(orte_proc_state_t state, orte_state_cbfunc_t cbfunc, int priority) { opal_list_item_t *item; orte_state_t *st; /* check for uniqueness */ for (item = opal_list_get_first(&orte_proc_states); item != opal_list_get_end(&orte_proc_states); item = opal_list_get_next(item)) { st = (orte_state_t*)item; if (st->proc_state == state) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "DUPLICATE STATE DEFINED: %s", orte_proc_state_to_str(state))); return ORTE_ERR_BAD_PARAM; } } st = OBJ_NEW(orte_state_t); st->proc_state = state; st->cbfunc = cbfunc; st->priority = priority; opal_list_append(&orte_proc_states, &(st->super)); return ORTE_SUCCESS; }
static int parseable_print(orte_ps_mpirun_info_t *hnpinfo) { orte_job_t **jobs; orte_node_t **nodes; orte_proc_t *proc; orte_app_context_t *app; char *appname; int i, j; char *nodename; /* don't include the daemon job in the number of jobs reported */ printf("mpirun:%lu:num nodes:%d:num jobs:%d\n", (unsigned long)hnpinfo->hnp->pid, hnpinfo->num_nodes, hnpinfo->num_jobs-1); if (orte_ps_globals.nodes) { nodes = hnpinfo->nodes; for (i=0; i < hnpinfo->num_nodes; i++) { printf("node:%s:state:%s:slots:%d:in use:%d\n", nodes[i]->name, pretty_node_state(nodes[i]->state), nodes[i]->slots, nodes[i]->slots_inuse); } } jobs = hnpinfo->jobs; /* skip job=0 as that's the daemon job */ for (i=1; i < hnpinfo->num_jobs; i++) { printf("jobid:%d:state:%s:slots:%d:num procs:%d\n", ORTE_LOCAL_JOBID(jobs[i]->jobid), orte_job_state_to_str(jobs[i]->state), jobs[i]->total_slots_alloc, jobs[i]->num_procs); /* print the proc info */ for (j=0; j < jobs[i]->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) { continue; } app = (orte_app_context_t*)opal_pointer_array_get_item(jobs[i]->apps, proc->app_idx); if (NULL == app) { appname = strdup("NULL"); } else { appname = opal_basename(app->app); } nodename = NULL; orte_get_attribute(&proc->attributes, ORTE_PROC_NODENAME, (void**)&nodename, OPAL_STRING); printf("process:%s:rank:%s:pid:%lu:node:%s:state:%s\n", appname, ORTE_VPID_PRINT(proc->name.vpid), (unsigned long)proc->pid, (NULL == nodename) ? "unknown" : nodename, orte_proc_state_to_str(proc->state)); free(appname); if (NULL != nodename) { free(nodename); } } } return ORTE_SUCCESS; }
void orte_state_base_print_proc_state_machine(void) { opal_list_item_t *item; orte_state_t *st; opal_output(0, "ORTE_PROC_STATE_MACHINE:"); for (item = opal_list_get_first(&orte_proc_states); item != opal_list_get_end(&orte_proc_states); item = opal_list_get_next(item)) { st = (orte_state_t*)item; opal_output(0, "\tState: %s cbfunc: %s", orte_proc_state_to_str(st->proc_state), (NULL == st->cbfunc) ? "NULL" : "DEFINED"); } }
static int update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { orte_ns_cmp_bitmask_t mask; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:default_app: job %s reported state %s" " for proc %s state %s exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), orte_job_state_to_str(jobstate), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state), exit_code)); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return ORTE_SUCCESS; } if (ORTE_PROC_STATE_COMM_FAILED == state) { mask = ORTE_NS_CMP_ALL; /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { return ORTE_SUCCESS; } /* see is this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { return ORTE_ERR_UNRECOVERABLE; } } return ORTE_SUCCESS; }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_tool: proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&caddy->name), orte_proc_state_to_str(caddy->proc_state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { OBJ_RELEASE(caddy); return; } /* all errors require abort */ orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL); OBJ_RELEASE(caddy); }
int orte_errmgr_hnp_autor_global_update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc_name, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { orte_proc_t *loc_proc = NULL; orte_job_t *jdata = NULL; int ret = ORTE_SUCCESS, exit_status = ORTE_SUCCESS; int32_t i; /* * if orte is trying to shutdown, just let it */ if( mca_errmgr_hnp_component.term_in_progress ) { return ORTE_SUCCESS; } if( NULL != proc_name && OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc_name) ) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp(autor): Update reported on self (%s), state %s. Skip...", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc_name), orte_proc_state_to_str(state) )); return ORTE_SUCCESS; } /* * Get the job data object for this process */ if( NULL != proc_name ) { /* Get job from proc's jobid */ jdata = orte_get_job_data_object(proc_name->jobid); } else { /* Get from the general job */ jdata = orte_get_job_data_object(job); } if( NULL == jdata ) { opal_output(0, "%s errmgr:hnp(autor):update_state() Error: Cannot find job %s for Process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); ret = ORTE_ERROR; ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * If this is a tool, ignore */ if( jdata->num_apps == 0 && OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp(autor): An external tool disconnected. Ignore...", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); exit_status = ORTE_SUCCESS; goto cleanup; } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp(autor): job %s reported state %s" " for proc %s state %s exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), orte_job_state_to_str(jobstate), (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), orte_proc_state_to_str(state), exit_code)); if( ORTE_JOB_STATE_RESTART == jobstate ) { for(i = 0; i < jdata->procs->size; ++i) { if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { continue; } break; } if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, &(loc_proc->name), state)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } else if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || ORTE_PROC_STATE_COMM_FAILED == state ) { if( ORTE_SUCCESS != (ret = orte_errmgr_hnp_autor_global_process_fault(jdata, proc_name, state)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { if( autor_mask_faults ) { mca_errmgr_hnp_component.ignore_current_update = true; orte_errmgr_hnp_update_proc(jdata, proc_name, state, 0, exit_code); } } cleanup: return ret; }
static int update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { opal_list_item_t *item, *next; orte_odls_job_t *jobdat = NULL; orte_odls_child_t *child; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; orte_vpid_t null=ORTE_VPID_INVALID; orte_ns_cmp_bitmask_t mask; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, "errmgr:default_orted:update_state() %s) " "------- %s state updated for process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ((NULL == proc) ? "App. Process" : (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { return ORTE_SUCCESS; } /*** UPDATE COMMAND FOR A JOB ***/ if (NULL == proc) { /* this is an update for an entire job */ if (ORTE_JOBID_INVALID == job) { /* whatever happened, we don't know what job * it happened to */ orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error", true, orte_job_state_to_str(jobstate)); alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return rc; } /* pack the "invalid" jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } return rc; } /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == job) { break; } } if (NULL == jobdat) { return ORTE_ERR_NOT_FOUND; } switch (jobstate) { case ORTE_JOB_STATE_FAILED_TO_START: failed_start(jobdat, exit_code); break; case ORTE_JOB_STATE_RUNNING: /* update all local child states */ update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); break; case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: /* update all procs in job */ update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); /* order all local procs for this job to be killed */ killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); case ORTE_JOB_STATE_COMM_FAILED: /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* tell the caller we can't recover */ return ORTE_ERR_UNRECOVERABLE; break; case ORTE_JOB_STATE_HEARTBEAT_FAILED: /* let the HNP handle this */ return ORTE_SUCCESS; break; default: break; } alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { ORTE_ERROR_LOG(rc); } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } return rc; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { return ORTE_SUCCESS; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - ignore */ return ORTE_SUCCESS; } /* see if this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ orte_quit(); } /* if not, then indicate we can continue */ return ORTE_SUCCESS; } /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; } } if (NULL == jobdat) { /* must already be complete */ return ORTE_SUCCESS; } /* if there are no local procs for this job, we can * ignore this call */ if (0 == jobdat->num_local_procs) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted got state %s for proc %s pid %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc), pid)); /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { /* find this proc in the local children */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; /* Decrement the number of local procs */ jobdat->num_local_procs--; /* kill this proc */ killprocs(proc->jobid, proc->vpid); } return ORTE_SUCCESS; } } } if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { if (!orte_abort_non_zero_exit) { /* treat this as normal termination */ goto REPORT_STATE; } } if (ORTE_PROC_STATE_TERMINATED < state) { /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } /* find this proc in the local children */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; } /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return rc; } /* remove the child from our local list as it is no longer alive */ opal_list_remove_item(&orte_local_children, &child->super); /* Decrement the number of local procs */ jobdat->num_local_procs--; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), jobdat->num_local_procs)); /* release the child object */ OBJ_RELEASE(child); /* done with loop */ break; } } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } return rc; } REPORT_STATE: /* find this proc in the local children so we can update its state */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; if (0 < pid) { child->pid = pid; } child->exit_code = exit_code; } /* done with loop */ break; } } if (ORTE_PROC_STATE_REGISTERED == state) { /* see if everyone in this job has registered */ if (all_children_registered(proc->jobid)) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted: sending contact info to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack init routes command */ cmd = ORTE_PLM_INIT_ROUTES_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack all the local child vpids and epochs */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; if (child->name->jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } } } /* pack an invalid marker */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* add in contact info for all procs in the job */ if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&alert); return rc; } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } return rc; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; } } if (NULL == jobdat) { /* race condition - may not have been formed yet */ return ORTE_SUCCESS; } alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { ORTE_ERROR_LOG(rc); } FINAL_CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobdat->jobid))); /* remove all of this job's children from the global list - do not lock * the thread as we are already locked */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = next) { child = (orte_odls_child_t*)item; next = opal_list_get_next(item); if (jobdat->jobid == child->name->jobid) { opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jobdat->jobid); /* remove this job from our local job data since it is complete */ opal_list_remove_item(&orte_local_jobdata, &jobdat->super); OBJ_RELEASE(jobdat); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } /* indicate that the job is complete */ return rc; } return ORTE_SUCCESS; }
static int pretty_print_vpids(orte_job_t *job) { int len_o_proc_name = 0, len_proc_name = 0, len_rank = 0, len_pid = 0, len_state = 0, len_node = 0, len_ckpt_s = 0, len_ckpt_r = 0, len_ckpt_l = 0; int i, line_len; orte_vpid_t v; orte_proc_t *vpid; orte_app_context_t *app; char *o_proc_name; #if OPAL_ENABLE_FT_CR == 1 char *state_str = NULL; #endif /* * Caculate segment lengths */ len_o_proc_name = (int)strlen("ORTE Name"); len_proc_name = (int)strlen("Process Name"); len_rank = (int)strlen("Local Rank"); len_pid = 6; len_state = 0; len_node = 0; #if OPAL_ENABLE_FT_CR == 1 len_ckpt_s = strlen("Ckpt State"); len_ckpt_r = strlen("Ckpt Ref"); len_ckpt_l = strlen("Ckpt Loc"); #else len_ckpt_s = -3; len_ckpt_r = -3; len_ckpt_l = -3; #endif for(v=0; v < job->num_procs; v++) { char *rankstr; vpid = (orte_proc_t*)job->procs->addr[v]; /* * Find my app context */ if( 0 >= (int)job->num_apps ) { if( 0 == vpid->name.vpid ) { if( (int)strlen("orterun") > len_proc_name) len_proc_name = strlen("orterun"); } else { if( (int)strlen("orted") > len_proc_name) len_proc_name = strlen("orted"); } } for( i = 0; i < (int)job->num_apps; ++i) { app = (orte_app_context_t*)job->apps->addr[i]; if( app->idx == vpid->app_idx ) { if( (int)strlen(app->app) > len_proc_name) len_proc_name = strlen(app->app); break; } } o_proc_name = orte_util_print_name_args(&vpid->name); if ((int)strlen(o_proc_name) > len_o_proc_name) len_o_proc_name = strlen(o_proc_name); asprintf(&rankstr, "%u", (uint)vpid->local_rank); if ((int)strlen(rankstr) > len_rank) len_rank = strlen(rankstr); free(rankstr); if( NULL != vpid->nodename && (int)strlen(vpid->nodename) > len_node) { len_node = strlen(vpid->nodename); } else if ((int)strlen("Unknown") > len_node) { len_node = strlen("Unknown"); } if( (int)strlen(orte_proc_state_to_str(vpid->state)) > len_state) len_state = strlen(orte_proc_state_to_str(vpid->state)); #if OPAL_ENABLE_FT_CR == 1 orte_snapc_ckpt_state_str(&state_str, vpid->ckpt_state); if( (int)strlen(state_str) > len_ckpt_s) len_ckpt_s = strlen(state_str); if( NULL != vpid->ckpt_snapshot_ref && (int)strlen(vpid->ckpt_snapshot_ref) > len_ckpt_r) len_ckpt_r = strlen(vpid->ckpt_snapshot_ref); if( NULL != vpid->ckpt_snapshot_loc && (int)strlen(vpid->ckpt_snapshot_loc) > len_ckpt_l) len_ckpt_l = strlen(vpid->ckpt_snapshot_loc); #endif } line_len = (len_o_proc_name + 3 + len_proc_name + 3 + len_rank + 3 + len_pid + 3 + len_state + 3 + len_node + 3 + len_ckpt_s + 3 + len_ckpt_r + 3 + len_ckpt_l) + 2; /* * Print Header */ printf("\t"); printf("%*s | ", len_proc_name , "Process Name"); printf("%*s | ", len_o_proc_name , "ORTE Name"); printf("%*s | ", len_rank , "Local Rank"); printf("%*s | ", len_pid , "PID"); printf("%*s | ", len_node , "Node"); printf("%*s | ", len_state , "State"); #if OPAL_ENABLE_FT_CR == 1 printf("%*s | ", len_ckpt_s , "Ckpt State"); printf("%*s | ", len_ckpt_r , "Ckpt Ref"); printf("%*s |", len_ckpt_l , "Ckpt Loc"); #endif printf("\n"); printf("\t"); pretty_print_dashed_line(line_len); /* * Print Info */ for(v=0; v < job->num_procs; v++) { vpid = (orte_proc_t*)job->procs->addr[v]; printf("\t"); if( 0 >= (int)job->num_apps ) { if( 0 == vpid->name.vpid ) { printf("%*s | ", len_proc_name, "orterun"); } else { printf("%*s | ", len_proc_name, "orted"); } } for( i = 0; i < (int)job->num_apps; ++i) { app = (orte_app_context_t*)job->apps->addr[i]; if( app->idx == vpid->app_idx ) { printf("%*s | ", len_proc_name, app->app); break; } } o_proc_name = orte_util_print_name_args(&vpid->name); printf("%*s | ", len_o_proc_name, o_proc_name); printf("%*u | ", len_rank , (uint)vpid->local_rank); printf("%*d | ", len_pid , vpid->pid); printf("%*s | ", len_node , (NULL == vpid->nodename) ? "Unknown" : vpid->nodename); printf("%*s | ", len_state , orte_proc_state_to_str(vpid->state)); #if OPAL_ENABLE_FT_CR == 1 printf("%*s | ", len_ckpt_s, state_str); printf("%*s | ", len_ckpt_r, (NULL == vpid->ckpt_snapshot_ref ? "" : vpid->ckpt_snapshot_ref)); printf("%*s |", len_ckpt_l, (NULL == vpid->ckpt_snapshot_loc ? "" : vpid->ckpt_snapshot_loc)); #endif printf("\n"); } return ORTE_SUCCESS; }
static void track_procs(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; opal_output_verbose(2, orte_state_base_framework.framework_output, "%s state:staged_hnp:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } /* if this is a registration, check to see if it came from * inside MPI_Init - if it did, that is not acceptable */ if (ORTE_PROC_STATE_REGISTERED == state) { if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_AS_MPI) && !ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_GANG_LAUNCHED)) { /* we can't support this - issue an error and abort */ orte_show_help("help-state-staged-hnp.txt", "mpi-procs-not-supported", true); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT); } /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } OBJ_RELEASE(caddy); return; } if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { goto terminated; } OBJ_RELEASE(caddy); return; } if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { goto terminated; } OBJ_RELEASE(caddy); return; } /* if the proc terminated, see if any other procs are * waiting to run. We assume that the app_contexts are * in priority order, with the highest priority being * at position 0 in the app_context array for this job */ if (ORTE_PROC_STATE_TERMINATED == state) { terminated: /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = ORTE_PROC_STATE_TERMINATED; if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { /* no other procs are waiting, so end this job */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } else if (jdata->num_mapped < jdata->num_procs) { /* schedule the job for re-mapping so that procs * waiting for resources can execute */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); } /* otherwise, do nothing until more procs terminate */ OBJ_RELEASE(caddy); return; } }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; int i; opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ if (orte_orteds_term_ordered && 0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:base all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } cleanup: OBJ_RELEASE(caddy); }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:staged_orted:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); switch (state) { case ORTE_PROC_STATE_RUNNING: /* update the proc state */ pdata->state = state; jdata->num_launched++; /* we don't really care - nothing further to do */ break; case ORTE_PROC_STATE_REGISTERED: /* update the proc state */ pdata->state = state; /* if this proc registered as an MPI proc, and * MPI is not allowed, then that is an error */ if (!jdata->gang_launched && pdata->mpi_proc) { /* abort the proc */ /* notify the HNP of the error */ } break; case ORTE_PROC_STATE_IOF_COMPLETE: /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ pdata->iof_complete = true; if (pdata->waitpid_recvd) { /* the proc has terminated */ pdata->alive = false; pdata->state = ORTE_PROC_STATE_TERMINATED; /* retrieve any file maps posted by this process and forward them * to the HNP for collection */ orte_dfs.get_file_map(proc, send_fms, pdata); } /* Release the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process * Do this after we handle termination in case the IOF needs * to check to see if all procs from the job are actually terminated */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } break; case ORTE_PROC_STATE_WAITPID_FIRED: /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ pdata->waitpid_recvd = true; if (pdata->iof_complete) { /* the proc has terminated */ pdata->alive = false; pdata->state = ORTE_PROC_STATE_TERMINATED; /* retrieve any file maps posted by this process and forward them * to the HNP for collection */ orte_dfs.get_file_map(proc, send_fms, pdata); } break; default: /* ignore */ break; } cleanup: OBJ_RELEASE(caddy); }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; int i; char *rtmod; orte_process_name_t parent, target, *npptr; opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get our "lifeline" routed module */ rtmod = orte_rml.get_routed(orte_mgmt_conduit); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* tell the PMIx subsystem to cleanup this client */ opal_pmix.server_deregister_client(proc, NULL, NULL); /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ if (orte_orteds_term_ordered && 0 == orte_routed.num_routes(rtmod)) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:base all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); /* if they requested notification upon completion, provide it */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) { /* notify_completion => notify the parent of the termination * of this child job. So get the parent jobid info */ npptr = &parent; if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) { /* notify everyone who asked for it */ target.jobid = jdata->jobid; target.vpid = ORTE_VPID_WILDCARD; _send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD); } else { target.jobid = jdata->jobid; target.vpid = ORTE_VPID_WILDCARD; _send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent); } } } else if (ORTE_PROC_STATE_TERMINATED < pdata->state && !orte_job_term_ordered) { /* if this was an abnormal term, notify the other procs of the termination */ parent.jobid = jdata->jobid; parent.vpid = ORTE_VPID_WILDCARD; _send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent); } } cleanup: OBJ_RELEASE(caddy); }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; int rc, i; orte_plm_cmd_flag_t cmd; char *rtmod; orte_std_cntr_t index; orte_job_map_t *map; orte_node_t *node; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; /* don't update until we are told that all are done */ } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_local_procs) { /* once everyone registers, notify the HNP */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted: notifying HNP all local registered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack registered command */ cmd = ORTE_PLM_REGISTERED_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); /* Release the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process * Do this after we handle termination in case the IOF needs * to check to see if all procs from the job are actually terminated */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDALL); } if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* if this proc has not already recorded as terminated, then * update the accounting here */ if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { jdata->num_terminated++; } /* update the proc state */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED); ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ rtmod = orte_rml.get_routed(orte_mgmt_conduit); if (orte_orteds_term_ordered && 0 == orte_routed.num_routes(rtmod)) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted all routes gone but proc %s still alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pdata->name))); goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* track job status */ if (jdata->num_terminated == jdata->num_local_procs && !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) { /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); /* cleanup the procs as these are gone */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } /* if this child is part of the job... */ if (pptr->name.jobid == jdata->jobid) { /* clear the entry in the local children */ opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(pptr); // maintain accounting } } /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } /* tell the PMIx subsystem the job is complete */ if (NULL != opal_pmix.server_deregister_nspace) { opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL); } /* release the resources */ if (NULL != jdata->map) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:orted releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (pptr->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:orted releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(pptr); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; } /* cleanup the job info */ opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL); OBJ_RELEASE(jdata); } } cleanup: OBJ_RELEASE(caddy); }
static void sample(int fd, short event, void *arg) { float prob; orte_proc_t *child; int i; /* if we are not sampling any more, then just return */ if (NULL == sample_ev) { return; } OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sample:ft_tester considering killing something", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* are we including ourselves? */ if (ORTE_PROC_IS_DAEMON && 0 < mca_sensor_ft_tester_component.daemon_fail_prob) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sample:ft_tester considering killing me!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* roll the dice */ prob = (double)random() / (double)INT32_MAX; if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) { /* commit suicide */ OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sample:ft_tester committing suicide", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); orte_errmgr.abort(1, NULL); return; } } /* see if we should kill a child */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (!child->alive || 0 == child->pid || ORTE_PROC_STATE_UNTERMINATED < child->state) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), child->alive ? "TRUE" : "FALSE", (unsigned long)child->pid, orte_proc_state_to_str(child->state))); continue; } /* roll the dice */ prob = (double)random() / (double)INT32_MAX; OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sample:ft_tester child: %s dice: %f prob %f", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), prob, mca_sensor_ft_tester_component.fail_prob)); if (prob < mca_sensor_ft_tester_component.fail_prob) { /* you shall die... */ OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sample:ft_tester killing %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name))); kill(child->pid, SIGTERM); /* are we allowing multiple deaths */ if (!mca_sensor_ft_tester_component.multi_fail) { break; } } } /* restart the timer */ if (NULL != sample_ev) { opal_event_evtimer_add(sample_ev, &sample_time); } }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; int rc, i; orte_plm_cmd_flag_t cmd; int8_t flag; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (NULL == pdata) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; /* don't update until we are told that all are done */ } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_local_procs) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: notifying HNP all local registered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack registered command */ cmd = ORTE_PLM_REGISTERED_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_AS_MPI)) { flag = 1; } else { flag = 0; } if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &flag, 1, OPAL_INT8))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } /* send it */ if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* if this proc has not already recorded as terminated, then * update the accounting here */ if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { jdata->num_terminated++; } /* update the proc state */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED); ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); /* track job status */ if (jdata->num_terminated == jdata->num_local_procs && !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) { /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } } cleanup: OBJ_RELEASE(caddy); }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_proc_t *pptr; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_proc_t *child, *ptr; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; orte_vpid_t null=ORTE_VPID_INVALID; orte_ns_cmp_bitmask_t mask=ORTE_NS_CMP_ALL; int i; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default_orted:proc_errors process %s error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { goto cleanup; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { goto cleanup; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - ignore */ goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default:orted daemon %s exited", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* see if this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:orted daemon %s was a lifeline - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* are any of my children still alive */ for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (child->alive && child->state < ORTE_PROC_STATE_UNTERMINATED) { goto cleanup; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_TERMINATE(0); } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default:orted not exiting, num_routes() == %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } /* if not, then we can continue */ goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* must already be complete */ goto cleanup; } pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); /* if there are no local procs for this job, we can * ignore this call */ if (0 == jdata->num_local_procs) { goto cleanup; } /* find this proc in the local children */ child = NULL; for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &ptr->name, proc)) { child = ptr; break; } } if (NULL == child) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default_orted got state %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc))); if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { child->state = state; /* Decrement the number of local procs */ jdata->num_local_procs--; /* kill this proc */ killprocs(proc->jobid, proc->vpid); goto cleanup; } if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { if (!orte_abort_non_zero_exit) { /* leave the child in orte_local_children so we can * later send the state info after full job termination */ child->state = state; child->waitpid_recvd = true; if (child->iof_complete) { /* the proc has terminated */ child->alive = false; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(&child->name); /* track job status */ jdata->num_terminated++; } /* treat this as normal termination */ goto REPORT_STATE; } /* report this as abnormal termination to the HNP */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } child->state = state; /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* remove the child from our local array as it is no longer alive */ opal_pointer_array_set_item(orte_local_children, i, NULL); /* Decrement the number of local procs */ jdata->num_local_procs--; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); /* release the child object */ OBJ_RELEASE(child); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } return; } if (ORTE_PROC_STATE_FAILED_TO_START == state || ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) { /* update the proc state */ child->state = state; /* count the proc as having "terminated" */ jdata->num_terminated++; /* leave the error report in this case to the * state machine, which will receive notice * when all local procs have attempted to start * so that we send a consolidated error report * back to the HNP */ goto cleanup; } if (ORTE_PROC_STATE_TERMINATED < state) { /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } child->state = state; /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* remove the child from our local array as it is no longer alive */ opal_pointer_array_set_item(orte_local_children, i, NULL); /* Decrement the number of local procs */ jdata->num_local_procs--; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); /* release the child object */ OBJ_RELEASE(child); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } return; } REPORT_STATE: if (ORTE_PROC_STATE_REGISTERED == state) { /* see if everyone in this job has registered */ if (all_children_registered(proc->jobid)) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted: sending contact info to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack init routes command */ cmd = ORTE_PLM_INIT_ROUTES_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (ptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return; } } } /* pack an invalid marker */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return; } /* add in contact info for all procs in the job */ if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&alert); return; } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } } return; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* remove all of this job's children from the global list - do not lock * the thread as we are already locked */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (jdata->jobid == ptr->name.jobid) { opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(ptr); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } return; } cleanup: OBJ_RELEASE(caddy); }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_proc_t *pptr, *proct; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; int i; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:default_hnp: for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* if the orteds are terminating, check job complete */ if (orte_orteds_term_ordered) { opal_output(0, "TERM ORDERED - CHECKING COMPLETE"); goto cleanup; } else { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } } pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); /* we MUST handle a communication failure before doing anything else * as it requires some special care to avoid normal termination issues * for local application procs */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* is this to a daemon? */ if (ORTE_PROC_MY_NAME->jobid != proc->jobid) { /* nope - ignore it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s Comm failure to non-daemon proc - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this is my own connection, ignore it */ if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s Comm failure on my own connection - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if we have ordered orteds to terminate or abort * is in progress, record it */ if (orte_orteds_term_ordered || orte_abnormal_term_ordered) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s Comm failure: daemons terminating - recording daemon %s as gone", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); /* if all my routes and local children are gone, then terminate ourselves */ if (0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && proct->alive && proct->state < ORTE_PROC_STATE_UNTERMINATED) { /* at least one is still alive */ goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr_hnp: all routes and children gone - ordering exit", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s Comm failure: daemon %s - aborting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* record the first one to fail */ if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = pptr; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* abort the system */ default_hnp_abort(jdata); goto cleanup; } /* update the proc state - can get multiple reports on a proc * depending on circumstances, so ensure we only do this once */ if (pptr->state < ORTE_PROC_STATE_TERMINATED) { pptr->state = state; jdata->num_terminated++; } /* since we only come here if the proc terminated, * cleanup the local proc, if required */ cleanup_local_proc(jdata, proc); /* ensure we record the failed proc properly so we can report * the error once we terminate */ switch (state) { case ORTE_PROC_STATE_KILLED_BY_CMD: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s killed by cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* we ordered this proc to die, so it isn't an abnormal termination * and we don't flag it as such */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } /* don't abort the job as this isn't an abnormal termination */ break; case ORTE_PROC_STATE_ABORTED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s aborted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_ABORTED; /* point to the first rank to cause the problem */ jdata->aborted_proc = pptr; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* abnormal termination - abort */ default_hnp_abort(jdata); break; case ORTE_PROC_STATE_ABORTED_BY_SIG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s aborted by signal", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; /* point to the first rank to cause the problem */ jdata->aborted_proc = pptr; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* abnormal termination - abort */ default_hnp_abort(jdata); break; case ORTE_PROC_STATE_TERM_WO_SYNC: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s terminated without sync", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; /* point to the first rank to cause the problem */ jdata->aborted_proc = pptr; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* now treat a special case - if the proc exit'd without a required * sync, it may have done so with a zero exit code. We want to ensure * that the user realizes there was an error, so in this -one- case, * we overwrite the process' exit code with the default error code */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* abnormal termination - abort */ default_hnp_abort(jdata); break; case ORTE_PROC_STATE_FAILED_TO_START: case ORTE_PROC_STATE_FAILED_TO_LAUNCH: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); if (!jdata->abort) { if (ORTE_PROC_STATE_FAILED_TO_START) { jdata->state = ORTE_JOB_STATE_FAILED_TO_START; } else { jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH; } /* point to the first rank to cause the problem */ jdata->aborted_proc = pptr; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* abnormal termination - abort */ default_hnp_abort(jdata); break; case ORTE_PROC_STATE_CALLED_ABORT: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s called abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_CALLED_ABORT; /* point to the first proc to cause the problem */ jdata->aborted_proc = pptr; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* abnormal termination - abort */ default_hnp_abort(jdata); break; case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s exceeded sensor boundary", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; /* point to the lowest rank to cause the problem */ jdata->aborted_proc = pptr; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* abnormal termination - abort */ default_hnp_abort(jdata); break; case ORTE_PROC_STATE_TERM_NON_ZERO: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s exited with non-zero status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), pptr->exit_code)); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* track the number of non-zero exits */ jdata->num_non_zero_exit++; if (orte_abort_non_zero_exit) { if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; /* point to the first rank to cause the problem */ jdata->aborted_proc = pptr; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); jdata->abort = true; } /* user requested we abort in this scenario */ default_hnp_abort(jdata); } else { /* user requested we consider this normal termination */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } break; case ORTE_PROC_STATE_HEARTBEAT_FAILED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s heartbeat failed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; /* point to the first rank to cause the problem */ jdata->aborted_proc = pptr; /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); /* kill all jobs */ default_hnp_abort(jdata); break; default: /* shouldn't get this, but terminate job if required */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp: proc %s default error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } break; } cleanup: OBJ_RELEASE(caddy); }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; #if 0 orte_ns_cmp_bitmask_t mask; opal_buffer_t *buf; orcm_rm_cmd_flag_t command = ORCM_NODESTATE_UPDATE_COMMAND; orcm_node_state_t state = ORCM_NODE_STATE_DOWN; int ret; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:orcm: proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&caddy->name), orte_proc_state_to_str(caddy->proc_state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:orcm: finalizing", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(caddy); return; } if (ORTE_PROC_STATE_COMM_FAILED == caddy->proc_state) { mask = ORTE_NS_CMP_ALL; /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, &caddy->name)) { OBJ_RELEASE(caddy); return; } /* see is this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(&caddy->name)) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:orcm: lost my lifeline", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* order an exit */ ORTE_ERROR_LOG(ORTE_ERR_UNRECOVERABLE); OBJ_RELEASE(caddy); exit(1); } else { /* only notify for orcm daemon failures */ if (0 == caddy->name.jobid) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:orcm: reporting child aggregator failure", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* inform the scheduler of the lost connection */ buf = OBJ_NEW(opal_buffer_t); /* pack the alloc command flag */ if (OPAL_SUCCESS != (ret = opal_dss.pack(buf, &command,1, ORCM_RM_CMD_T))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buf); OBJ_RELEASE(caddy); return; } if (OPAL_SUCCESS != (ret = opal_dss.pack(buf, &state, 1, OPAL_INT8))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buf); OBJ_RELEASE(caddy); return; } if (OPAL_SUCCESS != (ret = opal_dss.pack(buf, &caddy->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buf); OBJ_RELEASE(caddy); return; } if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_SCHEDULER, buf, ORCM_RML_TAG_RM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buf); OBJ_RELEASE(caddy); return; } } } } else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) { OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:orcm: lifeline lost", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* order an exit */ ORTE_ERROR_LOG(ORTE_ERR_UNRECOVERABLE); OBJ_RELEASE(caddy); exit(1); } #endif /* cleanup */ OBJ_RELEASE(caddy); }
static int update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { int rc=ORTE_SUCCESS, i; orte_app_context_t *app; orte_node_t *node; orte_proc_t *pptr, *daemon, *pptr2; opal_buffer_t *notify; orcm_triplet_t *trp; orcm_source_t *src; bool procs_recovered; orte_job_t *jdt; uint16_t jfam; bool send_msg; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:update_state for job %s proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); /* protect against threads */ ORTE_ACQUIRE_THREAD(&ctl); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /*** UPDATE COMMAND FOR A JOB ***/ if (NULL == proc) { /* should only get this if a daemon restarted and we need * to check for procs waiting to migrate */ if (ORTE_JOB_STATE_PROCS_MIGRATING != jobstate) { /* we should never get this situation */ opal_output(0, "%s UNKNOWN JOB ERROR ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_job_state_to_str(jobstate)); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERROR; } /* cycle thru all known jobs looking for those with procs * awaiting resources to migrate */ for (i=0; i < orte_job_data->size; i++) { if (NULL == (jdt = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { continue; } if (ORTE_JOB_STATE_PROCS_MIGRATING != jdt->state) { continue; } /* reset the job */ orte_plm_base_reset_job(jdt); /* map the job again */ if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdt))) { ORTE_ERROR_LOG(rc); continue; } /* launch any procs that could be mapped - note that not * all procs that were waiting for migration may have * been successfully mapped, so this could in fact * result in no action by the daemons */ notify = OBJ_NEW(opal_buffer_t); /* indicate the target DVM */ jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); opal_dss.pack(notify, &jfam, 1, OPAL_UINT16); /* get the launch data */ if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(notify, jdt->jobid))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(notify); ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /* send it to the daemons */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL, NULL, ORCM_PNP_TAG_COMMAND, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /**** DEAL WITH INDIVIDUAL PROCS ****/ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:sched got state %s for proc %s pid %d exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc), pid, exit_code)); /* if this was a failed comm or heartbeat */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* ignore this */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { /* get the proc object for this daemon */ if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } /* ensure that the heartbeat system knows to ignore this proc * from this point forward */ daemon->beat = 0; /* if we have already heard about this proc, ignore repeats */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == daemon->state) { /* already heard */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } #if 0 /* delete the route */ orte_routed.delete_route(proc); /* purge the oob */ orte_rml.purge(proc); #endif /* get the triplet/source and mark this source as "dead" */ if (NULL == (trp = orcm_get_triplet_stringid("orcmd:0.1:alpha"))) { opal_output(0, "%s CANNOT FIND DAEMON TRIPLET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } if (NULL == (src = orcm_get_source(trp, proc, false))) { opal_output(0, "%s DAEMON %s IS UNKNOWN SOURCE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); ORTE_RELEASE_THREAD(&trp->ctl); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } src->alive = false; ORTE_RELEASE_THREAD(&src->ctl); ORTE_RELEASE_THREAD(&trp->ctl); /* notify all apps immediately */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } notify = OBJ_NEW(opal_buffer_t); send_msg = false; for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); ORTE_RELEASE_THREAD(&ctl); return rc; } /* reset the proc stats */ OBJ_DESTRUCT(&pptr->stats); OBJ_CONSTRUCT(&pptr->stats, opal_pstats_t); /* since we added something, need to send msg */ send_msg = true; } if (send_msg) { /* send it to all apps */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL, ORCM_PNP_TAG_ERRMGR, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } else { OBJ_RELEASE(notify); } /* reset the node stats */ OBJ_DESTRUCT(&node->stats); OBJ_CONSTRUCT(&node->stats, opal_node_stats_t); /* record that the daemon died */ daemon->state = state; daemon->exit_code = exit_code; daemon->pid = 0; /* reset the daemon stats */ OBJ_DESTRUCT(&daemon->stats); OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t); node = daemon->node; if (NULL == node) { opal_output(0, "%s Detected failure of daemon %s on unknown node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); /* can't do anything further */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } else { opal_output(0, "%s Detected failure of daemon %s on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), (NULL == node->name) ? "UNKNOWN" : node->name); } /* see if any usable daemons are left alive */ procs_recovered = false; for (i=2; i < daemon_job->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, i))) { continue; } if (ORTE_PROC_STATE_UNTERMINATED < pptr->state) { continue; } /* at least one alive! recover procs from the failed one */ recover_procs(proc); procs_recovered = true; break; } if (!procs_recovered) { daemon->node = NULL; node->state = ORTE_NODE_STATE_DOWN; node->daemon = NULL; /* mark all procs on this node as having terminated */ for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } /* get the job data object for this process */ if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) { /* major problem */ opal_output(0, "%s COULD NOT GET JOB OBJECT FOR PROC %s(%d): state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), i, orte_proc_state_to_str(pptr->state)); continue; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) { continue; } OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING PROC %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); app->num_procs--; opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL); OBJ_RELEASE(pptr); /* clean it off the node */ opal_pointer_array_set_item(node->procs, i, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(pptr); /* see if job is empty */ jdt->num_terminated++; if (jdt->num_procs <= jdt->num_terminated) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING JOB %s FROM ACTIVE ARRAY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdt->jobid))); opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL); OBJ_RELEASE(jdt); } } } ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } if (ORTE_PROC_STATE_RESTARTED == state) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s RESTART OF DAEMON %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* get the proc object for this daemon */ if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } /* if apps were on that node, notify all apps immediately that * those procs have failed */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } notify = OBJ_NEW(opal_buffer_t); send_msg = false; for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); ORTE_RELEASE_THREAD(&ctl); return rc; } /* since we added something, we need to send msg */ send_msg = true; /* remove the proc from the app so that it will get * restarted when we re-activate the config */ if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) { continue; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) { continue; } OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING PROC %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); app->num_procs--; opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL); OBJ_RELEASE(pptr); /* clean it off the node */ opal_pointer_array_set_item(node->procs, i, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(pptr); /* see if job is empty */ jdt->num_terminated++; if (jdt->num_procs <= jdt->num_terminated) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING JOB %s FROM ACTIVE ARRAY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdt->jobid))); opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL); OBJ_RELEASE(jdt); } } if (send_msg) { /* send it to all apps */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL, ORCM_PNP_TAG_ERRMGR, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } else { OBJ_RELEASE(notify); } /* reset the node stats */ OBJ_DESTRUCT(&node->stats); OBJ_CONSTRUCT(&node->stats, opal_node_stats_t); /* reset the daemon stats */ OBJ_DESTRUCT(&daemon->stats); OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t); /* don't restart procs - we'll do that later after * we allow time for multiple daemons to restart */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /* to arrive here is an error */ opal_output(0, "%s GOT UNRECOGNIZED STATE %s FOR PROC %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc)); return ORTE_ERROR; }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { if (jdata->controls & ORTE_JOB_CONTROL_DEBUGGER_DAEMON) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } pdata->iof_complete = true; if (pdata->waitpid_recvd) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; pdata->waitpid_recvd = true; if (pdata->iof_complete) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ pdata->alive = false; pdata->state = state; if (pdata->local_proc) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } cleanup: OBJ_RELEASE(caddy); }
/* failure notifications come here */ static void remote_update(int status, orte_process_name_t *sender, orcm_pnp_tag_t tag, struct iovec *msg, int count, opal_buffer_t *buffer, void *cbdata) { int rc, n, k, cnt; orte_process_name_t name; uint8_t flag; orte_job_t *jdata; orte_proc_t *proc, *pptr; orte_node_t *node; orte_app_context_t *app; opal_buffer_t *bfr; orte_proc_state_t state; orte_exit_code_t exit_code; pid_t pid; bool restart_reqd, job_released, job_done; uint16_t jfam; struct timeval offset={0, 0}; int32_t max_fails=0; orte_errmgr_caddy_t *cd; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:sched:receive proc state notification from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* get the node object for the sender */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, sender->vpid))) { opal_output(0, "%s CANNOT FIND NODE FOR DAEMON %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender)); return; } /* unpack the names of the procs */ restart_reqd = false; n=1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &name, &n, ORTE_NAME))) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s GOT UPDATE FOR %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name))); /* unpack the pid of the proc */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &n, OPAL_PID))) { ORTE_ERROR_LOG(rc); return; } /* unpack the state of the proc */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &n, ORTE_PROC_STATE))) { ORTE_ERROR_LOG(rc); return; } /* unpack the exit_code of the proc */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &n, ORTE_EXIT_CODE))) { ORTE_ERROR_LOG(rc); return; } /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(name.jobid))) { /* BIG problem*/ opal_output(0, "%s errmgr:sched JOB %s NOT FOUND", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(name.jobid)); return; } /* get the proc object */ if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name.vpid))) { /* unknown proc - race condition when killing a proc on cmd */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s MISSING PROC %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name))); continue; } /* update data */ proc->pid = pid; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s CHANGING STATE OF PROC %s FROM %s TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name), orte_proc_state_to_str(proc->state), orte_proc_state_to_str(state))); proc->state = state; proc->exit_code = exit_code; /* if the proc has failed, mark the job for restart unless * it was killed by our own cmd */ if (ORTE_PROC_STATE_UNTERMINATED < state) { /* reset the stats */ OBJ_DESTRUCT(&proc->stats); OBJ_CONSTRUCT(&proc->stats, opal_pstats_t); if (ORTE_PROC_STATE_KILLED_BY_CMD == state) { /* this is a response to our killing a proc - remove it * from the system */ opal_pointer_array_set_item(jdata->procs, name.vpid, NULL); jdata->num_procs--; /* clean it off of the node */ for (k=0; k < node->procs->size; k++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) { continue; } if (pptr->name.jobid == proc->name.jobid && pptr->name.vpid == proc->name.vpid) { /* found it */ OPAL_OUTPUT_VERBOSE((7, orte_errmgr_base.output, "%s REMOVING ENTRY %d FOR PROC %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, ORTE_NAME_PRINT(&proc->name), ORTE_VPID_PRINT(sender->vpid))); opal_pointer_array_set_item(node->procs, k, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(proc); break; } } /* release the object */ OBJ_RELEASE(proc); /* if the job is now empty, or if the only procs remaining are stopped * due to exceeding restart (and thus cannot run), remove it too */ if (0 == jdata->num_procs) { opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); } else { job_done = true; for (k=0; k < jdata->procs->size; k++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, k))) { continue; } OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s CHECKING PROC %s STATE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), orte_proc_state_to_str(pptr->state))); if (pptr->state < ORTE_PROC_STATE_UNTERMINATED || ORTE_PROC_STATE_CANNOT_RESTART != pptr->state) { job_done = false; break; } } if (job_done) { opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); } } } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s FLAGGING JOB %s AS CANDIDATE FOR RESTART", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); jdata->state = ORTE_JOB_STATE_RESTART; /* flag that at least one job requires restart */ restart_reqd = true; } } /* prep for next round */ n=1; } if (ORCM_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); } /* if restart not reqd, nothing more to do */ if (!restart_reqd) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s NO RESTARTS REQUIRED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return; } /* cycle thru the array of jobs looking for those requiring restart */ for (n=1; n < orte_job_data->size; n++) { if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) { continue; } if (ORTE_JOB_STATE_RESTART != jdata->state) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s JOB %s CANDIDATE FOR RESTART", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* find the proc that needs restarting */ restart_reqd = false; job_released = false; max_fails = 0; offset.tv_sec = 0; for (cnt=0; cnt < jdata->procs->size; cnt++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, cnt))) { continue; } if (ORTE_PROC_STATE_UNTERMINATED < proc->state && ORTE_PROC_STATE_KILLED_BY_CMD != proc->state) { /* get the app for this proc */ app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx); if (NULL == app) { opal_output(0, "%s UNKNOWN APP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); continue; } /* check the number of restarts to see if the limit has been reached */ if (app->max_restarts < 0 || proc->restarts < app->max_restarts) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s FLAGGING PROC %s FOR RESTART", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); /* flag the proc for restart */ proc->state = ORTE_PROC_STATE_RESTART; restart_reqd = true; /* adjust accounting */ jdata->num_terminated++; /* increment the restart counter since the proc will be restarted */ proc->restarts++; /* track max failures */ if (max_fails < proc->restarts) { max_fails = proc->restarts; } } else { /* limit reached - don't restart it */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s PROC %s AT LIMIT - CANNOT RESTART", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); /* leave the proc in the system so users can see that it * reached the restart limit */ proc->state = ORTE_PROC_STATE_CANNOT_RESTART; proc->pid = 0; /* increment his restarts this once so it shows as too high */ proc->restarts++; /* adjust accounting */ jdata->num_procs--; jdata->num_terminated++; /* clean it off of the node */ if (NULL == (node = proc->node)) { continue; } for (k=0; k < node->procs->size; k++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) { continue; } if (pptr == proc) { /* found it */ opal_pointer_array_set_item(node->procs, k, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(proc); proc->node = NULL; break; } } } } } /* if the job was released, then move on */ if (job_released) { continue; } /* if no procs require restart, then move on to next job */ if (!restart_reqd) { jdata->state = ORTE_JOB_STATE_RUNNING; /* reset this */ continue; } /* calculate a delay to avoid racy situation when a proc * is continuously failing due to, e.g., a bad command * syntax */ if (1 < max_fails) { if (4 < max_fails) { /* cap the delay at 4 secs */ offset.tv_sec = 4; } else { /* add a sec for each failure beyond the first */ offset.tv_sec = max_fails - 1; } } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s DELAYING RESTART OF JOB %s FOR %d SECS", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), (int)offset.tv_sec)); cd = OBJ_NEW(orte_errmgr_caddy_t); cd->jdata = jdata; opal_event_evtimer_set(opal_event_base, &cd->ev, launch_restart, cd); opal_event_evtimer_add(&cd->ev, &offset); } }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_proc_t *pptr, *proct; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; int i; int32_t i32, *i32ptr; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_hnp: for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* could be a race condition */ goto cleanup; } pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); /* we MUST handle a communication failure before doing anything else * as it requires some special care to avoid normal termination issues * for local application procs */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* is this to a daemon? */ if (ORTE_PROC_MY_NAME->jobid != proc->jobid) { /* nope - ignore it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure to non-daemon proc - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this is my own connection, ignore it */ if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure on my own connection - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* mark the daemon as gone */ ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE); /* if we have ordered orteds to terminate or abort * is in progress, record it */ if (orte_orteds_term_ordered || orte_abnormal_term_ordered) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: daemons terminating - recording daemon %s as gone", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); /* if all my routes and local children are gone, then terminate ourselves */ if (0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) { /* at least one is still alive */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: at least one proc (%s) still alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proct->name))); goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr_hnp: all routes and children gone - ordering exit", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: %d routes remain alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: daemon %s - aborting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* record the first one to fail */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { /* output an error message so the user knows what happened */ orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name); /* mark the daemon job as failed */ jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); /* update our exit code */ ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* just in case the exit code hadn't been set, do it here - this * won't override any reported exit code */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE); } /* abort the system */ default_hnp_abort(jdata); goto cleanup; } /* update the proc state - can get multiple reports on a proc * depending on circumstances, so ensure we only do this once */ if (pptr->state < ORTE_PROC_STATE_TERMINATED) { pptr->state = state; } /* if we were ordered to terminate, mark this proc as dead and see if * any of our routes or local children remain alive - if not, then * terminate ourselves. */ if (orte_orteds_term_ordered) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) { goto keep_going; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:hnp all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } } keep_going: /* ensure we record the failed proc properly so we can report * the error once we terminate */ switch (state) { case ORTE_PROC_STATE_KILLED_BY_CMD: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s killed by cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* we ordered this proc to die, so it isn't an abnormal termination * and we don't flag it as such */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } /* don't abort the job as this isn't an abnormal termination */ break; case ORTE_PROC_STATE_ABORTED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s aborted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_ABORTED; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; case ORTE_PROC_STATE_ABORTED_BY_SIG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s aborted by signal", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* track the number of non-zero exits */ i32 = 0; i32ptr = &i32; orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32); ++i32; orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32); if (orte_abort_non_zero_exit) { if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } } else { /* user requested we consider this normal termination */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } break; case ORTE_PROC_STATE_TERM_WO_SYNC: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s terminated without sync", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* now treat a special case - if the proc exit'd without a required * sync, it may have done so with a zero exit code. We want to ensure * that the user realizes there was an error, so in this -one- case, * we overwrite the process' exit code with the default error code */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; case ORTE_PROC_STATE_FAILED_TO_START: case ORTE_PROC_STATE_FAILED_TO_LAUNCH: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { if (ORTE_PROC_STATE_FAILED_TO_START) { jdata->state = ORTE_JOB_STATE_FAILED_TO_START; } else { jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH; } /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } /* if this was a daemon, report it */ if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* output a message indicating we failed to launch a daemon */ orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true); } break; case ORTE_PROC_STATE_CALLED_ABORT: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s called abort with exit code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), pptr->exit_code)); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_CALLED_ABORT; /* point to the first proc to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; case ORTE_PROC_STATE_TERM_NON_ZERO: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s exited with non-zero status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), pptr->exit_code)); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* track the number of non-zero exits */ i32 = 0; i32ptr = &i32; orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32); ++i32; orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32); if (orte_abort_non_zero_exit) { if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } } else { /* user requested we consider this normal termination */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } break; case ORTE_PROC_STATE_HEARTBEAT_FAILED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s heartbeat failed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); break; case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: unable to send message to proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if this proc is one of my daemons, then we are truly * hosed - so just exit out */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); break; } if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; default: /* shouldn't get this, but terminate job if required */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s default error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } break; } /* if the waitpid fired, be sure to let the state machine know */ if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED); } cleanup: OBJ_RELEASE(caddy); }
static void recover_procs(orte_process_name_t *daemon) { orte_job_t *jdt; orte_proc_t *proc; orte_node_t *node=NULL; int i, rc; opal_buffer_t *bfr; uint16_t jfam; struct timeval offset={0, 0}; int32_t max_fails=0; orte_errmgr_caddy_t *cd; /* the thread is locked by the caller, so don't do anything here */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s ATTEMPTING TO RECOVER PROCS FROM DAEMON %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(daemon))); /* if not already done, mark this daemon as down */ if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, daemon->vpid))) { /* correctly track number of alive daemons */ daemon_job->num_terminated++; orte_process_info.num_procs--; /* get the corresponding node */ node = proc->node; /* maintain accounting */ OBJ_RELEASE(proc); proc->node = NULL; } else { /* if it has already been removed, then we need to find the node it was on. * this doesn't necessarily correspond to the daemon's vpid, so we have * to search the array */ opal_output(0, "RECOVER PROCS - MISSING NODE"); return; } /* mark the node as down so it won't be used in mapping * procs to be relaunched */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s MARKING NODE %s DOWN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); node->state = ORTE_NODE_STATE_DOWN; node->daemon = NULL; max_fails = 0; /* mark all procs on this node as having terminated */ for (i=0; i < node->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } /* get the job data object for this process */ if (NULL == (jdt = orte_get_job_data_object(proc->name.jobid))) { /* major problem */ opal_output(0, "%s COULD NOT GET JOB OBJECT FOR PROC %s(%d): state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), i, orte_proc_state_to_str(proc->state)); continue; } /* since the proc failed for reasons other than its own, this restart * does not count against its total - so mark it for restart */ proc->state = ORTE_PROC_STATE_RESTART; proc->pid = 0; jdt->state = ORTE_JOB_STATE_RESTART; if (max_fails < proc->restarts) { max_fails = proc->restarts; } /* adjust the num terminated so that acctg works right */ jdt->num_terminated++; } /* calculate a delay to avoid racy situation when a proc * is continuously failing due to, e.g., a bad command * syntax */ if (1 < max_fails) { if (4 < max_fails) { /* cap the delay at 4 secs */ offset.tv_sec = 4; } else { /* add a sec for each failure beyond the first */ offset.tv_sec = max_fails - 1; } } /* now cycle thru the jobs and restart all those that were flagged */ for (i=0; i < orte_job_data->size; i++) { if (NULL == (jdt = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { continue; } if (ORTE_JOB_STATE_RESTART == jdt->state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s DELAYING RESTART OF JOB %s FOR %d SECS", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdt->jobid), (int)offset.tv_sec)); cd = OBJ_NEW(orte_errmgr_caddy_t); cd->jdata = jdt; opal_event_evtimer_set(opal_event_base, &cd->ev, launch_restart, cd); opal_event_evtimer_add(&cd->ev, &offset); } } }
/**** PROC STATE MACHINE ****/ void orte_state_base_activate_proc_state(orte_process_name_t *proc, orte_proc_state_t state) { opal_list_item_t *itm, *any=NULL, *error=NULL; orte_state_t *s; orte_state_caddy_t *caddy; for (itm = opal_list_get_first(&orte_proc_states); itm != opal_list_get_end(&orte_proc_states); itm = opal_list_get_next(itm)) { s = (orte_state_t*)itm; if (s->proc_state == ORTE_PROC_STATE_ANY) { /* save this place */ any = itm; } if (s->proc_state == ORTE_PROC_STATE_ERROR) { error = itm; } if (s->proc_state == state) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "%s ACTIVATING PROC %s STATE %s PRI %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state), s->priority)); if (NULL == s->cbfunc) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "%s NULL CBFUNC FOR PROC %s STATE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); return; } caddy = OBJ_NEW(orte_state_caddy_t); caddy->name = *proc; caddy->proc_state = state; opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); opal_event_set_priority(&caddy->ev, s->priority); opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); return; } } /* if we get here, then the state wasn't found, so execute * the default handler if it is defined */ if (ORTE_PROC_STATE_ERROR < state && NULL != error) { s = (orte_state_t*)error; } else if (NULL != any) { s = (orte_state_t*)any; } else { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "INCREMENT: ANY STATE NOT FOUND")); return; } if (NULL == s->cbfunc) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "ACTIVATE: ANY STATE HANDLER NOT DEFINED")); return; } caddy = OBJ_NEW(orte_state_caddy_t); caddy->name = *proc; caddy->proc_state = state; OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "%s ACTIVATING PROC %s STATE %s PRI %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state), s->priority)); opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); opal_event_set_priority(&caddy->ev, s->priority); opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); }
static void xcast_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tg, void* cbdata) { opal_list_item_t *item; orte_namelist_t *nm; int ret, cnt; opal_buffer_t *relay=NULL, *rly; orte_daemon_cmd_flag_t command = ORTE_DAEMON_NULL_CMD; opal_buffer_t wireup, datbuf, *data; opal_byte_object_t *bo; int8_t flag; orte_job_t *jdata; orte_proc_t *rec; opal_list_t coll; orte_grpcomm_signature_t *sig; orte_rml_tag_t tag; char *rtmod, *nidmap; size_t inlen, cmplen; uint8_t *packed_data, *cmpdata; int32_t nvals, i; opal_value_t kv, *kval; orte_process_name_t dmn; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:xcast:recv: with %d bytes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)buffer->bytes_used)); /* we need a passthru buffer to send to our children - we leave it * as compressed data */ rly = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(rly, buffer); OBJ_CONSTRUCT(&datbuf, opal_buffer_t); /* setup the relay list */ OBJ_CONSTRUCT(&coll, opal_list_t); /* unpack the flag to see if this payload is compressed */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } if (flag) { /* unpack the data size */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &inlen, &cnt, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } /* unpack the unpacked data size */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &cmplen, &cnt, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } /* allocate the space */ packed_data = (uint8_t*)malloc(inlen); /* unpack the data blob */ cnt = inlen; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, packed_data, &cnt, OPAL_UINT8))) { ORTE_ERROR_LOG(ret); free(packed_data); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } /* decompress the data */ if (orte_util_uncompress_block(&cmpdata, cmplen, packed_data, inlen)) { /* the data has been uncompressed */ opal_dss.load(&datbuf, cmpdata, cmplen); data = &datbuf; } else { data = buffer; } free(packed_data); } else { data = buffer; } /* get the signature that we do not need */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); ORTE_FORCED_TERMINATE(ret); return; } OBJ_RELEASE(sig); /* get the target tag */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &tag, &cnt, ORTE_RML_TAG))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); ORTE_FORCED_TERMINATE(ret); return; } /* get our conduit's routed module name */ rtmod = orte_rml.get_routed(orte_coll_conduit); /* if this is headed for the daemon command processor, * then we first need to check for add_local_procs * as that command includes some needed wireup info */ if (ORTE_RML_TAG_DAEMON == tag) { /* peek at the command */ cnt=1; if (ORTE_SUCCESS == (ret = opal_dss.unpack(data, &command, &cnt, ORTE_DAEMON_CMD))) { /* if it is an exit cmd, then flag that we are quitting so we will properly * handle connection losses from our downstream peers */ if (ORTE_DAEMON_EXIT_CMD == command || ORTE_DAEMON_HALT_VM_CMD == command) { orte_orteds_term_ordered = true; if (ORTE_DAEMON_HALT_VM_CMD == command) { /* this is an abnormal termination */ orte_abnormal_term_ordered = true; } /* copy the msg for relay to ourselves */ relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } opal_dss.copy_payload(relay, data); } else if (ORTE_DAEMON_ADD_LOCAL_PROCS == command || ORTE_DAEMON_DVM_NIDMAP_CMD == command || ORTE_DAEMON_DVM_ADD_PROCS == command) { /* setup our internal relay buffer */ relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } /* unpack the nidmap string - may be NULL */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &nidmap, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(ret); goto relay; } if (NULL != nidmap) { if (ORTE_SUCCESS != (ret = orte_regx.nidmap_parse(nidmap))) { ORTE_ERROR_LOG(ret); goto relay; } free(nidmap); } /* see if they included info on node capabilities */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { ORTE_ERROR_LOG(ret); goto relay; } if (0 != flag) { /* update our local nidmap, if required - the decode function * knows what to do */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:xcast updating daemon nidmap", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (ret = orte_regx.decode_daemon_nodemap(data))) { ORTE_ERROR_LOG(ret); goto relay; } if (!ORTE_PROC_IS_HNP) { /* update the routing plan - the HNP already did * it when it computed the VM, so don't waste time * re-doing it here */ orte_routed.update_routing_plan(rtmod); } /* routing is now possible */ orte_routed_base.routing_enabled = true; /* unpack the byte object */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { ORTE_ERROR_LOG(ret); goto relay; } if (0 < bo->size) { /* load it into a buffer */ OBJ_CONSTRUCT(&wireup, opal_buffer_t); opal_dss.load(&wireup, bo->bytes, bo->size); /* decode it, pushing the info into our database */ if (opal_pmix.legacy_get()) { OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = OPAL_PMIX_PROC_URI; kv.type = OPAL_STRING; cnt=1; while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kv.data.string, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(ret); break; } if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, &kv))) { ORTE_ERROR_LOG(ret); free(kv.data.string); break; } free(kv.data.string); kv.data.string = NULL; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) { ORTE_ERROR_LOG(ret); } } else { cnt=1; while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &nvals, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(ret); break; } for (i=0; i < nvals; i++) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kval, &cnt, OPAL_VALUE))) { ORTE_ERROR_LOG(ret); break; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s STORING MODEX DATA FOR PROC %s KEY %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&dmn), kval->key)); if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, kval))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(kval); break; } OBJ_RELEASE(kval); } } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) { ORTE_ERROR_LOG(ret); } } /* done with the wireup buffer - dump it */ OBJ_DESTRUCT(&wireup); } free(bo); } /* copy the remainder of the payload - we don't pass wiring info * to the odls */ opal_dss.copy_payload(relay, data); } else { relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } /* copy the msg for relay to ourselves */ opal_dss.copy_payload(relay, data); } } else { ORTE_ERROR_LOG(ret); goto CLEANUP; } } else { /* copy the msg for relay to ourselves */ relay = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(relay, data); } relay: if (!orte_do_not_launch) { /* get the list of next recipients from the routed module */ orte_routed.get_routing_list(rtmod, &coll); /* if list is empty, no relay is required */ if (opal_list_is_empty(&coll)) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:send_relay - recipient list is empty!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CLEANUP; } /* send the message to each recipient on list, deconstructing it as we go */ while (NULL != (item = opal_list_remove_first(&coll))) { nm = (orte_namelist_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)rly->bytes_used, ORTE_NAME_PRINT(&nm->name))); OBJ_RETAIN(rly); /* check the state of the recipient - no point * sending to someone not alive */ jdata = orte_get_job_data_object(nm->name.jobid); if (NULL == (rec = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) { if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not found - cannot relay", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name)); } OBJ_RELEASE(rly); OBJ_RELEASE(item); ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH); continue; } if ((ORTE_PROC_STATE_RUNNING < rec->state && ORTE_PROC_STATE_CALLED_ABORT != rec->state) || !ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) { if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay: %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name), ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE) ? orte_proc_state_to_str(rec->state) : "NOT ALIVE"); } OBJ_RELEASE(rly); OBJ_RELEASE(item); ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH); continue; } if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(orte_coll_conduit, &nm->name, rly, ORTE_RML_TAG_XCAST, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(rly); OBJ_RELEASE(item); ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH); continue; } OBJ_RELEASE(item); } } CLEANUP: /* cleanup */ OPAL_LIST_DESTRUCT(&coll); OBJ_RELEASE(rly); // retain accounting /* now pass the relay buffer to myself for processing - don't * inject it into the RML system via send as that will compete * with the relay messages down in the OOB. Instead, pass it * directly to the RML message processor */ if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) { ORTE_RML_POST_MESSAGE(ORTE_PROC_MY_NAME, tag, 1, relay->base_ptr, relay->bytes_used); relay->base_ptr = NULL; relay->bytes_used = 0; } if (NULL != relay) { OBJ_RELEASE(relay); } OBJ_DESTRUCT(&datbuf); }
/* * PROC */ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_type_t type) { char *tmp, *tmp2, *pfx2; /* set default result */ *output = NULL; /* protect against NULL prefix */ if (NULL == prefix) { asprintf(&pfx2, " "); } else { asprintf(&pfx2, "%s", prefix); } if (orte_xml_output) { /* need to create the output in XML format */ if (0 == src->pid) { asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", pfx2, ORTE_VPID_PRINT(src->name.vpid), orte_proc_state_to_str(src->state)); } else { asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\"/>\n", pfx2, ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, orte_proc_state_to_str(src->state)); } free(pfx2); return ORTE_SUCCESS; } if (!orte_devel_level_output) { /* just print a very simple output for users */ asprintf(&tmp, "\n%sProcess OMPI jobid: %s App: %ld Process rank: %s", pfx2, ORTE_JOBID_PRINT(src->name.jobid), (long)src->app_idx, ORTE_VPID_PRINT(src->name.vpid)); /* set the return */ *output = tmp; free(pfx2); return ORTE_SUCCESS; } asprintf(&tmp, "\n%sData for proc: %s", pfx2, ORTE_NAME_PRINT(&src->name)); asprintf(&tmp2, "%s\n%s\tPid: %ld\tLocal rank: %lu\tNode rank: %lu\tApp rank: %d", tmp, pfx2, (long)src->pid, (unsigned long)src->local_rank, (unsigned long)src->node_rank, src->app_rank); free(tmp); tmp = tmp2; #if OPAL_HAVE_HWLOC { char *locale=NULL; char *bind = NULL; if (NULL != src->locale) { hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset); } if (NULL != src->bind_location) { hwloc_bitmap_list_asprintf(&bind, src->bind_location->cpuset); } asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tLocale: %s\tBind location: %s\tBinding: %s", tmp, pfx2, orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx, (NULL == locale) ? "UNKNOWN" : locale, bind, (NULL == src->cpu_bitmap) ? "NULL" : src->cpu_bitmap); if (NULL != locale) { free(locale); } if (NULL != bind) { free(bind); } } #else asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld", tmp, pfx2, orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx); #endif free(tmp); /* set the return */ *output = tmp2; free(pfx2); return ORTE_SUCCESS; }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_proc_t *child, *ptr; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; int i; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors process %s error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors finalizing - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_LIFELINE_LOST == state || ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:orted lifeline lost - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* set our exit status */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* must already be complete */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - we can't seem to trust that we will catch the waitpid * in this situation, so push this over to be handled as if * it were a waitpid trigger so we don't create a bunch of * duplicate code */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* leave the exit code alone - process this as a waitpid */ odls_base_default_wait_local_proc(child, NULL); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted daemon %s exited", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if we are using static ports, then it is possible that the HNP * will not see this termination. So if the HNP didn't order us * to terminate, then we should ensure it knows */ if (orte_static_ports && !orte_orteds_term_ordered) { /* send an alert to the HNP */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* set the exit code to reflect the problem */ child->exit_code = ORTE_ERR_COMM_FAILURE; /* pack only the data for this daemon - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the daemon's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting lost connection to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); /* continue on */ goto cleanup; } if (orte_orteds_term_ordered) { /* are any of my children still alive */ for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s errmgr:default:orted[%s(%d)] proc %s is alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, ORTE_NAME_PRINT(&child->name))); goto cleanup; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted not exiting, num_routes() == %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } } /* if not, then we can continue */ goto cleanup; } if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* if this is not a local proc for this job, we can * ignore this call */ if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors proc is not local - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted got state %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc))); if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { /* update the state */ child->state = state; /* report this as abnormal termination to the HNP, unless we already have * done so for this job */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } if (ORTE_PROC_STATE_FAILED_TO_START == state || ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) { /* update the proc state */ child->state = state; /* count the proc as having "terminated" */ jdata->num_terminated++; /* leave the error report in this case to the * state machine, which will receive notice * when all local procs have attempted to start * so that we send a consolidated error report * back to the HNP */ goto cleanup; } if (ORTE_PROC_STATE_TERMINATED < state) { /* if we were ordered to terminate, see if * any of our routes or local children remain alive - if not, then * terminate ourselves. */ if (orte_orteds_term_ordered) { /* mark the child as no longer alive and update the counters, if necessary. * we have to do this here as we aren't going to send this to the state * machine, and we want to keep the bookkeeping accurate just in case */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); } if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED); jdata->num_terminated++; } for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { goto keep_going; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } /* no need to alert the HNP - we are already on our way out */ goto cleanup; } keep_going: /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away - but * only do this once! */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } child->state = state; /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* remove all of this job's children from the global list */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (jdata->jobid == ptr->name.jobid) { opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(ptr); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } return; } cleanup: OBJ_RELEASE(caddy); }
/* process incoming messages in order of receipt */ void orte_plm_base_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { orte_plm_cmd_flag_t command; orte_std_cntr_t count; orte_jobid_t job; orte_job_t *jdata, *parent; opal_buffer_t *answer; orte_vpid_t vpid; orte_proc_t *proc; orte_proc_state_t state; orte_exit_code_t exit_code; int32_t rc=ORTE_SUCCESS, ret; orte_app_context_t *app, *child_app; orte_process_name_t name; pid_t pid; bool running; int i; char **env; char *prefix_dir; OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive processing msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } switch (command) { case ORTE_PLM_LAUNCH_JOB_CMD: OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive job launch command from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* unpack the job object */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &count, ORTE_JOB))) { ORTE_ERROR_LOG(rc); goto ANSWER_LAUNCH; } /* record the sender so we know who to respond to */ jdata->originator.jobid = sender->jobid; jdata->originator.vpid = sender->vpid; /* get the parent's job object */ if (NULL != (parent = orte_get_job_data_object(sender->jobid))) { /* if the prefix was set in the parent's job, we need to transfer * that prefix to the child's app_context so any further launch of * orteds can find the correct binary. There always has to be at * least one app_context in both parent and child, so we don't * need to check that here. However, be sure not to overwrite * the prefix if the user already provided it! */ app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0); child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); prefix_dir = NULL; if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) && !orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) { orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING); } if (NULL != prefix_dir) { free(prefix_dir); } } /* if the user asked to forward any envars, cycle through the app contexts * in the comm_spawn request and add them */ if (NULL != orte_forwarded_envars) { for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } env = opal_environ_merge(orte_forwarded_envars, app->env); opal_argv_free(app->env); app->env = env; } } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive adding hosts", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* process any add-hostfile and add-host options that were provided */ if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) { ORTE_ERROR_LOG(rc); goto ANSWER_LAUNCH; } if (NULL != parent) { if (NULL == parent->bookmark) { /* find the sender's node in the job map */ if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) { /* set the bookmark so the child starts from that place - this means * that the first child process could be co-located with the proc * that called comm_spawn, assuming slots remain on that node. Otherwise, * the procs will start on the next available node */ jdata->bookmark = proc->node; } } else { jdata->bookmark = parent->bookmark; } } /* launch it */ OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive calling spawn", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { ORTE_ERROR_LOG(rc); goto ANSWER_LAUNCH; } break; ANSWER_LAUNCH: OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive - error on launch: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc)); /* setup the response */ answer = OBJ_NEW(opal_buffer_t); /* pack the error code to be returned */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); } /* send the response back to the sender */ if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_PLM_PROXY, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } break; case ORTE_PLM_UPDATE_PROC_STATE: opal_output_verbose(5, orte_plm_base_framework.framework_output, "%s plm:base:receive update proc state command from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender)); count = 1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { opal_output_verbose(5, orte_plm_base_framework.framework_output, "%s plm:base:receive got update_proc_state for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job)); name.jobid = job; running = false; /* get the job object */ jdata = orte_get_job_data_object(job); count = 1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID))) { if (ORTE_VPID_INVALID == vpid) { /* flag indicates that this job is complete - move on */ break; } name.vpid = vpid; /* unpack the pid */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &count, OPAL_PID))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* unpack the state */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &count, ORTE_PROC_STATE))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_PROC_STATE_RUNNING == state) { running = true; } /* unpack the exit code */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &count, ORTE_EXIT_CODE))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code)); if (NULL != jdata) { /* get the proc data object */ if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* NEVER update the proc state before activating the state machine - let * the state cbfunc update it as it may need to compare this * state against the prior proc state */ proc->pid = pid; proc->exit_code = exit_code; ORTE_ACTIVATE_PROC_STATE(&name, state); } } /* record that we heard back from a daemon during app launch */ if (running && NULL != jdata) { jdata->num_daemons_reported++; if (orte_report_launch_progress) { if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS); } } } /* prepare for next job */ count = 1; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } break; case ORTE_PLM_REGISTERED_CMD: count=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto DEPART; } name.jobid = job; /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto DEPART; } count=1; while (ORTE_SUCCESS == opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID)) { name.vpid = vpid; ORTE_ACTIVATE_PROC_STATE(&name, ORTE_PROC_STATE_REGISTERED); count=1; } break; default: ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; break; } CLEANUP: if (ORTE_SUCCESS != rc) { goto DEPART; } DEPART: /* see if an error occurred - if so, wakeup the HNP so we can exit */ if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) { jdata = NULL; ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive done processing commands", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); }
static int pretty_print_vpids(orte_job_t *job) { int len_o_proc_name = 0, len_proc_name = 0, len_rank = 0, len_pid = 0, len_state = 0, len_node = 0, len_ckpt_s = 0, len_ckpt_r = 0, len_ckpt_l = 0; int i, line_len; orte_vpid_t v; orte_proc_t *vpid; orte_app_context_t *app; char *o_proc_name; char **nodename = NULL; if (0 == job->num_procs) { return ORTE_SUCCESS; } /* * Caculate segment lengths */ len_o_proc_name = (int)strlen("ORTE Name"); len_proc_name = (int)strlen("Process Name"); len_rank = (int)strlen("Local Rank"); len_pid = 6; len_state = 0; len_node = 0; len_ckpt_s = -3; len_ckpt_r = -3; len_ckpt_l = -3; nodename = (char **) malloc(job->num_procs * sizeof(char *)); for(v=0; v < job->num_procs; v++) { char *rankstr; vpid = (orte_proc_t*)job->procs->addr[v]; /* * Find my app context */ if( 0 >= (int)job->num_apps ) { if( 0 == vpid->name.vpid ) { if( (int)strlen("orterun") > len_proc_name) len_proc_name = strlen("orterun"); } else { if( (int)strlen("orted") > len_proc_name) len_proc_name = strlen("orted"); } } for( i = 0; i < (int)job->num_apps; ++i) { app = (orte_app_context_t*)job->apps->addr[i]; if( app->idx == vpid->app_idx ) { if( (int)strlen(app->app) > len_proc_name) len_proc_name = strlen(app->app); break; } } o_proc_name = orte_util_print_name_args(&vpid->name); if ((int)strlen(o_proc_name) > len_o_proc_name) len_o_proc_name = strlen(o_proc_name); asprintf(&rankstr, "%u", (uint)vpid->local_rank); if ((int)strlen(rankstr) > len_rank) len_rank = strlen(rankstr); free(rankstr); nodename[v] = NULL; if( orte_get_attribute(&vpid->attributes, ORTE_PROC_NODENAME, (void**)&nodename[v], OPAL_STRING) && (int)strlen(nodename[v]) > len_node) { len_node = strlen(nodename[v]); } else if ((int)strlen("Unknown") > len_node) { len_node = strlen("Unknown"); } if( (int)strlen(orte_proc_state_to_str(vpid->state)) > len_state) len_state = strlen(orte_proc_state_to_str(vpid->state)); } line_len = (len_o_proc_name + 3 + len_proc_name + 3 + len_rank + 3 + len_pid + 3 + len_state + 3 + len_node + 3 + len_ckpt_s + 3 + len_ckpt_r + 3 + len_ckpt_l) + 2; /* * Print Header */ printf("\t"); printf("%*s | ", len_proc_name , "Process Name"); printf("%*s | ", len_o_proc_name , "ORTE Name"); printf("%*s | ", len_rank , "Local Rank"); printf("%*s | ", len_pid , "PID"); printf("%*s | ", len_node , "Node"); printf("%*s | ", len_state , "State"); printf("\n"); printf("\t"); pretty_print_dashed_line(line_len); /* * Print Info */ for(v=0; v < job->num_procs; v++) { vpid = (orte_proc_t*)job->procs->addr[v]; printf("\t"); if( 0 >= (int)job->num_apps ) { if( 0 == vpid->name.vpid ) { printf("%*s | ", len_proc_name, "orterun"); } else { printf("%*s | ", len_proc_name, "orted"); } } for( i = 0; i < (int)job->num_apps; ++i) { app = (orte_app_context_t*)job->apps->addr[i]; if( app->idx == vpid->app_idx ) { printf("%*s | ", len_proc_name, app->app); break; } } o_proc_name = orte_util_print_name_args(&vpid->name); printf("%*s | ", len_o_proc_name, o_proc_name); printf("%*u | ", len_rank , (uint)vpid->local_rank); printf("%*d | ", len_pid , vpid->pid); printf("%*s | ", len_node , (NULL == nodename[v]) ? "Unknown" : nodename[v]); printf("%*s | ", len_state , orte_proc_state_to_str(vpid->state)); if (NULL != nodename[v]) { free(nodename[v]); } printf("\n"); } if (NULL != nodename) { free(nodename); } return ORTE_SUCCESS; }