static void launch_restart(int fd, short args, void *cbdata) { orte_errmgr_caddy_t *cd = (orte_errmgr_caddy_t*)cbdata; int rc; opal_buffer_t *bfr; uint16_t jfam; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s RESTARTING JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(cd->jdata->jobid))); /* reset the job */ orte_plm_base_reset_job(cd->jdata); /* the resilient mapper will automatically avoid restarting the * proc on its former node */ /* map the job again */ if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(cd->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } bfr = OBJ_NEW(opal_buffer_t); /* indicate the target DVM */ jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); opal_dss.pack(bfr, &jfam, 1, OPAL_UINT16); /* get the launch data */ if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(bfr, cd->jdata->jobid))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(bfr); goto cleanup; } /* send it to the daemons */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL, NULL, ORCM_PNP_TAG_COMMAND, NULL, 0, bfr, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } cleanup: OBJ_RELEASE(cd); }
static void errmgr_autor_recover_processes(int fd, short event, void *cbdata) { int ret, exit_status = ORTE_SUCCESS; opal_list_item_t *item = NULL; errmgr_autor_wp_item_t *wp_item = NULL; orte_std_cntr_t i_proc; orte_proc_t *proc = NULL; orte_sstore_base_global_snapshot_info_t *snapshot = NULL; char * tmp_str = NULL; autor_mask_faults = true; ERRMGR_AUTOR_CLEAR_TIMERS(); ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_START); /* * Display the processes that are to be recovered */ OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor):recover() " "------- Display known failed processes in the job %s -------", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(current_global_jobdata->jobid))); opal_list_sort(procs_pending_recovery, autor_procs_sort_compare_fn); display_procs(); /* * Find the latest checkpoint */ OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor):recover() " "------- Find the latest checkpoint for the job %s -------", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(current_global_jobdata->jobid))); snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t); if( ORTE_SUCCESS != (ret = orte_sstore.request_global_snapshot_data(&orte_sstore_handle_last_stable, snapshot)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_SETUP); /* * Safely terminate the entire job */ opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Safely terminate the job %s -------", ORTE_JOBID_PRINT(current_global_jobdata->jobid)); for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); if( NULL == proc ) { continue; } if( proc->state < ORTE_PROC_STATE_UNTERMINATED ) { proc->state = ORTE_PROC_STATE_MIGRATING; } if( current_global_jobdata->stdin_target == proc->name.vpid ) { orte_iof.close(&(proc->name), ORTE_IOF_STDIN); } } orte_plm.terminate_procs(current_global_jobdata->procs); /* * Wait for the job to terminate all processes */ while(!check_if_terminated(current_global_jobdata->procs) ) { opal_progress(); } ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_TERM); opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Done waiting for termination of job %s -------", ORTE_JOBID_PRINT(current_global_jobdata->jobid)); current_global_jobdata->num_terminated = current_global_jobdata->num_procs; orte_plm_base_reset_job(current_global_jobdata); /* * Construct the app contexts to restart */ OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "%s errmgr:hnp(autor):recover() " "------- Rebuild job %s app context -------", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(current_global_jobdata->jobid))); for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); if( NULL == proc ) { continue; } if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, proc, &(snapshot->local_snapshots))) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnp_component.super.output_handle, "\tAdjusted: \"%s\" [0x%d] [%s]\n", ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); } ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESETUP); /* * Spawn the restarted job */ opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Respawning the job %s -------", ORTE_JOBID_PRINT(current_global_jobdata->jobid)); orte_snapc_base_has_recovered = false; autor_mask_faults = false; /* Failures pass this point are worth noting */ orte_plm.spawn(current_global_jobdata); /* * Wait for all the processes to restart */ opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Waiting for restart -------"); while(!check_if_restarted(current_global_jobdata->procs) ) { opal_progress(); } ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESTART); /* * All done */ while( !orte_snapc_base_has_recovered ) { opal_progress(); } opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle, "errmgr:hnp(autor):recover() " "------- Finished recovering job %s -------", ORTE_JOBID_PRINT(current_global_jobdata->jobid)); opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovery_complete", true); ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_FINISH); cleanup: while(NULL != (item = opal_list_remove_first(procs_pending_recovery))) { wp_item = (errmgr_autor_wp_item_t*)item; OBJ_RELEASE(wp_item); } if( NULL != tmp_str ) { free(tmp_str); tmp_str = NULL; } ERRMGR_AUTOR_DISPLAY_ALL_TIMERS(); autor_timer_active = false; autor_mask_faults = false; return; }
static int update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { int rc=ORTE_SUCCESS, i; orte_app_context_t *app; orte_node_t *node; orte_proc_t *pptr, *daemon, *pptr2; opal_buffer_t *notify; orcm_triplet_t *trp; orcm_source_t *src; bool procs_recovered; orte_job_t *jdt; uint16_t jfam; bool send_msg; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:update_state for job %s proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); /* protect against threads */ ORTE_ACQUIRE_THREAD(&ctl); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /*** UPDATE COMMAND FOR A JOB ***/ if (NULL == proc) { /* should only get this if a daemon restarted and we need * to check for procs waiting to migrate */ if (ORTE_JOB_STATE_PROCS_MIGRATING != jobstate) { /* we should never get this situation */ opal_output(0, "%s UNKNOWN JOB ERROR ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_job_state_to_str(jobstate)); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERROR; } /* cycle thru all known jobs looking for those with procs * awaiting resources to migrate */ for (i=0; i < orte_job_data->size; i++) { if (NULL == (jdt = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { continue; } if (ORTE_JOB_STATE_PROCS_MIGRATING != jdt->state) { continue; } /* reset the job */ orte_plm_base_reset_job(jdt); /* map the job again */ if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdt))) { ORTE_ERROR_LOG(rc); continue; } /* launch any procs that could be mapped - note that not * all procs that were waiting for migration may have * been successfully mapped, so this could in fact * result in no action by the daemons */ notify = OBJ_NEW(opal_buffer_t); /* indicate the target DVM */ jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); opal_dss.pack(notify, &jfam, 1, OPAL_UINT16); /* get the launch data */ if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(notify, jdt->jobid))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(notify); ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /* send it to the daemons */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL, NULL, ORCM_PNP_TAG_COMMAND, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /**** DEAL WITH INDIVIDUAL PROCS ****/ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:sched got state %s for proc %s pid %d exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc), pid, exit_code)); /* if this was a failed comm or heartbeat */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* ignore this */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { /* get the proc object for this daemon */ if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } /* ensure that the heartbeat system knows to ignore this proc * from this point forward */ daemon->beat = 0; /* if we have already heard about this proc, ignore repeats */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == daemon->state) { /* already heard */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } #if 0 /* delete the route */ orte_routed.delete_route(proc); /* purge the oob */ orte_rml.purge(proc); #endif /* get the triplet/source and mark this source as "dead" */ if (NULL == (trp = orcm_get_triplet_stringid("orcmd:0.1:alpha"))) { opal_output(0, "%s CANNOT FIND DAEMON TRIPLET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } if (NULL == (src = orcm_get_source(trp, proc, false))) { opal_output(0, "%s DAEMON %s IS UNKNOWN SOURCE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); ORTE_RELEASE_THREAD(&trp->ctl); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } src->alive = false; ORTE_RELEASE_THREAD(&src->ctl); ORTE_RELEASE_THREAD(&trp->ctl); /* notify all apps immediately */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } notify = OBJ_NEW(opal_buffer_t); send_msg = false; for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); ORTE_RELEASE_THREAD(&ctl); return rc; } /* reset the proc stats */ OBJ_DESTRUCT(&pptr->stats); OBJ_CONSTRUCT(&pptr->stats, opal_pstats_t); /* since we added something, need to send msg */ send_msg = true; } if (send_msg) { /* send it to all apps */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL, ORCM_PNP_TAG_ERRMGR, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } else { OBJ_RELEASE(notify); } /* reset the node stats */ OBJ_DESTRUCT(&node->stats); OBJ_CONSTRUCT(&node->stats, opal_node_stats_t); /* record that the daemon died */ daemon->state = state; daemon->exit_code = exit_code; daemon->pid = 0; /* reset the daemon stats */ OBJ_DESTRUCT(&daemon->stats); OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t); node = daemon->node; if (NULL == node) { opal_output(0, "%s Detected failure of daemon %s on unknown node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); /* can't do anything further */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } else { opal_output(0, "%s Detected failure of daemon %s on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), (NULL == node->name) ? "UNKNOWN" : node->name); } /* see if any usable daemons are left alive */ procs_recovered = false; for (i=2; i < daemon_job->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, i))) { continue; } if (ORTE_PROC_STATE_UNTERMINATED < pptr->state) { continue; } /* at least one alive! recover procs from the failed one */ recover_procs(proc); procs_recovered = true; break; } if (!procs_recovered) { daemon->node = NULL; node->state = ORTE_NODE_STATE_DOWN; node->daemon = NULL; /* mark all procs on this node as having terminated */ for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } /* get the job data object for this process */ if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) { /* major problem */ opal_output(0, "%s COULD NOT GET JOB OBJECT FOR PROC %s(%d): state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), i, orte_proc_state_to_str(pptr->state)); continue; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) { continue; } OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING PROC %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); app->num_procs--; opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL); OBJ_RELEASE(pptr); /* clean it off the node */ opal_pointer_array_set_item(node->procs, i, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(pptr); /* see if job is empty */ jdt->num_terminated++; if (jdt->num_procs <= jdt->num_terminated) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING JOB %s FROM ACTIVE ARRAY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdt->jobid))); opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL); OBJ_RELEASE(jdt); } } } ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } if (ORTE_PROC_STATE_RESTARTED == state) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s RESTART OF DAEMON %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* get the proc object for this daemon */ if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } /* if apps were on that node, notify all apps immediately that * those procs have failed */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } notify = OBJ_NEW(opal_buffer_t); send_msg = false; for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); ORTE_RELEASE_THREAD(&ctl); return rc; } /* since we added something, we need to send msg */ send_msg = true; /* remove the proc from the app so that it will get * restarted when we re-activate the config */ if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) { continue; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) { continue; } OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING PROC %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); app->num_procs--; opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL); OBJ_RELEASE(pptr); /* clean it off the node */ opal_pointer_array_set_item(node->procs, i, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(pptr); /* see if job is empty */ jdt->num_terminated++; if (jdt->num_procs <= jdt->num_terminated) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING JOB %s FROM ACTIVE ARRAY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdt->jobid))); opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL); OBJ_RELEASE(jdt); } } if (send_msg) { /* send it to all apps */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL, ORCM_PNP_TAG_ERRMGR, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } else { OBJ_RELEASE(notify); } /* reset the node stats */ OBJ_DESTRUCT(&node->stats); OBJ_CONSTRUCT(&node->stats, opal_node_stats_t); /* reset the daemon stats */ OBJ_DESTRUCT(&daemon->stats); OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t); /* don't restart procs - we'll do that later after * we allow time for multiple daemons to restart */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /* to arrive here is an error */ opal_output(0, "%s GOT UNRECOGNIZED STATE %s FOR PROC %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc)); return ORTE_ERROR; }