static int rte_finalize(void) { int ret = ORTE_SUCCESS; if (app_init_complete) { /* if I am a daemon, finalize using the default procedure */ if (ORTE_PROC_IS_DAEMON) { if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) { ORTE_ERROR_LOG(ret); } } else { /* use the default app procedure to finish */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { ORTE_ERROR_LOG(ret); } /* remove the envars that we pushed into environ * so we leave that structure intact */ unsetenv("OMPI_MCA_grpcomm"); unsetenv("OMPI_MCA_routed"); unsetenv("OMPI_MCA_orte_precondition_transports"); } } /* deconstruct my nidmap and jobmap arrays - this * function protects itself from being called * before things were initialized */ orte_util_nidmap_finalize(); #if OPAL_HAVE_HWLOC if (NULL != opal_hwloc_topology) { opal_hwloc_base_free_topology(opal_hwloc_topology); opal_hwloc_topology = NULL; } #endif return ret; }
static int rte_finalize(void) { int ret; /* use the default procedure to finish */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) { ORTE_ERROR_LOG(ret); } /* remove the envars that we pushed into environ * so we leave that structure intact */ unsetenv("OMPI_MCA_grpcomm"); unsetenv("OMPI_MCA_routed"); /* deconstruct my nidmap and jobmap arrays - this * function protects itself from being called * before things were initialized */ orte_util_nidmap_finalize(); return ret; }
static int rte_ft_event(int state) { int ret, exit_status = ORTE_SUCCESS; orte_proc_type_t svtype; orte_grpcomm_collective_t coll; OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); coll.id = orte_process_info.peer_init_barrier; /******** Checkpoint Prep ********/ if(OPAL_CRS_CHECKPOINT == state) { /* * Notify SnapC */ if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify Routed */ if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } /******** Continue Recovery ********/ else if (OPAL_CRS_CONTINUE == state ) { OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env ft_event(%2d) - %s is Continuing", state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify Routed */ if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify SnapC */ if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if( orte_cr_continue_like_restart ) { /* * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { opal_output(0, "ess:env: ft_event(%2d): Failed in orte_grpcomm.barrier (%d)", state, ret); exit_status = ret; goto cleanup; } ORTE_WAIT_FOR_COMPLETION(coll.active); if( orte_cr_flush_restart_files ) { OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env ft_event(%2d): %s " "Cleanup restart files...", state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); opal_crs_base_cleanup_flush(); } } } /******** Restart Recovery ********/ else if (OPAL_CRS_RESTART == state ) { OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env ft_event(%2d) - %s is Restarting", state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* * This should follow the ess init() function */ /* * Clear nidmap and jmap */ orte_util_nidmap_finalize(); /* * - Reset Contact information */ if( ORTE_SUCCESS != (ret = env_set_name() ) ) { exit_status = ret; } /* * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Restart the routed framework * JJH: Lie to the finalize function so it does not try to contact the daemon. */ svtype = orte_process_info.proc_type; orte_process_info.proc_type = ORTE_PROC_TOOL; if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } orte_process_info.proc_type = svtype; if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Group Comm - Clean out stale data */ orte_grpcomm.finalize(); if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_db.remove(NULL, NULL))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Restart the PLM - Does nothing at the moment, but included for completeness */ if (ORTE_SUCCESS != (ret = orte_plm.finalize())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * RML - Enable communications */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify Routed */ if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* if one was provided, build my nidmap */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { opal_output(0, "ess:env ft_event(%2d): Failed in orte_grpcomm.barrier (%d)", state, ret); exit_status = ret; goto cleanup; } ORTE_WAIT_FOR_COMPLETION(coll.active); if( orte_cr_flush_restart_files ) { OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env ft_event(%2d): %s " "Cleanup restart files...", state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); opal_crs_base_cleanup_flush(); } /* * Session directory re-init */ if (orte_create_session_dirs) { if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, /* Batch ID -- Not used */ ORTE_PROC_MY_NAME))) { exit_status = ret; } opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); } /* * Notify SnapC */ if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } else if (OPAL_CRS_TERM == state ) { /* Nothing */ } else { /* Error state = Nothing */ } cleanup: OBJ_DESTRUCT(&coll); return exit_status; }
static int rte_ft_event(int state) { int ret, exit_status = ORTE_SUCCESS; orte_proc_type_t svtype; /******** Checkpoint Prep ********/ if(OPAL_CRS_CHECKPOINT == state) { /* * Notify SnapC */ if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) { exit_status = ret; goto cleanup; } /* * Notify Routed */ if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) { exit_status = ret; goto cleanup; } /* * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) { exit_status = ret; goto cleanup; } } /******** Continue Recovery ********/ else if (OPAL_CRS_CONTINUE == state ) { /* * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) { exit_status = ret; goto cleanup; } /* * Notify Routed */ if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) { exit_status = ret; goto cleanup; } /* * Notify SnapC */ if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) { exit_status = ret; goto cleanup; } } /******** Restart Recovery ********/ else if (OPAL_CRS_RESTART == state ) { /* * This should follow the ess init() function */ /* * Clear nidmap and jmap */ orte_util_nidmap_finalize(); /* * - Reset Contact information */ if( ORTE_SUCCESS != (ret = slave_set_name() ) ) { exit_status = ret; } /* * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) { exit_status = ret; goto cleanup; } /* * Restart the routed framework * JJH: Lie to the finalize function so it does not try to contact the daemon. */ svtype = orte_process_info.proc_type; orte_process_info.proc_type = ORTE_PROC_TOOL; if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) { exit_status = ret; goto cleanup; } orte_process_info.proc_type = svtype; if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) { exit_status = ret; goto cleanup; } /* * Group Comm - Clean out stale data */ orte_grpcomm.finalize(); if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) { exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_grpcomm.purge_proc_attrs())) { exit_status = ret; goto cleanup; } /* * Restart the PLM - Does nothing at the moment, but included for completeness */ if (ORTE_SUCCESS != (ret = orte_plm.finalize())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * RML - Enable communications */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { exit_status = ret; goto cleanup; } /* * Session directory re-init */ if (orte_create_session_dirs) { if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, /* Batch ID -- Not used */ ORTE_PROC_MY_NAME))) { exit_status = ret; } opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); } /* * Notify Routed */ if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) { exit_status = ret; goto cleanup; } /* * Notify SnapC */ if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) { exit_status = ret; goto cleanup; } /* * Send new PID to HNP/daemon * The checkpointer could have used a proxy program to boot us * so the pid that the orted got from fork() may not be the * PID of this application. * - Note: BLCR does this because it tries to preseve the PID * of the program across checkpointes */ if( ORTE_SUCCESS != (ret = ess_slave_ft_event_update_process_info(orte_process_info.my_name, getpid())) ) { exit_status = ret; goto cleanup; } /* if one was provided, build my nidmap */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } else if (OPAL_CRS_TERM == state ) { /* Nothing */ } else { /* Error state = Nothing */ } cleanup: return exit_status; }