int main(int argc, char *argv[]) { int ret, exit_status = OPAL_SUCCESS; int child_pid; int prev_pid = 0; int idx; opal_crs_base_snapshot_t *snapshot = NULL; char * tmp_env_var = NULL; bool select = false; /*************** * Initialize ***************/ if (OPAL_SUCCESS != (ret = initialize(argc, argv))) { exit_status = ret; goto cleanup; } /* * Check for existence of the file, or program in the case of self */ if( OPAL_SUCCESS != (ret = check_file() )) { opal_show_help("help-opal-restart.txt", "invalid_filename", true, opal_restart_globals.snapshot_ref); exit_status = ret; goto cleanup; } /* Re-enable the selection of the CRS component, so we can choose the right one */ idx = mca_base_var_find(NULL, "crs", "base", "do_not_select"); if (0 > idx) { opal_output(opal_restart_globals.output, "MCA variable opal_crs_base_do_not_select not found\n"); exit_status = OPAL_ERROR; goto cleanup; } ret = mca_base_var_set_value(idx, &select, 0, MCA_BASE_VAR_SOURCE_DEFAULT, NULL); if (OPAL_SUCCESS != ret) { exit_status = ret; goto cleanup; } /* * Make sure we are using the correct checkpointer */ if(NULL == expected_crs_comp) { char * full_metadata_path = NULL; FILE * metadata = NULL; opal_asprintf(&full_metadata_path, "%s/%s/%s", opal_restart_globals.snapshot_loc, opal_restart_globals.snapshot_ref, opal_restart_globals.snapshot_metadata); if( NULL == (metadata = fopen(full_metadata_path, "r")) ) { opal_show_help("help-opal-restart.txt", "invalid_metadata", true, opal_restart_globals.snapshot_metadata, full_metadata_path); exit_status = OPAL_ERROR; goto cleanup; } if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(metadata, &expected_crs_comp, &prev_pid)) ) { opal_show_help("help-opal-restart.txt", "invalid_metadata", true, opal_restart_globals.snapshot_metadata, full_metadata_path); exit_status = ret; goto cleanup; } free(full_metadata_path); full_metadata_path = NULL; fclose(metadata); metadata = NULL; } opal_output_verbose(10, opal_restart_globals.output, "Restart Expects checkpointer: (%s)", expected_crs_comp); (void) mca_base_var_env_name("crs", &tmp_env_var); opal_setenv(tmp_env_var, expected_crs_comp, true, &environ); free(tmp_env_var); tmp_env_var = NULL; /* Select this component or don't continue. * If the selection of this component fails, then we can't * restart on this node because it doesn't have the proper checkpointer * available. */ if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) { opal_show_help("help-opal-restart.txt", "comp_select_failure", true, "crs", ret); exit_status = ret; goto cleanup; } if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) { opal_show_help("help-opal-restart.txt", "comp_select_failure", true, expected_crs_comp, ret); exit_status = ret; goto cleanup; } /* * Make sure we have selected the proper component */ if(NULL == expected_crs_comp || 0 != strncmp(expected_crs_comp, opal_crs_base_selected_component.base_version.mca_component_name, strlen(expected_crs_comp)) ) { opal_show_help("help-opal-restart.txt", "comp_select_mismatch", true, expected_crs_comp, opal_crs_base_selected_component.base_version.mca_component_name, ret); exit_status = ret; goto cleanup; } /****************************** * Restart in this process ******************************/ opal_output_verbose(10, opal_restart_globals.output, "Restarting from file (%s)\n", opal_restart_globals.snapshot_ref); snapshot = OBJ_NEW(opal_crs_base_snapshot_t); snapshot->cold_start = true; opal_asprintf(&(snapshot->snapshot_directory), "%s/%s", opal_restart_globals.snapshot_loc, opal_restart_globals.snapshot_ref); opal_asprintf(&(snapshot->metadata_filename), "%s/%s", snapshot->snapshot_directory, opal_restart_globals.snapshot_metadata); /* Since some checkpoint/restart systems don't pass along env vars to the * restarted app, we need to take care of that. * * Included here is the creation of any files or directories that need to be * created before the process is restarted. */ if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot) ) ) { exit_status = ret; goto cleanup; } /* * Do the actual restart */ ret = opal_crs.crs_restart(snapshot, false, &child_pid); if (OPAL_SUCCESS != ret) { opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true, opal_restart_globals.snapshot_ref, ret, opal_crs_base_selected_component.base_version.mca_component_name); exit_status = ret; goto cleanup; } /* Should never get here, since crs_restart calls exec */ /*************** * Cleanup ***************/ cleanup: if (OPAL_SUCCESS != (ret = finalize())) { return ret; } if(NULL != snapshot ) OBJ_DESTRUCT(snapshot); return exit_status; }
/* * Open all MCA components so that they can register their MCA * parameters. Take a shotgun approach here and indiscriminately open * all components -- don't be selective. To this end, we need to clear * out the environment of all OMPI_MCA_<type> variables to ensure * that the open algorithms don't try to only open one component. */ void orte_info_open_components(void) { int i; char *env, *str; char *target, *save, *type; char **env_save=NULL; bool need_close_components = false; orte_info_component_map_t *map; if (opened_components) { return; } /* init the map */ OBJ_CONSTRUCT(&component_map, opal_pointer_array_t); opal_pointer_array_init(&component_map, 256, INT_MAX, 128); /* Clear out the environment. Use strdup() to orphan the resulting * strings because items are placed in the environment by reference, * not by value. */ for (i = 0; i < mca_types.size; ++i) { if (NULL == (type = (char*)opal_pointer_array_get_item(&mca_types, i))) { continue; } asprintf(&env, "OMPI_MCA_%s", type); if (NULL != (save = getenv(env))) { /* save this param so it can later be restored */ asprintf(&str, "%s=%s", env, save); opal_argv_append_nosize(&env_save, str); free(str); /* can't manipulate it directly, so make a copy first */ asprintf(&target, "%s=", env); putenv(target); free(target); } } /* some components require the event library be active, so activate it */ if (OPAL_SUCCESS != opal_event_base_open()) { str = "opal_event_base_open"; goto error; } /* Open the DSS */ if (ORTE_SUCCESS != opal_dss_open()) { str = "Unable to initialize the DSS"; goto error; } /* Open up the MCA */ if (OPAL_SUCCESS != mca_base_open()) { str = "mca_base_open failed"; goto error; } /* Register the OPAL layer's MCA parameters */ if (OPAL_SUCCESS != opal_register_params()) { str = "opal_register_params failed"; goto error; } /* Register the ORTE layer's MCA parameters */ if (ORTE_SUCCESS != orte_register_params()) { str = "orte_register_params failed"; goto error; } /* Initialize the opal_output system */ if (!opal_output_init()) { str = "opal_output_init failed"; goto error; } /* Find / open all components */ map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("base"); opal_pointer_array_add(&component_map, map); /* set default error message from here forward */ str = "A component framework failed to open properly."; /* OPAL frameworks */ if (OPAL_SUCCESS != opal_backtrace_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("backtrace"); map->components = &opal_backtrace_base_components_opened; opal_pointer_array_add(&component_map, map); if (OPAL_SUCCESS != opal_memory_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("memory"); map->components = &opal_memory_base_components_opened; opal_pointer_array_add(&component_map, map); /* the event framework is already open - just get its components */ map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("event"); map->components = &opal_event_components; opal_pointer_array_add(&component_map, map); if (OPAL_SUCCESS != opal_memchecker_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("memchecker"); map->components = &opal_memchecker_base_components_opened; opal_pointer_array_add(&component_map, map); if (OPAL_SUCCESS != opal_shmem_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("shmem"); map->components = &opal_shmem_base_components_opened; opal_pointer_array_add(&component_map, map); #if OPAL_HAVE_HWLOC if (OPAL_SUCCESS != opal_hwloc_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("hwloc"); map->components = &opal_hwloc_base_components; opal_pointer_array_add(&component_map, map); #endif if (OPAL_SUCCESS != opal_timer_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("timer"); map->components = &opal_timer_base_components_opened; opal_pointer_array_add(&component_map, map); #if OPAL_ENABLE_FT_CR == 1 if (OPAL_SUCCESS != opal_crs_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("crs"); map->components = &opal_crs_base_components_available; opal_pointer_array_add(&component_map, map); #endif if (OPAL_SUCCESS != opal_if_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("if"); map->components = &opal_if_components; opal_pointer_array_add(&component_map, map); /* OPAL's installdirs base open has already been called as part of * opal_init_util() back in main(). */ map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("installdirs"); map->components = &opal_installdirs_components; opal_pointer_array_add(&component_map, map); /* ORTE frameworks * Set orte_process_info.proc_type to HNP to force all frameworks to * open components */ orte_process_info.proc_type = ORTE_PROC_HNP; if (ORTE_SUCCESS != orte_state_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("state"); map->components = &orte_state_base_components_available; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_errmgr_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("errmgr"); map->components = &orte_errmgr_base_components_available; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_grpcomm_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("grpcomm"); map->components = &orte_grpcomm_base.components_available; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_db_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("db"); map->components = &orte_db_base.available_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_ess_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("ess"); map->components = &orte_ess_base_components_available; opal_pointer_array_add(&component_map, map); #if !ORTE_DISABLE_FULL_SUPPORT if (ORTE_SUCCESS != mca_oob_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("oob"); map->components = &mca_oob_base_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_odls_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("odls"); map->components = &orte_odls_base.available_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_iof_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("iof"); map->components = &orte_iof_base.iof_components_opened; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_ras_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("ras"); map->components = &orte_ras_base.ras_opened; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_rmaps_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("rmaps"); map->components = &orte_rmaps_base.available_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_rml_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("rml"); map->components = &orte_rml_base_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_routed_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("routed"); map->components = &orte_routed_base_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_plm_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("plm"); map->components = &orte_plm_base.available_components; opal_pointer_array_add(&component_map, map); #if OPAL_ENABLE_FT_CR == 1 if (ORTE_SUCCESS != orte_snapc_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("snapc"); map->components = &orte_snapc_base_components_available; opal_pointer_array_add(&component_map, map); #endif if (ORTE_SUCCESS != orte_sensor_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("sensor"); map->components = &mca_sensor_base_components_available; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_filem_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("filem"); map->components = &orte_filem_base_components_available; opal_pointer_array_add(&component_map, map); #endif /* flag that we need to close components */ need_close_components = true; /* Restore the environment to what it was before we started so that * if users setenv OMPI_MCA_<framework name> to some value, they'll * see that value when it is shown via --param output. */ if (NULL != env_save) { for (i = 0; i < opal_argv_count(env_save); ++i) { putenv(env_save[i]); } } /* All done */ opened_components = true; return; error: fprintf(stderr, "%s\n", str); fprintf(stderr, "orte-info will likely not display all configuration information\n"); if (need_close_components) { opened_components = true; orte_info_close_components(); } }
int opal_cr_init(void ) { int ret, exit_status = OPAL_SUCCESS; opal_cr_coord_callback_fn_t prev_coord_func; int val; if( ++opal_cr_initalized != 1 ) { if( opal_cr_initalized < 1 ) { exit_status = OPAL_ERROR; goto cleanup; } exit_status = OPAL_SUCCESS; goto cleanup; } /* * Some startup MCA parameters */ ret = mca_base_param_reg_int_name("opal_cr", "verbose", "Verbose output level for the runtime OPAL Checkpoint/Restart functionality", false, false, 0, &val); if(0 != val) { opal_cr_output = opal_output_open(NULL); } else { opal_cr_output = -1; } opal_output_set_verbosity(opal_cr_output, val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Verbose Level: %d", val); mca_base_param_reg_int_name("ft", "cr_enabled", "Enable fault tolerance for this program", false, false, 0, &val); opal_cr_set_enabled(OPAL_INT_TO_BOOL(val)); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Enabled: %d", val); mca_base_param_reg_int_name("opal_cr", "enable_timer", "Enable Checkpoint timer (Default: Disabled)", false, false, 0, &val); opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val); mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier", "Enable Checkpoint timer Barrier (Default: Disabled)", false, false, 0, &val); if( opal_cr_timing_enabled ) { opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val); } else { opal_cr_timing_barrier_enabled = false; } mca_base_param_reg_int_name("opal_cr", "timer_target_rank", "Target Rank for the timer (Default: 0)", false, false, 0, &val); opal_cr_timing_target_rank = val; #if OPAL_ENABLE_FT_THREAD == 1 mca_base_param_reg_int_name("opal_cr", "use_thread", "Use an async thread to checkpoint this program (Default: Disabled)", false, false, 0, &val); opal_cr_thread_use_if_avail = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Use thread: %d", val); mca_base_param_reg_int_name("opal_cr", "thread_sleep_check", "Time to sleep between checking for a checkpoint (Default: 0)", false, false, 0, &val); opal_cr_thread_sleep_check = val; mca_base_param_reg_int_name("opal_cr", "thread_sleep_wait", "Time to sleep waiting for process to exit MPI library (Default: 0)", false, false, 0, &val); opal_cr_thread_sleep_wait = val; opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT thread sleep: check = %d, wait = %d", opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait); #endif mca_base_param_reg_int_name("opal_cr", "is_tool", "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.", false, false, 0, &val); opal_cr_is_tool = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Is a tool program: %d", val); #ifndef __WINDOWS__ mca_base_param_reg_int_name("opal_cr", "signal", "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program", false, false, SIGUSR1, &opal_cr_entry_point_signal); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal: %d", opal_cr_entry_point_signal); mca_base_param_reg_int_name("opal_cr", "debug_sigpipe", "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)", false, false, 0, &val); opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Debug SIGPIPE: %d (%s)", val, (opal_cr_debug_sigpipe ? "True" : "False")); #if OPAL_ENABLE_FT_THREAD == 1 /* If we have a thread, then attach the SIGPIPE signal handler there since * it is most likely to be the one that needs it. */ if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #else if( opal_cr_debug_sigpipe ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #endif #else opal_cr_is_tool = true; /* no support for CR on Windows yet */ #endif /* __WINDOWS__ */ mca_base_param_reg_string_name("opal_cr", "tmp_dir", "Temporary directory to place rendezvous files for a checkpoint", false, false, "/tmp", &opal_cr_pipe_dir); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Temp Directory: %s", opal_cr_pipe_dir); if( !opal_cr_is_tool ) { /* Register the OPAL interlevel coordination callback */ opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func); opal_cr_stall_check = false; opal_cr_currently_stalled = false; } /* End opal_cr_is_tool = true */ /* * If fault tolerance was not compiled in then * we need to make sure that the listener thread is active to tell * the tools that this is not a checkpointable job. * We don't need the CRS framework to be initalized. */ #if OPAL_ENABLE_FT == 1 /* * Open the checkpoint / restart service components */ if (OPAL_SUCCESS != (ret = opal_crs_base_open())) { opal_output(opal_cr_output, "opal_cr: init: opal_crs_base_open Failed to open. (%d)\n", ret); exit_status = ret; goto cleanup; } if (OPAL_SUCCESS != (ret = opal_crs_base_select())) { opal_output(opal_cr_output, "opal_cr: init: opal_crs_base_select Failed. (%d)\n", ret); exit_status = ret; goto cleanup; } #endif #if OPAL_ENABLE_FT_THREAD == 1 if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) { opal_output_verbose(10, opal_cr_output, "opal_cr: init: starting the thread\n"); opal_set_using_threads(true); /* * Start the thread */ OBJ_CONSTRUCT(&opal_cr_thread, opal_thread_t); OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t); opal_cr_thread_is_done = false; opal_cr_thread_is_active = false; opal_cr_thread_in_library = false; opal_cr_thread_num_in_library = 0; opal_cr_thread.t_run = opal_cr_thread_fn; opal_cr_thread.t_arg = NULL; opal_thread_start(&opal_cr_thread); } /* End opal_cr_is_tool = true */ else { opal_output_verbose(10, opal_cr_output, "opal_cr: init: *Not* Using C/R thread\n"); } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ cleanup: return exit_status; }
int opal_cr_init(void ) { int ret, exit_status = OPAL_SUCCESS; opal_cr_coord_callback_fn_t prev_coord_func; int val, t; if( ++opal_cr_initalized != 1 ) { if( opal_cr_initalized < 1 ) { exit_status = OPAL_ERROR; goto cleanup; } exit_status = OPAL_SUCCESS; goto cleanup; } /* * Some startup MCA parameters */ ret = mca_base_param_reg_int_name("opal_cr", "verbose", "Verbose output level for the runtime OPAL Checkpoint/Restart functionality", false, false, 0, &val); if(0 != val) { opal_cr_output = opal_output_open(NULL); } else { opal_cr_output = -1; } opal_output_set_verbosity(opal_cr_output, val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Verbose Level: %d", val); mca_base_param_reg_int_name("ft", "cr_enabled", "Enable fault tolerance for this program", false, false, 0, &val); opal_cr_set_enabled(OPAL_INT_TO_BOOL(val)); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Enabled: %d", val); mca_base_param_reg_int_name("opal_cr", "enable_timer", "Enable Checkpoint timer (Default: Disabled)", false, false, 0, &val); opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val); mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier", "Enable Checkpoint timer Barrier (Default: Disabled)", false, false, 0, &val); if( opal_cr_timing_enabled ) { opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val); } else { opal_cr_timing_barrier_enabled = false; } mca_base_param_reg_int_name("opal_cr", "timer_target_rank", "Target Rank for the timer (Default: 0)", false, false, 0, &val); opal_cr_timing_target_rank = val; #if OPAL_ENABLE_FT_THREAD == 1 mca_base_param_reg_int_name("opal_cr", "use_thread", "Use an async thread to checkpoint this program (Default: Disabled)", false, false, 0, &val); opal_cr_thread_use_if_avail = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Use thread: %d", val); mca_base_param_reg_int_name("opal_cr", "thread_sleep_check", "Time to sleep between checking for a checkpoint (Default: 0)", false, false, 0, &val); opal_cr_thread_sleep_check = val; mca_base_param_reg_int_name("opal_cr", "thread_sleep_wait", "Time to sleep waiting for process to exit MPI library (Default: 1000)", false, false, 1000, &val); opal_cr_thread_sleep_wait = val; opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT thread sleep: check = %d, wait = %d", opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait); #endif mca_base_param_reg_int_name("opal_cr", "is_tool", "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.", false, false, 0, &val); opal_cr_is_tool = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Is a tool program: %d", val); #if OPAL_ENABLE_CRDEBUG == 1 mca_base_param_reg_int_name("opal_cr", "enable_crdebug", "Enable checkpoint/restart debugging", false, false, 0, &val); MPIR_debug_with_checkpoint = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s]\n", (MPIR_debug_with_checkpoint ? "True": "False")); #endif #ifndef __WINDOWS__ mca_base_param_reg_int_name("opal_cr", "signal", "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program", false, false, SIGUSR1, &opal_cr_entry_point_signal); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal: %d", opal_cr_entry_point_signal); mca_base_param_reg_int_name("opal_cr", "debug_sigpipe", "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)", false, false, 0, &val); opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Debug SIGPIPE: %d (%s)", val, (opal_cr_debug_sigpipe ? "True" : "False")); #if OPAL_ENABLE_FT_THREAD == 1 /* If we have a thread, then attach the SIGPIPE signal handler there since * it is most likely to be the one that needs it. */ if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #else if( opal_cr_debug_sigpipe ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #endif #else opal_cr_is_tool = true; /* no support for CR on Windows yet */ #endif /* __WINDOWS__ */ #if OPAL_ENABLE_CRDEBUG == 1 opal_cr_debug_num_free_threads = 3; opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads ); for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) { opal_cr_debug_free_threads[t] = NULL; } mca_base_param_reg_int_name("opal_cr", "crdebug_signal", "Checkpoint/Restart signal used to hold threads when debugging", false, false, SIGTSTP, &opal_cr_debug_signal); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal (Debug): %d", opal_cr_debug_signal); if( SIG_ERR == signal(opal_cr_debug_signal, MPIR_checkpoint_debugger_signal_handler) ) { opal_output(opal_cr_output, "opal_cr: init: Failed to register C/R debug signal (%d)", opal_cr_debug_signal); } #else /* Silence a compiler warning */ t = 0; #endif mca_base_param_reg_string_name("opal_cr", "tmp_dir", "Temporary directory to place rendezvous files for a checkpoint", false, false, opal_tmp_directory(), &opal_cr_pipe_dir); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Temp Directory: %s", opal_cr_pipe_dir); if( !opal_cr_is_tool ) { /* Register the OPAL interlevel coordination callback */ opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func); opal_cr_stall_check = false; opal_cr_currently_stalled = false; } /* End opal_cr_is_tool = true */ /* * If fault tolerance was not compiled in then * we need to make sure that the listener thread is active to tell * the tools that this is not a checkpointable job. * We don't need the CRS framework to be initalized. */ #if OPAL_ENABLE_FT_CR == 1 /* * Open the checkpoint / restart service components */ if (OPAL_SUCCESS != (ret = opal_crs_base_open())) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_open", ret ); exit_status = ret; goto cleanup; } if (OPAL_SUCCESS != (ret = opal_crs_base_select())) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_select", ret ); exit_status = ret; goto cleanup; } #endif #if OPAL_ENABLE_FT_THREAD == 1 if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) { opal_output_verbose(10, opal_cr_output, "opal_cr: init: starting the thread\n"); /* JJH: We really do need this line below since it enables * actual locks for threads. However currently the * upper layers will deadlock if it is enabled. * So hack around the problem for now, while working * on a complete solution. See ticket #2741 for more * details. * opal_set_using_threads(true); */ /* * Start the thread */ OBJ_CONSTRUCT(&opal_cr_thread, opal_thread_t); OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t); opal_cr_thread_is_done = false; opal_cr_thread_is_active = false; opal_cr_thread_in_library = false; opal_cr_thread_num_in_library = 0; opal_cr_thread.t_run = opal_cr_thread_fn; opal_cr_thread.t_arg = NULL; opal_thread_start(&opal_cr_thread); } /* End opal_cr_is_tool = true */ else { opal_output_verbose(10, opal_cr_output, "opal_cr: init: *Not* Using C/R thread\n"); } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ cleanup: return exit_status; }