Example #1
0
int
main(int argc, char *argv[])
{
    int ret, exit_status = OPAL_SUCCESS;
    int child_pid;
    int prev_pid = 0;
    int idx;
    opal_crs_base_snapshot_t *snapshot = NULL;
    char * tmp_env_var = NULL;
    bool select = false;

    /***************
     * Initialize
     ***************/
    if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Check for existence of the file, or program in the case of self
     */
    if( OPAL_SUCCESS != (ret = check_file() )) {
        opal_show_help("help-opal-restart.txt", "invalid_filename", true,
                       opal_restart_globals.snapshot_ref);
        exit_status = ret;
        goto cleanup;
    }

    /* Re-enable the selection of the CRS component, so we can choose the right one */
    idx = mca_base_var_find(NULL, "crs", "base", "do_not_select");

    if (0 > idx) {
        opal_output(opal_restart_globals.output,
                    "MCA variable opal_crs_base_do_not_select not found\n");
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    ret = mca_base_var_set_value(idx, &select, 0, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
    if (OPAL_SUCCESS != ret) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Make sure we are using the correct checkpointer
     */
    if(NULL == expected_crs_comp) {
        char * full_metadata_path = NULL;
        FILE * metadata = NULL;

        opal_asprintf(&full_metadata_path, "%s/%s/%s",
                 opal_restart_globals.snapshot_loc,
                 opal_restart_globals.snapshot_ref,
                 opal_restart_globals.snapshot_metadata);
        if( NULL == (metadata = fopen(full_metadata_path, "r")) ) {
            opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
                           opal_restart_globals.snapshot_metadata,
                           full_metadata_path);
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
        if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(metadata,
                                                                            &expected_crs_comp,
                                                                            &prev_pid)) ) {
            opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
                           opal_restart_globals.snapshot_metadata,
                           full_metadata_path);
            exit_status = ret;
            goto cleanup;
        }

        free(full_metadata_path);
        full_metadata_path = NULL;

        fclose(metadata);
        metadata = NULL;
    }

    opal_output_verbose(10, opal_restart_globals.output,
                        "Restart Expects checkpointer: (%s)",
                        expected_crs_comp);

    (void) mca_base_var_env_name("crs", &tmp_env_var);
    opal_setenv(tmp_env_var,
                expected_crs_comp,
                true, &environ);
    free(tmp_env_var);
    tmp_env_var = NULL;

    /* Select this component or don't continue.
     * If the selection of this component fails, then we can't
     * restart on this node because it doesn't have the proper checkpointer
     * available.
     */
    if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) {
        opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
                       "crs", ret);
        exit_status = ret;
        goto cleanup;
    }

    if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
        opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
                       expected_crs_comp, ret);
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Make sure we have selected the proper component
     */
    if(NULL == expected_crs_comp ||
       0 != strncmp(expected_crs_comp,
                    opal_crs_base_selected_component.base_version.mca_component_name,
                    strlen(expected_crs_comp)) ) {
        opal_show_help("help-opal-restart.txt", "comp_select_mismatch",
                       true,
                       expected_crs_comp,
                       opal_crs_base_selected_component.base_version.mca_component_name,
                       ret);
        exit_status = ret;
        goto cleanup;
    }

    /******************************
     * Restart in this process
     ******************************/
    opal_output_verbose(10, opal_restart_globals.output,
                        "Restarting from file (%s)\n",
                        opal_restart_globals.snapshot_ref);

    snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
    snapshot->cold_start         = true;
    opal_asprintf(&(snapshot->snapshot_directory), "%s/%s",
             opal_restart_globals.snapshot_loc,
             opal_restart_globals.snapshot_ref);
    opal_asprintf(&(snapshot->metadata_filename), "%s/%s",
             snapshot->snapshot_directory,
             opal_restart_globals.snapshot_metadata);

    /* Since some checkpoint/restart systems don't pass along env vars to the
     * restarted app, we need to take care of that.
     *
     * Included here is the creation of any files or directories that need to be
     * created before the process is restarted.
     */
    if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot) ) ) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Do the actual restart
     */
    ret = opal_crs.crs_restart(snapshot,
                               false,
                               &child_pid);

    if (OPAL_SUCCESS != ret) {
        opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true,
                       opal_restart_globals.snapshot_ref,
                       ret,
                       opal_crs_base_selected_component.base_version.mca_component_name);
        exit_status = ret;
        goto cleanup;
    }
    /* Should never get here, since crs_restart calls exec */

    /***************
     * Cleanup
     ***************/
 cleanup:
    if (OPAL_SUCCESS != (ret = finalize())) {
        return ret;
    }

    if(NULL != snapshot )
        OBJ_DESTRUCT(snapshot);

    return exit_status;
}
Example #2
0
/*
 * Open all MCA components so that they can register their MCA
 * parameters.  Take a shotgun approach here and indiscriminately open
 * all components -- don't be selective.  To this end, we need to clear
 * out the environment of all OMPI_MCA_<type> variables to ensure
 * that the open algorithms don't try to only open one component.
 */
void orte_info_open_components(void)
{
    int i;
    char *env, *str;
    char *target, *save, *type;
    char **env_save=NULL;
    bool need_close_components = false;
    orte_info_component_map_t *map;
    
    if (opened_components) {
        return;
    }
    
    /* init the map */
    OBJ_CONSTRUCT(&component_map, opal_pointer_array_t);
    opal_pointer_array_init(&component_map, 256, INT_MAX, 128);
    
    /* Clear out the environment.  Use strdup() to orphan the resulting
     * strings because items are placed in the environment by reference,
     * not by value.
     */
    
    for (i = 0; i < mca_types.size; ++i) {
        if (NULL == (type = (char*)opal_pointer_array_get_item(&mca_types, i))) {
            continue;
        }
        asprintf(&env, "OMPI_MCA_%s", type);
        if (NULL != (save = getenv(env))) {
            /* save this param so it can later be restored */
            asprintf(&str, "%s=%s", env, save);
            opal_argv_append_nosize(&env_save, str);
            free(str);
            /* can't manipulate it directly, so make a copy first */
            asprintf(&target, "%s=", env);
            putenv(target);
            free(target);
        }
    }
    
    /* some components require the event library be active, so activate it */
    if (OPAL_SUCCESS != opal_event_base_open()) {
        str = "opal_event_base_open";
        goto error;
    }
    
    /* Open the DSS */
    
    if (ORTE_SUCCESS != opal_dss_open()) {
        str = "Unable to initialize the DSS";
        goto error;
    }
    
    /* Open up the MCA */
    
    if (OPAL_SUCCESS != mca_base_open()) {
        str = "mca_base_open failed";
        goto error;
    }
    
    /* Register the OPAL layer's MCA parameters */
    
    if (OPAL_SUCCESS != opal_register_params()) {
        str = "opal_register_params failed";
        goto error;
    }
    
    /* Register the ORTE layer's MCA parameters */
    
    if (ORTE_SUCCESS != orte_register_params()) {
        str = "orte_register_params failed";
        goto error;
    }
    
    /* Initialize the opal_output system */
    if (!opal_output_init()) {
        str = "opal_output_init failed";
        goto error;
    }
    
    /* Find / open all components */
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("base");
    opal_pointer_array_add(&component_map, map);
    
    /* set default error message from here forward */
    str = "A component framework failed to open properly.";
    
    /* OPAL frameworks */
    
    if (OPAL_SUCCESS != opal_backtrace_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("backtrace");
    map->components = &opal_backtrace_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (OPAL_SUCCESS != opal_memory_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("memory");
    map->components = &opal_memory_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    /* the event framework is already open - just get its components */
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("event");
    map->components = &opal_event_components;
    opal_pointer_array_add(&component_map, map);
    
    if (OPAL_SUCCESS != opal_memchecker_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("memchecker");
    map->components = &opal_memchecker_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (OPAL_SUCCESS != opal_shmem_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("shmem");
    map->components = &opal_shmem_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
#if OPAL_HAVE_HWLOC
    if (OPAL_SUCCESS != opal_hwloc_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("hwloc");
    map->components = &opal_hwloc_base_components;
    opal_pointer_array_add(&component_map, map);
#endif

    if (OPAL_SUCCESS != opal_timer_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("timer");
    map->components = &opal_timer_base_components_opened;
    opal_pointer_array_add(&component_map, map);

#if OPAL_ENABLE_FT_CR == 1
    if (OPAL_SUCCESS != opal_crs_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("crs");
    map->components = &opal_crs_base_components_available;
    opal_pointer_array_add(&component_map, map);
#endif

    if (OPAL_SUCCESS != opal_if_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("if");
    map->components = &opal_if_components;
    opal_pointer_array_add(&component_map, map);

    
    /* OPAL's installdirs base open has already been called as part of
     * opal_init_util() back in main().
     */
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("installdirs");
    map->components = &opal_installdirs_components;
    opal_pointer_array_add(&component_map, map);
    
    /* ORTE frameworks
     * Set orte_process_info.proc_type to HNP to force all frameworks to
     * open components
     */
    orte_process_info.proc_type = ORTE_PROC_HNP;
    
    if (ORTE_SUCCESS != orte_state_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("state");
    map->components = &orte_state_base_components_available;
    opal_pointer_array_add(&component_map, map);

    if (ORTE_SUCCESS != orte_errmgr_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("errmgr");
    map->components = &orte_errmgr_base_components_available;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_grpcomm_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("grpcomm");
    map->components = &orte_grpcomm_base.components_available;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_db_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("db");
    map->components = &orte_db_base.available_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_ess_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("ess");
    map->components = &orte_ess_base_components_available;
    opal_pointer_array_add(&component_map, map);
    
#if !ORTE_DISABLE_FULL_SUPPORT
    if (ORTE_SUCCESS != mca_oob_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("oob");
    map->components = &mca_oob_base_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_odls_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("odls");
    map->components = &orte_odls_base.available_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_iof_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("iof");
    map->components = &orte_iof_base.iof_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_ras_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("ras");
    map->components = &orte_ras_base.ras_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_rmaps_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("rmaps");
    map->components = &orte_rmaps_base.available_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_rml_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("rml");
    map->components = &orte_rml_base_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_routed_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("routed");
    map->components = &orte_routed_base_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_plm_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("plm");
    map->components = &orte_plm_base.available_components;
    opal_pointer_array_add(&component_map, map);

#if OPAL_ENABLE_FT_CR == 1
    if (ORTE_SUCCESS != orte_snapc_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("snapc");
    map->components = &orte_snapc_base_components_available;
    opal_pointer_array_add(&component_map, map);
#endif

    if (ORTE_SUCCESS != orte_sensor_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("sensor");
    map->components = &mca_sensor_base_components_available;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_filem_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("filem");
    map->components = &orte_filem_base_components_available;
    opal_pointer_array_add(&component_map, map);
#endif
    
    /* flag that we need to close components */
    need_close_components = true;

    /* Restore the environment to what it was before we started so that
     * if users setenv OMPI_MCA_<framework name> to some value, they'll
     * see that value when it is shown via --param output.
     */
    
    if (NULL != env_save) {
        for (i = 0; i < opal_argv_count(env_save); ++i) {
            putenv(env_save[i]);
        }
    }
    
    /* All done */
    
    opened_components = true;
    return;
    
error:
    fprintf(stderr, "%s\n", str);
    fprintf(stderr, "orte-info will likely not display all configuration information\n");
    if (need_close_components) {
        opened_components = true;
        orte_info_close_components();
    }
}
Example #3
0
int opal_cr_init(void )
{
    int ret, exit_status = OPAL_SUCCESS;
    opal_cr_coord_callback_fn_t prev_coord_func;
    int val;

    if( ++opal_cr_initalized != 1 ) {
        if( opal_cr_initalized < 1 ) {
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
        exit_status = OPAL_SUCCESS;
        goto cleanup;
    }

    /*
     * Some startup MCA parameters
     */
    ret = mca_base_param_reg_int_name("opal_cr", "verbose",
                                      "Verbose output level for the runtime OPAL Checkpoint/Restart functionality",
                                      false, false,
                                      0,
                                      &val);
    if(0 != val) {
        opal_cr_output = opal_output_open(NULL);
    } else {
        opal_cr_output = -1;
    }
    opal_output_set_verbosity(opal_cr_output, val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Verbose Level: %d",
                        val);

    mca_base_param_reg_int_name("ft", "cr_enabled",
                                "Enable fault tolerance for this program",
                                false, false,
                                0, &val);
    opal_cr_set_enabled(OPAL_INT_TO_BOOL(val));

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT Enabled: %d",
                        val);

    mca_base_param_reg_int_name("opal_cr", "enable_timer",
                                "Enable Checkpoint timer (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val);

    mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier",
                                "Enable Checkpoint timer Barrier (Default: Disabled)",
                                false, false,
                                0, &val);
    if( opal_cr_timing_enabled ) {
        opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val);
    } else {
        opal_cr_timing_barrier_enabled = false;
    }

    mca_base_param_reg_int_name("opal_cr", "timer_target_rank",
                                "Target Rank for the timer (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_timing_target_rank = val;

#if OPAL_ENABLE_FT_THREAD == 1
    mca_base_param_reg_int_name("opal_cr", "use_thread",
                                "Use an async thread to checkpoint this program (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_thread_use_if_avail = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT Use thread: %d",
                        val);

    mca_base_param_reg_int_name("opal_cr", "thread_sleep_check",
                                "Time to sleep between checking for a checkpoint (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_thread_sleep_check = val;

    mca_base_param_reg_int_name("opal_cr", "thread_sleep_wait",
                                "Time to sleep waiting for process to exit MPI library (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_thread_sleep_wait = val;

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT thread sleep: check = %d, wait = %d",
                        opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait);
#endif

    mca_base_param_reg_int_name("opal_cr", "is_tool",
                                "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
                                false, false,
                                0,
                                &val);
    opal_cr_is_tool = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Is a tool program: %d",
                        val);
#ifndef __WINDOWS__
    mca_base_param_reg_int_name("opal_cr", "signal",
                                "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program",
                                false, false,
                                SIGUSR1,
                                &opal_cr_entry_point_signal);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Checkpoint Signal: %d",
                        opal_cr_entry_point_signal);

    mca_base_param_reg_int_name("opal_cr", "debug_sigpipe",
                                "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Debug SIGPIPE: %d (%s)",
                        val, (opal_cr_debug_sigpipe ? "True" : "False"));

#if OPAL_ENABLE_FT_THREAD == 1
    /* If we have a thread, then attach the SIGPIPE signal handler there since
     * it is most likely to be the one that needs it.
     */
    if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) {
        if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
            ;
        }
    }
#else
    if( opal_cr_debug_sigpipe ) {
        if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
            ;
        }
    }
#endif

#else
    opal_cr_is_tool = true;  /* no support for CR on Windows yet */ 
#endif  /* __WINDOWS__ */

    mca_base_param_reg_string_name("opal_cr", "tmp_dir",
                                   "Temporary directory to place rendezvous files for a checkpoint",
                                   false, false,
                                   "/tmp",
                                   &opal_cr_pipe_dir);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Temp Directory: %s",
                        opal_cr_pipe_dir);

    if( !opal_cr_is_tool ) {
        /* Register the OPAL interlevel coordination callback */
        opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func);

        opal_cr_stall_check = false;
        opal_cr_currently_stalled = false;

    } /* End opal_cr_is_tool = true */

    /* 
     * If fault tolerance was not compiled in then
     * we need to make sure that the listener thread is active to tell
     * the tools that this is not a checkpointable job.
     * We don't need the CRS framework to be initalized.
     */
#if OPAL_ENABLE_FT    == 1
    /*
     * Open the checkpoint / restart service components
     */
    if (OPAL_SUCCESS != (ret = opal_crs_base_open())) {
        opal_output(opal_cr_output,
                    "opal_cr: init: opal_crs_base_open Failed to open. (%d)\n", ret);
        exit_status = ret;
        goto cleanup;
    }
    
    if (OPAL_SUCCESS != (ret = opal_crs_base_select())) {
        opal_output(opal_cr_output,
                    "opal_cr: init: opal_crs_base_select Failed. (%d)\n", ret);
        exit_status = ret;
        goto cleanup;
    }
#endif

#if OPAL_ENABLE_FT_THREAD == 1
    if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) {
        opal_output_verbose(10, opal_cr_output,
                            "opal_cr: init: starting the thread\n");

        opal_set_using_threads(true);
        /*
         * Start the thread
         */
        OBJ_CONSTRUCT(&opal_cr_thread,     opal_thread_t);
        OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t);

        opal_cr_thread_is_done    = false;
        opal_cr_thread_is_active  = false;
        opal_cr_thread_in_library = false;
        opal_cr_thread_num_in_library = 0;

        opal_cr_thread.t_run = opal_cr_thread_fn;
        opal_cr_thread.t_arg = NULL;
        opal_thread_start(&opal_cr_thread);

    } /* End opal_cr_is_tool = true */
    else {
        opal_output_verbose(10, opal_cr_output,
                            "opal_cr: init: *Not* Using C/R thread\n");
    }
#endif /* OPAL_ENABLE_FT_THREAD == 1 */

 cleanup:
    return exit_status;
}
Example #4
0
int opal_cr_init(void )
{
    int ret, exit_status = OPAL_SUCCESS;
    opal_cr_coord_callback_fn_t prev_coord_func;
    int val, t;

    if( ++opal_cr_initalized != 1 ) {
        if( opal_cr_initalized < 1 ) {
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
        exit_status = OPAL_SUCCESS;
        goto cleanup;
    }

    /*
     * Some startup MCA parameters
     */
    ret = mca_base_param_reg_int_name("opal_cr", "verbose",
                                      "Verbose output level for the runtime OPAL Checkpoint/Restart functionality",
                                      false, false,
                                      0,
                                      &val);
    if(0 != val) {
        opal_cr_output = opal_output_open(NULL);
    } else {
        opal_cr_output = -1;
    }
    opal_output_set_verbosity(opal_cr_output, val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Verbose Level: %d",
                        val);

    mca_base_param_reg_int_name("ft", "cr_enabled",
                                "Enable fault tolerance for this program",
                                false, false,
                                0, &val);
    opal_cr_set_enabled(OPAL_INT_TO_BOOL(val));

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT Enabled: %d",
                        val);

    mca_base_param_reg_int_name("opal_cr", "enable_timer",
                                "Enable Checkpoint timer (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val);

    mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier",
                                "Enable Checkpoint timer Barrier (Default: Disabled)",
                                false, false,
                                0, &val);
    if( opal_cr_timing_enabled ) {
        opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val);
    } else {
        opal_cr_timing_barrier_enabled = false;
    }

    mca_base_param_reg_int_name("opal_cr", "timer_target_rank",
                                "Target Rank for the timer (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_timing_target_rank = val;

#if OPAL_ENABLE_FT_THREAD == 1
    mca_base_param_reg_int_name("opal_cr", "use_thread",
                                "Use an async thread to checkpoint this program (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_thread_use_if_avail = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT Use thread: %d",
                        val);

    mca_base_param_reg_int_name("opal_cr", "thread_sleep_check",
                                "Time to sleep between checking for a checkpoint (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_thread_sleep_check = val;

    mca_base_param_reg_int_name("opal_cr", "thread_sleep_wait",
                                "Time to sleep waiting for process to exit MPI library (Default: 1000)",
                                false, false,
                                1000, &val);
    opal_cr_thread_sleep_wait = val;

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT thread sleep: check = %d, wait = %d",
                        opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait);
#endif

    mca_base_param_reg_int_name("opal_cr", "is_tool",
                                "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
                                false, false,
                                0,
                                &val);
    opal_cr_is_tool = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Is a tool program: %d",
                        val);
#if OPAL_ENABLE_CRDEBUG == 1
    mca_base_param_reg_int_name("opal_cr", "enable_crdebug",
                                "Enable checkpoint/restart debugging",
                                false, false,
                                0,
                                &val);
    MPIR_debug_with_checkpoint = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: C/R Debugging Enabled [%s]\n",
                        (MPIR_debug_with_checkpoint ? "True": "False"));
#endif

#ifndef __WINDOWS__
    mca_base_param_reg_int_name("opal_cr", "signal",
                                "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program",
                                false, false,
                                SIGUSR1,
                                &opal_cr_entry_point_signal);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Checkpoint Signal: %d",
                        opal_cr_entry_point_signal);

    mca_base_param_reg_int_name("opal_cr", "debug_sigpipe",
                                "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Debug SIGPIPE: %d (%s)",
                        val, (opal_cr_debug_sigpipe ? "True" : "False"));

#if OPAL_ENABLE_FT_THREAD == 1
    /* If we have a thread, then attach the SIGPIPE signal handler there since
     * it is most likely to be the one that needs it.
     */
    if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) {
        if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
            ;
        }
    }
#else
    if( opal_cr_debug_sigpipe ) {
        if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
            ;
        }
    }
#endif

#else
    opal_cr_is_tool = true;  /* no support for CR on Windows yet */ 
#endif  /* __WINDOWS__ */

#if OPAL_ENABLE_CRDEBUG == 1
    opal_cr_debug_num_free_threads = 3;
    opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads );
    for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) {
        opal_cr_debug_free_threads[t] = NULL;
    }
 
    mca_base_param_reg_int_name("opal_cr", "crdebug_signal",
                                "Checkpoint/Restart signal used to hold threads when debugging",
                                false, false,
                                SIGTSTP,
                                &opal_cr_debug_signal);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Checkpoint Signal (Debug): %d",
                        opal_cr_debug_signal);
    if( SIG_ERR == signal(opal_cr_debug_signal, MPIR_checkpoint_debugger_signal_handler) ) {
        opal_output(opal_cr_output,
                    "opal_cr: init: Failed to register C/R debug signal (%d)",
                    opal_cr_debug_signal);
    }
#else
    /* Silence a compiler warning */
    t = 0;
#endif

    mca_base_param_reg_string_name("opal_cr", "tmp_dir",
                                   "Temporary directory to place rendezvous files for a checkpoint",
                                   false, false,
                                   opal_tmp_directory(),
                                   &opal_cr_pipe_dir);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Temp Directory: %s",
                        opal_cr_pipe_dir);

    if( !opal_cr_is_tool ) {
        /* Register the OPAL interlevel coordination callback */
        opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func);

        opal_cr_stall_check = false;
        opal_cr_currently_stalled = false;

    } /* End opal_cr_is_tool = true */

    /* 
     * If fault tolerance was not compiled in then
     * we need to make sure that the listener thread is active to tell
     * the tools that this is not a checkpointable job.
     * We don't need the CRS framework to be initalized.
     */
#if OPAL_ENABLE_FT_CR    == 1
    /*
     * Open the checkpoint / restart service components
     */
    if (OPAL_SUCCESS != (ret = opal_crs_base_open())) {
        opal_show_help( "help-opal-runtime.txt",
                        "opal_cr_init:no-crs", true,
                        "opal_crs_base_open", ret );
        exit_status = ret;
        goto cleanup;
    }
    
    if (OPAL_SUCCESS != (ret = opal_crs_base_select())) {
        opal_show_help( "help-opal-runtime.txt",
                        "opal_cr_init:no-crs", true,
                        "opal_crs_base_select", ret );
        exit_status = ret;
        goto cleanup;
    }
#endif

#if OPAL_ENABLE_FT_THREAD == 1
    if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) {
        opal_output_verbose(10, opal_cr_output,
                            "opal_cr: init: starting the thread\n");

        /* JJH: We really do need this line below since it enables
         *      actual locks for threads. However currently the
         *      upper layers will deadlock if it is enabled.
         *      So hack around the problem for now, while working
         *      on a complete solution. See ticket #2741 for more
         *      details.
         * opal_set_using_threads(true);
         */

        /*
         * Start the thread
         */
        OBJ_CONSTRUCT(&opal_cr_thread,     opal_thread_t);
        OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t);

        opal_cr_thread_is_done    = false;
        opal_cr_thread_is_active  = false;
        opal_cr_thread_in_library = false;
        opal_cr_thread_num_in_library = 0;

        opal_cr_thread.t_run = opal_cr_thread_fn;
        opal_cr_thread.t_arg = NULL;
        opal_thread_start(&opal_cr_thread);

    } /* End opal_cr_is_tool = true */
    else {
        opal_output_verbose(10, opal_cr_output,
                            "opal_cr: init: *Not* Using C/R thread\n");
    }
#endif /* OPAL_ENABLE_FT_THREAD == 1 */

 cleanup:
    return exit_status;
}