int opal_cr_refresh_environ(int prev_pid) { char *file_name; #if OPAL_ENABLE_CRDEBUG == 1 char *tmp; #endif struct stat file_status; if( 0 >= prev_pid ) { prev_pid = getpid(); } /* * Make sure the file exists. If it doesn't then this means 2 things: * 1) We have already executed this function, and * 2) The file has been deleted on the previous round. */ asprintf(&file_name, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid); if (NULL == file_name) { return OPAL_ERR_OUT_OF_RESOURCE; } if(0 != stat(file_name, &file_status) ){ free(file_name); return OPAL_SUCCESS; } #if OPAL_ENABLE_CRDEBUG == 1 mca_base_var_env_name ("opal_cr_enable_crdebug", &tmp); opal_unsetenv(tmp, &environ); free (tmp); #endif extract_env_vars(prev_pid, file_name); #if OPAL_ENABLE_CRDEBUG == 1 MPIR_debug_with_checkpoint = 0; (void) mca_base_var_register ("opal", "opal", "cr", "enable_crdebug", "Enable checkpoint/restart debugging", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &MPIR_debug_with_checkpoint); opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s] (refresh)\n", (MPIR_debug_with_checkpoint ? "True": "False")); #endif free(file_name); return OPAL_SUCCESS; }
int opal_cr_refresh_environ(int prev_pid) { int val; char *file_name = NULL; struct stat file_status; if( 0 >= prev_pid ) { prev_pid = getpid(); } /* * Make sure the file exists. If it doesn't then this means 2 things: * 1) We have already executed this function, and * 2) The file has been deleted on the previous round. */ asprintf(&file_name, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid); if(0 != stat(file_name, &file_status) ){ return OPAL_SUCCESS; } #if OPAL_ENABLE_CRDEBUG == 1 opal_unsetenv(mca_base_param_env_var("opal_cr_enable_crdebug"), &environ); #endif extract_env_vars(prev_pid, file_name); #if OPAL_ENABLE_CRDEBUG == 1 mca_base_param_reg_int_name("opal_cr", "enable_crdebug", "Enable checkpoint/restart debugging", false, false, 0, &val); MPIR_debug_with_checkpoint = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s] (refresh)\n", (MPIR_debug_with_checkpoint ? "True": "False")); #else val = 0; /* Silence Compiler warning */ #endif if( NULL != file_name ){ free(file_name); file_name = NULL; } return OPAL_SUCCESS; }
static int _setup_tmpdir_base(void) { int rc = ORTE_SUCCESS; /* make sure that we have tmpdir_base set * if we need it */ if (NULL == orte_process_info.tmpdir_base) { orte_process_info.tmpdir_base = strdup(opal_tmp_directory()); if (NULL == orte_process_info.tmpdir_base) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto exit; } } exit: if( ORTE_SUCCESS != rc ){ ORTE_ERROR_LOG(rc); } return rc; }
/* * Setup the output stream infrastructure */ bool opal_output_init(void) { int i; char hostname[OPAL_MAXHOSTNAMELEN]; char *str; if (initialized) { return true; } str = getenv("OPAL_OUTPUT_STDERR_FD"); if (NULL != str) { default_stderr_fd = atoi(str); } str = getenv("OPAL_OUTPUT_REDIRECT"); if (NULL != str) { if (0 == strcasecmp(str, "syslog")) { opal_output_redirected_to_syslog = true; } } str = getenv("OPAL_OUTPUT_SYSLOG_PRI"); if (NULL != str) { if (0 == strcasecmp(str, "info")) { opal_output_redirected_syslog_pri = LOG_INFO; } else if (0 == strcasecmp(str, "error")) { opal_output_redirected_syslog_pri = LOG_ERR; } else if (0 == strcasecmp(str, "warn")) { opal_output_redirected_syslog_pri = LOG_WARNING; } else { opal_output_redirected_syslog_pri = LOG_ERR; } } else { opal_output_redirected_syslog_pri = LOG_ERR; } str = getenv("OPAL_OUTPUT_SYSLOG_IDENT"); if (NULL != str) { redirect_syslog_ident = strdup(str); } OBJ_CONSTRUCT(&verbose, opal_output_stream_t); if (opal_output_redirected_to_syslog) { verbose.lds_want_syslog = true; verbose.lds_syslog_priority = opal_output_redirected_syslog_pri; if (NULL != str) { verbose.lds_syslog_ident = strdup(redirect_syslog_ident); } verbose.lds_want_stderr = false; verbose.lds_want_stdout = false; } else { str = getenv("OPAL_OUTPUT_INTERNAL_TO_STDOUT"); if (NULL != str && str[0] == '1') { verbose.lds_want_stdout = true; } else { verbose.lds_want_stderr = true; } } gethostname(hostname, sizeof(hostname)); asprintf(&verbose.lds_prefix, "[%s:%05d] ", hostname, getpid()); for (i = 0; i < OPAL_OUTPUT_MAX_STREAMS; ++i) { info[i].ldi_used = false; info[i].ldi_enabled = false; info[i].ldi_syslog = opal_output_redirected_to_syslog; info[i].ldi_file = false; info[i].ldi_file_suffix = NULL; info[i].ldi_file_want_append = false; info[i].ldi_fd = -1; info[i].ldi_file_num_lines_lost = 0; } /* Initialize the mutex that protects the output */ OBJ_CONSTRUCT(&mutex, opal_mutex_t); initialized = true; /* Set some defaults */ asprintf(&output_prefix, "output-pid%d-", getpid()); output_dir = strdup(opal_tmp_directory()); /* Open the default verbose stream */ verbose_stream = opal_output_open(&verbose); return true; }
/* * Construct the fullpath to the session directory */ int orte_session_dir_get_name(char **fulldirpath, char **return_prefix, /* This will come back as the valid tmp dir */ char **return_frontend, char *hostid, char *batchid, orte_process_name_t *proc) { char *hostname = NULL, *batchname = NULL, *sessions = NULL, *user = NULL, *prefix = NULL, *frontend = NULL, *jobfam = NULL, *job = NULL, *vpidstr = NULL; bool prefix_provided = false; int exit_status = ORTE_SUCCESS; size_t len; int uid; struct passwd *pwdent; /* Ensure that system info is set */ orte_proc_info(); /* get the name of the user */ uid = getuid(); #ifdef HAVE_GETPWUID pwdent = getpwuid(uid); #else pwdent = NULL; #endif if (NULL != pwdent) { user = strdup(pwdent->pw_name); } else { orte_show_help("help-orte-runtime.txt", "orte:session:dir:nopwname", true); return ORTE_ERR_OUT_OF_RESOURCE; } /* * set the 'hostname' */ if( NULL != hostid) { /* User specified version */ hostname = strdup(hostid); } else { /* check if it is set elsewhere */ if( NULL != orte_process_info.nodename) hostname = strdup(orte_process_info.nodename); else { /* Couldn't find it, so fail */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); exit_status = ORTE_ERR_BAD_PARAM; goto cleanup; } } /* * set the 'batchid' */ if (NULL != batchid) batchname = strdup(batchid); else batchname = strdup("0"); /* * get the front part of the session directory * Will look something like: * openmpi-sessions-USERNAME@HOSTNAME_BATCHID */ if (NULL != orte_process_info.top_session_dir) { frontend = strdup(orte_process_info.top_session_dir); } else { /* If not set then construct it */ if (0 > asprintf(&frontend, "openmpi-sessions-%s@%s_%s", user, hostname, batchname)) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } } /* * Construct the session directory */ /* If we were given a valid vpid then we can construct it fully into: * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID */ if( NULL != proc) { if (ORTE_VPID_INVALID != proc->vpid) { if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->vpid)) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } sessions = opal_os_path( false, frontend, jobfam, job, vpidstr, NULL ); if( NULL == sessions ) { ORTE_ERROR_LOG(ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } } /* If we were given a valid jobid then we can construct it partially into: * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID */ else if (ORTE_JOBID_INVALID != proc->jobid) { if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } sessions = opal_os_path( false, frontend, jobfam, job, NULL ); if( NULL == sessions ) { ORTE_ERROR_LOG(ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } } /* if both are invalid */ else { sessions = strdup(frontend); /* must dup this to avoid double-free later */ } } /* If we were not given a proc at all, then we just set it to frontend */ else { sessions = strdup(frontend); /* must dup this to avoid double-free later */ } /* * If the user specified an invalid prefix, or no prefix at all * we need to keep looking */ if( NULL != fulldirpath && NULL != *fulldirpath) { free(*fulldirpath); *fulldirpath = NULL; } if( NULL != return_prefix && NULL != *return_prefix) { /* use the user specified one, if available */ prefix = strdup(*return_prefix); prefix_provided = true; } /* Try to find a proper alternative prefix */ else if (NULL != orte_process_info.tmpdir_base) { /* stored value */ prefix = strdup(orte_process_info.tmpdir_base); } else { /* General Environment var */ prefix = strdup(opal_tmp_directory()); } len = strlen(prefix); /* check for a trailing path separator */ if (OPAL_PATH_SEP[0] == prefix[len-1]) { prefix[len-1] = '\0'; } /* BEFORE doing anything else, check to see if this prefix is * allowed by the system */ if (NULL != orte_prohibited_session_dirs) { char **list; int i, len; /* break the string into tokens - it should be * separated by ',' */ list = opal_argv_split(orte_prohibited_session_dirs, ','); len = opal_argv_count(list); /* cycle through the list */ for (i=0; i < len; i++) { /* check if prefix matches */ if (0 == strncmp(prefix, list[i], strlen(list[i]))) { /* this is a prohibited location */ orte_show_help("help-orte-runtime.txt", "orte:session:dir:prohibited", true, prefix, orte_prohibited_session_dirs); return ORTE_ERR_FATAL; } } opal_argv_free(list); /* done with this */ } /* * Construct the absolute final path, if requested */ if (NULL != fulldirpath) { *fulldirpath = opal_os_path(false, prefix, sessions, NULL); } /* * Return the frontend and prefix, if user requested we do so */ if (NULL != return_frontend) { *return_frontend = strdup(frontend); } if (!prefix_provided && NULL != return_prefix) { *return_prefix = strdup(prefix); } cleanup: if(NULL != hostname) free(hostname); if(NULL != batchname) free(batchname); if(NULL != sessions) free(sessions); if(NULL != user) free(user); if (NULL != prefix) free(prefix); if (NULL != frontend) free(frontend); if (NULL != jobfam) free(jobfam); if (NULL != job) free(job); if (NULL != vpidstr) free(vpidstr); return exit_status; }
static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot) { int ret, exit_status = OPAL_SUCCESS; char *command = NULL; char *proc_file = NULL; char **loc_touch = NULL; char **loc_mkdir = NULL; int argc, i; if( 0 > prev_pid ) { opal_output(opal_restart_globals.output, "Invalid PID (%d)\n", prev_pid); exit_status = OPAL_ERROR; goto cleanup; } /* * This is needed so we can pass the previous environment to the restarted * application process. */ opal_asprintf(&proc_file, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid); opal_asprintf(&command, "env | grep OMPI_ > %s", proc_file); opal_output_verbose(5, opal_restart_globals.output, "post_env_vars: Execute: <%s>", command); ret = system(command); if( 0 > ret) { exit_status = ret; goto cleanup; } /* * Any directories that need to be created */ if( NULL == (snapshot->metadata = fopen(snapshot->metadata_filename, "r")) ) { opal_show_help("help-opal-restart.txt", "invalid_metadata", true, opal_restart_globals.snapshot_metadata, snapshot->metadata_filename); exit_status = OPAL_ERROR; goto cleanup; } opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_MKDIR, &loc_mkdir); argc = opal_argv_count(loc_mkdir); for( i = 0; i < argc; ++i ) { if( NULL != command ) { free(command); command = NULL; } opal_asprintf(&command, "mkdir -p %s", loc_mkdir[i]); opal_output_verbose(5, opal_restart_globals.output, "post_env_vars: Execute: <%s>", command); ret = system(command); if( 0 > ret) { exit_status = ret; goto cleanup; } } if( 0 < argc ) { system("sync ; sync"); } /* * Any files that need to exist */ opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_TOUCH, &loc_touch); argc = opal_argv_count(loc_touch); for( i = 0; i < argc; ++i ) { if( NULL != command ) { free(command); command = NULL; } opal_asprintf(&command, "touch %s", loc_touch[i]); opal_output_verbose(5, opal_restart_globals.output, "post_env_vars: Execute: <%s>", command); ret = system(command); if( 0 > ret) { exit_status = ret; goto cleanup; } } if( 0 < argc ) { system("sync ; sync"); } cleanup: if( NULL != command) { free(command); command = NULL; } if( NULL != proc_file) { free(proc_file); proc_file = NULL; } if( NULL != loc_mkdir ) { opal_argv_free(loc_mkdir); loc_mkdir = NULL; } if( NULL != loc_touch ) { opal_argv_free(loc_touch); loc_touch = NULL; } if( NULL != snapshot->metadata ) { fclose(snapshot->metadata); snapshot->metadata = NULL; } return exit_status; }
static int opal_cr_register (void) { int ret; #if OPAL_ENABLE_CRDEBUG == 1 int t; #endif /* * Some startup MCA parameters */ ret = mca_base_var_register ("opal", "opal", "cr", "verbose", "Verbose output level for the runtime OPAL Checkpoint/Restart functionality", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &opal_cr_verbose); if (0 > ret) { return ret; } opal_cr_is_enabled = false; (void) mca_base_var_register("opal", "ft", "cr", "enabled", "Enable fault tolerance for this program", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_is_enabled); opal_cr_timing_enabled = false; (void) mca_base_var_register ("opal", "opal", "cr", "enable_timer", "Enable Checkpoint timer (Default: Disabled)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_timing_enabled); opal_cr_timing_barrier_enabled = false; (void) mca_base_var_register ("opal", "opal", "cr", "enable_timer_barrier", "Enable Checkpoint timer Barrier. Must have opal_cr_enable_timer set. (Default: Disabled)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, opal_cr_timing_enabled ? MCA_BASE_VAR_FLAG_SETTABLE : 0, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_timing_barrier_enabled); opal_cr_timing_barrier_enabled = opal_cr_timing_barrier_enabled && opal_cr_timing_enabled; (void) mca_base_var_register ("opal", "opal", "cr", "timer_target_rank", "Target Rank for the timer (Default: 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_timing_target_rank); #if OPAL_ENABLE_FT_THREAD == 1 opal_cr_thread_use_if_avail = false; (void) mca_base_var_register ("opal", "opal", "cr", "use_thread", "Use an async thread to checkpoint this program (Default: Disabled)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_thread_use_if_avail); opal_cr_thread_sleep_check = 0; (void) mca_base_var_register ("opal", "opal", "cr", "thread_sleep_check", "Time to sleep between checking for a checkpoint (Default: 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_thread_sleep_check); opal_cr_thread_sleep_wait = 100; (void) mca_base_var_register ("opal", "opal", "cr", "thread_sleep_wait", "Time to sleep waiting for process to exit MPI library (Default: 1000)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_thread_sleep_wait); #endif opal_cr_is_tool = false; (void) mca_base_var_register ("opal", "opal", "cr", "is_tool", "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_is_tool); #ifndef __WINDOWS__ opal_cr_entry_point_signal = SIGUSR1; (void) mca_base_var_register ("opal", "opal", "cr", "signal", "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_entry_point_signal); opal_cr_debug_sigpipe = false; (void) mca_base_var_register ("opal", "opal", "cr", "debug_sigpipe", "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_debug_sigpipe); #else opal_cr_is_tool = true; /* no support for CR on Windows yet */ #endif /* __WINDOWS__ */ #if OPAL_ENABLE_CRDEBUG == 1 MPIR_debug_with_checkpoint = 0; (void) mca_base_var_register ("opal", "opal", "cr", "enable_crdebug", "Enable checkpoint/restart debugging", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &MPIR_debug_with_checkpoint); opal_cr_debug_num_free_threads = 3; opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads ); for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) { opal_cr_debug_free_threads[t] = NULL; } opal_cr_debug_signal = SIGTSTP; (void) mca_base_var_register ("opal", "opal", "cr", "crdebug_signal", "Checkpoint/Restart signal used to hold threads when debugging", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_debug_signal); #endif opal_cr_pipe_dir = (char *) opal_tmp_directory(); (void) mca_base_var_register ("opal", "opal", "cr", "tmp_dir", "Temporary directory to place rendezvous files for a checkpoint", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &opal_cr_pipe_dir); return OPAL_SUCCESS; }
static int sstore_stage_register(void) { mca_base_component_t *component = &mca_sstore_stage_component.super.base_version; int ret; /* * The local directory to use when staging checkpoints back to central storage */ orte_sstore_stage_local_snapshot_dir = strdup (opal_tmp_directory()); ret = mca_base_component_var_register(component, "local_snapshot_dir", "The temporary base directory to use when storing local snapshots before they are moved.", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_INTERNAL, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_sstore_stage_local_snapshot_dir); if (0 > ret) { return ret; } ret = mca_base_var_register_synonym(ret, "orte", "crs", "base", "snapshot_dir", MCA_BASE_VAR_SYN_FLAG_DEPRECATED); if (0 > ret) { return ret; } /* * If the global storage is just on a different file system, then we pass * this hint on to FileM. */ orte_sstore_stage_global_is_shared = false; ret = mca_base_component_var_register(component, "global_is_shared", "If the global_snapshot_dir is on a shared file system all nodes can access, " "then the checkpoint files can be copied more efficiently when FileM is used." " [Default = disabled]", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_sstore_stage_global_is_shared); if (0 > ret) { return ret; } ret = mca_base_var_register_synonym(ret, "orte", "snapc", "base", "global_shared", MCA_BASE_VAR_SYN_FLAG_DEPRECATED); if (0 > ret) { return ret; } /* * Debugging option to skip the filem step * Warning: Will not produce a usable global snapshot */ orte_sstore_stage_skip_filem = false; ret = mca_base_component_var_register(component, "skip_filem", "Not for general use! For debugging only! " "Pretend to move files. [Default = disabled]", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_sstore_stage_skip_filem); if (0 > ret) { return ret; } ret = mca_base_var_register_synonym(ret, "orte", "snapc", "base","skip_filem", MCA_BASE_VAR_SYN_FLAG_DEPRECATED); if (0 > ret) { return ret; } /* * Maintain a local cache of checkpoints taken, so that automatic recovery * does not require a transfer from central storage. */ orte_sstore_stage_enabled_caching = false; ret = mca_base_component_var_register(component, "caching", "Maintain a node local cache of last checkpoint. [Default = disabled]", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_sstore_stage_enabled_caching); if (0 > ret) { return ret; } /* * Compress checkpoints before/after transfer */ orte_sstore_stage_enabled_compression = false; ret = mca_base_component_var_register(component, "compress", "Compress local snapshots. [Default = disabled]", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_sstore_stage_enabled_compression); if (0 > ret) { return ret; } /* * Number of seconds to delay the start of compression when sync'ing */ orte_sstore_stage_compress_delay = 0; ret = mca_base_component_var_register(component, "compress_delay", "Seconds to delay the start of compression on sync() " " [Default = 0]", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_sstore_stage_compress_delay); if (0 > ret) { return ret; } /* * A progress meter */ orte_sstore_stage_progress_meter = 0; ret = mca_base_component_var_register(component, "progress_meter", "Display Progress every X percentage done. [Default = 0/off]", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_sstore_stage_progress_meter); if (0 > ret) { return ret; } orte_sstore_stage_progress_meter = (orte_sstore_stage_progress_meter % 101); /* * Priority */ mca_sstore_stage_component.super.priority = 10; ret = mca_base_component_var_register(component, "priority", "Priority of the SSTORE stage component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_sstore_stage_component.super.priority); if (0 > ret) { return ret; } /* * Verbose Level */ ret = mca_base_component_var_register(component, "verbose", "Verbose level for the SSTORE stage component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_sstore_stage_component.super.verbose); if (0 > ret) { return ret; } return ORTE_SUCCESS; }
int opal_cr_init(void ) { int ret, exit_status = OPAL_SUCCESS; opal_cr_coord_callback_fn_t prev_coord_func; int val, t; if( ++opal_cr_initalized != 1 ) { if( opal_cr_initalized < 1 ) { exit_status = OPAL_ERROR; goto cleanup; } exit_status = OPAL_SUCCESS; goto cleanup; } /* * Some startup MCA parameters */ ret = mca_base_param_reg_int_name("opal_cr", "verbose", "Verbose output level for the runtime OPAL Checkpoint/Restart functionality", false, false, 0, &val); if(0 != val) { opal_cr_output = opal_output_open(NULL); } else { opal_cr_output = -1; } opal_output_set_verbosity(opal_cr_output, val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Verbose Level: %d", val); mca_base_param_reg_int_name("ft", "cr_enabled", "Enable fault tolerance for this program", false, false, 0, &val); opal_cr_set_enabled(OPAL_INT_TO_BOOL(val)); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Enabled: %d", val); mca_base_param_reg_int_name("opal_cr", "enable_timer", "Enable Checkpoint timer (Default: Disabled)", false, false, 0, &val); opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val); mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier", "Enable Checkpoint timer Barrier (Default: Disabled)", false, false, 0, &val); if( opal_cr_timing_enabled ) { opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val); } else { opal_cr_timing_barrier_enabled = false; } mca_base_param_reg_int_name("opal_cr", "timer_target_rank", "Target Rank for the timer (Default: 0)", false, false, 0, &val); opal_cr_timing_target_rank = val; #if OPAL_ENABLE_FT_THREAD == 1 mca_base_param_reg_int_name("opal_cr", "use_thread", "Use an async thread to checkpoint this program (Default: Disabled)", false, false, 0, &val); opal_cr_thread_use_if_avail = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Use thread: %d", val); mca_base_param_reg_int_name("opal_cr", "thread_sleep_check", "Time to sleep between checking for a checkpoint (Default: 0)", false, false, 0, &val); opal_cr_thread_sleep_check = val; mca_base_param_reg_int_name("opal_cr", "thread_sleep_wait", "Time to sleep waiting for process to exit MPI library (Default: 1000)", false, false, 1000, &val); opal_cr_thread_sleep_wait = val; opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT thread sleep: check = %d, wait = %d", opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait); #endif mca_base_param_reg_int_name("opal_cr", "is_tool", "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.", false, false, 0, &val); opal_cr_is_tool = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Is a tool program: %d", val); #if OPAL_ENABLE_CRDEBUG == 1 mca_base_param_reg_int_name("opal_cr", "enable_crdebug", "Enable checkpoint/restart debugging", false, false, 0, &val); MPIR_debug_with_checkpoint = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s]\n", (MPIR_debug_with_checkpoint ? "True": "False")); #endif #ifndef __WINDOWS__ mca_base_param_reg_int_name("opal_cr", "signal", "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program", false, false, SIGUSR1, &opal_cr_entry_point_signal); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal: %d", opal_cr_entry_point_signal); mca_base_param_reg_int_name("opal_cr", "debug_sigpipe", "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)", false, false, 0, &val); opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Debug SIGPIPE: %d (%s)", val, (opal_cr_debug_sigpipe ? "True" : "False")); #if OPAL_ENABLE_FT_THREAD == 1 /* If we have a thread, then attach the SIGPIPE signal handler there since * it is most likely to be the one that needs it. */ if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #else if( opal_cr_debug_sigpipe ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #endif #else opal_cr_is_tool = true; /* no support for CR on Windows yet */ #endif /* __WINDOWS__ */ #if OPAL_ENABLE_CRDEBUG == 1 opal_cr_debug_num_free_threads = 3; opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads ); for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) { opal_cr_debug_free_threads[t] = NULL; } mca_base_param_reg_int_name("opal_cr", "crdebug_signal", "Checkpoint/Restart signal used to hold threads when debugging", false, false, SIGTSTP, &opal_cr_debug_signal); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal (Debug): %d", opal_cr_debug_signal); if( SIG_ERR == signal(opal_cr_debug_signal, MPIR_checkpoint_debugger_signal_handler) ) { opal_output(opal_cr_output, "opal_cr: init: Failed to register C/R debug signal (%d)", opal_cr_debug_signal); } #else /* Silence a compiler warning */ t = 0; #endif mca_base_param_reg_string_name("opal_cr", "tmp_dir", "Temporary directory to place rendezvous files for a checkpoint", false, false, opal_tmp_directory(), &opal_cr_pipe_dir); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Temp Directory: %s", opal_cr_pipe_dir); if( !opal_cr_is_tool ) { /* Register the OPAL interlevel coordination callback */ opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func); opal_cr_stall_check = false; opal_cr_currently_stalled = false; } /* End opal_cr_is_tool = true */ /* * If fault tolerance was not compiled in then * we need to make sure that the listener thread is active to tell * the tools that this is not a checkpointable job. * We don't need the CRS framework to be initalized. */ #if OPAL_ENABLE_FT_CR == 1 /* * Open the checkpoint / restart service components */ if (OPAL_SUCCESS != (ret = opal_crs_base_open())) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_open", ret ); exit_status = ret; goto cleanup; } if (OPAL_SUCCESS != (ret = opal_crs_base_select())) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_select", ret ); exit_status = ret; goto cleanup; } #endif #if OPAL_ENABLE_FT_THREAD == 1 if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) { opal_output_verbose(10, opal_cr_output, "opal_cr: init: starting the thread\n"); /* JJH: We really do need this line below since it enables * actual locks for threads. However currently the * upper layers will deadlock if it is enabled. * So hack around the problem for now, while working * on a complete solution. See ticket #2741 for more * details. * opal_set_using_threads(true); */ /* * Start the thread */ OBJ_CONSTRUCT(&opal_cr_thread, opal_thread_t); OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t); opal_cr_thread_is_done = false; opal_cr_thread_is_active = false; opal_cr_thread_in_library = false; opal_cr_thread_num_in_library = 0; opal_cr_thread.t_run = opal_cr_thread_fn; opal_cr_thread.t_arg = NULL; opal_thread_start(&opal_cr_thread); } /* End opal_cr_is_tool = true */ else { opal_output_verbose(10, opal_cr_output, "opal_cr: init: *Not* Using C/R thread\n"); } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ cleanup: return exit_status; }