static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) { int ret, exit_status = OPAL_SUCCESS; char **tmp_argv = NULL; char * component_name = NULL; int prev_pid; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: cold_start(%s)", snapshot->super.reference_name); /* * Find the snapshot directory, read the metadata file */ if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location, &component_name, &prev_pid) ) ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.", snapshot->super.local_location, ret); exit_status = ret; goto cleanup; } snapshot->super.component_name = strdup(component_name); /* Compare the component strings to make sure this is our snapshot before going further */ if ( 0 != strncmp(mca_crs_blcr_component.super.base_version.mca_component_name, component_name, strlen(component_name)) ) { exit_status = OPAL_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n", component_name, mca_crs_blcr_component.super.base_version.mca_component_name); goto cleanup; } /* * Context Filename */ opal_crs_base_metadata_read_token(snapshot->super.local_location, CRS_METADATA_CONTEXT, &tmp_argv); if( NULL == tmp_argv ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_cold_start: Error: Failed to read the %s token from the local checkpoint in %s", CRS_METADATA_CONTEXT, snapshot->super.local_location); exit_status = OPAL_ERROR; goto cleanup; } asprintf(&snapshot->context_filename, "%s/%s", snapshot->super.local_location, tmp_argv[0]); /* * Reset the cold_start flag */ snapshot->super.cold_start = false; cleanup: if(NULL != tmp_argv) { opal_argv_free(tmp_argv); tmp_argv = NULL; } return exit_status; }
int opal_crs_base_extract_expected_component(FILE *metadata, char ** component_name, int *prev_pid) { int exit_status = OPAL_SUCCESS; char **pid_argv = NULL; char **name_argv = NULL; /* Dummy check */ if( NULL == metadata ) { exit_status = OPAL_ERROR; goto cleanup; } opal_crs_base_metadata_read_token(metadata, CRS_METADATA_PID, &pid_argv); if( NULL != pid_argv && NULL != pid_argv[0] ) { *prev_pid = atoi(pid_argv[0]); } else { opal_output(0, "Error: expected_component: PID information unavailable!"); exit_status = OPAL_ERROR; goto cleanup; } opal_crs_base_metadata_read_token(metadata, CRS_METADATA_COMP, &name_argv); if( NULL != name_argv && NULL != name_argv[0] ) { *component_name = strdup(name_argv[0]); } else { opal_output(0, "Error: expected_component: Component Name information unavailable!"); exit_status = OPAL_ERROR; goto cleanup; } cleanup: if( NULL != pid_argv ) { opal_argv_free(pid_argv); pid_argv = NULL; } if( NULL != name_argv ) { opal_argv_free(name_argv); name_argv = NULL; } return exit_status; }
static int self_cold_start(opal_crs_self_snapshot_t *snapshot) { int ret, exit_status = OPAL_SUCCESS; char **tmp_argv = NULL; char * component_name = NULL; int prev_pid; opal_output_verbose(10, mca_crs_self_component.super.output_handle, "crs:self: cold_start()"); /* * Find the snapshot directory, read the metadata file */ if( NULL == snapshot->super.metadata ) { if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) { opal_output(mca_crs_self_component.super.output_handle, "crs:self: checkpoint(): Error: Unable to open the file (%s)", snapshot->super.metadata_filename); exit_status = OPAL_ERROR; goto cleanup; } } if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata, &component_name, &prev_pid) ) ) { opal_output(mca_crs_self_component.super.output_handle, "crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.", snapshot->super.metadata_filename, ret); exit_status = ret; goto cleanup; } snapshot->super.component_name = strdup(component_name); /* Compare the strings to make sure this is our snapshot before going further */ if ( 0 != strncmp(mca_crs_self_component.super.base_version.mca_component_name, component_name, strlen(component_name)) ) { exit_status = OPAL_ERROR; opal_output(mca_crs_self_component.super.output_handle, "crs:self: self_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n", component_name, mca_crs_self_component.super.base_version.mca_component_name); goto cleanup; } /* * Restart command * JJH: Command lines limited to 256 chars. */ opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv); if( NULL == tmp_argv ) { opal_output(mca_crs_self_component.super.output_handle, "crs:self: self_cold_start: Error: Failed to read the %s token from the local checkpoint in %s", CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory); exit_status = OPAL_ERROR; goto cleanup; } asprintf(&snapshot->cmd_line, "%s", tmp_argv[0]); /* * Reset the cold_start flag */ snapshot->super.cold_start = false; cleanup: if(NULL != tmp_argv) { opal_argv_free(tmp_argv); tmp_argv = NULL; } return exit_status; }
int opal_crs_none_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid) { int exit_status = OPAL_SUCCESS; char **tmp_argv = NULL; char **cr_argv = NULL; int status; *child_pid = getpid(); if( NULL == base_snapshot->metadata ) { if (NULL == (base_snapshot->metadata = fopen(base_snapshot->metadata_filename, "a")) ) { opal_output(0, "crs:none: checkpoint(): Error: Unable to open the file (%s)", base_snapshot->metadata_filename); return OPAL_ERROR; } } opal_crs_base_metadata_read_token(base_snapshot->metadata, CRS_METADATA_CONTEXT, &tmp_argv); if( NULL == tmp_argv ) { opal_output(opal_crs_base_framework.framework_output, "crs:none: none_restart: Error: Failed to read the %s token from the local checkpoint in %s", CRS_METADATA_CONTEXT, base_snapshot->metadata_filename); exit_status = OPAL_ERROR; goto cleanup; } if( opal_argv_count(tmp_argv) <= 0 ) { opal_output_verbose(10, opal_crs_base_framework.framework_output, "crs:none: none_restart: No command line to exec, so just returning"); exit_status = OPAL_SUCCESS; goto cleanup; } if ( NULL == (cr_argv = opal_argv_split(tmp_argv[0], ' ')) ) { exit_status = OPAL_ERROR; goto cleanup; } if( !spawn_child ) { opal_output_verbose(10, opal_crs_base_framework.framework_output, "crs:none: none_restart: exec :(%s, %s):", cr_argv[0], tmp_argv[0]); status = execvp(cr_argv[0], cr_argv); if(status < 0) { opal_output(opal_crs_base_framework.framework_output, "crs:none: none_restart: Child failed to execute :(%d):", status); } opal_output(opal_crs_base_framework.framework_output, "crs:none: none_restart: execvp returned %d", status); exit_status = status; goto cleanup; } else { opal_output(opal_crs_base_framework.framework_output, "crs:none: none_restart: Spawn not implemented"); exit_status = OPAL_ERR_NOT_IMPLEMENTED; goto cleanup; } cleanup: if (cr_argv) { opal_argv_free (cr_argv); } fclose(base_snapshot->metadata); return exit_status; }
static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot) { int ret, exit_status = OPAL_SUCCESS; char *command = NULL; char *proc_file = NULL; char **loc_touch = NULL; char **loc_mkdir = NULL; int argc, i; if( 0 > prev_pid ) { opal_output(opal_restart_globals.output, "Invalid PID (%d)\n", prev_pid); exit_status = OPAL_ERROR; goto cleanup; } /* * This is needed so we can pass the previous environment to the restarted * application process. */ opal_asprintf(&proc_file, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid); opal_asprintf(&command, "env | grep OMPI_ > %s", proc_file); opal_output_verbose(5, opal_restart_globals.output, "post_env_vars: Execute: <%s>", command); ret = system(command); if( 0 > ret) { exit_status = ret; goto cleanup; } /* * Any directories that need to be created */ if( NULL == (snapshot->metadata = fopen(snapshot->metadata_filename, "r")) ) { opal_show_help("help-opal-restart.txt", "invalid_metadata", true, opal_restart_globals.snapshot_metadata, snapshot->metadata_filename); exit_status = OPAL_ERROR; goto cleanup; } opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_MKDIR, &loc_mkdir); argc = opal_argv_count(loc_mkdir); for( i = 0; i < argc; ++i ) { if( NULL != command ) { free(command); command = NULL; } opal_asprintf(&command, "mkdir -p %s", loc_mkdir[i]); opal_output_verbose(5, opal_restart_globals.output, "post_env_vars: Execute: <%s>", command); ret = system(command); if( 0 > ret) { exit_status = ret; goto cleanup; } } if( 0 < argc ) { system("sync ; sync"); } /* * Any files that need to exist */ opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_TOUCH, &loc_touch); argc = opal_argv_count(loc_touch); for( i = 0; i < argc; ++i ) { if( NULL != command ) { free(command); command = NULL; } opal_asprintf(&command, "touch %s", loc_touch[i]); opal_output_verbose(5, opal_restart_globals.output, "post_env_vars: Execute: <%s>", command); ret = system(command); if( 0 > ret) { exit_status = ret; goto cleanup; } } if( 0 < argc ) { system("sync ; sync"); } cleanup: if( NULL != command) { free(command); command = NULL; } if( NULL != proc_file) { free(proc_file); proc_file = NULL; } if( NULL != loc_mkdir ) { opal_argv_free(loc_mkdir); loc_mkdir = NULL; } if( NULL != loc_touch ) { opal_argv_free(loc_touch); loc_touch = NULL; } if( NULL != snapshot->metadata ) { fclose(snapshot->metadata); snapshot->metadata = NULL; } return exit_status; }