static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) { int ret, exit_status = OPAL_SUCCESS; char **tmp_argv = NULL; char * component_name = NULL; int prev_pid; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: cold_start(%s)", snapshot->super.reference_name); /* * Find the snapshot directory, read the metadata file */ if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location, &component_name, &prev_pid) ) ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.", snapshot->super.local_location, ret); exit_status = ret; goto cleanup; } snapshot->super.component_name = strdup(component_name); /* Compare the component strings to make sure this is our snapshot before going further */ if ( 0 != strncmp(mca_crs_blcr_component.super.base_version.mca_component_name, component_name, strlen(component_name)) ) { exit_status = OPAL_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n", component_name, mca_crs_blcr_component.super.base_version.mca_component_name); goto cleanup; } /* * Context Filename */ opal_crs_base_metadata_read_token(snapshot->super.local_location, CRS_METADATA_CONTEXT, &tmp_argv); if( NULL == tmp_argv ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_cold_start: Error: Failed to read the %s token from the local checkpoint in %s", CRS_METADATA_CONTEXT, snapshot->super.local_location); exit_status = OPAL_ERROR; goto cleanup; } asprintf(&snapshot->context_filename, "%s/%s", snapshot->super.local_location, tmp_argv[0]); /* * Reset the cold_start flag */ snapshot->super.cold_start = false; cleanup: if(NULL != tmp_argv) { opal_argv_free(tmp_argv); tmp_argv = NULL; } return exit_status; }
static int self_cold_start(opal_crs_self_snapshot_t *snapshot) { int ret, exit_status = OPAL_SUCCESS; char **tmp_argv = NULL; char * component_name = NULL; int prev_pid; opal_output_verbose(10, mca_crs_self_component.super.output_handle, "crs:self: cold_start()"); /* * Find the snapshot directory, read the metadata file */ if( NULL == snapshot->super.metadata ) { if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) { opal_output(mca_crs_self_component.super.output_handle, "crs:self: checkpoint(): Error: Unable to open the file (%s)", snapshot->super.metadata_filename); exit_status = OPAL_ERROR; goto cleanup; } } if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata, &component_name, &prev_pid) ) ) { opal_output(mca_crs_self_component.super.output_handle, "crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.", snapshot->super.metadata_filename, ret); exit_status = ret; goto cleanup; } snapshot->super.component_name = strdup(component_name); /* Compare the strings to make sure this is our snapshot before going further */ if ( 0 != strncmp(mca_crs_self_component.super.base_version.mca_component_name, component_name, strlen(component_name)) ) { exit_status = OPAL_ERROR; opal_output(mca_crs_self_component.super.output_handle, "crs:self: self_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n", component_name, mca_crs_self_component.super.base_version.mca_component_name); goto cleanup; } /* * Restart command * JJH: Command lines limited to 256 chars. */ opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv); if( NULL == tmp_argv ) { opal_output(mca_crs_self_component.super.output_handle, "crs:self: self_cold_start: Error: Failed to read the %s token from the local checkpoint in %s", CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory); exit_status = OPAL_ERROR; goto cleanup; } asprintf(&snapshot->cmd_line, "%s", tmp_argv[0]); /* * Reset the cold_start flag */ snapshot->super.cold_start = false; cleanup: if(NULL != tmp_argv) { opal_argv_free(tmp_argv); tmp_argv = NULL; } return exit_status; }
int main(int argc, char *argv[]) { int ret, exit_status = OPAL_SUCCESS; int child_pid; int prev_pid = 0; int idx; opal_crs_base_snapshot_t *snapshot = NULL; char * tmp_env_var = NULL; bool select = false; /*************** * Initialize ***************/ if (OPAL_SUCCESS != (ret = initialize(argc, argv))) { exit_status = ret; goto cleanup; } /* * Check for existence of the file, or program in the case of self */ if( OPAL_SUCCESS != (ret = check_file() )) { opal_show_help("help-opal-restart.txt", "invalid_filename", true, opal_restart_globals.snapshot_ref); exit_status = ret; goto cleanup; } /* Re-enable the selection of the CRS component, so we can choose the right one */ idx = mca_base_var_find(NULL, "crs", "base", "do_not_select"); if (0 > idx) { opal_output(opal_restart_globals.output, "MCA variable opal_crs_base_do_not_select not found\n"); exit_status = OPAL_ERROR; goto cleanup; } ret = mca_base_var_set_value(idx, &select, 0, MCA_BASE_VAR_SOURCE_DEFAULT, NULL); if (OPAL_SUCCESS != ret) { exit_status = ret; goto cleanup; } /* * Make sure we are using the correct checkpointer */ if(NULL == expected_crs_comp) { char * full_metadata_path = NULL; FILE * metadata = NULL; opal_asprintf(&full_metadata_path, "%s/%s/%s", opal_restart_globals.snapshot_loc, opal_restart_globals.snapshot_ref, opal_restart_globals.snapshot_metadata); if( NULL == (metadata = fopen(full_metadata_path, "r")) ) { opal_show_help("help-opal-restart.txt", "invalid_metadata", true, opal_restart_globals.snapshot_metadata, full_metadata_path); exit_status = OPAL_ERROR; goto cleanup; } if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(metadata, &expected_crs_comp, &prev_pid)) ) { opal_show_help("help-opal-restart.txt", "invalid_metadata", true, opal_restart_globals.snapshot_metadata, full_metadata_path); exit_status = ret; goto cleanup; } free(full_metadata_path); full_metadata_path = NULL; fclose(metadata); metadata = NULL; } opal_output_verbose(10, opal_restart_globals.output, "Restart Expects checkpointer: (%s)", expected_crs_comp); (void) mca_base_var_env_name("crs", &tmp_env_var); opal_setenv(tmp_env_var, expected_crs_comp, true, &environ); free(tmp_env_var); tmp_env_var = NULL; /* Select this component or don't continue. * If the selection of this component fails, then we can't * restart on this node because it doesn't have the proper checkpointer * available. */ if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) { opal_show_help("help-opal-restart.txt", "comp_select_failure", true, "crs", ret); exit_status = ret; goto cleanup; } if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) { opal_show_help("help-opal-restart.txt", "comp_select_failure", true, expected_crs_comp, ret); exit_status = ret; goto cleanup; } /* * Make sure we have selected the proper component */ if(NULL == expected_crs_comp || 0 != strncmp(expected_crs_comp, opal_crs_base_selected_component.base_version.mca_component_name, strlen(expected_crs_comp)) ) { opal_show_help("help-opal-restart.txt", "comp_select_mismatch", true, expected_crs_comp, opal_crs_base_selected_component.base_version.mca_component_name, ret); exit_status = ret; goto cleanup; } /****************************** * Restart in this process ******************************/ opal_output_verbose(10, opal_restart_globals.output, "Restarting from file (%s)\n", opal_restart_globals.snapshot_ref); snapshot = OBJ_NEW(opal_crs_base_snapshot_t); snapshot->cold_start = true; opal_asprintf(&(snapshot->snapshot_directory), "%s/%s", opal_restart_globals.snapshot_loc, opal_restart_globals.snapshot_ref); opal_asprintf(&(snapshot->metadata_filename), "%s/%s", snapshot->snapshot_directory, opal_restart_globals.snapshot_metadata); /* Since some checkpoint/restart systems don't pass along env vars to the * restarted app, we need to take care of that. * * Included here is the creation of any files or directories that need to be * created before the process is restarted. */ if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot) ) ) { exit_status = ret; goto cleanup; } /* * Do the actual restart */ ret = opal_crs.crs_restart(snapshot, false, &child_pid); if (OPAL_SUCCESS != ret) { opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true, opal_restart_globals.snapshot_ref, ret, opal_crs_base_selected_component.base_version.mca_component_name); exit_status = ret; goto cleanup; } /* Should never get here, since crs_restart calls exec */ /*************** * Cleanup ***************/ cleanup: if (OPAL_SUCCESS != (ret = finalize())) { return ret; } if(NULL != snapshot ) OBJ_DESTRUCT(snapshot); return exit_status; }