int opal_cr_refresh_environ(int prev_pid) { char *file_name; #if OPAL_ENABLE_CRDEBUG == 1 char *tmp; #endif struct stat file_status; if( 0 >= prev_pid ) { prev_pid = getpid(); } /* * Make sure the file exists. If it doesn't then this means 2 things: * 1) We have already executed this function, and * 2) The file has been deleted on the previous round. */ asprintf(&file_name, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid); if (NULL == file_name) { return OPAL_ERR_OUT_OF_RESOURCE; } if(0 != stat(file_name, &file_status) ){ free(file_name); return OPAL_SUCCESS; } #if OPAL_ENABLE_CRDEBUG == 1 mca_base_var_env_name ("opal_cr_enable_crdebug", &tmp); opal_unsetenv(tmp, &environ); free (tmp); #endif extract_env_vars(prev_pid, file_name); #if OPAL_ENABLE_CRDEBUG == 1 MPIR_debug_with_checkpoint = 0; (void) mca_base_var_register ("opal", "opal", "cr", "enable_crdebug", "Enable checkpoint/restart debugging", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ, &MPIR_debug_with_checkpoint); opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s] (refresh)\n", (MPIR_debug_with_checkpoint ? "True": "False")); #endif free(file_name); return OPAL_SUCCESS; }
int opal_cr_refresh_environ(int prev_pid) { int val; char *file_name = NULL; struct stat file_status; if( 0 >= prev_pid ) { prev_pid = getpid(); } /* * Make sure the file exists. If it doesn't then this means 2 things: * 1) We have already executed this function, and * 2) The file has been deleted on the previous round. */ asprintf(&file_name, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid); if(0 != stat(file_name, &file_status) ){ return OPAL_SUCCESS; } #if OPAL_ENABLE_CRDEBUG == 1 opal_unsetenv(mca_base_param_env_var("opal_cr_enable_crdebug"), &environ); #endif extract_env_vars(prev_pid, file_name); #if OPAL_ENABLE_CRDEBUG == 1 mca_base_param_reg_int_name("opal_cr", "enable_crdebug", "Enable checkpoint/restart debugging", false, false, 0, &val); MPIR_debug_with_checkpoint = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s] (refresh)\n", (MPIR_debug_with_checkpoint ? "True": "False")); #else val = 0; /* Silence Compiler warning */ #endif if( NULL != file_name ){ free(file_name); file_name = NULL; } return OPAL_SUCCESS; }
/******************************* * Notification Routines *******************************/ int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, int *state) { int ret, exit_status = OPAL_SUCCESS; int prev_pid = 0; prev_pid = getpid(); /* * Use the registered coordination routine */ if(OPAL_SUCCESS != (ret = cur_coord_callback(OPAL_CRS_CHECKPOINT)) ) { if ( OPAL_EXISTS != ret ) { opal_output(opal_cr_output, "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n", OPAL_CRS_CHECKPOINT, ret); } exit_status = ret; goto cleanup; } /* * Take the checkpoint */ OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE0); if(OPAL_SUCCESS != (ret = opal_crs.crs_checkpoint(pid, snapshot, (opal_crs_state_type_t *)state))) { opal_output(opal_cr_output, "opal_cr: inc_core: Error: The checkpoint failed. %d\n", ret); exit_status = ret; /* Don't return here since we want to restart the OPAL level stuff */ } if(*state == OPAL_CRS_CONTINUE) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1); if(term) { *state = OPAL_CRS_TERM; opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM; } else { opal_cr_checkpointing_state = OPAL_CR_STATUS_CONTINUE; } } else { term = false; } /* * If restarting read environment stuff that opal-restart left us. */ if(*state == OPAL_CRS_RESTART) { extract_env_vars(prev_pid); opal_cr_checkpointing_state = OPAL_CR_STATUS_RESTART_PRE; } /* * Use the registered coordination routine */ if(OPAL_SUCCESS != (ret = cur_coord_callback(*state)) ) { if ( OPAL_EXISTS != ret ) { opal_output(opal_cr_output, "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n", *state, ret); } exit_status = ret; goto cleanup; } cleanup: return exit_status; }