static void process_ckpt_update_cmd(orte_process_name_t* sender, opal_buffer_t* buffer) { int ret; orte_std_cntr_t count = 1; int ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; /* * Receive the data: * - ckpt_state */ count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) { goto cleanup; } orte_migrate_ckpt_status = ckpt_status; /* * If the job is not able to be migrateed, then return */ if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_migrate_ckpt_status) { opal_show_help("help-orte-migrate.txt", "non-ckptable", true, orte_migrate_globals.pid); goto cleanup; } /* * If a migration is already in progress, then we must tell the user to * try again later. */ if( ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS == orte_migrate_ckpt_status) { opal_show_help("help-orte-migrate.txt", "err-inprogress", true, orte_migrate_globals.pid); goto cleanup; } /* * If there was an error, display a message and exit */ if( ORTE_ERRMGR_MIGRATE_STATE_ERROR == orte_migrate_ckpt_status ) { opal_show_help("help-orte-migrate.txt", "err-other", true, orte_migrate_globals.pid); goto cleanup; } /* * If we are to display the status progression */ if( orte_migrate_globals.status ) { if(ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status) { pretty_print_status(); } } cleanup: return; }
static void process_ckpt_update_cmd(orte_process_name_t* sender, opal_buffer_t* buffer) { int ret, exit_status = ORTE_SUCCESS; orte_std_cntr_t count = 1; int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE; /* * Receive the data: * - ckpt_state * - global snapshot handle (upon finish only) * - sequence number (upon finish only) */ count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) { exit_status = ret; goto cleanup; } orte_checkpoint_globals.ckpt_status = ckpt_status; if( ORTE_SNAPC_CKPT_STATE_FINISHED == orte_checkpoint_globals.ckpt_status || ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) { count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_snapshot_handle, &count, OPAL_STRING)) ) { exit_status = ret; goto cleanup; } count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_sequence_num, &count, OPAL_INT)) ) { exit_status = ret; goto cleanup; } } /* * If the job is not able to be checkpointed, then return */ if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) { orte_show_help("help-orte-checkpoint.txt", "non-ckptable", true, orte_checkpoint_globals.pid); exit_status = ORTE_ERROR; goto cleanup; } /* * If we are to display the status progression */ if( orte_checkpoint_globals.status ) { if(ORTE_SNAPC_CKPT_STATE_FINISHED != orte_checkpoint_globals.ckpt_status) { pretty_print_status(); } } cleanup: return; }
int main(int argc, char *argv[]) { int ret, exit_status = ORTE_SUCCESS; /*************** * Initialize ***************/ if (ORTE_SUCCESS != (ret = ckpt_init(argc, argv))) { exit_status = ret; goto cleanup; } /*************************** * Find the HNP that we want to connect to, if it exists ***************************/ if (ORTE_SUCCESS != (ret = find_hnp())) { /* Error printed by called function */ exit_status = ret; goto cleanup; } /******************************* * Checkpoint the requested PID *******************************/ if( orte_checkpoint_globals.verbose ) { opal_output_verbose(10, orte_checkpoint_globals.output, "orte_checkpoint: Checkpointing..."); if (0 < orte_checkpoint_globals.pid) { opal_output_verbose(10, orte_checkpoint_globals.output, "\t PID %d", orte_checkpoint_globals.pid); } else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){ opal_output_verbose(10, orte_checkpoint_globals.output, "\t Mpirun (%s)", ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp)); } opal_output_verbose(10, orte_checkpoint_globals.output, "\t Connected to Mpirun %s", ORTE_NAME_PRINT(&orterun_hnp->name)); if(orte_checkpoint_globals.term) { opal_output_verbose(10, orte_checkpoint_globals.output, "\t Terminating after checkpoint\n"); } } if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.term)) ) { orte_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, orte_checkpoint_globals.pid, ret); exit_status = ret; goto cleanup; } /* * Wait for the checkpoint to complete */ if(!orte_checkpoint_globals.nowait) { while( ORTE_SNAPC_CKPT_STATE_FINISHED != orte_checkpoint_globals.ckpt_status && ORTE_SNAPC_CKPT_STATE_NO_CKPT != orte_checkpoint_globals.ckpt_status && ORTE_SNAPC_CKPT_STATE_ERROR != orte_checkpoint_globals.ckpt_status ) { opal_progress(); } } if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status ) { exit_status = ORTE_ERROR; goto cleanup; } if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) { orte_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, orte_checkpoint_globals.pid, ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } if( orte_checkpoint_globals.status ) { orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_FINISHED; pretty_print_status(); } if(!orte_checkpoint_globals.nowait) { pretty_print_reference(); } cleanup: /*************** * Cleanup ***************/ if (ORTE_SUCCESS != (ret = ckpt_finalize())) { return ret; } return exit_status; }
int main(int argc, char *argv[]) { int ret, exit_status = ORTE_SUCCESS; /*************** * Initialize ***************/ if (ORTE_SUCCESS != (ret = tool_init(argc, argv))) { exit_status = ret; goto cleanup; } /*************************** * Find the HNP that we want to connect to, if it exists ***************************/ if( orte_migrate_globals.verbose ) { opal_output_verbose(10, orte_migrate_globals.output, "orte_migrate: Finding HNP..."); } if (ORTE_SUCCESS != (ret = find_hnp())) { opal_show_help("help-orte-migrate.txt", "invalid_pid", true, orte_migrate_globals.pid); exit_status = ret; goto cleanup; } /******************************* * Send migration information to HNP *******************************/ if( orte_migrate_globals.verbose ) { opal_output_verbose(10, orte_migrate_globals.output, "orte_migrate: Sending info to HNP..."); } if (ORTE_SUCCESS != (ret = notify_hnp())) { opal_output(0, "HNP with PID %d Not found!", orte_migrate_globals.pid); exit_status = ret; goto cleanup; } /******************************* * Wait for migration to complete *******************************/ while( ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status && ORTE_ERRMGR_MIGRATE_STATE_ERROR != orte_migrate_ckpt_status && ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS != orte_migrate_ckpt_status) { opal_progress(); } if( orte_migrate_globals.status ) { orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH; pretty_print_status(); } cleanup: /*************** * Cleanup ***************/ if (ORTE_SUCCESS != (ret = tool_finalize())) { return ret; } return exit_status; }
static void process_ckpt_update_cmd(orte_process_name_t* sender, opal_buffer_t* buffer) { int ret, exit_status = ORTE_SUCCESS; orte_std_cntr_t count = 1; int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE; /* * Receive the data: * - ckpt_state * - global snapshot handle (upon finish only) * - sequence number (upon finish only) */ count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) { exit_status = ret; goto cleanup; } orte_checkpoint_globals.ckpt_status = ckpt_status; if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status || ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status || ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status || ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) { count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_snapshot_handle, &count, OPAL_STRING)) ) { exit_status = ret; goto cleanup; } count = 1; if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_sequence_num, &count, OPAL_INT)) ) { exit_status = ret; goto cleanup; } } /* * If the job is not able to be checkpointed, then return */ if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) { opal_show_help("help-orte-checkpoint.txt", "non-ckptable", true, orte_checkpoint_globals.pid); is_checkpoint_finished = true; exit_status = ORTE_ERROR; goto cleanup; } if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status) { opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, orte_checkpoint_globals.pid, ORTE_ERROR); is_checkpoint_finished = true; exit_status = ORTE_ERROR; goto cleanup; } /* Status progression */ if( orte_checkpoint_globals.status ) { pretty_print_status(); } if( ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status) { is_checkpoint_finished = true; goto cleanup; } /* Normal termination check */ if( (ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status && is_checkpoint_established) || (ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status && is_checkpoint_recovered) ){ is_checkpoint_finished = true; goto cleanup; } else if( ORTE_SNAPC_CKPT_STATE_RECOVERED == orte_checkpoint_globals.ckpt_status ) { is_checkpoint_recovered = true; } else if(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ) { is_checkpoint_established = true; } cleanup: return; }