Exemplo n.º 1
0
static void process_ckpt_update_cmd(orte_process_name_t* sender,
                                    opal_buffer_t* buffer)
{
    int ret;
    orte_std_cntr_t count = 1;
    int ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_NONE;

    /*
     * Receive the data:
     * - ckpt_state
     */
    count = 1;
    if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
        goto cleanup;
    }
    orte_migrate_ckpt_status = ckpt_status;

    /*
     * If the job is not able to be migrateed, then return
     */
    if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_migrate_ckpt_status) {
        opal_show_help("help-orte-migrate.txt", "non-ckptable",
                       true,
                       orte_migrate_globals.pid);
        goto cleanup;
    }

    /*
     * If a migration is already in progress, then we must tell the user to
     * try again later.
     */
    if( ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS == orte_migrate_ckpt_status) {
        opal_show_help("help-orte-migrate.txt", "err-inprogress",
                       true,
                       orte_migrate_globals.pid);
        goto cleanup;
    }

    /*
     * If there was an error, display a message and exit
     */
    if( ORTE_ERRMGR_MIGRATE_STATE_ERROR == orte_migrate_ckpt_status ) {
        opal_show_help("help-orte-migrate.txt", "err-other",
                       true,
                       orte_migrate_globals.pid);
        goto cleanup;
    }

    /*
     * If we are to display the status progression
     */
    if( orte_migrate_globals.status ) {
        if(ORTE_ERRMGR_MIGRATE_STATE_FINISH != orte_migrate_ckpt_status) {
            pretty_print_status();
        }
    }

 cleanup:
    return;
}
Exemplo n.º 2
0
static void process_ckpt_update_cmd(orte_process_name_t* sender,
                                    opal_buffer_t* buffer)
{
    int ret, exit_status = ORTE_SUCCESS;
    orte_std_cntr_t count = 1;
    int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;

    /*
     * Receive the data:
     * - ckpt_state
     * - global snapshot handle (upon finish only)
     * - sequence number        (upon finish only)
     */
    count = 1;
    if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
        exit_status = ret;
        goto cleanup;
    }
    orte_checkpoint_globals.ckpt_status = ckpt_status;

    if( ORTE_SNAPC_CKPT_STATE_FINISHED == orte_checkpoint_globals.ckpt_status ||
        ORTE_SNAPC_CKPT_STATE_ERROR    == orte_checkpoint_globals.ckpt_status ) {
        count = 1;
        if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_snapshot_handle, &count, OPAL_STRING)) ) {
            exit_status = ret;
            goto cleanup;
        }
        count = 1;
        if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_sequence_num, &count, OPAL_INT)) ) {
            exit_status = ret;
            goto cleanup;
        }
    }

    /*
     * If the job is not able to be checkpointed, then return
     */
    if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) {
        orte_show_help("help-orte-checkpoint.txt", "non-ckptable", 
                       true,
                       orte_checkpoint_globals.pid);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }
    /*
     * If we are to display the status progression
     */
    if( orte_checkpoint_globals.status ) {
        if(ORTE_SNAPC_CKPT_STATE_FINISHED != orte_checkpoint_globals.ckpt_status) {
            pretty_print_status();
        }
    }

 cleanup:
    return;
}
Exemplo n.º 3
0
int
main(int argc, char *argv[])
{
    int ret, exit_status = ORTE_SUCCESS;

    /***************
     * Initialize
     ***************/
    if (ORTE_SUCCESS != (ret = ckpt_init(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /***************************
     * Find the HNP that we want to connect to, if it exists
     ***************************/
    if (ORTE_SUCCESS != (ret = find_hnp())) {
        /* Error printed by called function */
        exit_status = ret;
        goto cleanup;
    }

    /*******************************
     * Checkpoint the requested PID
     *******************************/
    if( orte_checkpoint_globals.verbose ) {
        opal_output_verbose(10, orte_checkpoint_globals.output,
                            "orte_checkpoint: Checkpointing...");
        if (0 < orte_checkpoint_globals.pid) {
            opal_output_verbose(10, orte_checkpoint_globals.output,
                                "\t PID %d",
                                orte_checkpoint_globals.pid);
        } else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){
            opal_output_verbose(10, orte_checkpoint_globals.output,
                                "\t Mpirun (%s)",
                                ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp));
        }

        opal_output_verbose(10, orte_checkpoint_globals.output,
                            "\t Connected to Mpirun %s",
                            ORTE_NAME_PRINT(&orterun_hnp->name));
         
        if(orte_checkpoint_globals.term) {
            opal_output_verbose(10, orte_checkpoint_globals.output,
                                "\t Terminating after checkpoint\n");
        }
    }

    if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.term)) ) {
        orte_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
                       orte_checkpoint_globals.pid, ret);
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Wait for the checkpoint to complete
     */
    if(!orte_checkpoint_globals.nowait) { 
        while( ORTE_SNAPC_CKPT_STATE_FINISHED != orte_checkpoint_globals.ckpt_status &&
               ORTE_SNAPC_CKPT_STATE_NO_CKPT  != orte_checkpoint_globals.ckpt_status &&
               ORTE_SNAPC_CKPT_STATE_ERROR    != orte_checkpoint_globals.ckpt_status ) {
            opal_progress();
        }
    }

    if( ORTE_SNAPC_CKPT_STATE_NO_CKPT  == orte_checkpoint_globals.ckpt_status ) {
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
        orte_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
                       orte_checkpoint_globals.pid, ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    if( orte_checkpoint_globals.status ) {
        orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_FINISHED;
        pretty_print_status();
    }

    if(!orte_checkpoint_globals.nowait) {
        pretty_print_reference();
    }

 cleanup:
    /***************
     * Cleanup
     ***************/
    if (ORTE_SUCCESS != (ret = ckpt_finalize())) {
        return ret;
    }

    return exit_status;
}
Exemplo n.º 4
0
int
main(int argc, char *argv[])
{
    int ret, exit_status = ORTE_SUCCESS;

    /***************
     * Initialize
     ***************/
    if (ORTE_SUCCESS != (ret = tool_init(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /***************************
     * Find the HNP that we want to connect to, if it exists
     ***************************/
    if( orte_migrate_globals.verbose ) {
        opal_output_verbose(10, orte_migrate_globals.output,
                            "orte_migrate: Finding HNP...");
    }
    if (ORTE_SUCCESS != (ret = find_hnp())) {
        opal_show_help("help-orte-migrate.txt", "invalid_pid",
                       true, orte_migrate_globals.pid);
        exit_status = ret;
        goto cleanup;
    }

    /*******************************
     * Send migration information to HNP
     *******************************/
    if( orte_migrate_globals.verbose ) {
        opal_output_verbose(10, orte_migrate_globals.output,
                            "orte_migrate: Sending info to HNP...");
    }
    if (ORTE_SUCCESS != (ret = notify_hnp())) {
        opal_output(0,
                    "HNP with PID %d Not found!",
                    orte_migrate_globals.pid);
        exit_status = ret;
        goto cleanup;
    }

    /*******************************
     * Wait for migration to complete
     *******************************/
    while( ORTE_ERRMGR_MIGRATE_STATE_FINISH         != orte_migrate_ckpt_status &&
           ORTE_ERRMGR_MIGRATE_STATE_ERROR          != orte_migrate_ckpt_status &&
           ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS != orte_migrate_ckpt_status) {
        opal_progress();
    }

    if( orte_migrate_globals.status ) {
        orte_migrate_ckpt_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH;
        pretty_print_status();
    }

 cleanup:
    /***************
     * Cleanup
     ***************/
    if (ORTE_SUCCESS != (ret = tool_finalize())) {
        return ret;
    }

    return exit_status;
}
Exemplo n.º 5
0
static void process_ckpt_update_cmd(orte_process_name_t* sender,
                                    opal_buffer_t* buffer)
{
    int ret, exit_status = ORTE_SUCCESS;
    orte_std_cntr_t count = 1;
    int ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;

    /*
     * Receive the data:
     * - ckpt_state
     * - global snapshot handle (upon finish only)
     * - sequence number        (upon finish only)
     */
    count = 1;
    if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &ckpt_status, &count, OPAL_INT)) ) {
        exit_status = ret;
        goto cleanup;
    }
    orte_checkpoint_globals.ckpt_status = ckpt_status;

    if( ORTE_SNAPC_CKPT_STATE_RECOVERED   == orte_checkpoint_globals.ckpt_status ||
        ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ||
        ORTE_SNAPC_CKPT_STATE_STOPPED     == orte_checkpoint_globals.ckpt_status ||
        ORTE_SNAPC_CKPT_STATE_ERROR       == orte_checkpoint_globals.ckpt_status ) {
        count = 1;
        if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_snapshot_handle, &count, OPAL_STRING)) ) {
            exit_status = ret;
            goto cleanup;
        }
        count = 1;
        if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &global_sequence_num, &count, OPAL_INT)) ) {
            exit_status = ret;
            goto cleanup;
        }
    }

    /*
     * If the job is not able to be checkpointed, then return
     */
    if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) {
        opal_show_help("help-orte-checkpoint.txt", "non-ckptable", 
                       true,
                       orte_checkpoint_globals.pid);
        is_checkpoint_finished = true;
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status) {
        opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
                       orte_checkpoint_globals.pid, ORTE_ERROR);
        is_checkpoint_finished = true;
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    /* Status progression */
    if( orte_checkpoint_globals.status ) {
        pretty_print_status();
    }

    if( ORTE_SNAPC_CKPT_STATE_STOPPED == orte_checkpoint_globals.ckpt_status) {
        is_checkpoint_finished = true;
        goto cleanup;
    }

    /* Normal termination check */
    if( (ORTE_SNAPC_CKPT_STATE_RECOVERED   == orte_checkpoint_globals.ckpt_status && is_checkpoint_established) ||
        (ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status && is_checkpoint_recovered) ){
        is_checkpoint_finished = true;
        goto cleanup;
    }
    else if( ORTE_SNAPC_CKPT_STATE_RECOVERED  == orte_checkpoint_globals.ckpt_status ) {
        is_checkpoint_recovered = true;
    }
    else if(ORTE_SNAPC_CKPT_STATE_ESTABLISHED == orte_checkpoint_globals.ckpt_status ) {
        is_checkpoint_established = true;
    }

 cleanup:
    return;
}