Esempio n. 1
0
File: ompi_cr.c Progetto: IanYXXL/A1
static int ompi_cr_coord_pre_continue(void) {
#if OPAL_ENABLE_FT_CR == 1
    int ret, exit_status = OMPI_SUCCESS;

    /*
     * Can not really do much until ORTE is up and running,
     * so defer action until the post_continue function.
     */
    opal_output_verbose(10, ompi_cr_output,
                        "ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");

    if( orte_cr_continue_like_restart ) {
        /* Mimic ompi_cr_coord_pre_restart(); */
        if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
            exit_status = ret;
            goto cleanup;
        }
    }
    else {
        if( opal_cr_timing_barrier_enabled ) {
            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
        }
        OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
        if( opal_cr_timing_barrier_enabled ) {
            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
        }
        OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
    }

 cleanup:    
    return exit_status;
#else
    return OMPI_SUCCESS;
#endif
}
Esempio n. 2
0
int opal_cr_inc_core_ckpt(pid_t pid,
                          opal_crs_base_snapshot_t *snapshot,
                          opal_crs_base_ckpt_options_t *options,
                          int *state)
{
    int ret, exit_status = OPAL_SUCCESS;

    OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE0);
    if(OPAL_SUCCESS != (ret = opal_crs.crs_checkpoint(pid,
                                                      snapshot,
                                                      options,
                                                      (opal_crs_state_type_t *)state))) {
        opal_output(opal_cr_output,
                    "opal_cr: inc_core: Error: The checkpoint failed. %d\n", ret);
        exit_status = ret;
    }

    if(*state == OPAL_CRS_CONTINUE) {
        OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1);

        if(options->term) {
            *state = OPAL_CRS_TERM;
            opal_cr_checkpointing_state  = OPAL_CR_STATUS_TERM;
        } else {
            opal_cr_checkpointing_state  = OPAL_CR_STATUS_CONTINUE;
        }
    }
    else {
        options->term = false;
    }

    /*
     * If restarting read environment stuff that opal-restart left us.
     */
    if(*state == OPAL_CRS_RESTART) {
        opal_cr_refresh_environ(core_prev_pid);
        opal_cr_checkpointing_state  = OPAL_CR_STATUS_RESTART_PRE;
    }

    return exit_status;
}
Esempio n. 3
0
int mca_pml_ob1_ft_event( int state )
{
    static bool first_continue_pass = false;
    ompi_proc_t** procs = NULL;
    size_t num_procs;
    int ret, p;
    ompi_rte_collective_t *coll, *modex;

    coll = OBJ_NEW(ompi_rte_collective_t);
    coll->id = ompi_process_info.peer_init_barrier;
    if(OPAL_CRS_CHECKPOINT == state) {
        if( opal_cr_timing_barrier_enabled ) {
            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
            ompi_rte_barrier(coll);
            OMPI_WAIT_FOR_COMPLETION(coll->active);
        }

        OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
    }
    else if(OPAL_CRS_CONTINUE == state) {
        first_continue_pass = !first_continue_pass;

        if( !first_continue_pass ) { 
            if( opal_cr_timing_barrier_enabled ) {
                OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
                ompi_rte_barrier(coll);
                OMPI_WAIT_FOR_COMPLETION(coll->active);
            }
            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
        }

        if( orte_cr_continue_like_restart && !first_continue_pass ) {
            /*
             * Get a list of processes
             */
            procs = ompi_proc_all(&num_procs);
            if(NULL == procs) {
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto clean;
            }

            /*
             * Refresh the proc structure, and publish our proc info in the modex.
             * NOTE: Do *not* call ompi_proc_finalize as there are many places in
             *       the code that point to indv. procs in this strucutre. For our
             *       needs here we only need to fix up the modex, bml and pml 
             *       references.
             */
            if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
                opal_output(0,
                            "pml:ob1: ft_event(Restart): proc_refresh Failed %d",
                            ret);
                for(p = 0; p < (int)num_procs; ++p) {
                    OBJ_RELEASE(procs[p]);
                }
                free (procs);
                goto clean;
            }
        }
    }
    else if(OPAL_CRS_RESTART_PRE == state ) {
        /* Nothing here */
    }
    else if(OPAL_CRS_RESTART == state ) {
        /*
         * Get a list of processes
         */
        procs = ompi_proc_all(&num_procs);
        if(NULL == procs) {
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto clean;
        }

        /*
         * Clean out the modex information since it is invalid now.
         *    ompi_rte_purge_proc_attrs();
         * This happens at the ORTE level, so doing it again here will cause
         * some issues with socket caching.
         */


        /*
         * Refresh the proc structure, and publish our proc info in the modex.
         * NOTE: Do *not* call ompi_proc_finalize as there are many places in
         *       the code that point to indv. procs in this strucutre. For our
         *       needs here we only need to fix up the modex, bml and pml 
         *       references.
         */
        if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
            opal_output(0,
                        "pml:ob1: ft_event(Restart): proc_refresh Failed %d",
                        ret);
            for(p = 0; p < (int)num_procs; ++p) {
                OBJ_RELEASE(procs[p]);
            }
            free (procs);
            goto clean;
        }
    }
    else if(OPAL_CRS_TERM == state ) {
        ;
    }
    else {
        ;
    }

    /* Call the BML
     * BML is expected to call ft_event in
     * - BTL(s)
     * - MPool(s)
     */
    if( OMPI_SUCCESS != (ret = mca_bml.bml_ft_event(state))) {
        opal_output(0, "pml:base: ft_event: BML ft_event function failed: %d\n",
                    ret);
    }
    
    if(OPAL_CRS_CHECKPOINT == state) {
        OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1);

        if( opal_cr_timing_barrier_enabled ) {
            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0);
            /* JJH Cannot barrier here due to progress engine -- ompi_rte_barrier();*/
        }
    }
    else if(OPAL_CRS_CONTINUE == state) {
        if( !first_continue_pass ) {
            if( opal_cr_timing_barrier_enabled ) {
                OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
                ompi_rte_barrier(coll);
                OMPI_WAIT_FOR_COMPLETION(coll->active);
            }
            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
        }

        if( orte_cr_continue_like_restart && !first_continue_pass ) {
            /*
             * Exchange the modex information once again.
             * BTLs will have republished their modex information.
             */
            modex = OBJ_NEW(ompi_rte_collective_t);
            modex->id = ompi_process_info.peer_modex;
            if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) {
                opal_output(0,
                            "pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
                            ret);
                OBJ_RELEASE(modex);
                goto clean;
            }
            OMPI_WAIT_FOR_COMPLETION(modex->active);
            OBJ_RELEASE(modex);

            /*
             * Startup the PML stack now that the modex is running again
             * Add the new procs (BTLs redo modex recv's)
             */
            if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
                opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret);
                goto clean;
            }

            /* Is this barrier necessary ? JJH */
            if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
                opal_output(0, "pml:ob1: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
                goto clean;
            }
            OMPI_WAIT_FOR_COMPLETION(coll->active);

            if( NULL != procs ) {
                for(p = 0; p < (int)num_procs; ++p) {
                    OBJ_RELEASE(procs[p]);
                }
                free(procs);
                procs = NULL;
            }
        }
        if( !first_continue_pass ) {
            if( opal_cr_timing_barrier_enabled ) {
                OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
                ompi_rte_barrier(coll);
                OMPI_WAIT_FOR_COMPLETION(coll->active);
            }
            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
        }
    }
    else if(OPAL_CRS_RESTART_PRE == state ) {
        /* Nothing here */
    }
    else if(OPAL_CRS_RESTART == state  ) {
        /*
         * Exchange the modex information once again.
         * BTLs will have republished their modex information.
         */
        modex = OBJ_NEW(ompi_rte_collective_t);
        modex->id = ompi_process_info.peer_modex;
        if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) {
            opal_output(0,
                        "pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
                        ret);
            OBJ_RELEASE(modex);
            goto clean;
        }
        OMPI_WAIT_FOR_COMPLETION(modex->active);
        OBJ_RELEASE(modex);

        /*
         * Startup the PML stack now that the modex is running again
         * Add the new procs (BTLs redo modex recv's)
         */
        if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
            opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret);
            goto clean;
        }

        /* Is this barrier necessary ? JJH */
        if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
            opal_output(0, "pml:ob1: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
            goto clean;
        }
        OMPI_WAIT_FOR_COMPLETION(coll->active);

        if( NULL != procs ) {
            for(p = 0; p < (int)num_procs; ++p) {
                OBJ_RELEASE(procs[p]);
            }
            free(procs);
            procs = NULL;
        }
    }
    else if(OPAL_CRS_TERM == state ) {
        ;
    }
    else {
        ;
    }

    ret = OMPI_SUCCESS;

clean:
    OBJ_RELEASE(coll);
    return ret;
}
Esempio n. 4
0
int opal_cr_inc_core_recover(int state)
{
    int ret;
    opal_cr_user_inc_callback_state_t cb_state;

    if( opal_cr_checkpointing_state != OPAL_CR_STATUS_TERM &&
        opal_cr_checkpointing_state != OPAL_CR_STATUS_CONTINUE &&
        opal_cr_checkpointing_state != OPAL_CR_STATUS_RESTART_PRE &&
        opal_cr_checkpointing_state != OPAL_CR_STATUS_RESTART_POST ) {

        if(state == OPAL_CRS_CONTINUE) {
            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1);
            opal_cr_checkpointing_state  = OPAL_CR_STATUS_CONTINUE;
        }
        /*
         * If restarting read environment stuff that opal-restart left us.
         */
        else if(state == OPAL_CRS_RESTART) {
            opal_cr_refresh_environ(core_prev_pid);
            opal_cr_checkpointing_state  = OPAL_CR_STATUS_RESTART_PRE;
        }
    }

    /*
     * Call User Level INC
     */
    if( OPAL_CRS_CONTINUE == state ) {
        cb_state = OPAL_CR_INC_STATE_CONTINUE;
    }
    else if( OPAL_CRS_RESTART == state ) {
        cb_state = OPAL_CR_INC_STATE_RESTART;
    }
    else {
        cb_state = OPAL_CR_INC_STATE_ERROR;
    }

    if(OPAL_SUCCESS != (ret = trigger_user_inc_callback(OPAL_CR_INC_POST_CRS_PRE_MPI,
                                                        cb_state)) ) {
        return ret;
    }

    /*
     * Use the registered coordination routine
     */
    if(OPAL_SUCCESS != (ret = cur_coord_callback(state)) ) {
        if ( OPAL_EXISTS != ret ) {
            opal_output(opal_cr_output,
                        "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n",
                        state, ret);
        }
        return ret;
    }

    if(OPAL_SUCCESS != (ret = trigger_user_inc_callback(OPAL_CR_INC_POST_CRS_POST_MPI,
                                                        cb_state)) ) {
        return ret;
    }

#if OPAL_ENABLE_CRDEBUG == 1
    opal_cr_debug_clear_current_ckpt_thread();
#endif

    return OPAL_SUCCESS;
}
Esempio n. 5
0
/*******************************
 * Notification Routines
 *******************************/
int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, int *state)
{
    int ret, exit_status = OPAL_SUCCESS;
    int prev_pid = 0;

    prev_pid = getpid();

    /*
     * Use the registered coordination routine
     */
    if(OPAL_SUCCESS != (ret = cur_coord_callback(OPAL_CRS_CHECKPOINT)) ) {
        if ( OPAL_EXISTS != ret ) {
            opal_output(opal_cr_output, 
                        "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n",
                        OPAL_CRS_CHECKPOINT, ret);
        }
        exit_status = ret;
        goto cleanup;
    }
    
    /*
     * Take the checkpoint
     */
    OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE0);
    if(OPAL_SUCCESS != (ret = opal_crs.crs_checkpoint(pid, snapshot, (opal_crs_state_type_t *)state))) {
        opal_output(opal_cr_output,
                    "opal_cr: inc_core: Error: The checkpoint failed. %d\n", ret);
        exit_status = ret;
        /* Don't return here since we want to restart the OPAL level stuff */
    }

    if(*state == OPAL_CRS_CONTINUE) {
        OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1);

        if(term) {
            *state = OPAL_CRS_TERM;
            opal_cr_checkpointing_state  = OPAL_CR_STATUS_TERM;
        } else {
            opal_cr_checkpointing_state  = OPAL_CR_STATUS_CONTINUE;
        }
    }
    else {
        term = false;
    }

    /*
     * If restarting read environment stuff that opal-restart left us.
     */
    if(*state == OPAL_CRS_RESTART) {
        extract_env_vars(prev_pid);
        opal_cr_checkpointing_state  = OPAL_CR_STATUS_RESTART_PRE;
    }

    /*
     * Use the registered coordination routine
     */
    if(OPAL_SUCCESS != (ret = cur_coord_callback(*state)) ) {
        if ( OPAL_EXISTS != ret ) {
            opal_output(opal_cr_output,
                        "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n",
                        *state, ret);
        }
        exit_status = ret;
        goto cleanup;
    }
    
 cleanup:
    return exit_status;
}