Exemplo n.º 1
0
int opal_cr_refresh_environ(int prev_pid) {
    char *file_name;
#if OPAL_ENABLE_CRDEBUG == 1
    char *tmp;
#endif
    struct stat file_status;

    if( 0 >= prev_pid ) {
        prev_pid = getpid();
    }

    /*
     * Make sure the file exists. If it doesn't then this means 2 things:
     *  1) We have already executed this function, and
     *  2) The file has been deleted on the previous round.
     */
    asprintf(&file_name, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid);
    if (NULL == file_name) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }
    if(0 != stat(file_name, &file_status) ){
        free(file_name);
        return OPAL_SUCCESS;
    }

#if OPAL_ENABLE_CRDEBUG == 1
    mca_base_var_env_name ("opal_cr_enable_crdebug", &tmp);
    opal_unsetenv(tmp, &environ);
    free (tmp);
#endif

    extract_env_vars(prev_pid, file_name);

#if OPAL_ENABLE_CRDEBUG == 1
    MPIR_debug_with_checkpoint = 0;
    (void) mca_base_var_register ("opal", "opal", "cr", "enable_crdebug",
                                  "Enable checkpoint/restart debugging",
                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                  OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_ALL_EQ,
                                  &MPIR_debug_with_checkpoint);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: C/R Debugging Enabled [%s] (refresh)\n",
                        (MPIR_debug_with_checkpoint ? "True": "False"));
#endif

    free(file_name);

    return OPAL_SUCCESS;
}
Exemplo n.º 2
0
int opal_cr_refresh_environ(int prev_pid) {
    int val;
    char *file_name = NULL;
    struct stat file_status;

    if( 0 >= prev_pid ) {
        prev_pid = getpid();
    }

    /*
     * Make sure the file exists. If it doesn't then this means 2 things:
     *  1) We have already executed this function, and
     *  2) The file has been deleted on the previous round.
     */
    asprintf(&file_name, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid);
    if(0 != stat(file_name, &file_status) ){
        return OPAL_SUCCESS;
    }

#if OPAL_ENABLE_CRDEBUG == 1
    opal_unsetenv(mca_base_param_env_var("opal_cr_enable_crdebug"), &environ);
#endif

    extract_env_vars(prev_pid, file_name);

#if OPAL_ENABLE_CRDEBUG == 1
    mca_base_param_reg_int_name("opal_cr", "enable_crdebug",
                                "Enable checkpoint/restart debugging",
                                false, false,
                                0,
                                &val);
    MPIR_debug_with_checkpoint = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: C/R Debugging Enabled [%s] (refresh)\n",
                        (MPIR_debug_with_checkpoint ? "True": "False"));
#else
    val = 0; /* Silence Compiler warning */
#endif

    if( NULL != file_name ){
        free(file_name);
        file_name = NULL;
    }

    return OPAL_SUCCESS;
}
Exemplo n.º 3
0
/*******************************
 * Notification Routines
 *******************************/
int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, int *state)
{
    int ret, exit_status = OPAL_SUCCESS;
    int prev_pid = 0;

    prev_pid = getpid();

    /*
     * Use the registered coordination routine
     */
    if(OPAL_SUCCESS != (ret = cur_coord_callback(OPAL_CRS_CHECKPOINT)) ) {
        if ( OPAL_EXISTS != ret ) {
            opal_output(opal_cr_output, 
                        "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n",
                        OPAL_CRS_CHECKPOINT, ret);
        }
        exit_status = ret;
        goto cleanup;
    }
    
    /*
     * Take the checkpoint
     */
    OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE0);
    if(OPAL_SUCCESS != (ret = opal_crs.crs_checkpoint(pid, snapshot, (opal_crs_state_type_t *)state))) {
        opal_output(opal_cr_output,
                    "opal_cr: inc_core: Error: The checkpoint failed. %d\n", ret);
        exit_status = ret;
        /* Don't return here since we want to restart the OPAL level stuff */
    }

    if(*state == OPAL_CRS_CONTINUE) {
        OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1);

        if(term) {
            *state = OPAL_CRS_TERM;
            opal_cr_checkpointing_state  = OPAL_CR_STATUS_TERM;
        } else {
            opal_cr_checkpointing_state  = OPAL_CR_STATUS_CONTINUE;
        }
    }
    else {
        term = false;
    }

    /*
     * If restarting read environment stuff that opal-restart left us.
     */
    if(*state == OPAL_CRS_RESTART) {
        extract_env_vars(prev_pid);
        opal_cr_checkpointing_state  = OPAL_CR_STATUS_RESTART_PRE;
    }

    /*
     * Use the registered coordination routine
     */
    if(OPAL_SUCCESS != (ret = cur_coord_callback(*state)) ) {
        if ( OPAL_EXISTS != ret ) {
            opal_output(opal_cr_output,
                        "opal_cr: inc_core: Error: cur_coord_callback(%d) failed! %d\n",
                        *state, ret);
        }
        exit_status = ret;
        goto cleanup;
    }
    
 cleanup:
    return exit_status;
}