예제 #1
0
static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
    int ret, exit_status = OPAL_SUCCESS;
    char **tmp_argv = NULL;
    char * component_name = NULL;
    int prev_pid;

    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: cold_start(%s)", snapshot->super.reference_name);

    /*
     * Find the snapshot directory, read the metadata file
     */
    if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location, 
                                                                        &component_name, &prev_pid) ) ) {
        opal_output(mca_crs_blcr_component.super.output_handle,
                    "crs:blcr: blcr_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
                    snapshot->super.local_location, ret);
        exit_status = ret;
        goto cleanup;
    }

    snapshot->super.component_name = strdup(component_name);

    /* Compare the component strings to make sure this is our snapshot before going further */
    if ( 0 != strncmp(mca_crs_blcr_component.super.base_version.mca_component_name,
                      component_name, strlen(component_name)) ) {
        exit_status = OPAL_ERROR;
        opal_output(mca_crs_blcr_component.super.output_handle,
                    "crs:blcr: blcr_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n", 
                    component_name, mca_crs_blcr_component.super.base_version.mca_component_name);
        goto cleanup;
    }

    /*
     * Context Filename
     */
    opal_crs_base_metadata_read_token(snapshot->super.local_location, CRS_METADATA_CONTEXT, &tmp_argv);
    if( NULL == tmp_argv ) {
        opal_output(mca_crs_blcr_component.super.output_handle,
                    "crs:blcr: blcr_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
                    CRS_METADATA_CONTEXT, snapshot->super.local_location);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }
    asprintf(&snapshot->context_filename, "%s/%s", snapshot->super.local_location, tmp_argv[0]);

    /*
     * Reset the cold_start flag
     */
    snapshot->super.cold_start = false;

 cleanup:
    if(NULL != tmp_argv) {
        opal_argv_free(tmp_argv);
        tmp_argv = NULL;
    }

    return exit_status;
}
예제 #2
0
int opal_crs_base_extract_expected_component(FILE *metadata, char ** component_name, int *prev_pid)
{
    int exit_status = OPAL_SUCCESS;
    char **pid_argv = NULL;
    char **name_argv = NULL;

    /* Dummy check */
    if( NULL == metadata ) {
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    opal_crs_base_metadata_read_token(metadata, CRS_METADATA_PID, &pid_argv);
    if( NULL != pid_argv && NULL != pid_argv[0] ) {
        *prev_pid = atoi(pid_argv[0]);
    } else {
        opal_output(0, "Error: expected_component: PID information unavailable!");
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    opal_crs_base_metadata_read_token(metadata, CRS_METADATA_COMP, &name_argv);
    if( NULL != name_argv && NULL != name_argv[0] ) {
        *component_name = strdup(name_argv[0]);
    } else {
        opal_output(0, "Error: expected_component: Component Name information unavailable!");
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

 cleanup:
    if( NULL != pid_argv ) {
        opal_argv_free(pid_argv);
        pid_argv = NULL;
    }

    if( NULL != name_argv ) {
        opal_argv_free(name_argv);
        name_argv = NULL;
    }

    return exit_status;
}
예제 #3
0
static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
    int ret, exit_status = OPAL_SUCCESS;
    char **tmp_argv = NULL;
    char * component_name = NULL;
    int prev_pid;

    opal_output_verbose(10, mca_crs_self_component.super.output_handle,
                        "crs:self: cold_start()");

    /*
     * Find the snapshot directory, read the metadata file
     */
    if( NULL == snapshot->super.metadata ) {
        if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
            opal_output(mca_crs_self_component.super.output_handle,
                        "crs:self: checkpoint(): Error: Unable to open the file (%s)",
                        snapshot->super.metadata_filename);
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
    }
    if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.metadata,
                                                                        &component_name, &prev_pid) ) ) {
        opal_output(mca_crs_self_component.super.output_handle,
                    "crs:self: self_cold_start: Error: Failed to extract the metadata from the local snapshot (%s). Returned %d.",
                    snapshot->super.metadata_filename, ret);
        exit_status = ret;
        goto cleanup;
    }

    snapshot->super.component_name = strdup(component_name);

    /* Compare the strings to make sure this is our snapshot before going further */
    if ( 0 != strncmp(mca_crs_self_component.super.base_version.mca_component_name,
                      component_name, strlen(component_name)) ) {
        exit_status = OPAL_ERROR;
        opal_output(mca_crs_self_component.super.output_handle,
                    "crs:self: self_cold_start: Error: This snapshot (%s) is not intended for us (%s)\n",
                    component_name, mca_crs_self_component.super.base_version.mca_component_name);
        goto cleanup;
    }

    /*
     * Restart command
     * JJH: Command lines limited to 256 chars.
     */
    opal_crs_base_metadata_read_token(snapshot->super.metadata, CRS_METADATA_CONTEXT, &tmp_argv);
    if( NULL == tmp_argv ) {
        opal_output(mca_crs_self_component.super.output_handle,
                    "crs:self: self_cold_start: Error: Failed to read the %s token from the local checkpoint in %s",
                    CRS_METADATA_CONTEXT, snapshot->super.snapshot_directory);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }
    asprintf(&snapshot->cmd_line, "%s", tmp_argv[0]);

    /*
     * Reset the cold_start flag
     */
    snapshot->super.cold_start = false;

 cleanup:
    if(NULL != tmp_argv) {
        opal_argv_free(tmp_argv);
        tmp_argv = NULL;
    }

    return exit_status;

}
예제 #4
0
int opal_crs_none_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
{
    int exit_status = OPAL_SUCCESS;
    char **tmp_argv = NULL;
    char **cr_argv = NULL;
    int status;

    *child_pid = getpid();

    if( NULL == base_snapshot->metadata ) {
        if (NULL == (base_snapshot->metadata = fopen(base_snapshot->metadata_filename, "a")) ) {
            opal_output(0,
                        "crs:none: checkpoint(): Error: Unable to open the file (%s)",
                        base_snapshot->metadata_filename);
            return OPAL_ERROR;
        }
    }

    opal_crs_base_metadata_read_token(base_snapshot->metadata, CRS_METADATA_CONTEXT, &tmp_argv);

    if( NULL == tmp_argv ) {
        opal_output(opal_crs_base_framework.framework_output,
                    "crs:none: none_restart: Error: Failed to read the %s token from the local checkpoint in %s",
                    CRS_METADATA_CONTEXT, base_snapshot->metadata_filename);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    if( opal_argv_count(tmp_argv) <= 0 ) {
        opal_output_verbose(10, opal_crs_base_framework.framework_output,
                            "crs:none: none_restart: No command line to exec, so just returning");
        exit_status = OPAL_SUCCESS;
        goto cleanup;
    }

    if ( NULL == (cr_argv = opal_argv_split(tmp_argv[0], ' ')) ) {
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    if( !spawn_child ) {
        opal_output_verbose(10, opal_crs_base_framework.framework_output,
                            "crs:none: none_restart: exec :(%s, %s):",
                            cr_argv[0], tmp_argv[0]);

        status = execvp(cr_argv[0], cr_argv);

        if(status < 0) {
            opal_output(opal_crs_base_framework.framework_output,
                        "crs:none: none_restart: Child failed to execute :(%d):", status);
        }
        opal_output(opal_crs_base_framework.framework_output,
                    "crs:none: none_restart: execvp returned %d", status);
        exit_status = status;
        goto cleanup;
    } else {
        opal_output(opal_crs_base_framework.framework_output,
                   "crs:none: none_restart: Spawn not implemented");
        exit_status = OPAL_ERR_NOT_IMPLEMENTED;
        goto cleanup;
    }

 cleanup:
    if (cr_argv) {
        opal_argv_free (cr_argv);
    }

    fclose(base_snapshot->metadata);
    
    return exit_status;
}
예제 #5
0
파일: opal-restart.c 프로젝트: bosilca/ompi
static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot)
{
    int ret, exit_status = OPAL_SUCCESS;
    char *command = NULL;
    char *proc_file = NULL;
    char **loc_touch = NULL;
    char **loc_mkdir = NULL;
    int argc, i;

    if( 0 > prev_pid ) {
        opal_output(opal_restart_globals.output,
                    "Invalid PID (%d)\n",
                    prev_pid);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /*
     * This is needed so we can pass the previous environment to the restarted
     * application process.
     */
    opal_asprintf(&proc_file, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid);
    opal_asprintf(&command, "env | grep OMPI_ > %s", proc_file);

    opal_output_verbose(5, opal_restart_globals.output,
                        "post_env_vars: Execute: <%s>", command);

    ret = system(command);
    if( 0 > ret) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Any directories that need to be created
     */
    if( NULL == (snapshot->metadata = fopen(snapshot->metadata_filename, "r")) ) {
        opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
                       opal_restart_globals.snapshot_metadata,
                       snapshot->metadata_filename);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }
    opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_MKDIR, &loc_mkdir);
    argc = opal_argv_count(loc_mkdir);
    for( i = 0; i < argc; ++i ) {
        if( NULL != command ) {
            free(command);
            command = NULL;
        }
        opal_asprintf(&command, "mkdir -p %s", loc_mkdir[i]);

        opal_output_verbose(5, opal_restart_globals.output,
                            "post_env_vars: Execute: <%s>", command);

        ret = system(command);
        if( 0 > ret) {
            exit_status = ret;
            goto cleanup;
        }
    }
    if( 0 < argc ) {
        system("sync ; sync");
    }

    /*
     * Any files that need to exist
     */
    opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_TOUCH, &loc_touch);
    argc = opal_argv_count(loc_touch);
    for( i = 0; i < argc; ++i ) {
        if( NULL != command ) {
            free(command);
            command = NULL;
        }
        opal_asprintf(&command, "touch %s", loc_touch[i]);

        opal_output_verbose(5, opal_restart_globals.output,
                            "post_env_vars: Execute: <%s>", command);

        ret = system(command);
        if( 0 > ret) {
            exit_status = ret;
            goto cleanup;
        }
    }
    if( 0 < argc ) {
        system("sync ; sync");
    }

 cleanup:
    if( NULL != command) {
        free(command);
        command = NULL;
    }
    if( NULL != proc_file) {
        free(proc_file);
        proc_file = NULL;
    }
    if( NULL != loc_mkdir ) {
        opal_argv_free(loc_mkdir);
        loc_mkdir = NULL;
    }
    if( NULL != loc_touch ) {
        opal_argv_free(loc_touch);
        loc_touch = NULL;
    }

    if( NULL != snapshot->metadata ) {
        fclose(snapshot->metadata);
        snapshot->metadata = NULL;
    }

    return exit_status;
}