Example #1
0
int orte_sstore_central_global_request_global_snapshot_data(orte_sstore_base_handle_t *handle,
                                                            orte_sstore_base_global_snapshot_info_t *snapshot)
{
    int ret, exit_status = ORTE_SUCCESS;
    orte_sstore_central_global_snapshot_info_t *handle_info = NULL;

    OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle,
                         "sstore:central:(global): request_global_snapshot_data()"));

    /*
     * Lookup the handle (if NULL, use last stable)
     */
    if( NULL != handle ) {
        handle_info = find_handle_info(*handle);
        snapshot->ss_handle = *handle;
    } else {
        handle_info = find_handle_info(orte_sstore_handle_last_stable);
        snapshot->ss_handle = orte_sstore_handle_last_stable;
    }

    /*
     * Construct the snapshot from local data, and metadata file
     */
    snapshot->seq_num   = handle_info->seq_num;
    snapshot->reference = strdup(handle_info->ref_name);
    snapshot->basedir   = strdup(handle_info->base_location); 
    snapshot->metadata_filename = strdup(handle_info->metadata_filename);

    /* If this is the current checkpoint, pull data from local cache */
    if( orte_sstore_handle_current == snapshot->ss_handle ) {
        if( ORTE_SUCCESS != (ret = orte_sstore_central_extract_global_metadata(handle_info, snapshot)) ) {
            ORTE_ERROR_LOG(ret);
            exit_status = ret;
            goto cleanup;
        }
    }
    /* Otherwise, pull from metadata */
    else {
        if( ORTE_SUCCESS != (ret = orte_sstore_base_extract_global_metadata(snapshot)) ) {
            ORTE_ERROR_LOG(ret);
            exit_status = ret;
            goto cleanup;
        }
    }

    opal_list_sort(&snapshot->local_snapshots, central_snapshot_sort_compare_fn);

 cleanup:
    return exit_status;
}
Example #2
0
int snapshot_info(orte_sstore_base_global_snapshot_info_t *snapshot)
{
    int ret, exit_status = ORTE_SUCCESS;
    int num_seqs, processes, i;
    char **snapshot_ref_seqs = NULL;
    opal_list_item_t* item = NULL;
    orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
    char *tmp_str = NULL;

    /*
     * Find all sequence numbers
     */
    orte_sstore.get_attr(snapshot->ss_handle,
                         SSTORE_METADATA_GLOBAL_SNAP_NUM_SEQ,
                         &tmp_str);
    num_seqs = atoi(tmp_str);
    if( NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }
    orte_sstore.get_attr(snapshot->ss_handle,
                         SSTORE_METADATA_GLOBAL_SNAP_ALL_SEQ,
                         &tmp_str);
    snapshot_ref_seqs = opal_argv_split(tmp_str, ',');
    if( NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }

    if( 0 > orte_restart_globals.seq_number ) {
        opal_output(orte_restart_globals.output,
                    "Sequences: %d\n",
                    num_seqs);
    }

    for(i=0; i < num_seqs; ++i) {
        snapshot->seq_num = atoi(snapshot_ref_seqs[i]);

        if( 0 <= orte_restart_globals.seq_number &&
            snapshot->seq_num != orte_restart_globals.seq_number ) {
            continue;
        }

        if( ORTE_SUCCESS != (ret = orte_sstore_base_extract_global_metadata( snapshot ) ) ) {
            exit_status = ret;
            goto cleanup;
        }

        opal_output(orte_restart_globals.output,
                    "Seq: %d\n",
                    snapshot->seq_num);

        if (NULL != snapshot->start_time ) {
            opal_output(orte_restart_globals.output,
                        "\tBegin Timestamp: %s\n",
                        snapshot->start_time);
        }
        if (NULL != snapshot->end_time ) {
            opal_output(orte_restart_globals.output,
                        "\tEnd Timestamp  : %s\n",
                        snapshot->end_time);
        }

        processes = opal_list_get_size(&snapshot->local_snapshots);
        opal_output(orte_restart_globals.output,
                    "\tProcesses: %d\n",
                    processes);

        for(item  = opal_list_get_first(&snapshot->local_snapshots);
            item != opal_list_get_end(&snapshot->local_snapshots);
            item  = opal_list_get_next(item) ) {
            vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;

            opal_output_verbose(10, orte_restart_globals.output,
                                "\t\tProcess: %u.%u \t CRS: %s \t Compress: %s (%s)",
                                vpid_snapshot->process_name.jobid,
                                vpid_snapshot->process_name.vpid,
                                vpid_snapshot->crs_comp,
                                vpid_snapshot->compress_comp,
                                vpid_snapshot->compress_postfix);
        }
    }

 cleanup:
    return exit_status;
}
Example #3
0
int orte_sstore_base_tool_request_restart_handle(orte_sstore_base_handle_t *handle,
                                                 char *basedir, char *ref, int seq,
                                                 orte_sstore_base_global_snapshot_info_t *snapshot)
{
    int ret, exit_status = ORTE_SUCCESS;
    char * tmp_str = NULL;

    if( NULL != tool_global_snapshot ) {
        OBJ_RELEASE(tool_global_snapshot);
    }
    tool_global_snapshot = snapshot;
    OBJ_RETAIN(tool_global_snapshot);

    snapshot->reference = strdup(ref);
    if( NULL == basedir ) {
        snapshot->basedir = strdup(orte_sstore_base_global_snapshot_dir);
    } else {
        snapshot->basedir = strdup(basedir);
    }
    asprintf(&(snapshot->metadata_filename),
             "%s/%s/%s",
             snapshot->basedir,
             snapshot->reference,
             orte_sstore_base_global_metadata_filename);

    /*
     * Check the checkpoint location
     */
    asprintf(&tmp_str, "%s/%s",
             snapshot->basedir,
             snapshot->reference);
    if (0 >  (ret = access(tmp_str, F_OK)) ) {
        opal_output(0, ("Error: The snapshot requested does not exist!\n"
                        "Check the path (%s)!"),
                    tmp_str);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }
    if(NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }

    /*
     * If we were asked to find the largest seq num
     */
    if( seq < 0 ) {
        if( ORTE_SUCCESS != (ret = orte_sstore_base_find_largest_seq_num(snapshot, &seq)) ) {
            opal_output(0, ("Error: Failed to find a valid sequence number in snapshot metadata!\n"
                            "Check the metadata file (%s)!"),
                        snapshot->metadata_filename);
            exit_status = ORTE_ERROR;
            goto cleanup;
        }
        snapshot->seq_num = seq;
    } else {
        snapshot->seq_num = seq;
    }

    /*
     * Check the checkpoint sequence location
     */
    asprintf(&tmp_str, "%s/%s/%d",
             snapshot->basedir,
             snapshot->reference,
             snapshot->seq_num);
    if (0 >  (ret = access(tmp_str, F_OK)) ) {
        opal_output(0, ("Error: The snapshot sequence requested does not exist!\n"
                        "Check the path (%s)!"),
                    tmp_str);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }
    if(NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }

    /*
     * Build the list of processes attached to the snapshot
     */
    if( ORTE_SUCCESS != (ret = orte_sstore_base_extract_global_metadata(snapshot)) ) {
        opal_output(0, "Error: Failed to extract process information! Check the metadata file in (%s)!",
                    tmp_str);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    /*
     * Save some basic infomation
     */
    snapshot->ss_handle = 1;
    *handle = 1;

 cleanup:
    if( NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }

    return exit_status;
}