예제 #1
0
static int opal_crs_blcr_checkpoint_cmd(pid_t pid, char * local_dir, char **fname, char **cmd)
{
    char **cr_argv = NULL;
    int argc = 0, ret;
    char * pid_str;
    int exit_status = OPAL_SUCCESS;
    char * loc_fname = NULL;

    blcr_get_checkpoint_filename(fname, pid);

    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: checkpoint_cmd(%d)", pid);

    asprintf(&loc_fname, "%s/%s", local_dir, *fname);

    /*
     * Build the command
     */
    if (OPAL_SUCCESS != (ret = opal_argv_append(&argc, &cr_argv, strdup(blcr_checkpoint_cmd)))) {
        exit_status = ret;
        goto cleanup;
    }

    if (OPAL_SUCCESS != (ret = opal_argv_append(&argc, &cr_argv, strdup("--pid")))) {
        exit_status = ret;
        goto cleanup;
    }

    asprintf(&pid_str, "%d", pid);
    if (OPAL_SUCCESS != (ret = opal_argv_append(&argc, &cr_argv, strdup(pid_str)))) {
        exit_status = ret;
        goto cleanup;
    }

    if (OPAL_SUCCESS != (ret = opal_argv_append(&argc, &cr_argv, strdup("--file")))) {
        exit_status = ret;
        goto cleanup;
    }

    if (OPAL_SUCCESS != (ret = opal_argv_append(&argc, &cr_argv, strdup(loc_fname)))) {
        exit_status = ret;
        goto cleanup;
    }

 cleanup:
    if(exit_status != OPAL_SUCCESS)
        *cmd = NULL;
    else 
        *cmd = opal_argv_join(cr_argv, ' ');
    
    if(NULL != pid_str) 
        free(pid_str);
    if( NULL != cr_argv)
        opal_argv_free(cr_argv);
    if(NULL != loc_fname) 
        free(loc_fname);

    return exit_status;
}
예제 #2
0
int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,  opal_crs_state_type_t *state)
{
    int ret, exit_status = OPAL_SUCCESS;
    opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
    cr_checkpoint_args_t cr_args;
    static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1);
#endif

    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: checkpoint(%d, ---)", pid);

    if(NULL != snapshot->super.reference_name)
        free(snapshot->super.reference_name);
    snapshot->super.reference_name = strdup(base_snapshot->reference_name);

    if(NULL != snapshot->super.local_location)
        free(snapshot->super.local_location);
    snapshot->super.local_location  = strdup(base_snapshot->local_location);

    if(NULL != snapshot->super.remote_location)
        free(snapshot->super.remote_location);
    snapshot->super.remote_location  = strdup(base_snapshot->remote_location);

    /*
     * Update the snapshot metadata
     */
    snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name);
    if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, snapshot->super.component_name) ) ) {
        opal_output(mca_crs_blcr_component.super.output_handle,
                    "crs:blcr: checkpoint(): Error: Unable to write component name to the directory for (%s).",
                    snapshot->super.reference_name);
        exit_status = ret;
        goto cleanup;
    }

    /*
     * If we can checkpointing ourselves do so:
     * use cr_request_checkpoint() if available, and cr_request_file() if not
     */
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1
    if( pid == my_pid ) {
        char *loc_fname = NULL;

        blcr_get_checkpoint_filename(&(snapshot->context_filename), pid);
        asprintf(&loc_fname, "%s/%s", snapshot->super.local_location, snapshot->context_filename);

        opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: checkpoint SELF <%s>",
                            loc_fname);

#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
        {
            int fd = 0;
            fd = open(loc_fname,
                       O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE,
                       S_IRUSR | S_IWUSR);
            if( fd < 0 ) {
                *state = OPAL_CRS_ERROR;
                opal_output(mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)",
                            loc_fname, pid);
                exit_status = ret;
                goto cleanup;
            }

            cr_initialize_checkpoint_args_t(&cr_args);
            cr_args.cr_scope = CR_SCOPE_PROC;
            cr_args.cr_fd    = fd;

            ret = cr_request_checkpoint(&cr_args, &cr_handle);
            if( ret < 0 ) {
                close(cr_args.cr_fd);
                *state = OPAL_CRS_ERROR;
                opal_output(mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)",
                            pid, loc_fname);
                exit_status = ret;
                goto cleanup;
            }

            /* Wait for checkpoint to finish */
            do {
                ret = cr_poll_checkpoint(&cr_handle, NULL);
                if( ret < 0 ) {
                    /* Check if restarting. This is not an error. */
                    if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) {
                        ret = 0;
                        break;
                    }
                    /* If Call was interrupted by a signal, retry the call */
                    else if (errno == EINTR) {
                        ;
                    }
                    /* Otherwise this is a real error that we need to deal with */
                    else {
                        *state = OPAL_CRS_ERROR;
                        opal_output(mca_crs_blcr_component.super.output_handle,
                                    "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)",
                                    pid, loc_fname, ret);
                        exit_status = ret;
                        goto cleanup;
                    }
                }
            } while( ret < 0 );

            /* Close the file */
            close(cr_args.cr_fd);
        }
#else
        /* Request a checkpoint be taken of the current process.
         * Since we are not guaranteed to finish the checkpoint before this
         * returns, we also need to wait for it.
         */
        cr_request_file(loc_fname);
        
        /* Wait for checkpoint to finish */
        do {
            usleep(1000); /* JJH Do we really want to sleep? */
        } while(CR_STATE_IDLE != cr_status());
#endif

        *state = blcr_current_state;
        free(loc_fname);
    }
    /*
     * Checkpointing another process
     */
    else 
#endif
    {
        ret = blcr_checkpoint_peer(pid, snapshot->super.local_location, &(snapshot->context_filename));

        if(OPAL_SUCCESS != ret) {
            *state = OPAL_CRS_ERROR;
            opal_output(mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d)",
                        pid);
            exit_status = ret;
            goto cleanup;
        }

        *state = blcr_current_state;
    }
    
    if(*state == OPAL_CRS_CONTINUE) {
        /*
         * Update the metadata file
         */
        if( OPAL_SUCCESS != (ret = blcr_update_snapshot_metadata(snapshot)) ) {
            *state = OPAL_CRS_ERROR;
            opal_output(mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: checkpoint(): Error: Unable to update metadata for snapshot (%s).", 
                        snapshot->super.reference_name);
            exit_status = ret;
            goto cleanup;
        }
    }

    /*
     * Return to the caller
     */
    base_snapshot = &(snapshot->super);

 cleanup:
    return exit_status;
}
예제 #3
0
int opal_crs_blcr_checkpoint(pid_t pid,
                             opal_crs_base_snapshot_t *base_snapshot,
                             opal_crs_base_ckpt_options_t *options,
                             opal_crs_state_type_t *state)
{
    int ret, exit_status = OPAL_SUCCESS;
    opal_crs_blcr_snapshot_t *snapshot = NULL;
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
    cr_checkpoint_args_t cr_args;
    static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1);
#endif
    int fd = 0;
    char *loc_fname = NULL;

    if( pid != my_pid ) {
        opal_output(0, "crs:blcr: checkpoint(%d, ---): Checkpointing of peers not allowed!", pid);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: checkpoint(%d, ---)", pid);

    snapshot = (opal_crs_blcr_snapshot_t *)base_snapshot;

    /*
     * Update the snapshot metadata
     */
    snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name);
    blcr_get_checkpoint_filename(&(snapshot->context_filename), pid);

    if( NULL == snapshot->super.metadata ) {
        if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
            opal_output(mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: checkpoint(): Error: Unable to open the file (%s)",
                        snapshot->super.metadata_filename);
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
    }
    fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP,    snapshot->super.component_name);
    fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->context_filename);

    fclose(snapshot->super.metadata );
    snapshot->super.metadata = NULL;

    /*
     * If we can checkpointing ourselves do so:
     * use cr_request_checkpoint() if available, and cr_request_file() if not
     */
    if( opal_crs_blcr_dev_null ) {
        loc_fname = strdup("/dev/null");
    } else {
        asprintf(&loc_fname, "%s/%s", snapshot->super.snapshot_directory, snapshot->context_filename);
    }

#if OPAL_ENABLE_CRDEBUG == 1
    /* Make sure to identify the checkpointing thread, so that it is not
     * prevented from requesting the checkpoint after the debugger detaches
     */
    opal_cr_debug_set_current_ckpt_thread_self();
    checkpoint_thread_id = opal_thread_get_self();
    blcr_crdebug_refreshed_env = false;

    /* If checkpoint/restart enabled debugging  then mark detachment place */
    if( MPIR_debug_with_checkpoint ) {
        opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: checkpoint(): Detaching debugger...");
        MPIR_checkpoint_debugger_detach();
    }
#endif

    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: checkpoint SELF <%s>",
                        loc_fname);

#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1
#if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1
    fd = open(loc_fname,
              O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE,
              S_IRUSR | S_IWUSR);
    if( fd < 0 ) {
        *state = OPAL_CRS_ERROR;
        opal_output(mca_crs_blcr_component.super.output_handle,
                    "crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)",
                    loc_fname, pid);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    cr_initialize_checkpoint_args_t(&cr_args);
    cr_args.cr_scope = CR_SCOPE_PROC;
    cr_args.cr_fd    = fd;
    if( options->stop ) {
        cr_args.cr_signal = SIGSTOP;
    }

    ret = cr_request_checkpoint(&cr_args, &cr_handle);
    if( ret < 0 ) {
        close(cr_args.cr_fd);
        *state = OPAL_CRS_ERROR;
        opal_output(mca_crs_blcr_component.super.output_handle,
                    "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)",
                    pid, loc_fname);
        exit_status = ret;
        goto cleanup;
    }

    /* Wait for checkpoint to finish */
    do {
        ret = cr_poll_checkpoint(&cr_handle, NULL);
        if( ret < 0 ) {
            /* Check if restarting. This is not an error. */
            if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) {
                ret = 0;
                break;
            }
            /* If Call was interrupted by a signal, retry the call */
            else if (errno == EINTR) {
                ;
            }
            /* Otherwise this is a real error that we need to deal with */
            else {
                *state = OPAL_CRS_ERROR;
                opal_output(mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)",
                            pid, loc_fname, ret);
                exit_status = ret;
                goto cleanup;
            }
        }
    } while( ret < 0 );

    /* Close the file */
    close(cr_args.cr_fd);
#else
    /* Request a checkpoint be taken of the current process.
     * Since we are not guaranteed to finish the checkpoint before this
     * returns, we also need to wait for it.
     */
    cr_request_file(loc_fname);

    /* Wait for checkpoint to finish */
    do {
        usleep(1000); /* JJH Do we really want to sleep? */
    } while(CR_STATE_IDLE != cr_status());
#endif
#endif

    *state = blcr_current_state;
    free(loc_fname);

 cleanup:
    if( NULL != snapshot->super.metadata ) {
        fclose(snapshot->super.metadata );
        snapshot->super.metadata = NULL;
    }

    return exit_status;
}