int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot, opal_crs_state_type_t *state) { int ret, exit_status = OPAL_SUCCESS; opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t); #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 cr_checkpoint_args_t cr_args; static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1); #endif opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(%d, ---)", pid); if(NULL != snapshot->super.reference_name) free(snapshot->super.reference_name); snapshot->super.reference_name = strdup(base_snapshot->reference_name); if(NULL != snapshot->super.local_location) free(snapshot->super.local_location); snapshot->super.local_location = strdup(base_snapshot->local_location); if(NULL != snapshot->super.remote_location) free(snapshot->super.remote_location); snapshot->super.remote_location = strdup(base_snapshot->remote_location); /* * Update the snapshot metadata */ snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name); if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, snapshot->super.component_name) ) ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to write component name to the directory for (%s).", snapshot->super.reference_name); exit_status = ret; goto cleanup; } /* * If we can checkpointing ourselves do so: * use cr_request_checkpoint() if available, and cr_request_file() if not */ #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1 if( pid == my_pid ) { char *loc_fname = NULL; blcr_get_checkpoint_filename(&(snapshot->context_filename), pid); asprintf(&loc_fname, "%s/%s", snapshot->super.local_location, snapshot->context_filename); opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint SELF <%s>", loc_fname); #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 { int fd = 0; fd = open(loc_fname, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, S_IRUSR | S_IWUSR); if( fd < 0 ) { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)", loc_fname, pid); exit_status = ret; goto cleanup; } cr_initialize_checkpoint_args_t(&cr_args); cr_args.cr_scope = CR_SCOPE_PROC; cr_args.cr_fd = fd; ret = cr_request_checkpoint(&cr_args, &cr_handle); if( ret < 0 ) { close(cr_args.cr_fd); *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)", pid, loc_fname); exit_status = ret; goto cleanup; } /* Wait for checkpoint to finish */ do { ret = cr_poll_checkpoint(&cr_handle, NULL); if( ret < 0 ) { /* Check if restarting. This is not an error. */ if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) { ret = 0; break; } /* If Call was interrupted by a signal, retry the call */ else if (errno == EINTR) { ; } /* Otherwise this is a real error that we need to deal with */ else { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)", pid, loc_fname, ret); exit_status = ret; goto cleanup; } } } while( ret < 0 ); /* Close the file */ close(cr_args.cr_fd); } #else /* Request a checkpoint be taken of the current process. * Since we are not guaranteed to finish the checkpoint before this * returns, we also need to wait for it. */ cr_request_file(loc_fname); /* Wait for checkpoint to finish */ do { usleep(1000); /* JJH Do we really want to sleep? */ } while(CR_STATE_IDLE != cr_status()); #endif *state = blcr_current_state; free(loc_fname); } /* * Checkpointing another process */ else #endif { ret = blcr_checkpoint_peer(pid, snapshot->super.local_location, &(snapshot->context_filename)); if(OPAL_SUCCESS != ret) { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d)", pid); exit_status = ret; goto cleanup; } *state = blcr_current_state; } if(*state == OPAL_CRS_CONTINUE) { /* * Update the metadata file */ if( OPAL_SUCCESS != (ret = blcr_update_snapshot_metadata(snapshot)) ) { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to update metadata for snapshot (%s).", snapshot->super.reference_name); exit_status = ret; goto cleanup; } } /* * Return to the caller */ base_snapshot = &(snapshot->super); cleanup: return exit_status; }
int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot, opal_crs_base_ckpt_options_t *options, opal_crs_state_type_t *state) { int ret, exit_status = OPAL_SUCCESS; opal_crs_blcr_snapshot_t *snapshot = NULL; #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 cr_checkpoint_args_t cr_args; static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1); #endif int fd = 0; char *loc_fname = NULL; if( pid != my_pid ) { opal_output(0, "crs:blcr: checkpoint(%d, ---): Checkpointing of peers not allowed!", pid); exit_status = OPAL_ERROR; goto cleanup; } opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(%d, ---)", pid); snapshot = (opal_crs_blcr_snapshot_t *)base_snapshot; /* * Update the snapshot metadata */ snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name); blcr_get_checkpoint_filename(&(snapshot->context_filename), pid); if( NULL == snapshot->super.metadata ) { if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to open the file (%s)", snapshot->super.metadata_filename); exit_status = OPAL_ERROR; goto cleanup; } } fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name); fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->context_filename); fclose(snapshot->super.metadata ); snapshot->super.metadata = NULL; /* * If we can checkpointing ourselves do so: * use cr_request_checkpoint() if available, and cr_request_file() if not */ if( opal_crs_blcr_dev_null ) { loc_fname = strdup("/dev/null"); } else { asprintf(&loc_fname, "%s/%s", snapshot->super.snapshot_directory, snapshot->context_filename); } #if OPAL_ENABLE_CRDEBUG == 1 /* Make sure to identify the checkpointing thread, so that it is not * prevented from requesting the checkpoint after the debugger detaches */ opal_cr_debug_set_current_ckpt_thread_self(); checkpoint_thread_id = opal_thread_get_self(); blcr_crdebug_refreshed_env = false; /* If checkpoint/restart enabled debugging then mark detachment place */ if( MPIR_debug_with_checkpoint ) { opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Detaching debugger..."); MPIR_checkpoint_debugger_detach(); } #endif opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint SELF <%s>", loc_fname); #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1 #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 fd = open(loc_fname, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, S_IRUSR | S_IWUSR); if( fd < 0 ) { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)", loc_fname, pid); exit_status = OPAL_ERROR; goto cleanup; } cr_initialize_checkpoint_args_t(&cr_args); cr_args.cr_scope = CR_SCOPE_PROC; cr_args.cr_fd = fd; if( options->stop ) { cr_args.cr_signal = SIGSTOP; } ret = cr_request_checkpoint(&cr_args, &cr_handle); if( ret < 0 ) { close(cr_args.cr_fd); *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)", pid, loc_fname); exit_status = ret; goto cleanup; } /* Wait for checkpoint to finish */ do { ret = cr_poll_checkpoint(&cr_handle, NULL); if( ret < 0 ) { /* Check if restarting. This is not an error. */ if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) { ret = 0; break; } /* If Call was interrupted by a signal, retry the call */ else if (errno == EINTR) { ; } /* Otherwise this is a real error that we need to deal with */ else { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)", pid, loc_fname, ret); exit_status = ret; goto cleanup; } } } while( ret < 0 ); /* Close the file */ close(cr_args.cr_fd); #else /* Request a checkpoint be taken of the current process. * Since we are not guaranteed to finish the checkpoint before this * returns, we also need to wait for it. */ cr_request_file(loc_fname); /* Wait for checkpoint to finish */ do { usleep(1000); /* JJH Do we really want to sleep? */ } while(CR_STATE_IDLE != cr_status()); #endif #endif *state = blcr_current_state; free(loc_fname); cleanup: if( NULL != snapshot->super.metadata ) { fclose(snapshot->super.metadata ); snapshot->super.metadata = NULL; } return exit_status; }