/* Write process state to the provided context file */ extern int gasneti_checkpoint_write(int fd) { cr_checkpoint_handle_t cr_handle; int rc; { /* Request the checkpoint */ cr_checkpoint_args_t cr_args; cr_initialize_checkpoint_args_t(&cr_args); cr_args.cr_scope = CR_SCOPE_PROC; cr_args.cr_flags = CR_CHKPT_ASYNC_ERR; /* defers reporting of most errors to "reap" */ cr_args.cr_target = 0; /* self */ cr_args.cr_fd = fd; rc = cr_request_checkpoint(&cr_args, &cr_handle); /* BLCR-TODO: error checking for cr_request_checkpoint() */ } do { /* This loop is necessary because checkpointing self causes EINTR */ rc = cr_wait_checkpoint(&cr_handle, NULL); /* BLCR-TODO: error checking for cr_wait_checkpoint() */ } while ((rc < 0) && (errno == EINTR)); rc = cr_reap_checkpoint(&cr_handle); if (rc >= 0) { /* Continue case */ return 0; } else if (errno == CR_ERESTARTED) { /* Restart case */ return 1; } else { /* Error case */ return -1; } }
int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot, opal_crs_state_type_t *state) { int ret, exit_status = OPAL_SUCCESS; opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t); #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 cr_checkpoint_args_t cr_args; static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1); #endif opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(%d, ---)", pid); if(NULL != snapshot->super.reference_name) free(snapshot->super.reference_name); snapshot->super.reference_name = strdup(base_snapshot->reference_name); if(NULL != snapshot->super.local_location) free(snapshot->super.local_location); snapshot->super.local_location = strdup(base_snapshot->local_location); if(NULL != snapshot->super.remote_location) free(snapshot->super.remote_location); snapshot->super.remote_location = strdup(base_snapshot->remote_location); /* * Update the snapshot metadata */ snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name); if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, snapshot->super.component_name) ) ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to write component name to the directory for (%s).", snapshot->super.reference_name); exit_status = ret; goto cleanup; } /* * If we can checkpointing ourselves do so: * use cr_request_checkpoint() if available, and cr_request_file() if not */ #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1 if( pid == my_pid ) { char *loc_fname = NULL; blcr_get_checkpoint_filename(&(snapshot->context_filename), pid); asprintf(&loc_fname, "%s/%s", snapshot->super.local_location, snapshot->context_filename); opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint SELF <%s>", loc_fname); #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 { int fd = 0; fd = open(loc_fname, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, S_IRUSR | S_IWUSR); if( fd < 0 ) { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)", loc_fname, pid); exit_status = ret; goto cleanup; } cr_initialize_checkpoint_args_t(&cr_args); cr_args.cr_scope = CR_SCOPE_PROC; cr_args.cr_fd = fd; ret = cr_request_checkpoint(&cr_args, &cr_handle); if( ret < 0 ) { close(cr_args.cr_fd); *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)", pid, loc_fname); exit_status = ret; goto cleanup; } /* Wait for checkpoint to finish */ do { ret = cr_poll_checkpoint(&cr_handle, NULL); if( ret < 0 ) { /* Check if restarting. This is not an error. */ if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) { ret = 0; break; } /* If Call was interrupted by a signal, retry the call */ else if (errno == EINTR) { ; } /* Otherwise this is a real error that we need to deal with */ else { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)", pid, loc_fname, ret); exit_status = ret; goto cleanup; } } } while( ret < 0 ); /* Close the file */ close(cr_args.cr_fd); } #else /* Request a checkpoint be taken of the current process. * Since we are not guaranteed to finish the checkpoint before this * returns, we also need to wait for it. */ cr_request_file(loc_fname); /* Wait for checkpoint to finish */ do { usleep(1000); /* JJH Do we really want to sleep? */ } while(CR_STATE_IDLE != cr_status()); #endif *state = blcr_current_state; free(loc_fname); } /* * Checkpointing another process */ else #endif { ret = blcr_checkpoint_peer(pid, snapshot->super.local_location, &(snapshot->context_filename)); if(OPAL_SUCCESS != ret) { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d)", pid); exit_status = ret; goto cleanup; } *state = blcr_current_state; } if(*state == OPAL_CRS_CONTINUE) { /* * Update the metadata file */ if( OPAL_SUCCESS != (ret = blcr_update_snapshot_metadata(snapshot)) ) { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to update metadata for snapshot (%s).", snapshot->super.reference_name); exit_status = ret; goto cleanup; } } /* * Return to the caller */ base_snapshot = &(snapshot->super); cleanup: return exit_status; }
int hypre_SMGResidual( void *residual_vdata, hypre_StructMatrix *A, hypre_StructVector *x, hypre_StructVector *b, hypre_StructVector *r ) { int ierr = 0; hypre_SMGResidualData *residual_data = (hypre_SMGResidualData *)residual_vdata; hypre_IndexRef base_stride = (residual_data -> base_stride); hypre_BoxArray *base_points = (residual_data -> base_points); hypre_ComputePkg *compute_pkg = (residual_data -> compute_pkg); hypre_CommHandle *comm_handle; hypre_BoxArrayArray *compute_box_aa; hypre_BoxArray *compute_box_a; hypre_Box *compute_box; hypre_Box *A_data_box; hypre_Box *x_data_box; hypre_Box *b_data_box; hypre_Box *r_data_box; int Ai; int xi; int bi; int ri; double *Ap; double *xp; double *bp; double *rp; hypre_Index loop_size; hypre_IndexRef start; hypre_StructStencil *stencil; hypre_Index *stencil_shape; int stencil_size; int compute_i, i, j, si; int loopi, loopj, loopk; hypre_BeginTiming(residual_data -> time_index); /*----------------------------------------------------------------------- * Compute residual r = b - Ax *-----------------------------------------------------------------------*/ stencil = hypre_StructMatrixStencil(A); stencil_shape = hypre_StructStencilShape(stencil); stencil_size = hypre_StructStencilSize(stencil); for (compute_i = 0; compute_i < 2; compute_i++) { switch(compute_i) { case 0: { xp = hypre_StructVectorData(x); hypre_InitializeIndtComputations(compute_pkg, xp, &comm_handle); compute_box_aa = hypre_ComputePkgIndtBoxes(compute_pkg); /*---------------------------------------- * Copy b into r *----------------------------------------*/ compute_box_a = base_points; hypre_ForBoxI(i, compute_box_a) { compute_box = hypre_BoxArrayBox(compute_box_a, i); start = hypre_BoxIMin(compute_box); b_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(b), i); r_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(r), i); bp = hypre_StructVectorBoxData(b, i); rp = hypre_StructVectorBoxData(r, i); hypre_BoxGetStrideSize(compute_box, base_stride, loop_size); hypre_BoxLoop2Begin(loop_size, b_data_box, start, base_stride, bi, r_data_box, start, base_stride, ri); #define HYPRE_BOX_SMP_PRIVATE loopk,loopi,loopj,bi,ri #include "hypre_box_smp_forloop.h" hypre_BoxLoop2For(loopi, loopj, loopk, bi, ri) { rp[ri] = bp[bi]; } hypre_BoxLoop2End(bi, ri); } } break; case 1: { hypre_FinalizeIndtComputations(comm_handle); compute_box_aa = hypre_ComputePkgDeptBoxes(compute_pkg); } break; } /*-------------------------------------------------------------------- * Compute r -= A*x *--------------------------------------------------------------------*/ hypre_ForBoxArrayI(i, compute_box_aa) { compute_box_a = hypre_BoxArrayArrayBoxArray(compute_box_aa, i); A_data_box = hypre_BoxArrayBox(hypre_StructMatrixDataSpace(A), i); x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i); r_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(r), i); rp = hypre_StructVectorBoxData(r, i); hypre_ForBoxI(j, compute_box_a) { compute_box = hypre_BoxArrayBox(compute_box_a, j); start = hypre_BoxIMin(compute_box); for (si = 0; si < stencil_size; si++) { Ap = hypre_StructMatrixBoxData(A, i, si); xp = hypre_StructVectorBoxData(x, i) + hypre_BoxOffsetDistance(x_data_box, stencil_shape[si]); hypre_BoxGetStrideSize(compute_box, base_stride, loop_size); hypre_BoxLoop3Begin(loop_size, A_data_box, start, base_stride, Ai, x_data_box, start, base_stride, xi, r_data_box, start, base_stride, ri); #if 0 /* The following portion is preprocessed to be handled by ROSE outliner */ #define HYPRE_BOX_SMP_PRIVATE loopk,loopi,loopj,Ai,xi,ri #include "hypre_box_smp_forloop.h" hypre_BoxLoop3For(loopi, loopj, loopk, Ai, xi, ri) { rp[ri] -= Ap[Ai] * xp[xi]; } hypre_BoxLoop3End(Ai, xi, ri); #else for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) { loopi = 0; loopj = 0; loopk = 0; hypre__nx = hypre__mx; hypre__ny = hypre__my; hypre__nz = hypre__mz; if (hypre__num_blocks > 1) { if (hypre__dir == 0) { loopi = hypre__block * hypre__div + (((hypre__mod) < (hypre__block)) ? (hypre__mod) : (hypre__block)); hypre__nx = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0); } else if (hypre__dir == 1) { loopj = hypre__block * hypre__div + (((hypre__mod) < (hypre__block)) ? (hypre__mod) : (hypre__block)); hypre__ny = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0); } else if (hypre__dir == 2) { loopk = hypre__block * hypre__div + (((hypre__mod) < (hypre__block)) ? (hypre__mod) : (hypre__block)); hypre__nz = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0); } }; Ai = hypre__i1start + loopi * hypre__sx1 + loopj * hypre__sy1 + loopk * hypre__sz1; xi = hypre__i2start + loopi * hypre__sx2 + loopj * hypre__sy2 + loopk * hypre__sz2; ri = hypre__i3start + loopi * hypre__sx3 + loopj * hypre__sy3 + loopk * hypre__sz3; //begin of the loop #if 0 // for (loopk = 0; loopk < hypre__nz; loopk++) { for (loopj = 0; loopj < hypre__ny; loopj++) { for (loopi = 0; loopi < hypre__nx; loopi++) { { rp[ri] -= Ap[Ai] * xp[xi]; } Ai += hypre__sx1; xi += hypre__sx2; ri += hypre__sx3; } Ai += hypre__sy1 - hypre__nx * hypre__sx1; xi += hypre__sy2 - hypre__nx * hypre__sx2; ri += hypre__sy3 - hypre__nx * hypre__sx3; } Ai += hypre__sz1 - hypre__ny * hypre__sy1; xi += hypre__sz2 - hypre__ny * hypre__sy2; ri += hypre__sz3 - hypre__ny * hypre__sy3; } // end of the loop #else #if BLCR_CHECKPOINTING // Only checkpoint it at the first occurrance. if (g_checkpoint_flag == 0) { int err; cr_checkpoint_args_t cr_args; cr_checkpoint_handle_t cr_handle; cr_initialize_checkpoint_args_t(&cr_args); cr_args.cr_scope = CR_SCOPE_PROC;// a process cr_args.cr_target = 0; //self cr_args.cr_signal = SIGKILL; // kill after checkpointing cr_args.cr_fd = open("dump.yy", O_WRONLY|O_CREAT|O_LARGEFILE, 0400); if (cr_args.cr_fd < 0) { printf("Error: cannot open file for checkpoiting context\n"); abort(); } g_checkpoint_flag ++; printf("Checkpoiting: starting here ..\n"); err = cr_request_checkpoint(&cr_args, &cr_handle); if (err < 0) { printf("cannot request checkpoining! err=%d\n",err); abort(); } // block until the request is served cr_enter_cs(cr); cr_leave_cs(cr); printf("Checkpoiting: restarting here ..\n"); } #endif OUT__1__6755__(&Ai,&xi,&ri,&Ap,&xp,&rp,&loopi,&loopj,&loopk,&hypre__sx1,&hypre__sy1,&hypre__sz1,&hypre__sx2,&hypre__sy2,&hypre__sz2,&hypre__sx3,&hypre__sy3,&hypre__sz3,&hypre__nx,&hypre__ny,&hypre__nz); #endif } };
HYD_status HYDT_ckpoint_blcr_checkpoint(const char *prefix, int pgid, int id, int ckpt_num) { HYD_status status = HYD_SUCCESS; int ret; int fd; cr_checkpoint_args_t my_args; cr_checkpoint_handle_t my_handle; char filename[256]; HYDU_FUNC_ENTER(); /* build the checkpoint filename */ MPL_snprintf(filename, sizeof(filename), "%s/context-num%d-%d-%d", prefix, ckpt_num, pgid, id); /* remove existing checkpoint file, if any */ (void) unlink(filename); /* open the checkpoint file */ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC /* | O_LARGEFILE */ , 0600); HYDU_ERR_CHKANDJUMP(status, fd < 0, HYD_INTERNAL_ERROR, "open failed: %s\n", strerror(errno)); cr_initialize_checkpoint_args_t(&my_args); my_args.cr_fd = fd; my_args.cr_scope = CR_SCOPE_TREE; /* issue the request */ ret = cr_request_checkpoint(&my_args, &my_handle); if (ret < 0) { HYDU_ERR_CHKANDJUMP(status, errno == CR_ENOSUPPORT, HYD_INTERNAL_ERROR, "Checkpointing failed. Make sure BLCR kernel module is loaded. %s\n", strerror(errno)); HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "cr_request_checkpoint failed, %s\n", strerror(errno)); } /* wait for the request to complete */ while (1) { ret = cr_poll_checkpoint(&my_handle, NULL); if (ret < 0) { if ((ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED)) { HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "trying to restart in a checkpoint\n"); } else if (errno == EINTR) { /* poll was interrupted by a signal -- retry */ } else { HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "cr_poll_checkpoint failed: %s\n", strerror(errno)); } } else if (ret == 0) { HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "cr_poll_checkpoint returned 0 unexpectedly\n"); } else { break; } } ret = close(my_args.cr_fd); HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "close failed, %s\n", strerror(errno)); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
// Request a checkpoint of the local process // The return value is // - negative in case of error // - zero when successfully resuming after the checkpoint // - positive when restarting from the checkpoint static int request_checkpoint( const char* filename ) { cr_checkpoint_args_t cr_file_args; cr_checkpoint_handle_t cr_handle; int cr_fd = -1; int return_code = 0; // Check current state CR_state_lock(); if ( cr_state != CR_READY ) { switch( cr_state ) { case CR_REQUEST_CHECKPOINT: case CR_CHECKPOINT: { PRINT_ERROR("Error: Already checkpointing... (cr_state=%d)\n", cr_state); return_code = -10; break; } default: { PRINT_ERROR("Error: Not ready to checkpoint... (cr_state=%d)\n", cr_state); return_code = -11; break; } } CR_state_unlock(); goto error; } else { // All is ok, proceed to checkpoint request CR_state_transition_nolock( CR_REQUEST_CHECKPOINT ); } CR_state_unlock(); cr_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0600); if ( cr_fd < 0 ) { PRINT_ERROR_ERRNO("Failed to open checkpoint file '%s'", errno, filename); return_code = -1; goto error; } int ret = cr_initialize_checkpoint_args_t(&cr_file_args); if (ret < 0) { PRINT_ERROR("BLCR call cr_initialize_checkpoint_args_t() failed\n"); return_code = -2; goto error; } cr_file_args.cr_scope = CR_SCOPE_PROC; cr_file_args.cr_target = getpid(); cr_file_args.cr_fd = cr_fd; cr_file_args.cr_signal = 0; cr_file_args.cr_timeout = 0; cr_file_args.cr_flags &= ~CR_CHKPT_DUMP_ALL; // Save None // Request a checkpoint PRINT_DEBUG( DEBUG_FT_verbose, "cr_request_checkpoint() with file '%s'\n", filename ); ret = cr_request_checkpoint(&cr_file_args, &cr_handle); PRINT_DEBUG( DEBUG_FT_verbose>1, "cr_request_checkpoint() returned %d\n", ret ); if (ret < 0) { PRINT_ERROR("BLCR call cr_request_checkpoint() failed with error %d: %s\n", errno, cr_strerror(errno)); return_code = -3; goto error; } // Wait for the end of the checkpoint, and retry while interrupted PRINT_DEBUG( DEBUG_FT_verbose, "cr_poll_checkpoint()\n" ); do { ret = cr_poll_checkpoint(&cr_handle, NULL); } while (ret == CR_POLL_CHKPT_ERR_PRE && errno == EINTR); PRINT_DEBUG( DEBUG_FT_verbose>1, "cr_poll_checkpoint() returned %d\n", ret ); // Check the result of the checkpoint if (ret == CR_POLL_CHKPT_ERR_POST && errno == CR_ERESTARTED) { // We are restarting, ignore this error code // The checkpoint file is not opened at restart cr_fd = -1; // Positive value means restart return_code = 1; return return_code; } else if (ret < 0) { // Checkpoint failed PRINT_ERROR("BLCR call cr_poll_checkpoint() failed with error %d: %s\n", errno, cr_strerror(errno)); // Negative value for failure return_code = -4; goto error; } else if (ret == 0) { // 0 means that the checkpoint is in progress // It should never happen because we don't specify any timeout when calling cr_poll_checkpoint() ASSERT_MSG( 0==1, "Internal error\n"); } // Close the checkpoint file ASSERT_MSG( cr_fd>=0, "Internal error\n"); ret = close(cr_fd); cr_fd = -1; PRINT_DEBUG( DEBUG_FT_verbose, "close() returned %d\n", ret ); if (ret < 0) { PRINT_ERROR_ERRNO("Failed to close file '%s'", errno, filename); return_code = -5; goto error; } // If we are here, it means that everything went good ASSERT_MSG( return_code==0, "Internal error\n"); return return_code; error: // An error happened, cleanup and return properly if ( cr_fd >= 0 ) { close( cr_fd ); cr_fd = -1; } // If the request failed, ie not the checkpoint itself // Restore the CR_READY state CR_state_lock(); if ( cr_state == CR_REQUEST_CHECKPOINT ) { CR_state_transition_nolock( CR_READY ); } CR_state_unlock(); return return_code; }
int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot, opal_crs_base_ckpt_options_t *options, opal_crs_state_type_t *state) { int ret, exit_status = OPAL_SUCCESS; opal_crs_blcr_snapshot_t *snapshot = NULL; #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 cr_checkpoint_args_t cr_args; static cr_checkpoint_handle_t cr_handle = (cr_checkpoint_handle_t)(-1); #endif int fd = 0; char *loc_fname = NULL; if( pid != my_pid ) { opal_output(0, "crs:blcr: checkpoint(%d, ---): Checkpointing of peers not allowed!", pid); exit_status = OPAL_ERROR; goto cleanup; } opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(%d, ---)", pid); snapshot = (opal_crs_blcr_snapshot_t *)base_snapshot; /* * Update the snapshot metadata */ snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name); blcr_get_checkpoint_filename(&(snapshot->context_filename), pid); if( NULL == snapshot->super.metadata ) { if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to open the file (%s)", snapshot->super.metadata_filename); exit_status = OPAL_ERROR; goto cleanup; } } fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name); fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_CONTEXT, snapshot->context_filename); fclose(snapshot->super.metadata ); snapshot->super.metadata = NULL; /* * If we can checkpointing ourselves do so: * use cr_request_checkpoint() if available, and cr_request_file() if not */ if( opal_crs_blcr_dev_null ) { loc_fname = strdup("/dev/null"); } else { asprintf(&loc_fname, "%s/%s", snapshot->super.snapshot_directory, snapshot->context_filename); } #if OPAL_ENABLE_CRDEBUG == 1 /* Make sure to identify the checkpointing thread, so that it is not * prevented from requesting the checkpoint after the debugger detaches */ opal_cr_debug_set_current_ckpt_thread_self(); checkpoint_thread_id = opal_thread_get_self(); blcr_crdebug_refreshed_env = false; /* If checkpoint/restart enabled debugging then mark detachment place */ if( MPIR_debug_with_checkpoint ) { opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Detaching debugger..."); MPIR_checkpoint_debugger_detach(); } #endif opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint SELF <%s>", loc_fname); #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 || CRS_BLCR_HAVE_CR_REQUEST == 1 #if CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT == 1 fd = open(loc_fname, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, S_IRUSR | S_IWUSR); if( fd < 0 ) { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to open checkpoint file (%s) for pid (%d)", loc_fname, pid); exit_status = OPAL_ERROR; goto cleanup; } cr_initialize_checkpoint_args_t(&cr_args); cr_args.cr_scope = CR_SCOPE_PROC; cr_args.cr_fd = fd; if( options->stop ) { cr_args.cr_signal = SIGSTOP; } ret = cr_request_checkpoint(&cr_args, &cr_handle); if( ret < 0 ) { close(cr_args.cr_fd); *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s)", pid, loc_fname); exit_status = ret; goto cleanup; } /* Wait for checkpoint to finish */ do { ret = cr_poll_checkpoint(&cr_handle, NULL); if( ret < 0 ) { /* Check if restarting. This is not an error. */ if( (ret == CR_POLL_CHKPT_ERR_POST) && (errno == CR_ERESTARTED) ) { ret = 0; break; } /* If Call was interrupted by a signal, retry the call */ else if (errno == EINTR) { ; } /* Otherwise this is a real error that we need to deal with */ else { *state = OPAL_CRS_ERROR; opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d) to file (%s) - poll failed with (%d)", pid, loc_fname, ret); exit_status = ret; goto cleanup; } } } while( ret < 0 ); /* Close the file */ close(cr_args.cr_fd); #else /* Request a checkpoint be taken of the current process. * Since we are not guaranteed to finish the checkpoint before this * returns, we also need to wait for it. */ cr_request_file(loc_fname); /* Wait for checkpoint to finish */ do { usleep(1000); /* JJH Do we really want to sleep? */ } while(CR_STATE_IDLE != cr_status()); #endif #endif *state = blcr_current_state; free(loc_fname); cleanup: if( NULL != snapshot->super.metadata ) { fclose(snapshot->super.metadata ); snapshot->super.metadata = NULL; } return exit_status; }