static int opal_crs_blcr_thread_callback(void *arg) { const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info(); int ret; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: thread_callback()"); OPAL_THREAD_LOCK(&blcr_lock); blcr_current_state = OPAL_CRS_CHECKPOINT; /* * Allow the checkpoint to be taken, if we requested it */ #if CRS_BLCR_HAVE_INFO_REQUESTER == 1 if( ckpt_info->requester != my_pid ) { ret = cr_checkpoint(CR_CHECKPOINT_OMIT); blcr_current_state = OPAL_CRS_RUNNING; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: thread_callback(); WARNING: An external agent attempted to checkpoint this process " "when it did not expect to be checkpointed. Skipping this checkpoint request." " [%d != %d].", ckpt_info->requester, my_pid); return 0; } else #endif { ret = cr_checkpoint(0); } /* * Restarting */ if ( 0 < ret ) { opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: thread_callback: Restarting."); blcr_current_state = OPAL_CRS_RESTART; } /* * Continuing */ else { opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: thread_callback: Continue."); blcr_current_state = OPAL_CRS_CONTINUE; } OPAL_THREAD_UNLOCK(&blcr_lock); opal_condition_signal(&blcr_cond); return 0; }
static int my_callback(void *arg) { int rc; rc = cr_checkpoint(CR_CHECKPOINT_OMIT); switch (rc) { case -CR_ETEMPFAIL: /* One of the processes indicated that it couldn't take the checkpoint now. Try again later. */ return -1; break; case -CR_EPERMFAIL: /* One of the processes indicated a permanent failure */ return -1; break; case -CR_EOMITTED: /* This is the expected return */ break; default: /* Something bad happened */ return -1; } return 0; }
static int ckpt_cb(void *arg) { int rc, ret; const struct cr_restart_info* ri; if (MPIDI_Process.my_pg_rank == 0) { MPIDI_nem_ckpt_start_checkpoint = TRUE; /* poke the progress engine in case we're waiting in a blocking recv */ MPIDI_CH3_Progress_signal_completion(); } do { ret = sem_wait(&ckpt_sem); } while (ret == -1 && errno == EINTR); CHECK_ERR(ret, "sem_wait"); if (MPID_nem_netmod_func->ckpt_precheck) { int mpi_errno; mpi_errno = MPID_nem_netmod_func->ckpt_precheck(); CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_precheck failed"); } rc = cr_checkpoint(0); if (rc < 0) { ckpt_result = CKPT_ERROR; } else if (rc) { ckpt_result = CKPT_RESTART; ri = cr_get_restart_info(); CHECK_ERR(!ri, "cr_get_restart_info"); ret = restore_env(ri->requester, MPIDI_Process.my_pg_rank); CHECK_ERR(ret, "restore_env"); ret = restore_stdinouterr(MPIDI_Process.my_pg_rank); CHECK_ERR(ret, "restore_stdinouterr"); ret = reinit_pmi(); CHECK_ERR(ret, "reinit_pmi"); if (MPID_nem_netmod_func->ckpt_restart) { int mpi_errno; mpi_errno = MPID_nem_netmod_func->ckpt_restart(); CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_restart failed"); } } else { ckpt_result = CKPT_CONTINUE; if (MPID_nem_netmod_func->ckpt_continue) { int mpi_errno; mpi_errno = MPID_nem_netmod_func->ckpt_continue(); CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_continue failed"); } } do { ret = sem_post(&cont_sem); } while (ret == -1 && errno == EINTR); CHECK_ERR(ret, "sem_post"); return 0; }
static int demux_callback(void* arg) { fflush(stdout); fflush(stderr); cr_checkpoint(CR_CHECKPOINT_OMIT); return 0; }
static int cr_callback(void *unused) { int rc; char *step_image_dir = NULL; rc = CR_CHECKPOINT_READY; if (step_launched) { step_image_dir = get_step_image_dir(1); if (step_image_dir == NULL) { error ("failed to get step image directory"); rc = CR_CHECKPOINT_PERM_FAILURE; } else if (slurm_checkpoint_tasks(jobid, stepid, time(NULL), /* timestamp */ step_image_dir, 60, /* wait */ nodelist) != SLURM_SUCCESS) { error ("failed to checkpoint step tasks"); rc = CR_CHECKPOINT_PERM_FAILURE; } xfree(step_image_dir); } rc = cr_checkpoint(rc); /* dump */ if (rc < 0) { fatal("checkpoint failed: %s", cr_strerror(errno)); } else if (rc == 0) { /* continue, nothing to do */ } else { /* restarted */ if (srun_pid) { /* srun forked */ if (step_launched) { step_image_dir = get_step_image_dir(0); if (step_image_dir == NULL) { fatal("failed to get step image directory"); } update_env("SLURM_RESTART_DIR", step_image_dir); xfree(step_image_dir); } if (fork_exec_srun()) { fatal("failed fork/exec srun"); } } /* XXX: step_launched => listen_fd valid */ step_launched = 0; debug2("step not launched."); pthread_cond_broadcast(&step_launch_cond); } return 0; }
static int opal_crs_blcr_signal_callback(void *arg) { const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info(); int ret; /* * Allow the checkpoint to be taken, if we requested it */ #if CRS_BLCR_HAVE_INFO_REQUESTER == 1 if( ckpt_info->requester != my_pid ) { ret = cr_checkpoint(CR_CHECKPOINT_OMIT); return 0; } else #endif { ret = cr_checkpoint(0); } return 0; }