/* * get_step_image_dir - get the dir to store step task images * IN cr: checkpoint/restart * RET image dir on success, NULL on error * * NOTE: only can be called in callbak */ static char * get_step_image_dir(int cr) { const struct cr_checkpoint_info *ckpt_info; const struct cr_restart_info *rstrt_info; const char *dest; char *rchar, *dir; if (cr) { /* checkpoint */ ckpt_info = cr_get_checkpoint_info(); if (!ckpt_info) { error("failed to get checkpoint info: %s", cr_strerror(errno)); return NULL; } dest = ckpt_info->dest; } else { /* retart */ rstrt_info = cr_get_restart_info(); if (!rstrt_info) { error("failed to get restart info: %s", cr_strerror(errno)); return NULL; } dest = rstrt_info->src; } rchar = strrchr(dest, '/'); if (rchar) { dir = xstrndup(dest, rchar - dest + 1); } xstrfmtcat(dir, "%u.%u", jobid, stepid); return dir; }
static int ckpt_cb(void *arg) { int rc, ret; const struct cr_restart_info* ri; if (MPIDI_Process.my_pg_rank == 0) { MPIDI_nem_ckpt_start_checkpoint = TRUE; /* poke the progress engine in case we're waiting in a blocking recv */ MPIDI_CH3_Progress_signal_completion(); } do { ret = sem_wait(&ckpt_sem); } while (ret == -1 && errno == EINTR); CHECK_ERR(ret, "sem_wait"); if (MPID_nem_netmod_func->ckpt_precheck) { int mpi_errno; mpi_errno = MPID_nem_netmod_func->ckpt_precheck(); CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_precheck failed"); } rc = cr_checkpoint(0); if (rc < 0) { ckpt_result = CKPT_ERROR; } else if (rc) { ckpt_result = CKPT_RESTART; ri = cr_get_restart_info(); CHECK_ERR(!ri, "cr_get_restart_info"); ret = restore_env(ri->requester, MPIDI_Process.my_pg_rank); CHECK_ERR(ret, "restore_env"); ret = restore_stdinouterr(MPIDI_Process.my_pg_rank); CHECK_ERR(ret, "restore_stdinouterr"); ret = reinit_pmi(); CHECK_ERR(ret, "reinit_pmi"); if (MPID_nem_netmod_func->ckpt_restart) { int mpi_errno; mpi_errno = MPID_nem_netmod_func->ckpt_restart(); CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_restart failed"); } } else { ckpt_result = CKPT_CONTINUE; if (MPID_nem_netmod_func->ckpt_continue) { int mpi_errno; mpi_errno = MPID_nem_netmod_func->ckpt_continue(); CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_continue failed"); } } do { ret = sem_post(&cont_sem); } while (ret == -1 && errno == EINTR); CHECK_ERR(ret, "sem_post"); return 0; }