Exemplo n.º 1
0
/*
 * get_step_image_dir - get the dir to store step task images
 * IN cr: checkpoint/restart
 * RET image dir on success, NULL on error
 *
 * NOTE: only can be called in callbak
 */
static char *
get_step_image_dir(int cr)
{
	const struct cr_checkpoint_info *ckpt_info;
	const struct cr_restart_info *rstrt_info;
	const char *dest;
	char *rchar, *dir;

	if (cr) {		/* checkpoint */
		ckpt_info = cr_get_checkpoint_info();
		if (!ckpt_info) {
			error("failed to get checkpoint info: %s",
			      cr_strerror(errno));
			return NULL;
		}
		dest = ckpt_info->dest;
	} else {		/* retart */
		rstrt_info = cr_get_restart_info();
		if (!rstrt_info) {
			error("failed to get restart info: %s",
			      cr_strerror(errno));
			return NULL;
		}
		dest = rstrt_info->src;
	}

	rchar = strrchr(dest, '/');
	if (rchar) {
		dir = xstrndup(dest, rchar - dest + 1);
	}
	xstrfmtcat(dir, "%u.%u", jobid, stepid);

	return dir;
}
Exemplo n.º 2
0
static int ckpt_cb(void *arg)
{
    int rc, ret;
    const struct cr_restart_info* ri;

    if (MPIDI_Process.my_pg_rank == 0) {
        MPIDI_nem_ckpt_start_checkpoint = TRUE;
        /* poke the progress engine in case we're waiting in a blocking recv */
        MPIDI_CH3_Progress_signal_completion();
    }

    do {
        ret = sem_wait(&ckpt_sem);
    } while (ret == -1 && errno == EINTR);
    CHECK_ERR(ret, "sem_wait");

    if (MPID_nem_netmod_func->ckpt_precheck) {
        int mpi_errno;
        mpi_errno = MPID_nem_netmod_func->ckpt_precheck();
        CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_precheck failed");
    }

    rc = cr_checkpoint(0);
    if (rc < 0) {
        ckpt_result = CKPT_ERROR;
    } else if (rc) {

        ckpt_result = CKPT_RESTART;
        ri = cr_get_restart_info();
        CHECK_ERR(!ri, "cr_get_restart_info");
        ret = restore_env(ri->requester, MPIDI_Process.my_pg_rank);
        CHECK_ERR(ret, "restore_env");
        ret = restore_stdinouterr(MPIDI_Process.my_pg_rank);
        CHECK_ERR(ret, "restore_stdinouterr");
        ret = reinit_pmi();
        CHECK_ERR(ret, "reinit_pmi");

        if (MPID_nem_netmod_func->ckpt_restart) {
            int mpi_errno;
            mpi_errno = MPID_nem_netmod_func->ckpt_restart();
            CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_restart failed");
        }
    } else {

        ckpt_result = CKPT_CONTINUE;

        if (MPID_nem_netmod_func->ckpt_continue) {
            int mpi_errno;
            mpi_errno = MPID_nem_netmod_func->ckpt_continue();
            CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_continue failed");
        }
    }
    
    do {
        ret = sem_post(&cont_sem);
    } while (ret == -1 && errno == EINTR);
    CHECK_ERR(ret, "sem_post");

    return 0;
}