Пример #1
0
static int opal_crs_blcr_thread_callback(void *arg) {
    const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info();
    int ret;
    
    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: thread_callback()");

    OPAL_THREAD_LOCK(&blcr_lock);
    blcr_current_state = OPAL_CRS_CHECKPOINT;

    /*
     * Allow the checkpoint to be taken, if we requested it
     */
#if CRS_BLCR_HAVE_INFO_REQUESTER == 1
    if( ckpt_info->requester != my_pid ) {
        ret = cr_checkpoint(CR_CHECKPOINT_OMIT);
        blcr_current_state = OPAL_CRS_RUNNING;
        opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: thread_callback(); WARNING: An external agent attempted to checkpoint this process "
                            "when it did not expect to be checkpointed. Skipping this checkpoint request."
                            " [%d != %d].", ckpt_info->requester, my_pid);
        return 0;
    }
    else
#endif
    {
        ret = cr_checkpoint(0);
    }
    
    /*
     * Restarting
     */
    if ( 0 < ret ) {
        opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: thread_callback: Restarting.");
        blcr_current_state = OPAL_CRS_RESTART;
    }
    /*
     * Continuing
     */
    else {
        opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: thread_callback: Continue.");
        blcr_current_state = OPAL_CRS_CONTINUE;
    }

    OPAL_THREAD_UNLOCK(&blcr_lock);
    opal_condition_signal(&blcr_cond);

    return 0;
}
Пример #2
0
static int my_callback(void *arg)
{
    int rc;

    rc = cr_checkpoint(CR_CHECKPOINT_OMIT);

    switch (rc) {
    case -CR_ETEMPFAIL:
        /* One of the processes indicated that it couldn't take the checkpoint now.  Try again later. */
        return -1;
        break;
    case -CR_EPERMFAIL:
        /* One of the processes indicated a permanent failure */
        return -1;
        break;
    case -CR_EOMITTED:
        /* This is the expected return */
        break;
    default:
        /* Something bad happened */
        return -1;
    }


    return 0;
}
Пример #3
0
static int ckpt_cb(void *arg)
{
    int rc, ret;
    const struct cr_restart_info* ri;

    if (MPIDI_Process.my_pg_rank == 0) {
        MPIDI_nem_ckpt_start_checkpoint = TRUE;
        /* poke the progress engine in case we're waiting in a blocking recv */
        MPIDI_CH3_Progress_signal_completion();
    }

    do {
        ret = sem_wait(&ckpt_sem);
    } while (ret == -1 && errno == EINTR);
    CHECK_ERR(ret, "sem_wait");

    if (MPID_nem_netmod_func->ckpt_precheck) {
        int mpi_errno;
        mpi_errno = MPID_nem_netmod_func->ckpt_precheck();
        CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_precheck failed");
    }

    rc = cr_checkpoint(0);
    if (rc < 0) {
        ckpt_result = CKPT_ERROR;
    } else if (rc) {

        ckpt_result = CKPT_RESTART;
        ri = cr_get_restart_info();
        CHECK_ERR(!ri, "cr_get_restart_info");
        ret = restore_env(ri->requester, MPIDI_Process.my_pg_rank);
        CHECK_ERR(ret, "restore_env");
        ret = restore_stdinouterr(MPIDI_Process.my_pg_rank);
        CHECK_ERR(ret, "restore_stdinouterr");
        ret = reinit_pmi();
        CHECK_ERR(ret, "reinit_pmi");

        if (MPID_nem_netmod_func->ckpt_restart) {
            int mpi_errno;
            mpi_errno = MPID_nem_netmod_func->ckpt_restart();
            CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_restart failed");
        }
    } else {

        ckpt_result = CKPT_CONTINUE;

        if (MPID_nem_netmod_func->ckpt_continue) {
            int mpi_errno;
            mpi_errno = MPID_nem_netmod_func->ckpt_continue();
            CHECK_ERR_MPI(mpi_errno, mpi_errno, "ckpt_continue failed");
        }
    }
    
    do {
        ret = sem_post(&cont_sem);
    } while (ret == -1 && errno == EINTR);
    CHECK_ERR(ret, "sem_post");

    return 0;
}
Пример #4
0
static int demux_callback(void* arg)
{
    fflush(stdout);
    fflush(stderr);

    cr_checkpoint(CR_CHECKPOINT_OMIT);

    return 0;
}
Пример #5
0
static int
cr_callback(void *unused)
{
	int rc;
	char *step_image_dir = NULL;

	rc = CR_CHECKPOINT_READY;
	if (step_launched) {
		step_image_dir = get_step_image_dir(1);
		if (step_image_dir == NULL) {
			error ("failed to get step image directory");
			rc = CR_CHECKPOINT_PERM_FAILURE;
		} else if (slurm_checkpoint_tasks(jobid,
						  stepid,
						  time(NULL), /* timestamp */
						  step_image_dir,
						  60, /* wait */
						  nodelist) != SLURM_SUCCESS) {
			error ("failed to checkpoint step tasks");
			rc = CR_CHECKPOINT_PERM_FAILURE;
		}
		xfree(step_image_dir);
	}
	rc = cr_checkpoint(rc);	/* dump */

	if (rc < 0) {
		fatal("checkpoint failed: %s", cr_strerror(errno));
	} else if (rc == 0) {
		/* continue, nothing to do */
	} else {
		/* restarted */
		if (srun_pid) { /* srun forked */
			if (step_launched) {
				step_image_dir = get_step_image_dir(0);
				if (step_image_dir == NULL) {
					fatal("failed to get step image directory");
				}
				update_env("SLURM_RESTART_DIR", step_image_dir);
				xfree(step_image_dir);
			}

			if (fork_exec_srun()) {
				fatal("failed fork/exec srun");
			}
		}

		/* XXX: step_launched => listen_fd valid */
		step_launched = 0;

		debug2("step not launched.");

		pthread_cond_broadcast(&step_launch_cond);
	}

	return 0;
}
Пример #6
0
static int opal_crs_blcr_signal_callback(void *arg) {
    const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info();
    int ret;

    /*
     * Allow the checkpoint to be taken, if we requested it
     */
#if CRS_BLCR_HAVE_INFO_REQUESTER == 1
    if( ckpt_info->requester != my_pid ) {
        ret = cr_checkpoint(CR_CHECKPOINT_OMIT);
        return 0;
    }
    else
#endif
    {
        ret = cr_checkpoint(0);
    }

    return 0;
}