示例#1
0
/*
 * get_step_image_dir - get the dir to store step task images
 * IN cr: checkpoint/restart
 * RET image dir on success, NULL on error
 *
 * NOTE: only can be called in callbak
 */
static char *
get_step_image_dir(int cr)
{
	const struct cr_checkpoint_info *ckpt_info;
	const struct cr_restart_info *rstrt_info;
	const char *dest;
	char *rchar, *dir;

	if (cr) {		/* checkpoint */
		ckpt_info = cr_get_checkpoint_info();
		if (!ckpt_info) {
			error("failed to get checkpoint info: %s",
			      cr_strerror(errno));
			return NULL;
		}
		dest = ckpt_info->dest;
	} else {		/* retart */
		rstrt_info = cr_get_restart_info();
		if (!rstrt_info) {
			error("failed to get restart info: %s",
			      cr_strerror(errno));
			return NULL;
		}
		dest = rstrt_info->src;
	}

	rchar = strrchr(dest, '/');
	if (rchar) {
		dir = xstrndup(dest, rchar - dest + 1);
	}
	xstrfmtcat(dir, "%u.%u", jobid, stepid);

	return dir;
}
static int opal_crs_blcr_thread_callback(void *arg) {
    const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info();
    int ret;
    
    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: thread_callback()");

    OPAL_THREAD_LOCK(&blcr_lock);
    blcr_current_state = OPAL_CRS_CHECKPOINT;

    /*
     * Allow the checkpoint to be taken, if we requested it
     */
#if CRS_BLCR_HAVE_INFO_REQUESTER == 1
    if( ckpt_info->requester != my_pid ) {
        ret = cr_checkpoint(CR_CHECKPOINT_OMIT);
        blcr_current_state = OPAL_CRS_RUNNING;
        opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: thread_callback(); WARNING: An external agent attempted to checkpoint this process "
                            "when it did not expect to be checkpointed. Skipping this checkpoint request."
                            " [%d != %d].", ckpt_info->requester, my_pid);
        return 0;
    }
    else
#endif
    {
        ret = cr_checkpoint(0);
    }
    
    /*
     * Restarting
     */
    if ( 0 < ret ) {
        opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: thread_callback: Restarting.");
        blcr_current_state = OPAL_CRS_RESTART;
    }
    /*
     * Continuing
     */
    else {
        opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                            "crs:blcr: thread_callback: Continue.");
        blcr_current_state = OPAL_CRS_CONTINUE;
    }

    OPAL_THREAD_UNLOCK(&blcr_lock);
    opal_condition_signal(&blcr_cond);

    return 0;
}
static int opal_crs_blcr_signal_callback(void *arg) {
    const struct cr_checkpoint_info *ckpt_info = cr_get_checkpoint_info();
    int ret;

    /*
     * Allow the checkpoint to be taken, if we requested it
     */
#if CRS_BLCR_HAVE_INFO_REQUESTER == 1
    if( ckpt_info->requester != my_pid ) {
        ret = cr_checkpoint(CR_CHECKPOINT_OMIT);
        return 0;
    }
    else
#endif
    {
        ret = cr_checkpoint(0);
    }

    return 0;
}