示例#1
0
/*
 * get_step_image_dir - get the dir to store step task images
 * IN cr: checkpoint/restart
 * RET image dir on success, NULL on error
 *
 * NOTE: only can be called in callbak
 */
static char *
get_step_image_dir(int cr)
{
	const struct cr_checkpoint_info *ckpt_info;
	const struct cr_restart_info *rstrt_info;
	const char *dest;
	char *rchar, *dir;

	if (cr) {		/* checkpoint */
		ckpt_info = cr_get_checkpoint_info();
		if (!ckpt_info) {
			error("failed to get checkpoint info: %s",
			      cr_strerror(errno));
			return NULL;
		}
		dest = ckpt_info->dest;
	} else {		/* retart */
		rstrt_info = cr_get_restart_info();
		if (!rstrt_info) {
			error("failed to get restart info: %s",
			      cr_strerror(errno));
			return NULL;
		}
		dest = rstrt_info->src;
	}

	rchar = strrchr(dest, '/');
	if (rchar) {
		dir = xstrndup(dest, rchar - dest + 1);
	}
	xstrfmtcat(dir, "%u.%u", jobid, stepid);

	return dir;
}
示例#2
0
static int
cr_callback(void *unused)
{
	int rc;
	char *step_image_dir = NULL;

	rc = CR_CHECKPOINT_READY;
	if (step_launched) {
		step_image_dir = get_step_image_dir(1);
		if (step_image_dir == NULL) {
			error ("failed to get step image directory");
			rc = CR_CHECKPOINT_PERM_FAILURE;
		} else if (slurm_checkpoint_tasks(jobid,
						  stepid,
						  time(NULL), /* timestamp */
						  step_image_dir,
						  60, /* wait */
						  nodelist) != SLURM_SUCCESS) {
			error ("failed to checkpoint step tasks");
			rc = CR_CHECKPOINT_PERM_FAILURE;
		}
		xfree(step_image_dir);
	}
	rc = cr_checkpoint(rc);	/* dump */

	if (rc < 0) {
		fatal("checkpoint failed: %s", cr_strerror(errno));
	} else if (rc == 0) {
		/* continue, nothing to do */
	} else {
		/* restarted */
		if (srun_pid) { /* srun forked */
			if (step_launched) {
				step_image_dir = get_step_image_dir(0);
				if (step_image_dir == NULL) {
					fatal("failed to get step image directory");
				}
				update_env("SLURM_RESTART_DIR", step_image_dir);
				xfree(step_image_dir);
			}

			if (fork_exec_srun()) {
				fatal("failed fork/exec srun");
			}
		}

		/* XXX: step_launched => listen_fd valid */
		step_launched = 0;

		debug2("step not launched.");

		pthread_cond_broadcast(&step_launch_cond);
	}

	return 0;
}
示例#3
0
int
main(int argc, char **argv)
{
	int debug_level, sig, srun_fd;
	struct sigaction sa;
	log_options_t logopt = LOG_OPTS_STDERR_ONLY;
	struct sockaddr_un ca;
	unsigned int ca_len = sizeof(ca);

	atexit(remove_listen_socket);

	/* copied from srun */
	debug_level = _slurm_debug_env_val();
	logopt.stderr_level += debug_level;
	log_init(xbasename(argv[0]), logopt, 0, NULL);

	if (init_srun_argv(argc, argv)) {
		fatal("failed to initialize arguments for running srun");
	}

	if ((cr_id = cr_init()) < 0) {
		fatal("failed to initialize libcr: %s", cr_strerror(errno));
	}
	(void)cr_register_callback(cr_callback, NULL, CR_THREAD_CONTEXT);

	/* forward signals. copied from cr_restart */
	sa.sa_sigaction = signal_child;
	sa.sa_flags = SA_RESTART | SA_NODEFER | SA_SIGINFO;
	sigemptyset(&sa.sa_mask);
	for (sig = 0;  sig < _NSIG; sig ++) {
		if (sig == SIGSTOP ||
		    sig == SIGKILL ||
		    sig == SIGCHLD)
			continue;
		sigaction(sig, &sa, NULL);
	}
	sa.sa_sigaction = on_child_exit;
	sa.sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP;
	sigaction(SIGCHLD, &sa, NULL);

	cr_enter_cs(cr_id); /* BEGIN CS: avoid race condition of whether srun is forked */
	if ( fork_exec_srun() ) {
		fatal("failed fork/exec/wait srun");
	}
	cr_leave_cs(cr_id); /* END CS */

	while (1) {
		pthread_mutex_lock(&step_launch_mutex);
		while (step_launched) {
			/* just avoid busy waiting */
			pthread_cond_wait(&step_launch_cond,
					  &step_launch_mutex);
		}
		pthread_mutex_unlock(&step_launch_mutex);

		if (_wait_for_srun_connect() < 0)
			continue;

		cr_enter_cs(cr_id); /* BEGIN CS: checkpoint(callback) will be delayed */

		srun_fd = accept(listen_fd, (struct sockaddr*)&ca, &ca_len);
		if (srun_fd < 0) {
			/* restarted before enter CS. socket will not be restored */
			if (errno == EBADF) {
				cr_leave_cs(cr_id);
				continue;
			} else {
				fatal("failed to accept socket: %m");
			}
		}

		_read_info_from_srun(srun_fd);
		close(srun_fd);

		step_launched = 1;
		debug2("step launched");

		cr_leave_cs(cr_id); /* END CS */
	}

	return 0;
}
示例#4
0
// Request a checkpoint of the local process
// The return value is
// - negative in case of error
// - zero when successfully resuming after the checkpoint
// - positive when restarting from the checkpoint
static int request_checkpoint( const char* filename ) 
{
    cr_checkpoint_args_t cr_file_args;
    cr_checkpoint_handle_t cr_handle;
    int cr_fd = -1;
    int return_code = 0;

    // Check current state
    CR_state_lock();
    if ( cr_state != CR_READY ) {
        switch( cr_state ) {
            case CR_REQUEST_CHECKPOINT:
            case CR_CHECKPOINT:
            {
                PRINT_ERROR("Error: Already checkpointing... (cr_state=%d)\n", cr_state);
                return_code = -10;
                break;
            }
            default:
            {
                PRINT_ERROR("Error: Not ready to checkpoint... (cr_state=%d)\n", cr_state);
                return_code = -11;
                break;
            }
        }
        CR_state_unlock();
        goto error;
    } else {
        // All is ok, proceed to checkpoint request
        CR_state_transition_nolock( CR_REQUEST_CHECKPOINT );
    }
    CR_state_unlock();


    cr_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0600);
    if ( cr_fd < 0 ) {
        PRINT_ERROR_ERRNO("Failed to open checkpoint file '%s'", errno, filename);
        return_code = -1;
        goto error;
    }

    int ret = cr_initialize_checkpoint_args_t(&cr_file_args);
    if (ret < 0) {
        PRINT_ERROR("BLCR call cr_initialize_checkpoint_args_t() failed\n");
        return_code = -2;
        goto error;
    }

    cr_file_args.cr_scope = CR_SCOPE_PROC;
    cr_file_args.cr_target = getpid();
    cr_file_args.cr_fd = cr_fd;
    cr_file_args.cr_signal = 0;
    cr_file_args.cr_timeout = 0;
    cr_file_args.cr_flags &= ~CR_CHKPT_DUMP_ALL;    // Save None

    // Request a checkpoint
    PRINT_DEBUG( DEBUG_FT_verbose, "cr_request_checkpoint() with file '%s'\n", filename );
    ret = cr_request_checkpoint(&cr_file_args, &cr_handle);
    PRINT_DEBUG( DEBUG_FT_verbose>1, "cr_request_checkpoint() returned %d\n", ret );
    if (ret < 0) {
        PRINT_ERROR("BLCR call cr_request_checkpoint() failed with error %d: %s\n", errno, cr_strerror(errno));
        return_code = -3;
        goto error;
    }

    // Wait for the end of the checkpoint, and retry while interrupted
    PRINT_DEBUG( DEBUG_FT_verbose, "cr_poll_checkpoint()\n" );
    do {
        ret = cr_poll_checkpoint(&cr_handle, NULL);
    } while (ret == CR_POLL_CHKPT_ERR_PRE && errno == EINTR);
    PRINT_DEBUG( DEBUG_FT_verbose>1, "cr_poll_checkpoint() returned %d\n", ret );

    // Check the result of the checkpoint
    if (ret == CR_POLL_CHKPT_ERR_POST && errno == CR_ERESTARTED) { 
        // We are restarting, ignore this error code

        // The checkpoint file is not opened at restart
        cr_fd = -1;

        // Positive value means restart
        return_code = 1;
        return return_code;
    } else if (ret < 0) {
        // Checkpoint failed
        PRINT_ERROR("BLCR call cr_poll_checkpoint() failed with error %d: %s\n", errno, cr_strerror(errno));

        // Negative value for failure
        return_code = -4;
        goto error;
    } else if (ret == 0) {
        // 0 means that the checkpoint is in progress
        // It should never happen because we don't specify any timeout when calling cr_poll_checkpoint()
        ASSERT_MSG( 0==1, "Internal error\n");
    }

    // Close the checkpoint file
    ASSERT_MSG( cr_fd>=0, "Internal error\n");
    ret = close(cr_fd);
    cr_fd = -1;
    PRINT_DEBUG( DEBUG_FT_verbose, "close() returned %d\n", ret );
    if (ret < 0) {
        PRINT_ERROR_ERRNO("Failed to close file '%s'", errno, filename);
        return_code = -5;
        goto error;
    }

    // If we are here, it means that everything went good
    ASSERT_MSG( return_code==0, "Internal error\n");
    return return_code;

error:
    // An error happened, cleanup and return properly
    if ( cr_fd >= 0 ) {
        close( cr_fd );
        cr_fd = -1;
    }

    // If the request failed, ie not the checkpoint itself
    // Restore the CR_READY state
    CR_state_lock();
    if ( cr_state == CR_REQUEST_CHECKPOINT ) {
        CR_state_transition_nolock( CR_READY );
    }
    CR_state_unlock();

    return return_code;
}
示例#5
0
// In CR_initialize(), put only code that must be called once
// because CR_initialize() won't be called at restart
// Code that needs to be run after each restart should go in CR_thread_start() or CR_Loop()
int CR_initialize()
{
    time_t tm;
    struct tm *stm;

    int rv = pthread_mutex_init( &cr_state_mutex, NULL );
    if ( rv != 0 ) {
        PRINT_ERROR_ERRNO( "pthread_mutex_init() failed", errno );
        return -1;
    }

    CR_state_transition( CR_INIT );

    cr_client_id_t cr_id = cr_init();
    if (cr_id < 0) {
        PRINT_ERROR("BLCR call cr_init() failed\n");
        return -2;
    }

    if (cr_register_callback(CR_Callback, (void *) NULL, CR_THREAD_CONTEXT) == -1) {
        PRINT_ERROR("BLCR call cr_register_callback() failed with error %d: %s\n", errno, cr_strerror(errno));
        return -3;
    }

    strncpy(ckpt_filename, DEFAULT_CHECKPOINT_FILENAME, CR_MAX_FILENAME);

    tm = time(NULL);
    if ((time_t) tm == -1) {
        PRINT_ERROR("time() failed\n");
        return -4;
    }

    stm = localtime(&tm);
    if (!stm) {
        PRINT_ERROR("localtime() failed\n");
        return -5;
    }

    snprintf(sessionid, CR_SESSION_MAX, "%d%d%d%d%d", stm->tm_yday, stm->tm_hour, stm->tm_min, stm->tm_sec, getpid());
    sessionid[CR_SESSION_MAX - 1] = '\0';

    return 0;
}