/* * get_step_image_dir - get the dir to store step task images * IN cr: checkpoint/restart * RET image dir on success, NULL on error * * NOTE: only can be called in callbak */ static char * get_step_image_dir(int cr) { const struct cr_checkpoint_info *ckpt_info; const struct cr_restart_info *rstrt_info; const char *dest; char *rchar, *dir; if (cr) { /* checkpoint */ ckpt_info = cr_get_checkpoint_info(); if (!ckpt_info) { error("failed to get checkpoint info: %s", cr_strerror(errno)); return NULL; } dest = ckpt_info->dest; } else { /* retart */ rstrt_info = cr_get_restart_info(); if (!rstrt_info) { error("failed to get restart info: %s", cr_strerror(errno)); return NULL; } dest = rstrt_info->src; } rchar = strrchr(dest, '/'); if (rchar) { dir = xstrndup(dest, rchar - dest + 1); } xstrfmtcat(dir, "%u.%u", jobid, stepid); return dir; }
static int cr_callback(void *unused) { int rc; char *step_image_dir = NULL; rc = CR_CHECKPOINT_READY; if (step_launched) { step_image_dir = get_step_image_dir(1); if (step_image_dir == NULL) { error ("failed to get step image directory"); rc = CR_CHECKPOINT_PERM_FAILURE; } else if (slurm_checkpoint_tasks(jobid, stepid, time(NULL), /* timestamp */ step_image_dir, 60, /* wait */ nodelist) != SLURM_SUCCESS) { error ("failed to checkpoint step tasks"); rc = CR_CHECKPOINT_PERM_FAILURE; } xfree(step_image_dir); } rc = cr_checkpoint(rc); /* dump */ if (rc < 0) { fatal("checkpoint failed: %s", cr_strerror(errno)); } else if (rc == 0) { /* continue, nothing to do */ } else { /* restarted */ if (srun_pid) { /* srun forked */ if (step_launched) { step_image_dir = get_step_image_dir(0); if (step_image_dir == NULL) { fatal("failed to get step image directory"); } update_env("SLURM_RESTART_DIR", step_image_dir); xfree(step_image_dir); } if (fork_exec_srun()) { fatal("failed fork/exec srun"); } } /* XXX: step_launched => listen_fd valid */ step_launched = 0; debug2("step not launched."); pthread_cond_broadcast(&step_launch_cond); } return 0; }
int main(int argc, char **argv) { int debug_level, sig, srun_fd; struct sigaction sa; log_options_t logopt = LOG_OPTS_STDERR_ONLY; struct sockaddr_un ca; unsigned int ca_len = sizeof(ca); atexit(remove_listen_socket); /* copied from srun */ debug_level = _slurm_debug_env_val(); logopt.stderr_level += debug_level; log_init(xbasename(argv[0]), logopt, 0, NULL); if (init_srun_argv(argc, argv)) { fatal("failed to initialize arguments for running srun"); } if ((cr_id = cr_init()) < 0) { fatal("failed to initialize libcr: %s", cr_strerror(errno)); } (void)cr_register_callback(cr_callback, NULL, CR_THREAD_CONTEXT); /* forward signals. copied from cr_restart */ sa.sa_sigaction = signal_child; sa.sa_flags = SA_RESTART | SA_NODEFER | SA_SIGINFO; sigemptyset(&sa.sa_mask); for (sig = 0; sig < _NSIG; sig ++) { if (sig == SIGSTOP || sig == SIGKILL || sig == SIGCHLD) continue; sigaction(sig, &sa, NULL); } sa.sa_sigaction = on_child_exit; sa.sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP; sigaction(SIGCHLD, &sa, NULL); cr_enter_cs(cr_id); /* BEGIN CS: avoid race condition of whether srun is forked */ if ( fork_exec_srun() ) { fatal("failed fork/exec/wait srun"); } cr_leave_cs(cr_id); /* END CS */ while (1) { pthread_mutex_lock(&step_launch_mutex); while (step_launched) { /* just avoid busy waiting */ pthread_cond_wait(&step_launch_cond, &step_launch_mutex); } pthread_mutex_unlock(&step_launch_mutex); if (_wait_for_srun_connect() < 0) continue; cr_enter_cs(cr_id); /* BEGIN CS: checkpoint(callback) will be delayed */ srun_fd = accept(listen_fd, (struct sockaddr*)&ca, &ca_len); if (srun_fd < 0) { /* restarted before enter CS. socket will not be restored */ if (errno == EBADF) { cr_leave_cs(cr_id); continue; } else { fatal("failed to accept socket: %m"); } } _read_info_from_srun(srun_fd); close(srun_fd); step_launched = 1; debug2("step launched"); cr_leave_cs(cr_id); /* END CS */ } return 0; }
// Request a checkpoint of the local process // The return value is // - negative in case of error // - zero when successfully resuming after the checkpoint // - positive when restarting from the checkpoint static int request_checkpoint( const char* filename ) { cr_checkpoint_args_t cr_file_args; cr_checkpoint_handle_t cr_handle; int cr_fd = -1; int return_code = 0; // Check current state CR_state_lock(); if ( cr_state != CR_READY ) { switch( cr_state ) { case CR_REQUEST_CHECKPOINT: case CR_CHECKPOINT: { PRINT_ERROR("Error: Already checkpointing... (cr_state=%d)\n", cr_state); return_code = -10; break; } default: { PRINT_ERROR("Error: Not ready to checkpoint... (cr_state=%d)\n", cr_state); return_code = -11; break; } } CR_state_unlock(); goto error; } else { // All is ok, proceed to checkpoint request CR_state_transition_nolock( CR_REQUEST_CHECKPOINT ); } CR_state_unlock(); cr_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0600); if ( cr_fd < 0 ) { PRINT_ERROR_ERRNO("Failed to open checkpoint file '%s'", errno, filename); return_code = -1; goto error; } int ret = cr_initialize_checkpoint_args_t(&cr_file_args); if (ret < 0) { PRINT_ERROR("BLCR call cr_initialize_checkpoint_args_t() failed\n"); return_code = -2; goto error; } cr_file_args.cr_scope = CR_SCOPE_PROC; cr_file_args.cr_target = getpid(); cr_file_args.cr_fd = cr_fd; cr_file_args.cr_signal = 0; cr_file_args.cr_timeout = 0; cr_file_args.cr_flags &= ~CR_CHKPT_DUMP_ALL; // Save None // Request a checkpoint PRINT_DEBUG( DEBUG_FT_verbose, "cr_request_checkpoint() with file '%s'\n", filename ); ret = cr_request_checkpoint(&cr_file_args, &cr_handle); PRINT_DEBUG( DEBUG_FT_verbose>1, "cr_request_checkpoint() returned %d\n", ret ); if (ret < 0) { PRINT_ERROR("BLCR call cr_request_checkpoint() failed with error %d: %s\n", errno, cr_strerror(errno)); return_code = -3; goto error; } // Wait for the end of the checkpoint, and retry while interrupted PRINT_DEBUG( DEBUG_FT_verbose, "cr_poll_checkpoint()\n" ); do { ret = cr_poll_checkpoint(&cr_handle, NULL); } while (ret == CR_POLL_CHKPT_ERR_PRE && errno == EINTR); PRINT_DEBUG( DEBUG_FT_verbose>1, "cr_poll_checkpoint() returned %d\n", ret ); // Check the result of the checkpoint if (ret == CR_POLL_CHKPT_ERR_POST && errno == CR_ERESTARTED) { // We are restarting, ignore this error code // The checkpoint file is not opened at restart cr_fd = -1; // Positive value means restart return_code = 1; return return_code; } else if (ret < 0) { // Checkpoint failed PRINT_ERROR("BLCR call cr_poll_checkpoint() failed with error %d: %s\n", errno, cr_strerror(errno)); // Negative value for failure return_code = -4; goto error; } else if (ret == 0) { // 0 means that the checkpoint is in progress // It should never happen because we don't specify any timeout when calling cr_poll_checkpoint() ASSERT_MSG( 0==1, "Internal error\n"); } // Close the checkpoint file ASSERT_MSG( cr_fd>=0, "Internal error\n"); ret = close(cr_fd); cr_fd = -1; PRINT_DEBUG( DEBUG_FT_verbose, "close() returned %d\n", ret ); if (ret < 0) { PRINT_ERROR_ERRNO("Failed to close file '%s'", errno, filename); return_code = -5; goto error; } // If we are here, it means that everything went good ASSERT_MSG( return_code==0, "Internal error\n"); return return_code; error: // An error happened, cleanup and return properly if ( cr_fd >= 0 ) { close( cr_fd ); cr_fd = -1; } // If the request failed, ie not the checkpoint itself // Restore the CR_READY state CR_state_lock(); if ( cr_state == CR_REQUEST_CHECKPOINT ) { CR_state_transition_nolock( CR_READY ); } CR_state_unlock(); return return_code; }
// In CR_initialize(), put only code that must be called once // because CR_initialize() won't be called at restart // Code that needs to be run after each restart should go in CR_thread_start() or CR_Loop() int CR_initialize() { time_t tm; struct tm *stm; int rv = pthread_mutex_init( &cr_state_mutex, NULL ); if ( rv != 0 ) { PRINT_ERROR_ERRNO( "pthread_mutex_init() failed", errno ); return -1; } CR_state_transition( CR_INIT ); cr_client_id_t cr_id = cr_init(); if (cr_id < 0) { PRINT_ERROR("BLCR call cr_init() failed\n"); return -2; } if (cr_register_callback(CR_Callback, (void *) NULL, CR_THREAD_CONTEXT) == -1) { PRINT_ERROR("BLCR call cr_register_callback() failed with error %d: %s\n", errno, cr_strerror(errno)); return -3; } strncpy(ckpt_filename, DEFAULT_CHECKPOINT_FILENAME, CR_MAX_FILENAME); tm = time(NULL); if ((time_t) tm == -1) { PRINT_ERROR("time() failed\n"); return -4; } stm = localtime(&tm); if (!stm) { PRINT_ERROR("localtime() failed\n"); return -5; } snprintf(sessionid, CR_SESSION_MAX, "%d%d%d%d%d", stm->tm_yday, stm->tm_hour, stm->tm_min, stm->tm_sec, getpid()); sessionid[CR_SESSION_MAX - 1] = '\0'; return 0; }