int opal_crs_blcr_module_init(void) { void *crs_blcr_thread_callback_arg = NULL; void *crs_blcr_signal_callback_arg = NULL; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: module_init()"); my_pid = getpid(); if( !opal_cr_is_tool ) { /* * Initialize BLCR */ client_id = cr_init(); if (0 > client_id) { opal_output(mca_crs_blcr_component.super.output_handle, "Error: crs:blcr: module_init: cr_init failed (%d)\n", client_id); return OPAL_ERROR; } } blcr_restart_cmd = strdup("cr_restart"); blcr_checkpoint_cmd = strdup("cr_checkpoint"); if( !opal_cr_is_tool ) { /* We need to make the lock and condition variable before * starting the thread, since the thread uses these vars. */ OBJ_CONSTRUCT(&blcr_lock, opal_mutex_t); OBJ_CONSTRUCT(&blcr_cond, opal_condition_t); /* * Register the thread handler */ cr_thread_callback_id = cr_register_callback(opal_crs_blcr_thread_callback, crs_blcr_thread_callback_arg, CR_THREAD_CONTEXT); /* * Register the signal handler * - even though we do not use it */ cr_signal_callback_id = cr_register_callback(opal_crs_blcr_signal_callback, crs_blcr_signal_callback_arg, CR_SIGNAL_CONTEXT); } /* * Now that we are done with init, set the state to running */ blcr_current_state = OPAL_CRS_RUNNING; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: module_init() --> Finished [%d]", opal_cr_is_tool); return OPAL_SUCCESS; }
int MPIDI_nem_ckpt_init(void) { int mpi_errno = MPI_SUCCESS; cr_callback_id_t cb_id; cr_client_id_t client_id; int ret; MPIDI_STATE_DECL(MPID_STATE_MPIDI_NEM_CKPT_INIT); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_NEM_CKPT_INIT); if (!MPIR_CVAR_NEMESIS_ENABLE_CKPOINT) goto fn_exit; client_id = cr_init(); MPIU_ERR_CHKANDJUMP(client_id < 0 && errno == ENOSYS, mpi_errno, MPI_ERR_OTHER, "**blcr_mod"); cb_id = cr_register_callback(ckpt_cb, NULL, CR_THREAD_CONTEXT); MPIU_ERR_CHKANDJUMP1(cb_id == -1, mpi_errno, MPI_ERR_OTHER, "**intern", "**intern %s", MPIU_Strerror(errno)); checkpointing = FALSE; current_wave = 0; ret = sem_init(&ckpt_sem, 0, 0); MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**sem_init", "**sem_init %s", MPIU_Strerror(errno)); ret = sem_init(&cont_sem, 0, 0); MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**sem_init", "**sem_init %s", MPIU_Strerror(errno)); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_NEM_CKPT_INIT); return mpi_errno; fn_fail: goto fn_exit; }
// In CR_initialize(), put only code that must be called once // because CR_initialize() won't be called at restart // Code that needs to be run after each restart should go in CR_thread_start() or CR_Loop() int CR_initialize() { time_t tm; struct tm *stm; int rv = pthread_mutex_init( &cr_state_mutex, NULL ); if ( rv != 0 ) { PRINT_ERROR_ERRNO( "pthread_mutex_init() failed", errno ); return -1; } CR_state_transition( CR_INIT ); cr_client_id_t cr_id = cr_init(); if (cr_id < 0) { PRINT_ERROR("BLCR call cr_init() failed\n"); return -2; } if (cr_register_callback(CR_Callback, (void *) NULL, CR_THREAD_CONTEXT) == -1) { PRINT_ERROR("BLCR call cr_register_callback() failed with error %d: %s\n", errno, cr_strerror(errno)); return -3; } strncpy(ckpt_filename, DEFAULT_CHECKPOINT_FILENAME, CR_MAX_FILENAME); tm = time(NULL); if ((time_t) tm == -1) { PRINT_ERROR("time() failed\n"); return -4; } stm = localtime(&tm); if (!stm) { PRINT_ERROR("localtime() failed\n"); return -5; } snprintf(sessionid, CR_SESSION_MAX, "%d%d%d%d%d", stm->tm_yday, stm->tm_hour, stm->tm_min, stm->tm_sec, getpid()); sessionid[CR_SESSION_MAX - 1] = '\0'; return 0; }
HYD_status HYDT_ckpoint_blcr_init(void) { HYD_status status = HYD_SUCCESS; int rc; cr_client_id_t client_id; cr_callback_id_t callback_id; HYDU_FUNC_ENTER(); client_id = (int) cr_init(); if (client_id < 0) goto fn_fail; callback_id = cr_register_callback(my_callback, &rc, CR_SIGNAL_CONTEXT); if (callback_id < 0) goto fn_fail; fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int main(int argc, char **argv) { int debug_level, sig, srun_fd; struct sigaction sa; log_options_t logopt = LOG_OPTS_STDERR_ONLY; struct sockaddr_un ca; unsigned int ca_len = sizeof(ca); atexit(remove_listen_socket); /* copied from srun */ debug_level = _slurm_debug_env_val(); logopt.stderr_level += debug_level; log_init(xbasename(argv[0]), logopt, 0, NULL); if (init_srun_argv(argc, argv)) { fatal("failed to initialize arguments for running srun"); } if ((cr_id = cr_init()) < 0) { fatal("failed to initialize libcr: %s", cr_strerror(errno)); } (void)cr_register_callback(cr_callback, NULL, CR_THREAD_CONTEXT); /* forward signals. copied from cr_restart */ sa.sa_sigaction = signal_child; sa.sa_flags = SA_RESTART | SA_NODEFER | SA_SIGINFO; sigemptyset(&sa.sa_mask); for (sig = 0; sig < _NSIG; sig ++) { if (sig == SIGSTOP || sig == SIGKILL || sig == SIGCHLD) continue; sigaction(sig, &sa, NULL); } sa.sa_sigaction = on_child_exit; sa.sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP; sigaction(SIGCHLD, &sa, NULL); cr_enter_cs(cr_id); /* BEGIN CS: avoid race condition of whether srun is forked */ if ( fork_exec_srun() ) { fatal("failed fork/exec/wait srun"); } cr_leave_cs(cr_id); /* END CS */ while (1) { pthread_mutex_lock(&step_launch_mutex); while (step_launched) { /* just avoid busy waiting */ pthread_cond_wait(&step_launch_cond, &step_launch_mutex); } pthread_mutex_unlock(&step_launch_mutex); if (_wait_for_srun_connect() < 0) continue; cr_enter_cs(cr_id); /* BEGIN CS: checkpoint(callback) will be delayed */ srun_fd = accept(listen_fd, (struct sockaddr*)&ca, &ca_len); if (srun_fd < 0) { /* restarted before enter CS. socket will not be restored */ if (errno == EBADF) { cr_leave_cs(cr_id); continue; } else { fatal("failed to accept socket: %m"); } } _read_info_from_srun(srun_fd); close(srun_fd); step_launched = 1; debug2("step launched"); cr_leave_cs(cr_id); /* END CS */ } return 0; }
int main( int argc, char *argv[]) { struct timeval timeout; int i; int maxfd; int main_sock_out = 3; int main_sock_err = 4; int n; int newsock; pid_t parent; fd_set selset; struct routem *routem; #ifdef ENABLE_BLCR if (cr_init() < 0) { perror("Failed to initialize BLCR."); exit(5); } (void)cr_register_callback(demux_callback, NULL, CR_THREAD_CONTEXT); #endif /* ENABLE_BLCR */ parent = getppid(); /* disable cookie search - PW - mpiexec patch */ /* cookie = getenv("PBS_JOBCOOKIE"); if (cookie == 0) { fprintf(stderr, "%s: no PBS_JOBCOOKIE found in the env\n", argv[0]); exit(3); } #ifdef DEBUG printf("Cookie found in environment: %s\n", cookie); #endif */ if((maxfd = sysconf(_SC_OPEN_MAX)) < 0) { perror("unexpected return from sysconf."); exit(5); } routem = (struct routem *)calloc(maxfd, sizeof(struct routem)); if (routem == NULL) { perror("cannot alloc memory"); exit(5); } for (i = 0; i < maxfd; ++i) { routem[i].r_where = invalid; routem[i].r_nl = 1; } routem[main_sock_out].r_where = new_out; routem[main_sock_err].r_where = new_err; FD_ZERO(&readset); FD_SET(main_sock_out, &readset); FD_SET(main_sock_err, &readset); if (listen(main_sock_out, TORQUE_LISTENQUEUE) < 0) { perror("listen on out"); exit(5); } if (listen(main_sock_err, TORQUE_LISTENQUEUE) < 0) { perror("listen on err"); exit(5); } while (1) { selset = readset; timeout.tv_usec = 0; timeout.tv_sec = 10; n = select(FD_SETSIZE, &selset, (fd_set *)0, (fd_set *)0, &timeout); if (n == -1) { if (errno == EINTR) { n = 0; } else { fprintf(stderr, "%s: select failed\n", argv[0]); exit(1); } } else if (n == 0) { /* NOTE: on TRU64, init process does not have pid==1 */ if (getppid() != parent) { #ifdef DEBUG fprintf(stderr, "%s: Parent has gone, and so will I\n", argv[0]); #endif /* DEBUG */ break; } } /* END else if (n == 0) */ for (i = 0; (n != 0) && (i < maxfd); ++i) { if (FD_ISSET(i, &selset)) { /* this socket has data */ n--; switch ((routem + i)->r_where) { case new_out: case new_err: newsock = accept(i, 0, 0); (routem + newsock)->r_where = (routem + i)->r_where == new_out ? old_out : old_err; FD_SET(newsock, &readset); break; case old_out: case old_err: readit(i, routem + i); break; default: fprintf(stderr, "%s: internal error\n", argv[0]); exit(2); /*NOTREACHED*/ break; } } } } /* END while(1) */ return(0); } /* END main() */
int opal_crs_blcr_module_init(void) { void *crs_blcr_thread_callback_arg = NULL; void *crs_blcr_signal_callback_arg = NULL; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: module_init()"); blcr_restart_cmd = strdup("cr_restart"); blcr_checkpoint_cmd = strdup("cr_checkpoint"); my_pid = getpid(); if( !opal_cr_is_tool ) { /* We need to make the lock and condition variable before * starting the thread, since the thread uses these vars. */ OBJ_CONSTRUCT(&blcr_lock, opal_mutex_t); OBJ_CONSTRUCT(&blcr_cond, opal_condition_t); /* * Initialize BLCR */ client_id = cr_init(); if (0 > client_id) { opal_output(mca_crs_blcr_component.super.output_handle, "Error: crs:blcr: module_init: cr_init failed (%d)\n", client_id); return OPAL_ERROR; } } #if OPAL_ENABLE_CRDEBUG == 1 blcr_crdebug_refreshed_env = false; #endif blcr_restart_cmd = strdup("cr_restart"); blcr_checkpoint_cmd = strdup("cr_checkpoint"); if( !opal_cr_is_tool ) { /* * Register the thread handler */ cr_thread_callback_id = cr_register_callback(opal_crs_blcr_thread_callback, crs_blcr_thread_callback_arg, CR_THREAD_CONTEXT); /* * Register the signal handler * - even though we do not use it */ cr_signal_callback_id = cr_register_callback(opal_crs_blcr_signal_callback, crs_blcr_signal_callback_arg, CR_SIGNAL_CONTEXT); #if OPAL_ENABLE_CRDEBUG == 1 /* * Checkpoint/restart enabled debugging hooks * "NO_CALLBACKS" -> non-MPI threads * "SIGNAL_CONTEXT" -> MPI threads * "THREAD_CONTEXT" -> BLCR threads */ cr_register_hook(CR_HOOK_CONT_NO_CALLBACKS, MPIR_checkpoint_debugger_crs_hook); cr_register_hook(CR_HOOK_CONT_SIGNAL_CONTEXT, MPIR_checkpoint_debugger_crs_hook); cr_register_hook(CR_HOOK_RSTRT_NO_CALLBACKS, MPIR_checkpoint_debugger_crs_hook); cr_register_hook(CR_HOOK_RSTRT_SIGNAL_CONTEXT, MPIR_checkpoint_debugger_crs_hook); #endif } /* * Now that we are done with init, set the state to running */ blcr_current_state = OPAL_CRS_RUNNING; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: module_init() --> Finished [%d]", opal_cr_is_tool); return OPAL_SUCCESS; }