int opal_crs_blcr_module_init(void)
{
    void *crs_blcr_thread_callback_arg = NULL;
    void *crs_blcr_signal_callback_arg = NULL;

    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: module_init()");

    my_pid = getpid();

    if( !opal_cr_is_tool ) {
        /*
         * Initialize BLCR
         */
        client_id = cr_init();
        if (0 > client_id) {
            opal_output(mca_crs_blcr_component.super.output_handle,
                        "Error: crs:blcr: module_init: cr_init failed (%d)\n", client_id);
            return OPAL_ERROR;
        }
    }

    blcr_restart_cmd    = strdup("cr_restart");
    blcr_checkpoint_cmd = strdup("cr_checkpoint");
    
    if( !opal_cr_is_tool ) {
        /* We need to make the lock and condition variable before
         * starting the thread, since the thread uses these vars.
         */
        OBJ_CONSTRUCT(&blcr_lock, opal_mutex_t);
        OBJ_CONSTRUCT(&blcr_cond, opal_condition_t);
        
        /*
         * Register the thread handler
         */
        cr_thread_callback_id = cr_register_callback(opal_crs_blcr_thread_callback,
                                                     crs_blcr_thread_callback_arg,
                                                     CR_THREAD_CONTEXT);
        /*
         * Register the signal handler
         *  - even though we do not use it
         */
        cr_signal_callback_id = cr_register_callback(opal_crs_blcr_signal_callback,
                                                     crs_blcr_signal_callback_arg,
                                                     CR_SIGNAL_CONTEXT);
    }

    /*
     * Now that we are done with init, set the state to running
     */
    blcr_current_state = OPAL_CRS_RUNNING;

    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: module_init() --> Finished [%d]",
                        opal_cr_is_tool);
    
    return OPAL_SUCCESS;
}
示例#2
0
int MPIDI_nem_ckpt_init(void)
{
    int mpi_errno = MPI_SUCCESS;
    cr_callback_id_t cb_id;
    cr_client_id_t client_id;
    int ret;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_NEM_CKPT_INIT);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_NEM_CKPT_INIT);

    if (!MPIR_CVAR_NEMESIS_ENABLE_CKPOINT)
        goto fn_exit;
    
    client_id = cr_init();
    MPIU_ERR_CHKANDJUMP(client_id < 0 && errno == ENOSYS, mpi_errno, MPI_ERR_OTHER, "**blcr_mod");

    cb_id = cr_register_callback(ckpt_cb, NULL, CR_THREAD_CONTEXT);
    MPIU_ERR_CHKANDJUMP1(cb_id == -1, mpi_errno, MPI_ERR_OTHER, "**intern", "**intern %s", MPIU_Strerror(errno));
    
    checkpointing = FALSE;
    current_wave = 0;

    ret = sem_init(&ckpt_sem, 0, 0);
    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**sem_init", "**sem_init %s", MPIU_Strerror(errno));
    ret = sem_init(&cont_sem, 0, 0);
    MPIU_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**sem_init", "**sem_init %s", MPIU_Strerror(errno));

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_NEM_CKPT_INIT);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
示例#3
0
// In CR_initialize(), put only code that must be called once
// because CR_initialize() won't be called at restart
// Code that needs to be run after each restart should go in CR_thread_start() or CR_Loop()
int CR_initialize()
{
    time_t tm;
    struct tm *stm;

    int rv = pthread_mutex_init( &cr_state_mutex, NULL );
    if ( rv != 0 ) {
        PRINT_ERROR_ERRNO( "pthread_mutex_init() failed", errno );
        return -1;
    }

    CR_state_transition( CR_INIT );

    cr_client_id_t cr_id = cr_init();
    if (cr_id < 0) {
        PRINT_ERROR("BLCR call cr_init() failed\n");
        return -2;
    }

    if (cr_register_callback(CR_Callback, (void *) NULL, CR_THREAD_CONTEXT) == -1) {
        PRINT_ERROR("BLCR call cr_register_callback() failed with error %d: %s\n", errno, cr_strerror(errno));
        return -3;
    }

    strncpy(ckpt_filename, DEFAULT_CHECKPOINT_FILENAME, CR_MAX_FILENAME);

    tm = time(NULL);
    if ((time_t) tm == -1) {
        PRINT_ERROR("time() failed\n");
        return -4;
    }

    stm = localtime(&tm);
    if (!stm) {
        PRINT_ERROR("localtime() failed\n");
        return -5;
    }

    snprintf(sessionid, CR_SESSION_MAX, "%d%d%d%d%d", stm->tm_yday, stm->tm_hour, stm->tm_min, stm->tm_sec, getpid());
    sessionid[CR_SESSION_MAX - 1] = '\0';

    return 0;
}
示例#4
0
文件: ckpoint_blcr.c 项目: adk9/hydra
HYD_status HYDT_ckpoint_blcr_init(void)
{
    HYD_status status = HYD_SUCCESS;
    int rc;
    cr_client_id_t client_id;
    cr_callback_id_t callback_id;

    HYDU_FUNC_ENTER();

    client_id = (int) cr_init();
    if (client_id < 0)
        goto fn_fail;

    callback_id = cr_register_callback(my_callback, &rc, CR_SIGNAL_CONTEXT);
    if (callback_id < 0)
        goto fn_fail;

  fn_exit:
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
示例#5
0
int
main(int argc, char **argv)
{
	int debug_level, sig, srun_fd;
	struct sigaction sa;
	log_options_t logopt = LOG_OPTS_STDERR_ONLY;
	struct sockaddr_un ca;
	unsigned int ca_len = sizeof(ca);

	atexit(remove_listen_socket);

	/* copied from srun */
	debug_level = _slurm_debug_env_val();
	logopt.stderr_level += debug_level;
	log_init(xbasename(argv[0]), logopt, 0, NULL);

	if (init_srun_argv(argc, argv)) {
		fatal("failed to initialize arguments for running srun");
	}

	if ((cr_id = cr_init()) < 0) {
		fatal("failed to initialize libcr: %s", cr_strerror(errno));
	}
	(void)cr_register_callback(cr_callback, NULL, CR_THREAD_CONTEXT);

	/* forward signals. copied from cr_restart */
	sa.sa_sigaction = signal_child;
	sa.sa_flags = SA_RESTART | SA_NODEFER | SA_SIGINFO;
	sigemptyset(&sa.sa_mask);
	for (sig = 0;  sig < _NSIG; sig ++) {
		if (sig == SIGSTOP ||
		    sig == SIGKILL ||
		    sig == SIGCHLD)
			continue;
		sigaction(sig, &sa, NULL);
	}
	sa.sa_sigaction = on_child_exit;
	sa.sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP;
	sigaction(SIGCHLD, &sa, NULL);

	cr_enter_cs(cr_id); /* BEGIN CS: avoid race condition of whether srun is forked */
	if ( fork_exec_srun() ) {
		fatal("failed fork/exec/wait srun");
	}
	cr_leave_cs(cr_id); /* END CS */

	while (1) {
		pthread_mutex_lock(&step_launch_mutex);
		while (step_launched) {
			/* just avoid busy waiting */
			pthread_cond_wait(&step_launch_cond,
					  &step_launch_mutex);
		}
		pthread_mutex_unlock(&step_launch_mutex);

		if (_wait_for_srun_connect() < 0)
			continue;

		cr_enter_cs(cr_id); /* BEGIN CS: checkpoint(callback) will be delayed */

		srun_fd = accept(listen_fd, (struct sockaddr*)&ca, &ca_len);
		if (srun_fd < 0) {
			/* restarted before enter CS. socket will not be restored */
			if (errno == EBADF) {
				cr_leave_cs(cr_id);
				continue;
			} else {
				fatal("failed to accept socket: %m");
			}
		}

		_read_info_from_srun(srun_fd);
		close(srun_fd);

		step_launched = 1;
		debug2("step launched");

		cr_leave_cs(cr_id); /* END CS */
	}

	return 0;
}
示例#6
0
文件: pbs_demux.c 项目: gbeane/torque
int main(

    int   argc,
    char *argv[])

{
    struct timeval timeout;
    int i;
    int maxfd;
    int main_sock_out = 3;
    int main_sock_err = 4;
    int n;
    int newsock;
    pid_t parent;
    fd_set selset;

    struct routem *routem;

#ifdef ENABLE_BLCR
    if (cr_init() < 0)
    {
        perror("Failed to initialize BLCR.");
        exit(5);
    }

    (void)cr_register_callback(demux_callback, NULL, CR_THREAD_CONTEXT);
#endif /* ENABLE_BLCR */

    parent = getppid();

    /* disable cookie search - PW - mpiexec patch */

    /*
    cookie = getenv("PBS_JOBCOOKIE");

    if (cookie == 0)
      {
      fprintf(stderr, "%s: no PBS_JOBCOOKIE found in the env\n",
        argv[0]);

      exit(3);
      }

    #ifdef DEBUG
    printf("Cookie found in environment: %s\n",
      cookie);
    #endif
    */

    if((maxfd = sysconf(_SC_OPEN_MAX)) < 0)
    {
        perror("unexpected return from sysconf.");

        exit(5);
    }

    routem = (struct routem *)calloc(maxfd, sizeof(struct routem));

    if (routem == NULL)
    {
        perror("cannot alloc memory");

        exit(5);
    }

    for (i = 0; i < maxfd; ++i)
    {
        routem[i].r_where = invalid;
        routem[i].r_nl    = 1;
    }

    routem[main_sock_out].r_where = new_out;
    routem[main_sock_err].r_where = new_err;

    FD_ZERO(&readset);
    FD_SET(main_sock_out, &readset);
    FD_SET(main_sock_err, &readset);

    if (listen(main_sock_out, TORQUE_LISTENQUEUE) < 0)
    {
        perror("listen on out");

        exit(5);
    }

    if (listen(main_sock_err, TORQUE_LISTENQUEUE) < 0)
    {
        perror("listen on err");

        exit(5);
    }

    while (1)
    {
        selset = readset;
        timeout.tv_usec = 0;
        timeout.tv_sec  = 10;

        n = select(FD_SETSIZE, &selset, (fd_set *)0, (fd_set *)0, &timeout);

        if (n == -1)
        {
            if (errno == EINTR)
            {
                n = 0;
            }
            else
            {
                fprintf(stderr, "%s: select failed\n",
                        argv[0]);

                exit(1);
            }
        }
        else if (n == 0)
        {
            /* NOTE:  on TRU64, init process does not have pid==1 */

            if (getppid() != parent)
            {
#ifdef DEBUG
                fprintf(stderr, "%s: Parent has gone, and so will I\n",
                        argv[0]);
#endif /* DEBUG */

                break;
            }
        }    /* END else if (n == 0) */

        for (i = 0; (n != 0) && (i < maxfd); ++i)
        {
            if (FD_ISSET(i, &selset))
            {
                /* this socket has data */
                n--;

                switch ((routem + i)->r_where)
                {

                case new_out:

                case new_err:

                    newsock = accept(i, 0, 0);

                    (routem + newsock)->r_where = (routem + i)->r_where == new_out ?
                                                  old_out :
                                                  old_err;

                    FD_SET(newsock, &readset);

                    break;

                case old_out:

                case old_err:

                    readit(i, routem + i);

                    break;

                default:

                    fprintf(stderr, "%s: internal error\n",
                            argv[0]);

                    exit(2);

                    /*NOTREACHED*/

                    break;
                }
            }
        }
    }    /* END while(1) */

    return(0);
}  /* END main() */
示例#7
0
int opal_crs_blcr_module_init(void)
{
    void *crs_blcr_thread_callback_arg = NULL;
    void *crs_blcr_signal_callback_arg = NULL;

    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: module_init()");

    blcr_restart_cmd    = strdup("cr_restart");
    blcr_checkpoint_cmd = strdup("cr_checkpoint");

    my_pid = getpid();

    if( !opal_cr_is_tool ) {
        /* We need to make the lock and condition variable before
         * starting the thread, since the thread uses these vars.
         */
        OBJ_CONSTRUCT(&blcr_lock, opal_mutex_t);
        OBJ_CONSTRUCT(&blcr_cond, opal_condition_t);

        /*
         * Initialize BLCR
         */
        client_id = cr_init();
        if (0 > client_id) {
            opal_output(mca_crs_blcr_component.super.output_handle,
                        "Error: crs:blcr: module_init: cr_init failed (%d)\n", client_id);
            return OPAL_ERROR;
        }
    }

#if OPAL_ENABLE_CRDEBUG == 1
    blcr_crdebug_refreshed_env = false;
#endif

    blcr_restart_cmd    = strdup("cr_restart");
    blcr_checkpoint_cmd = strdup("cr_checkpoint");

    if( !opal_cr_is_tool ) {
        /*
         * Register the thread handler
         */
        cr_thread_callback_id = cr_register_callback(opal_crs_blcr_thread_callback,
                                                     crs_blcr_thread_callback_arg,
                                                     CR_THREAD_CONTEXT);
        /*
         * Register the signal handler
         *  - even though we do not use it
         */
        cr_signal_callback_id = cr_register_callback(opal_crs_blcr_signal_callback,
                                                     crs_blcr_signal_callback_arg,
                                                     CR_SIGNAL_CONTEXT);

#if OPAL_ENABLE_CRDEBUG == 1
        /*
         * Checkpoint/restart enabled debugging hooks
         *  "NO_CALLBACKS"   -> non-MPI threads
         *  "SIGNAL_CONTEXT" -> MPI threads
         *  "THREAD_CONTEXT" -> BLCR threads
         */
        cr_register_hook(CR_HOOK_CONT_NO_CALLBACKS,   MPIR_checkpoint_debugger_crs_hook);
        cr_register_hook(CR_HOOK_CONT_SIGNAL_CONTEXT, MPIR_checkpoint_debugger_crs_hook);

        cr_register_hook(CR_HOOK_RSTRT_NO_CALLBACKS,   MPIR_checkpoint_debugger_crs_hook);
        cr_register_hook(CR_HOOK_RSTRT_SIGNAL_CONTEXT, MPIR_checkpoint_debugger_crs_hook);
#endif
    }

    /*
     * Now that we are done with init, set the state to running
     */
    blcr_current_state = OPAL_CRS_RUNNING;

    opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
                        "crs:blcr: module_init() --> Finished [%d]",
                        opal_cr_is_tool);

    return OPAL_SUCCESS;
}