static int do_child(orte_app_context_t* context,
                    orte_proc_t *child,
                    char **environ_copy,
                    orte_job_t *jobdat, int write_fd,
                    orte_iof_base_io_conf_t opts)
{
    int i, rc;
    sigset_t sigs;
    long fd, fdmax = sysconf(_SC_OPEN_MAX);
    char *param, *msg;

    if (orte_forward_job_control) {
        /* Set a new process group for this child, so that a
           SIGSTOP can be sent to it without being sent to the
           orted. */
        setpgid(0, 0);
    }

    /* Setup the pipe to be close-on-exec */
    opal_fd_set_cloexec(write_fd);

    if (NULL != child) {
        /* setup stdout/stderr so that any error messages that we
           may print out will get displayed back at orterun.

           NOTE: Definitely do this AFTER we check contexts so
           that any error message from those two functions doesn't
           come out to the user. IF we didn't do it in this order,
           THEN a user who gives us a bad executable name or
           working directory would get N error messages, where
           N=num_procs. This would be very annoying for large
           jobs, so instead we set things up so that orterun
           always outputs a nice, single message indicating what
           happened
        */
        if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
                                                           &environ_copy))) {
            ORTE_ERROR_LOG(i);
            send_error_show_help(write_fd, 1,
                                 "help-orte-odls-default.txt",
                                 "iof setup failed",
                                 orte_process_info.nodename, context->app);
            /* Does not return */
        }

        /* now set any child-level controls such as binding */
        orte_rtc.set(jobdat, child, &environ_copy, write_fd);

    } else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
        /* tie stdin/out/err/internal to /dev/null */
        int fdnull;
        for (i=0; i < 3; i++) {
            fdnull = open("/dev/null", O_RDONLY, 0);
            if (fdnull > i && i != write_fd) {
                dup2(fdnull, i);
            }
            close(fdnull);
        }
        fdnull = open("/dev/null", O_RDONLY, 0);
        if (fdnull > opts.p_internal[1]) {
            dup2(fdnull, opts.p_internal[1]);
        }
        close(fdnull);
    }

    /* if the user requested it, set the system resource limits */
    if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
        send_error_show_help(write_fd, 1, "help-orte-odls-default.txt",
                             "set limit",
                             orte_process_info.nodename, context->app,
                             __FILE__, __LINE__, msg);
    }
    /* ensure we only do this once */
    (void) mca_base_var_env_name("opal_set_max_sys_limits", &param);
    opal_unsetenv(param, &environ_copy);
    free(param);

    /* close all open file descriptors w/ exception of stdin/stdout/stderr,
       the pipe used for the IOF INTERNAL messages, and the pipe up to
       the parent. */
    if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
        // close *all* file descriptors -- slow
        for(fd=3; fd<fdmax; fd++) {
            if (fd != opts.p_internal[1] && fd != write_fd) {
                close(fd);
            }
        }
    }

    if (context->argv == NULL) {
        context->argv = malloc(sizeof(char*)*2);
        context->argv[0] = strdup(context->app);
        context->argv[1] = NULL;
    }

    /* Set signal handlers back to the default.  Do this close to
       the exev() because the event library may (and likely will)
       reset them.  If we don't do this, the event library may
       have left some set that, at least on some OS's, don't get
       reset via fork() or exec().  Hence, the launched process
       could be unkillable (for example). */

    set_handler_default(SIGTERM);
    set_handler_default(SIGINT);
    set_handler_default(SIGHUP);
    set_handler_default(SIGPIPE);
    set_handler_default(SIGCHLD);

    /* Unblock all signals, for many of the same reasons that we
       set the default handlers, above.  This is noticable on
       Linux where the event library blocks SIGTERM, but we don't
       want that blocked by the launched process. */
    sigprocmask(0, 0, &sigs);
    sigprocmask(SIG_UNBLOCK, &sigs, 0);

    /* Exec the new executable */

    if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
        int jout;
        opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), context->app);
        for (jout=0; NULL != context->argv[jout]; jout++) {
            opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]);
        }
        for (jout=0; NULL != environ_copy[jout]; jout++) {
            opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
        }
    }

    execve(context->app, context->argv, environ_copy);
    send_error_show_help(write_fd, 1,
                         "help-orte-odls-default.txt", "execve error",
                         orte_process_info.nodename, context->app, strerror(errno));
    /* Does not return */
}
Exemple #2
0
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
{
    int i;
    sigset_t sigs;

    /* Setup the pipe to be close-on-exec */
    opal_fd_set_cloexec(write_fd);

    if (NULL != cd->child) {
        /* setup stdout/stderr so that any error messages that we
           may print out will get displayed back at orterun.

           NOTE: Definitely do this AFTER we check contexts so
           that any error message from those two functions doesn't
           come out to the user. IF we didn't do it in this order,
           THEN a user who gives us a bad executable name or
           working directory would get N error messages, where
           N=num_procs. This would be very annoying for large
           jobs, so instead we set things up so that orterun
           always outputs a nice, single message indicating what
           happened
        */
        if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
            ORTE_ERROR_LOG(i);
            send_error_show_help(write_fd, 1,
                                 "help-orte-odls-alps.txt",
                                 "iof setup failed",
                                 orte_process_info.nodename, cd->app->app);
            /* Does not return */
        }

        /* now set any child-level controls such as binding */
        orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);

    } else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
        /* tie stdin/out/err/internal to /dev/null */
        int fdnull;
        for (i=0; i < 3; i++) {
            fdnull = open("/dev/null", O_RDONLY, 0);
            if (fdnull > i && i != write_fd) {
                dup2(fdnull, i);
            }
            close(fdnull);
        }
        fdnull = open("/dev/null", O_RDONLY, 0);
        if (fdnull > cd->opts.p_internal[1]) {
            dup2(fdnull, cd->opts.p_internal[1]);
        }
        close(fdnull);
    }

    if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
        send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
                             "close fds",
                             orte_process_info.nodename, cd->app->app,
                             __FILE__, __LINE__);
    }


    if (cd->argv == NULL) {
        cd->argv = malloc(sizeof(char*)*2);
        cd->argv[0] = strdup(cd->app->app);
        cd->argv[1] = NULL;
    }

    /* Set signal handlers back to the default.  Do this close to
       the exev() because the event library may (and likely will)
       reset them.  If we don't do this, the event library may
       have left some set that, at least on some OS's, don't get
       reset via fork() or exec().  Hence, the launched process
       could be unkillable (for example). */

    set_handler_alps(SIGTERM);
    set_handler_alps(SIGINT);
    set_handler_alps(SIGHUP);
    set_handler_alps(SIGPIPE);
    set_handler_alps(SIGCHLD);

    /* Unblock all signals, for many of the same reasons that we
       set the default handlers, above.  This is noticable on
       Linux where the event library blocks SIGTERM, but we don't
       want that blocked by the launched process. */
    sigprocmask(0, 0, &sigs);
    sigprocmask(SIG_UNBLOCK, &sigs, 0);

    /* Exec the new executable */

    if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
        int jout;
        opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app);
        for (jout=0; NULL != cd->argv[jout]; jout++) {
            opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]);
        }
        for (jout=0; NULL != cd->env[jout]; jout++) {
            opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]);
        }
    }

    execve(cd->app->app, cd->argv, cd->env);
    send_error_show_help(write_fd, 1,
                         "help-orte-odls-alps.txt", "execve error",
                         orte_process_info.nodename, cd->app->app, strerror(errno));
    /* Does not return */
}