Ejemplo n.º 1
0
static int do_child(orte_app_context_t* context,
                    orte_proc_t *child,
                    char **environ_copy,
                    orte_job_t *jobdat, int write_fd,
                    orte_iof_base_io_conf_t opts)
{
    int i, rc;
    sigset_t sigs;
    char *param, *msg;

    /* Setup the pipe to be close-on-exec */
    opal_fd_set_cloexec(write_fd);

    if (NULL != child) {
        /* setup stdout/stderr so that any error messages that we
           may print out will get displayed back at orterun.

           NOTE: Definitely do this AFTER we check contexts so
           that any error message from those two functions doesn't
           come out to the user. IF we didn't do it in this order,
           THEN a user who gives us a bad executable name or
           working directory would get N error messages, where
           N=num_procs. This would be very annoying for large
           jobs, so instead we set things up so that orterun
           always outputs a nice, single message indicating what
           happened
        */
        if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
                                                           &environ_copy))) {
            ORTE_ERROR_LOG(i);
            send_error_show_help(write_fd, 1,
                                 "help-orte-odls-alps.txt",
                                 "iof setup failed",
                                 orte_process_info.nodename, context->app);
            /* Does not return */
        }

        /* now set any child-level controls such as binding */
        orte_rtc.set(jobdat, child, &environ_copy, write_fd);

    } else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
        /* tie stdin/out/err/internal to /dev/null */
        int fdnull;
        for (i=0; i < 3; i++) {
            fdnull = open("/dev/null", O_RDONLY, 0);
            if (fdnull > i && i != write_fd) {
                dup2(fdnull, i);
            }
            close(fdnull);
        }
        fdnull = open("/dev/null", O_RDONLY, 0);
        if (fdnull > opts.p_internal[1]) {
            dup2(fdnull, opts.p_internal[1]);
        }
        close(fdnull);
    }

    /* if the user requested it, set the system resource limits */
    if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
        send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
                             "set limit",
                             orte_process_info.nodename, context->app,
                             __FILE__, __LINE__, msg);
    }
    /* ensure we only do this once */
    (void) mca_base_var_env_name("opal_set_max_sys_limits", &param);
    opal_unsetenv(param, &environ_copy);
    free(param);

    if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
        send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
                             "close fds",
                             orte_process_info.nodename, context->app,
                             __FILE__, __LINE__);
    }


    if (context->argv == NULL) {
        context->argv = malloc(sizeof(char*)*2);
        context->argv[0] = strdup(context->app);
        context->argv[1] = NULL;
    }

    /* Set signal handlers back to the default.  Do this close to
       the exev() because the event library may (and likely will)
       reset them.  If we don't do this, the event library may
       have left some set that, at least on some OS's, don't get
       reset via fork() or exec().  Hence, the launched process
       could be unkillable (for example). */

    set_handler_alps(SIGTERM);
    set_handler_alps(SIGINT);
    set_handler_alps(SIGHUP);
    set_handler_alps(SIGPIPE);
    set_handler_alps(SIGCHLD);

    /* Unblock all signals, for many of the same reasons that we
       set the default handlers, above.  This is noticable on
       Linux where the event library blocks SIGTERM, but we don't
       want that blocked by the launched process. */
    sigprocmask(0, 0, &sigs);
    sigprocmask(SIG_UNBLOCK, &sigs, 0);

    /* Exec the new executable */

    if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
        int jout;
        opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), context->app);
        for (jout=0; NULL != context->argv[jout]; jout++) {
            opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]);
        }
        for (jout=0; NULL != environ_copy[jout]; jout++) {
            opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
        }
    }

    execve(context->app, context->argv, environ_copy);
    send_error_show_help(write_fd, 1,
                         "help-orte-odls-alps.txt", "execve error",
                         orte_process_info.nodename, context->app, strerror(errno));
    /* Does not return */
}
Ejemplo n.º 2
0
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
{
    int i;
    sigset_t sigs;

    /* Setup the pipe to be close-on-exec */
    opal_fd_set_cloexec(write_fd);

    if (NULL != cd->child) {
        /* setup stdout/stderr so that any error messages that we
           may print out will get displayed back at orterun.

           NOTE: Definitely do this AFTER we check contexts so
           that any error message from those two functions doesn't
           come out to the user. IF we didn't do it in this order,
           THEN a user who gives us a bad executable name or
           working directory would get N error messages, where
           N=num_procs. This would be very annoying for large
           jobs, so instead we set things up so that orterun
           always outputs a nice, single message indicating what
           happened
        */

	if (ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
	    if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
		ORTE_ERROR_LOG(i);
		send_error_show_help(write_fd, 1,
				     "help-orte-odls-alps.txt",
				     "iof setup failed",
				     orte_process_info.nodename, cd->app->app);
		/* Does not return */
	    }
	}


        /* now set any child-level controls such as binding */
        orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);

    } else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
        /* tie stdin/out/err/internal to /dev/null */
        int fdnull;
        for (i=0; i < 3; i++) {
            fdnull = open("/dev/null", O_RDONLY, 0);
            if (fdnull > i && i != write_fd) {
                dup2(fdnull, i);
            }
            close(fdnull);
        }
#if OPAL_PMIX_V1
        fdnull = open("/dev/null", O_RDONLY, 0);
        if (fdnull > cd->opts.p_internal[1]) {
            dup2(fdnull, cd->opts.p_internal[1]);
        }
        close(fdnull);
#endif
    }

    if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
        send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
                             "close fds",
                             orte_process_info.nodename, cd->app->app,
                             __FILE__, __LINE__);
    }


    if (cd->argv == NULL) {
        cd->argv = malloc(sizeof(char*)*2);
        cd->argv[0] = strdup(cd->app->app);
        cd->argv[1] = NULL;
    }

    /* Set signal handlers back to the default.  Do this close to
       the exev() because the event library may (and likely will)
       reset them.  If we don't do this, the event library may
       have left some set that, at least on some OS's, don't get
       reset via fork() or exec().  Hence, the launched process
       could be unkillable (for example). */

    set_handler_alps(SIGTERM);
    set_handler_alps(SIGINT);
    set_handler_alps(SIGHUP);
    set_handler_alps(SIGPIPE);
    set_handler_alps(SIGCHLD);

    /* Unblock all signals, for many of the same reasons that we
       set the default handlers, above.  This is noticable on
       Linux where the event library blocks SIGTERM, but we don't
       want that blocked by the launched process. */
    sigprocmask(0, 0, &sigs);
    sigprocmask(SIG_UNBLOCK, &sigs, 0);

    /* take us to the correct wdir */
    if (NULL != cd->wdir) {
        if (0 != chdir(cd->wdir)) {
            send_error_show_help(write_fd, 1,
                                 "help-orterun.txt",
                                 "orterun:wdir-not-found",
                                 "orted",
                                 cd->wdir,
                                 orte_process_info.nodename,
                                 (NULL == cd->child) ? 0 : cd->child->app_rank);
            /* Does not return */
        }
    }

    /* Exec the new executable */

    if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
        int jout;
        opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app);
        for (jout=0; NULL != cd->argv[jout]; jout++) {
            opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]);
        }
        for (jout=0; NULL != cd->env[jout]; jout++) {
            opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]);
        }
    }

    execve(cd->cmd, cd->argv, cd->env);
    send_error_show_help(write_fd, 1,
                         "help-orte-odls-alps.txt", "execve error",
                         orte_process_info.nodename, cd->app->app, strerror(errno));
    /* Does not return */
}