static int do_child(orte_app_context_t* context, orte_proc_t *child, char **environ_copy, orte_job_t *jobdat, int write_fd, orte_iof_base_io_conf_t opts) { int i, rc; sigset_t sigs; long fd, fdmax = sysconf(_SC_OPEN_MAX); char *param, *msg; if (orte_forward_job_control) { /* Set a new process group for this child, so that a SIGSTOP can be sent to it without being sent to the orted. */ setpgid(0, 0); } /* Setup the pipe to be close-on-exec */ opal_fd_set_cloexec(write_fd); if (NULL != child) { /* setup stdout/stderr so that any error messages that we may print out will get displayed back at orterun. NOTE: Definitely do this AFTER we check contexts so that any error message from those two functions doesn't come out to the user. IF we didn't do it in this order, THEN a user who gives us a bad executable name or working directory would get N error messages, where N=num_procs. This would be very annoying for large jobs, so instead we set things up so that orterun always outputs a nice, single message indicating what happened */ if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts, &environ_copy))) { ORTE_ERROR_LOG(i); send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", "iof setup failed", orte_process_info.nodename, context->app); /* Does not return */ } /* now set any child-level controls such as binding */ orte_rtc.set(jobdat, child, &environ_copy, write_fd); } else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { /* tie stdin/out/err/internal to /dev/null */ int fdnull; for (i=0; i < 3; i++) { fdnull = open("/dev/null", O_RDONLY, 0); if (fdnull > i && i != write_fd) { dup2(fdnull, i); } close(fdnull); } fdnull = open("/dev/null", O_RDONLY, 0); if (fdnull > opts.p_internal[1]) { dup2(fdnull, opts.p_internal[1]); } close(fdnull); } /* if the user requested it, set the system resource limits */ if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) { send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", "set limit", orte_process_info.nodename, context->app, __FILE__, __LINE__, msg); } /* ensure we only do this once */ (void) mca_base_var_env_name("opal_set_max_sys_limits", ¶m); opal_unsetenv(param, &environ_copy); free(param); /* close all open file descriptors w/ exception of stdin/stdout/stderr, the pipe used for the IOF INTERNAL messages, and the pipe up to the parent. */ if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) { // close *all* file descriptors -- slow for(fd=3; fd<fdmax; fd++) { if (fd != opts.p_internal[1] && fd != write_fd) { close(fd); } } } if (context->argv == NULL) { context->argv = malloc(sizeof(char*)*2); context->argv[0] = strdup(context->app); context->argv[1] = NULL; } /* Set signal handlers back to the default. Do this close to the exev() because the event library may (and likely will) reset them. If we don't do this, the event library may have left some set that, at least on some OS's, don't get reset via fork() or exec(). Hence, the launched process could be unkillable (for example). */ set_handler_default(SIGTERM); set_handler_default(SIGINT); set_handler_default(SIGHUP); set_handler_default(SIGPIPE); set_handler_default(SIGCHLD); /* Unblock all signals, for many of the same reasons that we set the default handlers, above. This is noticable on Linux where the event library blocks SIGTERM, but we don't want that blocked by the launched process. */ sigprocmask(0, 0, &sigs); sigprocmask(SIG_UNBLOCK, &sigs, 0); /* Exec the new executable */ if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) { int jout; opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), context->app); for (jout=0; NULL != context->argv[jout]; jout++) { opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]); } for (jout=0; NULL != environ_copy[jout]; jout++) { opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]); } } execve(context->app, context->argv, environ_copy); send_error_show_help(write_fd, 1, "help-orte-odls-default.txt", "execve error", orte_process_info.nodename, context->app, strerror(errno)); /* Does not return */ }
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd) { int i; sigset_t sigs; /* Setup the pipe to be close-on-exec */ opal_fd_set_cloexec(write_fd); if (NULL != cd->child) { /* setup stdout/stderr so that any error messages that we may print out will get displayed back at orterun. NOTE: Definitely do this AFTER we check contexts so that any error message from those two functions doesn't come out to the user. IF we didn't do it in this order, THEN a user who gives us a bad executable name or working directory would get N error messages, where N=num_procs. This would be very annoying for large jobs, so instead we set things up so that orterun always outputs a nice, single message indicating what happened */ if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) { ORTE_ERROR_LOG(i); send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt", "iof setup failed", orte_process_info.nodename, cd->app->app); /* Does not return */ } /* now set any child-level controls such as binding */ orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd); } else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { /* tie stdin/out/err/internal to /dev/null */ int fdnull; for (i=0; i < 3; i++) { fdnull = open("/dev/null", O_RDONLY, 0); if (fdnull > i && i != write_fd) { dup2(fdnull, i); } close(fdnull); } fdnull = open("/dev/null", O_RDONLY, 0); if (fdnull > cd->opts.p_internal[1]) { dup2(fdnull, cd->opts.p_internal[1]); } close(fdnull); } if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) { send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt", "close fds", orte_process_info.nodename, cd->app->app, __FILE__, __LINE__); } if (cd->argv == NULL) { cd->argv = malloc(sizeof(char*)*2); cd->argv[0] = strdup(cd->app->app); cd->argv[1] = NULL; } /* Set signal handlers back to the default. Do this close to the exev() because the event library may (and likely will) reset them. If we don't do this, the event library may have left some set that, at least on some OS's, don't get reset via fork() or exec(). Hence, the launched process could be unkillable (for example). */ set_handler_alps(SIGTERM); set_handler_alps(SIGINT); set_handler_alps(SIGHUP); set_handler_alps(SIGPIPE); set_handler_alps(SIGCHLD); /* Unblock all signals, for many of the same reasons that we set the default handlers, above. This is noticable on Linux where the event library blocks SIGTERM, but we don't want that blocked by the launched process. */ sigprocmask(0, 0, &sigs); sigprocmask(SIG_UNBLOCK, &sigs, 0); /* Exec the new executable */ if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) { int jout; opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app); for (jout=0; NULL != cd->argv[jout]; jout++) { opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]); } for (jout=0; NULL != cd->env[jout]; jout++) { opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]); } } execve(cd->app->app, cd->argv, cd->env); send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt", "execve error", orte_process_info.nodename, cd->app->app, strerror(errno)); /* Does not return */ }