/** * cockpit_unix_fd_close_all: * @from: minimum FD to close, or -1 * @except: an FD to leave open, or -1 * * Close all open file descriptors starting from @from * and skipping @except. * * Will set errno if a failure happens. * * Returns: zero if successful, -1 if not */ int cockpit_unix_fd_close_all (int from, int except) { CloseAll ca = { from, except }; return fdwalk (closefd, &ca); }
/** * cockpit_unix_fd_close_until: * @from: minimum FD to close, or -1 * @except: an FD to leave open, or -1 * @until: stop closing fds when this number is hit. * * Close all open file descriptors starting from @from * and skipping @except up to but not including @until. * * Will set errno if a failure happens. * * Returns: zero if successful, -1 if not */ int cockpit_unix_fd_close_until (int from, int except, int until) { CloseAll ca = { from, except, until }; return fdwalk (closefd, &ca); }
/* * call-seq: * IO.fdwalk(lowfd){ |fh| ... } * * Iterates over each open file descriptor and yields back a File object. * * Not supported on all platforms. */ static VALUE io_fdwalk(int argc, VALUE* argv, VALUE klass){ VALUE v_low_fd, v_block; int lowfd; rb_scan_args(argc, argv, "1&", &v_low_fd, &v_block); lowfd = NUM2INT(v_low_fd); fdwalk(close_func, &lowfd); return klass; }
static int fork_session (struct passwd *pw, int (*func) (void)) { int status; int from; fflush (stderr); child = fork (); if (child < 0) { warn ("can't fork"); return 1 << 8; } if (child == 0) { if (setgid (pw->pw_gid) < 0) { warn ("setgid() failed"); _exit (42); } if (setuid (pw->pw_uid) < 0) { warn ("setuid() failed"); _exit (42); } if (getuid() != geteuid() && getgid() != getegid()) { warnx ("couldn't drop privileges"); _exit (42); } debug ("dropped privileges"); from = 3; if (fdwalk (closefd, &from) < 0) { warnx ("couldn't close all file descirptors"); _exit (42); } _exit (func ()); } close (0); close (1); waitpid (child, &status, 0); return status; }
/* * Close all open file descriptors greater than or equal to lowfd. */ void closefrom(int lowfd) { int low = (lowfd < 0)? 0 : lowfd; /* * Close lowfd right away as a hedge against failing * to open the /proc file descriptor directory due * all file descriptors being currently used up. */ (void) close(low++); (void) fdwalk(void_close, &low); }
static void close_all_fds() { #if HAVE_FDWALK fdwalk(fdwalker_close, NULL); #elif defined(linux) || defined(__linux) || defined(__linux__) char path[PATH_MAX]; DIR *root; struct dirent *de, *entry; int size = 0; snprintf(path, sizeof(path), "/proc/%d/fd", getpid()); #ifdef _PC_NAME_MAX size = pathconf(path, _PC_NAME_MAX); #endif size = MAX(size, PATH_MAX + 128); de = alloca(size); close(3); /* hoping opendir uses 3 */ root = opendir(path); if(!root) return; while(portable_readdir_r(root, de, &entry) == 0 && entry != NULL) { if(entry->d_name[0] >= '1' && entry->d_name[0] <= '9') { int tgt; tgt = atoi(entry->d_name); if(tgt != 3) close(tgt); } } close(3); #elif defined(__MACH__) && defined(__APPLE__) struct proc_fdinfo files[1024*16]; int rv, i = 0; rv = proc_pidinfo(getpid(), PROC_PIDLISTFDS, 0, files, sizeof(files)); if(rv > 0 && (rv % sizeof(files[0])) == 0) { rv /= sizeof(files[0]); for(i=0;i<rv;i++) { (void) close(files[i].proc_fd); } } #else struct rlimit rl; int i, reasonable_max; getrlimit(RLIMIT_NOFILE, &rl); reasonable_max = MIN(1<<14, rl.rlim_max); for (i = 0; i < reasonable_max; i++) (void) close(i); #endif }
/* * We cannot use libc's daemon() because the door we create is associated with * the process ID. If we create the door before the call to daemon(), it will * be associated with the parent and it's incorrect. On the other hand if we * create the door later, after the call to daemon(), parent process exits * early and gives a false notion to SMF that 'ipmgmtd' is up and running, * which is incorrect. So, we have our own daemon() equivalent. */ static boolean_t ipmgmt_daemonize(void) { pid_t pid; int rv; if (pipe(pfds) < 0) { (void) fprintf(stderr, "%s: pipe() failed: %s\n", progname, strerror(errno)); exit(EXIT_FAILURE); } if ((pid = fork()) == -1) { (void) fprintf(stderr, "%s: fork() failed: %s\n", progname, strerror(errno)); exit(EXIT_FAILURE); } else if (pid > 0) { /* Parent */ (void) close(pfds[1]); /* * Parent should not exit early, it should wait for the child * to return Success/Failure. If the parent exits early, then * SMF will think 'ipmgmtd' is up and would start all the * depended services. * * If the child process exits unexpectedly, read() returns -1. */ if (read(pfds[0], &rv, sizeof (int)) != sizeof (int)) { (void) kill(pid, SIGKILL); rv = EXIT_FAILURE; } (void) close(pfds[0]); exit(rv); } /* Child */ (void) close(pfds[0]); (void) setsid(); /* close all files except pfds[1] */ (void) fdwalk(closefunc, NULL); (void) chdir("/"); openlog(progname, LOG_PID, LOG_DAEMON); return (B_TRUE); }
static boolean_t dlmgmt_daemonize(void) { pid_t pid; int rv; if (pipe(pfds) < 0) { (void) fprintf(stderr, "%s: pipe() failed: %s\n", progname, strerror(errno)); exit(EXIT_FAILURE); } if ((pid = fork()) == -1) { (void) fprintf(stderr, "%s: fork() failed: %s\n", progname, strerror(errno)); exit(EXIT_FAILURE); } else if (pid > 0) { /* Parent */ (void) close(pfds[1]); /* * Read the child process's return value from the pfds. * If the child process exits unexpected, read() returns -1. */ if (read(pfds[0], &rv, sizeof (int)) != sizeof (int)) { (void) kill(pid, SIGKILL); rv = EXIT_FAILURE; } (void) close(pfds[0]); exit(rv); } /* Child */ (void) close(pfds[0]); (void) setsid(); /* * Close all files except pfds[1]. */ (void) fdwalk(closefunc, NULL); (void) chdir("/"); openlog(progname, LOG_PID, LOG_DAEMON); return (B_TRUE); }
/* * Become a daemon. */ static void daemonize(void) { pid_t cpid; /* * A little bit of magic here. By the first fork+setsid, we * disconnect from our current controlling terminal and become * a session group leader. By forking again without setsid, * we make certain that we're not the session group leader and * can never reacquire a controlling terminal. */ if ((cpid = fork()) == (pid_t)-1) { early_error("fork 1"); } if (cpid != 0) { (void) wait(NULL); _exit(0); } if (setsid() == (pid_t)-1) { early_error("setsid"); } if ((cpid = fork()) == (pid_t)-1) { early_error("fork 2"); } if (cpid != 0) { /* Parent just exits */ (void) printf("%d\n", (int)cpid); (void) fflush(stdout); _exit(0); } (void) chdir("/"); (void) umask(0); (void) fdwalk(fdcloser, NULL); reopen_log(); }
static void pre_exec_close_fds(void) { fdwalk (set_cloexec, GINT_TO_POINTER(3)); }
/* This stays around for as long as the initial process in the app does * and when that exits it exits, propagating the exit status. We do this * by having pid 1 in the sandbox detect this exit and tell the monitor * the exit status via a eventfd. We also track the exit of the sandbox * pid 1 via a signalfd for SIGCHLD, and exit with an error in this case. * This is to catch e.g. problems during setup. */ static void monitor_child (int event_fd) { int res; uint64_t val; ssize_t s; int signal_fd; sigset_t mask; struct pollfd fds[2]; int num_fds; struct signalfd_siginfo fdsi; int dont_close[] = { event_fd, -1 }; /* Close all extra fds in the monitoring process. Any passed in fds have been passed on to the child anyway. */ fdwalk (proc_fd, close_extra_fds, dont_close); sigemptyset (&mask); sigaddset (&mask, SIGCHLD); signal_fd = signalfd (-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK); if (signal_fd == -1) die_with_error ("Can't create signalfd"); num_fds = 1; fds[0].fd = signal_fd; fds[0].events = POLLIN; if (event_fd != -1) { fds[1].fd = event_fd; fds[1].events = POLLIN; num_fds++; } while (1) { fds[0].revents = fds[1].revents = 0; res = poll (fds, num_fds, -1); if (res == -1 && errno != EINTR) die_with_error ("poll"); /* Always read from the eventfd first, if pid 2 died then pid 1 often * dies too, and we could race, reporting that first and we'd lose * the real exit status. */ if (event_fd != -1) { s = read (event_fd, &val, 8); if (s == -1 && errno != EINTR && errno != EAGAIN) die_with_error ("read eventfd"); else if (s == 8) exit ((int)val - 1); } s = read (signal_fd, &fdsi, sizeof (struct signalfd_siginfo)); if (s == -1 && errno != EINTR && errno != EAGAIN) die_with_error ("read signalfd"); else if (s == sizeof(struct signalfd_siginfo)) { if (fdsi.ssi_signo != SIGCHLD) die ("Read unexpected signal\n"); exit (fdsi.ssi_status); } } }
/* Close all fd's except the ones we are using. * Clear the close-on-exec flag for socketpair fd's we are passing to child. */ static int preparefd_child (struct subprocess *p) { return fdwalk (do_prepare_open_fd, (void *) p); }
void closefrom (int lowfd) { fdwalk (close_lowfd, (void *)lowfd); }
int _z_zone_exec(int *r_status, char **r_results, char *a_inputFile, char *a_path, char *a_argv[], const char *a_zoneName, int *a_fds) { struct sigaction nact; struct sigaction oact; char *buffer; char *thisZoneName; int bufferIndex; int bufferSize; int exit_no; int ipipe[2] = {0, 0}; int lerrno; int n; int status; int stdinfile = -1; int tmpl_fd; pid_t child_pid; pid_t result_pid; void (*funcSighup)(); void (*funcSigint)(); /* entry assertions */ assert(a_path != (char *)NULL); assert(*a_path != '\0'); assert(a_argv != (char **)NULL); assert(a_argv[0] != (char *)NULL); assert(*a_argv[0] != '\0'); assert(a_zoneName != (char *)NULL); /* * if requested to execute in current zone name, directly execute */ thisZoneName = z_get_zonename(); status = (strcmp(a_zoneName, thisZoneName) == 0); /* entry debugging info */ _z_echoDebug(DBG_ZONE_EXEC_CMD_ENTER, a_path, a_zoneName, thisZoneName); (void) free(thisZoneName); for (n = 0; a_argv[n]; n++) { _z_echoDebug(DBG_ARG, n, a_argv[n]); } /* if this zone, just exec the command directly */ if (status != 0) { return (z_ExecCmdArray(r_status, r_results, a_inputFile, a_path, a_argv)); } /* reset return results buffer pointer */ if (r_results != (char **)NULL) { *r_results = (char *)NULL; } *r_status = -1; /* -1 : failure to exec process */ /* if zones are not implemented, return TRUE */ if (!z_zones_are_implemented()) { return (-6); /* -6 : zones are not supported */ } if ((tmpl_fd = _zexec_init_template()) == -1) { _z_program_error(ERR_CANNOT_CREATE_CONTRACT, strerror(errno)); return (-2); /* -2 : cannot create greenline contract */ } /* * See if input file exists */ if (a_inputFile != (char *)NULL) { stdinfile = open(a_inputFile, O_RDONLY); } else { stdinfile = open("/dev/null", O_RDONLY); /* stdin = /dev/null */ } if (stdinfile < 0) { return (-4); /* -4 : could not open stdin source file */ } /* * Create a pipe to be used to capture the command output */ if (pipe(ipipe) != 0) { (void) close(stdinfile); return (-1); } bufferSize = PIPE_BUFFER_INCREMENT; bufferIndex = 0; buffer = calloc(1, bufferSize); if (buffer == (char *)NULL) { (void) close(stdinfile); return (-1); } /* flush standard i/o before creating new process */ (void) fflush(stderr); (void) fflush(stdout); /* * hold SIGINT/SIGHUP signals and reset signal received counter; * after the fork1() the parent and child need to setup their respective * interrupt handling and release the hold on the signals */ (void) sighold(SIGINT); (void) sighold(SIGHUP); _z_global_data._z_SigReceived = 0; /* no signals received */ /* * fork off a new process to execute command in; * fork1() is used instead of vfork() so the child process can * perform operations that would modify the parent process if * vfork() were used */ child_pid = fork1(); if (child_pid < 0) { /* * ************************************************************* * fork failed! * ************************************************************* */ (void) ct_tmpl_clear(tmpl_fd); (void) close(tmpl_fd); (void) free(buffer); _z_program_error(ERR_FORK, strerror(errno)); /* release hold on signals */ (void) sigrelse(SIGHUP); (void) sigrelse(SIGINT); return (-3); /* -3 : fork() failed */ } if (child_pid == 0) { int i; /* * ************************************************************* * This is the forked (child) process * ************************************************************* */ (void) ct_tmpl_clear(tmpl_fd); (void) close(tmpl_fd); /* reset any signals to default */ for (i = 0; i < NSIG; i++) { (void) sigset(i, SIG_DFL); } /* assign stdin, stdout, stderr as appropriate */ (void) dup2(stdinfile, STDIN_FILENO); (void) close(ipipe[0]); /* close out pipe reader side */ (void) dup2(ipipe[1], STDOUT_FILENO); (void) dup2(ipipe[1], STDERR_FILENO); /* * close all file descriptors not in the a_fds list */ (void) fdwalk(&_z_close_file_descriptors, (void *)a_fds); /* release all held signals */ (void) sigrelse(SIGHUP); (void) sigrelse(SIGINT); /* execute command in the specified non-global zone */ _exit(_zexec(a_zoneName, a_path, a_argv)); } /* * ********************************************************************* * This is the forking (parent) process * ********************************************************************* */ /* register child process i.d. so signal handlers can pass signal on */ _z_global_data._z_ChildProcessId = child_pid; /* * setup signal handlers for SIGINT and SIGHUP and release hold */ /* hook SIGINT to _z_sig_trap() */ nact.sa_handler = _z_sig_trap; nact.sa_flags = SA_RESTART; (void) sigemptyset(&nact.sa_mask); if (sigaction(SIGINT, &nact, &oact) < 0) { funcSigint = SIG_DFL; } else { funcSigint = oact.sa_handler; } /* hook SIGHUP to _z_sig_trap() */ nact.sa_handler = _z_sig_trap; nact.sa_flags = SA_RESTART; (void) sigemptyset(&nact.sa_mask); if (sigaction(SIGHUP, &nact, &oact) < 0) { funcSighup = SIG_DFL; } else { funcSighup = oact.sa_handler; } /* release hold on signals */ (void) sigrelse(SIGHUP); (void) sigrelse(SIGINT); (void) ct_tmpl_clear(tmpl_fd); (void) close(tmpl_fd); (void) close(stdinfile); (void) close(ipipe[1]); /* Close write side of pipe */ /* * Spin reading data from the child into the buffer - when the read eofs * the child has exited */ for (;;) { ssize_t bytesRead; /* read as much child data as there is available buffer space */ bytesRead = read(ipipe[0], buffer + bufferIndex, bufferSize - bufferIndex); /* break out of read loop if end-of-file encountered */ if (bytesRead == 0) { break; } /* if error, continue if recoverable, else break out of loop */ if (bytesRead == -1) { /* try again: EAGAIN - insufficient resources */ if (errno == EAGAIN) { continue; } /* try again: EINTR - interrupted system call */ if (errno == EINTR) { continue; } /* break out of loop - error not recoverable */ break; } /* at least 1 byte read: expand buffer if at end */ bufferIndex += bytesRead; if (bufferIndex >= bufferSize) { buffer = realloc(buffer, bufferSize += PIPE_BUFFER_INCREMENT); (void) memset(buffer + bufferIndex, 0, bufferSize - bufferIndex); } } (void) close(ipipe[0]); /* Close read side of pipe */ /* * wait for the process to exit, reap child exit status */ for (;;) { result_pid = waitpid(child_pid, &status, 0L); lerrno = (result_pid == -1 ? errno : 0); /* break loop if child process status reaped */ if (result_pid != -1) { break; } /* break loop if not interrupted out of waitpid */ if (errno != EINTR) { break; } } /* reset child process i.d. so signal handlers do not pass signals on */ _z_global_data._z_ChildProcessId = -1; /* * If the child process terminated due to a call to exit(), then * set results equal to the 8-bit exit status of the child process; * otherwise, set the exit status to "-1" indicating that the child * exited via a signal. */ if (WIFEXITED(status)) { *r_status = WEXITSTATUS(status); if ((_z_global_data._z_SigReceived != 0) && (*r_status == 0)) { *r_status = 1; } } else { *r_status = -1; /* -1 : failure to exec process */ } /* determine proper exit code */ if (result_pid == -1) { exit_no = -5; /* -5 : error from 'waitpid' other than EINTR */ } else if (_z_global_data._z_SigReceived != 0) { exit_no = -7; /* -7 : interrupt received */ } else { exit_no = 0; } /* return appropriate output */ if (!*buffer) { /* No contents in output buffer - discard */ free(buffer); } else if (r_results == (char **)NULL) { /* Not requested to return results - discard */ free(buffer); } else { /* have output and request to return: pass to calling method */ *r_results = buffer; } /* * reset signal handlers */ /* reset SIGINT */ nact.sa_handler = funcSigint; nact.sa_flags = SA_RESTART; (void) sigemptyset(&nact.sa_mask); (void) sigaction(SIGINT, &nact, (struct sigaction *)NULL); /* reset SIGHUP */ nact.sa_handler = funcSighup; nact.sa_flags = SA_RESTART; (void) sigemptyset(&nact.sa_mask); (void) sigaction(SIGHUP, &nact, (struct sigaction *)NULL); /* * if signal received during command execution, interrupt * this process now. */ if (_z_global_data._z_SigReceived != 0) { (void) kill(getpid(), SIGINT); } /* set errno and return */ errno = lerrno; return (exit_no); }
/* * Mail a message on standard input to the people indicated * in the passed header. (Internal interface). */ void mail1(struct header *hp, int use_to, char *orig_to) { pid_t p, pid; int i, s, gotcha; char **namelist, *deliver; struct name *to, *np; FILE *mtf, *fp; int remote = rflag != NOSTR || rmail; char **t; char *deadletter; char recfile[PATHSIZE]; /* * Collect user's mail from standard input. * Get the result as mtf. */ pid = (pid_t)-1; if ((mtf = collect(hp)) == NULL) return; hp->h_seq = 1; if (hp->h_subject == NOSTR) hp->h_subject = sflag; if (fsize(mtf) == 0 && hp->h_subject == NOSTR) { printf(gettext("No message !?!\n")); goto out; } if (intty) { printf(gettext("EOT\n")); flush(); } /* * If we need to use the To: line to determine the record * file, save a copy of it before it's sorted below. */ if (use_to && orig_to == NOSTR && hp->h_to != NOSTR) orig_to = strcpy((char *)salloc(strlen(hp->h_to)+1), hp->h_to); else if (orig_to == NOSTR) orig_to = ""; /* * Now, take the user names from the combined * to and cc lists and do all the alias * processing. */ senderr = 0; to = cat(extract(hp->h_bcc, GBCC), cat(extract(hp->h_to, GTO), extract(hp->h_cc, GCC))); to = translate(outpre(elide(usermap(to)))); if (!senderr) mapf(to, myname); mechk(to); for (gotcha = 0, np = to; np != NIL; np = np->n_flink) if ((np->n_type & GDEL) == 0) gotcha++; hp->h_to = detract(to, GTO); hp->h_cc = detract(to, GCC); hp->h_bcc = detract(to, GBCC); if ((mtf = infix(hp, mtf)) == NULL) { fprintf(stderr, gettext(". . . message lost, sorry.\n")); return; } rewind(mtf); if (askme && isatty(0)) { char ans[64]; puthead(hp, stdout, GTO|GCC|GBCC, 0); printf(gettext("Send? ")); printf("[yes] "); if (fgets(ans, sizeof(ans), stdin) && ans[0] && (tolower(ans[0]) != 'y' && ans[0] != '\n')) goto dead; } if (senderr) goto dead; /* * Look through the recipient list for names with /'s * in them which we write to as files directly. */ i = outof(to, mtf); rewind(mtf); if (!gotcha && !i) { printf(gettext("No recipients specified\n")); goto dead; } if (senderr) goto dead; getrecf(orig_to, recfile, use_to, sizeof (recfile)); if (recfile != NOSTR && *recfile) savemail(safeexpand(recfile), hp, mtf); if (!gotcha) goto out; namelist = unpack(to); if (debug) { fprintf(stderr, "Recipients of message:\n"); for (t = namelist; *t != NOSTR; t++) fprintf(stderr, " \"%s\"", *t); fprintf(stderr, "\n"); return; } /* * Wait, to absorb a potential zombie, then * fork, set up the temporary mail file as standard * input for "mail" and exec with the user list we generated * far above. Return the process id to caller in case he * wants to await the completion of mail. */ #ifdef VMUNIX while (wait3((int *)0, WNOHANG, (struct rusage *)0) > 0) ; #else #ifdef preSVr4 wait((int *)0); #else while (waitpid((pid_t)-1, (int *)0, WNOHANG) > 0) ; #endif #endif rewind(mtf); pid = fork(); if (pid == (pid_t)-1) { perror("fork"); dead: deadletter = Getf("DEAD"); if (fp = fopen(deadletter, value("appenddeadletter") == NOSTR ? "w" : "a")) { chmod(deadletter, DEADPERM); puthead(hp, fp, GMASK|GCLEN, fsize(mtf) - textpos); fseek(mtf, textpos, 0); lcwrite(deadletter, mtf, fp, value("appenddeadletter") != NOSTR); fclose(fp); } else perror(deadletter); goto out; } if (pid == 0) { sigchild(); #ifdef SIGTSTP if (remote == 0) { sigset(SIGTSTP, SIG_IGN); sigset(SIGTTIN, SIG_IGN); sigset(SIGTTOU, SIG_IGN); } #endif sigset(SIGHUP, SIG_IGN); sigset(SIGINT, SIG_IGN); sigset(SIGQUIT, SIG_IGN); s = fileno(mtf); (void) fdwalk(closefd_walk, &s); close(0); dup(s); close(s); #ifdef CC submit(getpid()); #endif /* CC */ if ((deliver = value("sendmail")) == NOSTR) #ifdef SENDMAIL deliver = SENDMAIL; #else deliver = MAIL; #endif execvp(safeexpand(deliver), namelist); perror(deliver); exit(1); } if (value("sendwait")!=NOSTR) remote++; out: if (remote) { while ((p = wait(&s)) != pid && p != (pid_t)-1) ; if (s != 0) senderr++; pid = 0; } fclose(mtf); return; }
void notice_open_fds (void) { fdwalk (do_mark_open_fd, NULL); }
int main (int argc, char **argv) { mode_t old_umask; cleanup_free char *base_path = NULL; int clone_flags; char *old_cwd = NULL; pid_t pid; int event_fd = -1; const char *new_cwd; uid_t ns_uid; gid_t ns_gid; /* Get the (optional) capabilities we need, drop root */ acquire_caps (); /* Never gain any more privs during exec */ if (prctl (PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) die_with_error ("prctl(PR_SET_NO_NEW_CAPS) failed"); /* The initial code is run with high permissions (i.e. CAP_SYS_ADMIN), so take lots of care. */ argv0 = argv[0]; if (isatty (1)) host_tty_dev = ttyname (1); argv++; argc--; if (argc == 0) usage (EXIT_FAILURE); parse_args (&argc, &argv); /* We have to do this if we weren't installed setuid, so let's just DWIM */ if (!is_privileged) opt_unshare_user = TRUE; if (argc == 0) usage (EXIT_FAILURE); __debug__(("Creating root mount point\n")); uid = getuid (); if (opt_sandbox_uid == -1) opt_sandbox_uid = uid; gid = getgid (); if (opt_sandbox_gid == -1) opt_sandbox_gid = gid; if (!opt_unshare_user && opt_sandbox_uid != uid) die ("Specifying --uid requires --unshare-user"); if (!opt_unshare_user && opt_sandbox_gid != gid) die ("Specifying --gid requires --unshare-user"); /* We need to read stuff from proc during the pivot_root dance, etc. Lets keep a fd to it open */ proc_fd = open ("/proc", O_RDONLY | O_PATH); if (proc_fd == -1) die_with_error ("Can't open /proc"); /* We need *some* mountpoint where we can mount the root tmpfs. We first try in /run, and if that fails, try in /tmp. */ base_path = xasprintf ("/run/user/%d/.bubblewrap", uid); if (mkdir (base_path, 0755) && errno != EEXIST) { free (base_path); base_path = xasprintf ("/tmp/.bubblewrap-%d", uid); if (mkdir (base_path, 0755) && errno != EEXIST) die_with_error ("Creating root mountpoint failed"); } __debug__(("creating new namespace\n")); if (opt_unshare_pid) { event_fd = eventfd (0, EFD_CLOEXEC | EFD_NONBLOCK); if (event_fd == -1) die_with_error ("eventfd()"); } /* We block sigchild here so that we can use signalfd in the monitor. */ block_sigchild (); clone_flags = SIGCHLD | CLONE_NEWNS; if (opt_unshare_user) clone_flags |= CLONE_NEWUSER; if (opt_unshare_pid) clone_flags |= CLONE_NEWPID; if (opt_unshare_net) clone_flags |= CLONE_NEWNET; if (opt_unshare_ipc) clone_flags |= CLONE_NEWIPC; if (opt_unshare_uts) clone_flags |= CLONE_NEWUTS; pid = raw_clone (clone_flags, NULL); if (pid == -1) { if (opt_unshare_user) { if (errno == EINVAL) die ("Creating new namespace failed, likely because the kernel does not support user namespaces. bwrap must be installed setuid on such systems."); else if (errno == EPERM && !is_privileged) die ("No permissions to creating new namespace, likely because the kernel does not allow non-privileged user namespaces. On e.g. debian this can be enabled with 'sysctl kernel.unprivileged_userns_clone=1'."); } die_with_error ("Creating new namespace failed"); } if (pid != 0) { /* Initial launched process, wait for exec:ed command to exit */ /* We don't need any caps in the launcher, drop them immediately. */ drop_caps (); monitor_child (event_fd); exit (0); /* Should not be reached, but better safe... */ } if (opt_unshare_net && loopback_setup () != 0) die ("Can't create loopback device"); ns_uid = opt_sandbox_uid; ns_gid = opt_sandbox_gid; if (opt_unshare_user) { if (opt_needs_devpts) { /* This is a bit hacky, but we need to first map the real uid/gid to 0, otherwise we can't mount the devpts filesystem because root is not mapped. Later we will create another child user namespace and map back to the real uid */ ns_uid = 0; ns_gid = 0; } write_uid_gid_map (ns_uid, uid, ns_gid, gid, TRUE); } old_umask = umask (0); /* Mark everything as slave, so that we still * receive mounts from the real root, but don't * propagate mounts to the real root. */ if (mount (NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) die_with_error ("Failed to make / slave"); /* Create a tmpfs which we will use as / in the namespace */ if (mount ("", base_path, "tmpfs", MS_NODEV|MS_NOSUID, NULL) != 0) die_with_error ("Failed to mount tmpfs"); old_cwd = get_current_dir_name (); /* Chdir to the new root tmpfs mount. This will be the CWD during the entire setup. Access old or new root via "oldroot" and "newroot". */ if (chdir (base_path) != 0) die_with_error ("chdir base_path"); /* We create a subdir "$base_path/newroot" for the new root, that * way we can pivot_root to base_path, and put the old root at * "$base_path/oldroot". This avoids problems accessing the oldroot * dir if the user requested to bind mount something over / */ if (mkdir ("newroot", 0755)) die_with_error ("Creating newroot failed"); if (mkdir ("oldroot", 0755)) die_with_error ("Creating oldroot failed"); if (pivot_root (base_path, "oldroot")) die_with_error ("pivot_root"); if (chdir ("/") != 0) die_with_error ("chdir / (base path)"); if (is_privileged) { pid_t child; int privsep_sockets[2]; if (socketpair (AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC, 0, privsep_sockets) != 0) die_with_error ("Can't create privsep socket"); child = fork (); if (child == -1) die_with_error ("Can't fork unprivileged helper"); if (child == 0) { /* Unprivileged setup process */ drop_caps (); close (privsep_sockets[0]); setup_newroot (opt_unshare_pid, privsep_sockets[1]); exit (0); } else { uint32_t buffer[2048]; /* 8k, but is int32 to guarantee nice alignment */ uint32_t op, flags; const char *arg1, *arg2; cleanup_fd int unpriv_socket = -1; unpriv_socket = privsep_sockets[0]; close (privsep_sockets[1]); do { op = read_priv_sec_op (unpriv_socket, buffer, sizeof (buffer), &flags, &arg1, &arg2); privileged_op (-1, op, flags, arg1, arg2); if (write (unpriv_socket, buffer, 1) != 1) die ("Can't write to op_socket"); } while (op != PRIV_SEP_OP_DONE); /* Continue post setup */ } } else setup_newroot (opt_unshare_pid, -1); /* The old root better be rprivate or we will send unmount events to the parent namespace */ if (mount ("oldroot", "oldroot", NULL, MS_REC|MS_PRIVATE, NULL) != 0) die_with_error ("Failed to make old root rprivate"); if (umount2 ("oldroot", MNT_DETACH)) die_with_error ("unmount old root"); if (opt_unshare_user && (ns_uid != opt_sandbox_uid || ns_gid != opt_sandbox_gid)) { /* Now that devpts is mounted and we've no need for mount permissions we can create a new userspace and map our uid 1:1 */ if (unshare (CLONE_NEWUSER)) die_with_error ("unshare user ns"); write_uid_gid_map (opt_sandbox_uid, ns_uid, opt_sandbox_gid, ns_gid, FALSE); } /* Now make /newroot the real root */ if (chdir ("/newroot") != 0) die_with_error ("chdir newroot"); if (chroot ("/newroot") != 0) die_with_error ("chroot /newroot"); if (chdir ("/") != 0) die_with_error ("chdir /"); /* Now we have everything we need CAP_SYS_ADMIN for, so drop it */ drop_caps (); if (opt_seccomp_fd != -1) { cleanup_free char *seccomp_data = NULL; size_t seccomp_len; struct sock_fprog prog; seccomp_data = load_file_data (opt_seccomp_fd, &seccomp_len); if (seccomp_data == NULL) die_with_error ("Can't read seccomp data"); if (seccomp_len % 8 != 0) die ("Invalide seccomp data, must be multiple of 8"); prog.len = seccomp_len / 8; prog.filter = (struct sock_filter *)seccomp_data; close (opt_seccomp_fd); if (prctl (PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) != 0) die_with_error ("prctl(PR_SET_SECCOMP)"); } umask (old_umask); new_cwd = "/"; if (opt_chdir_path) { if (chdir (opt_chdir_path)) die_with_error ("Can't chdir to %s", opt_chdir_path); new_cwd = opt_chdir_path; } else if (chdir (old_cwd) == 0) { /* If the old cwd is mapped in the sandbox, go there */ new_cwd = old_cwd; } else { /* If the old cwd is not mapped, go to home */ const char *home = getenv ("HOME"); if (home != NULL && chdir (home) == 0) new_cwd = home; } xsetenv ("PWD", new_cwd, 1); free (old_cwd); __debug__(("forking for child\n")); if (opt_unshare_pid || lock_files != NULL || opt_sync_fd != -1) { /* We have to have a pid 1 in the pid namespace, because * otherwise we'll get a bunch of zombies as nothing reaps * them. Alternatively if we're using sync_fd or lock_files we * need some process to own these. */ pid = fork (); if (pid == -1) die_with_error("Can't fork for pid 1"); if (pid != 0) { /* Close fds in pid 1, except stdio and optionally event_fd (for syncing pid 2 lifetime with monitor_child) and opt_sync_fd (for syncing sandbox lifetime with outside process). Any other fds will been passed on to the child though. */ { int dont_close[3]; int j = 0; if (event_fd != -1) dont_close[j++] = event_fd; if (opt_sync_fd != -1) dont_close[j++] = opt_sync_fd; dont_close[j++] = -1; fdwalk (proc_fd, close_extra_fds, dont_close); } return do_init (event_fd, pid); } } __debug__(("launch executable %s\n", argv[0])); if (proc_fd != -1) close (proc_fd); if (opt_sync_fd != -1) close (opt_sync_fd); /* We want sigchild in the child */ unblock_sigchild (); if (label_exec (opt_exec_label) == -1) die_with_error ("label_exec %s", argv[0]); if (execvp (argv[0], argv) == -1) die_with_error ("execvp %s", argv[0]); return 0; }
void close_most_fds (void) { fdwalk (do_close, NULL); }