int main(int argc, char *argv[]) { static const char *unlink_paths[] = { "dev/shm", "dev/ptmx", NULL }; static const dir_op_t dirs[] = { dir("dev", 0755), dir("dev/net", 0755), dir("dev/shm", 0755), dir("etc", 0755), dir("proc", 0755), dir("sys", 0755), dir("tmp", 01777), dir("dev/pts", 0755), }; static const char *devnodes[] = { "/dev/null", "/dev/zero", "/dev/full", "/dev/random", "/dev/urandom", "/dev/tty", "/dev/net/tun", "/dev/console", NULL }; static const mount_point mount_table[] = { { "/proc", "/proc", "bind", NULL, MS_BIND|MS_REC }, { "/sys", "/sys", "bind", NULL, MS_BIND|MS_REC }, { "/dev/shm", "/dev/shm", "bind", NULL, MS_BIND }, { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND }, }; const char *root; int rootfd; char to[4096]; int i; exit_if(argc < 2, "Usage: %s /path/to/root", argv[0]); root = argv[1]; /* Make stage2's root a mount point. Chrooting an application in a * directory which is not a mount point is not nice because the * application would not be able to remount "/" it as private mount. * This allows Docker to run inside rkt. * The recursive flag is to preserve volumes mounted previously by * systemd-nspawn via "rkt run -volume". * */ pexit_if(mount(root, root, "bind", MS_BIND | MS_REC, NULL) == -1, "Make / a mount point failed"); rootfd = open(root, O_DIRECTORY | O_CLOEXEC); pexit_if(rootfd < 0, "Failed to open directory \"%s\"", root); /* Some images have annoying symlinks that are resolved as dangling * links before the chroot in stage1. E.g. "/dev/shm" -> "/run/shm" * Just remove the symlinks. */ for (i = 0; unlink_paths[i]; i++) { pexit_if(unlinkat(rootfd, unlink_paths[i], 0) != 0 && errno != ENOENT && errno != EISDIR, "Failed to unlink \"%s\"", unlink_paths[i]) } /* Create the directories */ umask(0); for (i = 0; i < nelems(dirs); i++) { const dir_op_t *d = &dirs[i]; pexit_if(mkdirat(rootfd, d->name, d->mode) == -1 && errno != EEXIST, "Failed to create directory \"%s/%s\"", root, d->name); } exit_if(!ensure_etc_hosts_exists(root, rootfd), "Failed to ensure \"%s/etc/hosts\" exists", root); close(rootfd); /* systemd-nspawn already creates few /dev entries in the container * namespace: copy_devnodes() * http://cgit.freedesktop.org/systemd/systemd/tree/src/nspawn/nspawn.c?h=v219#n1345 * * But they are not visible by the apps because they are "protected" by * the chroot. * * Bind mount them individually over the chroot border. * * Do NOT bind mount the whole directory /dev because it would shadow * potential individual bind mount by stage0 ("rkt run --volume..."). * * Do NOT use mknod, it would not work for /dev/console because it is * a bind mount to a pts and pts device nodes only work when they live * on a devpts filesystem. */ for (i = 0; devnodes[i]; i++) { const char *from = devnodes[i]; int fd; /* If the file does not exist, skip it. It might be because * the kernel does not provide it (e.g. kernel compiled without * CONFIG_TUN) or because systemd-nspawn does not provide it * (/dev/net/tun is not available with systemd-nspawn < v217 */ if (access(from, F_OK) != 0) continue; exit_if(snprintf(to, sizeof(to), "%s%s", root, from) >= sizeof(to), "Path too long: \"%s\"", to); /* The mode does not matter: it will be bind-mounted over. */ fd = open(to, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY, 0644); if (fd != -1) close(fd); pexit_if(mount(from, to, "bind", MS_BIND, NULL) == -1, "Mounting \"%s\" on \"%s\" failed", from, to); } /* Bind mount directories */ for (i = 0; i < nelems(mount_table); i++) { const mount_point *mnt = &mount_table[i]; exit_if(snprintf(to, sizeof(to), "%s/%s", root, mnt->target) >= sizeof(to), "Path too long: \"%s\"", to); pexit_if(mount(mnt->source, to, mnt->type, mnt->flags, mnt->options) == -1, "Mounting \"%s\" on \"%s\" failed", mnt->source, to); } /* /dev/ptmx -> /dev/pts/ptmx */ exit_if(snprintf(to, sizeof(to), "%s/dev/ptmx", root) >= sizeof(to), "Path too long: \"%s\"", to); pexit_if(symlink("/dev/pts/ptmx", to) == -1, "Failed to create /dev/ptmx symlink"); return EXIT_SUCCESS; }
static void diag(const char *exe) { static const uint8_t elf[] = {0x7f, 'E', 'L', 'F'}; static const uint8_t shebang[] = {'#','!'}; static int diag_depth; struct stat st; const uint8_t *mm; const char *itrp = NULL; map_file(exe, PROT_READ, MAP_SHARED, &st, (void **)&mm); exit_if(!((S_IXUSR|S_IXGRP|S_IXOTH) & st.st_mode), "\"%s\" is not executable", exe) if(st.st_size >= sizeof(shebang) && !memcmp(mm, shebang, sizeof(shebang))) { const uint8_t *nl; int maxlen = MIN(PATH_MAX, st.st_size - sizeof(shebang)); /* TODO(vc): EOF-terminated shebang lines are technically possible */ exit_if(!(nl = memchr(&mm[sizeof(shebang)], '\n', maxlen)), "Shebang line too long"); pexit_if(!(itrp = strndup((char *)&mm[sizeof(shebang)], (nl - mm) - 2)), "Failed to dup interpreter path"); } else if(st.st_size >= sizeof(elf) && !memcmp(mm, elf, sizeof(elf))) { uint64_t (*lget)(const uint8_t *) = NULL; uint32_t (*iget)(const uint8_t *) = NULL; uint16_t (*sget)(const uint8_t *) = NULL; const void *phoff = NULL, *phesz = NULL, *phecnt = NULL; const uint8_t *ph = NULL; int i, phreloff, phrelsz; exit_if(mm[ELF_VERSION] != 1, "Unsupported ELF version: %hhx", mm[ELF_VERSION]); /* determine which accessors to use and where */ if(mm[ELF_BITS] == ELF_BITS_32) { if(mm[ELF_ENDIAN] == ELF_ENDIAN_LITL) { lget = le32_lget; sget = le_sget; iget = le_iget; } else if(mm[ELF_ENDIAN] == ELF_ENDIAN_BIG) { lget = be32_lget; sget = be_sget; iget = be_iget; } phoff = &mm[ELF32_PHT_OFF]; phesz = &mm[ELF32_PHTE_SIZE]; phecnt = &mm[ELF32_PHTE_CNT]; phreloff = ELF32_PHE_OFF; phrelsz = ELF32_PHE_SIZE; } else if(mm[ELF_BITS] == ELF_BITS_64) { if(mm[ELF_ENDIAN] == ELF_ENDIAN_LITL) { lget = le64_lget; sget = le_sget; iget = le_iget; } else if(mm[ELF_ENDIAN] == ELF_ENDIAN_BIG) { lget = be64_lget; sget = be_sget; iget = be_iget; } phoff = &mm[ELF64_PHT_OFF]; phesz = &mm[ELF64_PHTE_SIZE]; phecnt = &mm[ELF64_PHTE_CNT]; phreloff = ELF64_PHE_OFF; phrelsz = ELF64_PHE_SIZE; } exit_if(!lget, "Unsupported ELF format"); if(!phoff) /* program header may be absent, don't make it an error */ return; /* TODO(vc): sanity checks on values before using them */ for(ph = &mm[lget(phoff)], i = 0; i < sget(phecnt); i++, ph += sget(phesz)) { if(iget(ph) == ELF_PT_INTERP) { itrp = strndup((char *)&mm[lget(&ph[phreloff])], lget(&ph[phrelsz])); break; } } } else { exit_if(1, "Unsupported file type"); } exit_if(!itrp, "Unable to determine interpreter for \"%s\"", exe); exit_if(*itrp != '/', "Path must be absolute: \"%s\"", itrp); exit_if(++diag_depth > MAX_DIAG_DEPTH, "Excessive interpreter recursion, giving up"); diag(itrp); }
int main(int argc, char *argv[]) { int fd; int pid; pid_t child; int status; int root_fd; exit_if(argc < 4, "Usage: %s pid imageid cmd [args...]", argv[0]) pid = atoi(argv[1]); root_fd = openpidfd(pid, "root"); #define ns(_typ, _nam) \ fd = openpidfd(pid, _nam); \ pexit_if(setns(fd, _typ), "Unable to enter " _nam " namespace"); #if 0 /* TODO(vc): Nspawn isn't employing CLONE_NEWUSER, disabled for now */ ns(CLONE_NEWUSER, "ns/user"); #endif ns(CLONE_NEWIPC, "ns/ipc"); ns(CLONE_NEWUTS, "ns/uts"); ns(CLONE_NEWNET, "ns/net"); ns(CLONE_NEWPID, "ns/pid"); ns(CLONE_NEWNS, "ns/mnt"); pexit_if(fchdir(root_fd) < 0, "Unable to chdir to pod root"); pexit_if(chroot(".") < 0, "Unable to chroot"); pexit_if(close(root_fd) == -1, "Unable to close root_fd"); /* Fork is required to realize consequence of CLONE_NEWPID */ pexit_if(((child = fork()) == -1), "Unable to fork"); /* some stuff make the argv->args copy less cryptic */ #define ENTER_ARGV_FWD_OFFSET 3 #define DIAGEXEC_ARGV_FWD_OFFSET 6 #define args_fwd_idx(_idx) \ ((_idx - ENTER_ARGV_FWD_OFFSET) + DIAGEXEC_ARGV_FWD_OFFSET) if(child == 0) { char root[PATH_MAX]; char env[PATH_MAX]; char *args[args_fwd_idx(argc) + 1 /* NULL terminator */]; int i; /* Child goes on to execute /diagexec */ exit_if(snprintf(root, sizeof(root), "/opt/stage2/%s/rootfs", argv[2]) == sizeof(root), "Root path overflow"); exit_if(snprintf(env, sizeof(env), "/rkt/env/%s", argv[2]) == sizeof(env), "Env path overflow"); args[0] = "/diagexec"; args[1] = root; args[2] = "/"; /* TODO(vc): plumb this into app.WorkingDirectory */ args[3] = env; args[4] = "0"; /* uid */ args[5] = "0"; /* gid */ for(i = ENTER_ARGV_FWD_OFFSET; i < argc; i++) { args[args_fwd_idx(i)] = argv[i]; } args[args_fwd_idx(i)] = NULL; pexit_if(execv(args[0], args) == -1, "Exec failed"); } /* Wait for child, nsenter-like */ for(;;) { if(waitpid(child, &status, WUNTRACED) == pid && (WIFSTOPPED(status))) { kill(getpid(), SIGSTOP); /* the above stops us, upon receiving SIGCONT we'll * continue here and inform our child */ kill(child, SIGCONT); } else { break; } } if(WIFEXITED(status)) { exit(WEXITSTATUS(status)); } else if(WIFSIGNALED(status)) { kill(getpid(), WTERMSIG(status)); } return EXIT_FAILURE; }