Example #1
0
int main(int argc, char *argv[])
{
	static const char *unlink_paths[] = {
		"dev/shm",
		"dev/ptmx",
		NULL
	};
	static const dir_op_t dirs[] = {
		dir("dev",	0755),
		dir("dev/net",	0755),
		dir("dev/shm",	0755),
		dir("etc",	0755),
		dir("proc",	0755),
		dir("sys",	0755),
		dir("tmp",	01777),
		dir("dev/pts",	0755),
	};
	static const char *devnodes[] = {
		"/dev/null",
		"/dev/zero",
		"/dev/full",
		"/dev/random",
		"/dev/urandom",
		"/dev/tty",
		"/dev/net/tun",
		"/dev/console",
		NULL
	};
	static const mount_point mount_table[] = {
		{ "/proc", "/proc", "bind", NULL, MS_BIND|MS_REC },
		{ "/sys", "/sys", "bind", NULL, MS_BIND|MS_REC },
		{ "/dev/shm", "/dev/shm", "bind", NULL, MS_BIND },
		{ "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND },
	};
	const char *root;
	int rootfd;
	char to[4096];
	int i;

	exit_if(argc < 2,
		"Usage: %s /path/to/root", argv[0]);

	root = argv[1];

	/* Make stage2's root a mount point. Chrooting an application in a
	 * directory which is not a mount point is not nice because the
	 * application would not be able to remount "/" it as private mount.
	 * This allows Docker to run inside rkt.
	 * The recursive flag is to preserve volumes mounted previously by
	 * systemd-nspawn via "rkt run -volume".
	 * */
	pexit_if(mount(root, root, "bind", MS_BIND | MS_REC, NULL) == -1,
			"Make / a mount point failed");

	rootfd = open(root, O_DIRECTORY | O_CLOEXEC);
	pexit_if(rootfd < 0,
		"Failed to open directory \"%s\"", root);

	/* Some images have annoying symlinks that are resolved as dangling
	 * links before the chroot in stage1. E.g. "/dev/shm" -> "/run/shm"
	 * Just remove the symlinks.
         */
	for (i = 0; unlink_paths[i]; i++) {
		pexit_if(unlinkat(rootfd, unlink_paths[i], 0) != 0
			 && errno != ENOENT && errno != EISDIR,
			 "Failed to unlink \"%s\"", unlink_paths[i])
	}

	/* Create the directories */
	umask(0);
	for (i = 0; i < nelems(dirs); i++) {
		const dir_op_t *d = &dirs[i];
		pexit_if(mkdirat(rootfd, d->name, d->mode) == -1 &&
			 errno != EEXIST,
			"Failed to create directory \"%s/%s\"", root, d->name);
	}

	exit_if(!ensure_etc_hosts_exists(root, rootfd),
		"Failed to ensure \"%s/etc/hosts\" exists", root);

	close(rootfd);

	/* systemd-nspawn already creates few /dev entries in the container
	 * namespace: copy_devnodes()
	 * http://cgit.freedesktop.org/systemd/systemd/tree/src/nspawn/nspawn.c?h=v219#n1345
	 *
	 * But they are not visible by the apps because they are "protected" by
	 * the chroot.
	 *
	 * Bind mount them individually over the chroot border.
	 *
	 * Do NOT bind mount the whole directory /dev because it would shadow
	 * potential individual bind mount by stage0 ("rkt run --volume...").
	 *
	 * Do NOT use mknod, it would not work for /dev/console because it is
	 * a bind mount to a pts and pts device nodes only work when they live
	 * on a devpts filesystem.
	 */
	for (i = 0; devnodes[i]; i++) {
		const char *from = devnodes[i];
		int fd;

		/* If the file does not exist, skip it. It might be because
		 * the kernel does not provide it (e.g. kernel compiled without
		 * CONFIG_TUN) or because systemd-nspawn does not provide it
		 * (/dev/net/tun is not available with systemd-nspawn < v217
		 */
		if (access(from, F_OK) != 0)
			continue;

		exit_if(snprintf(to, sizeof(to), "%s%s", root, from) >= sizeof(to),
			"Path too long: \"%s\"", to);

		/* The mode does not matter: it will be bind-mounted over.
		 */
		fd = open(to, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY, 0644);
		if (fd != -1)
			close(fd);

		pexit_if(mount(from, to, "bind", MS_BIND, NULL) == -1,
				"Mounting \"%s\" on \"%s\" failed", from, to);
	}

	/* Bind mount directories */
	for (i = 0; i < nelems(mount_table); i++) {
		const mount_point *mnt = &mount_table[i];

		exit_if(snprintf(to, sizeof(to), "%s/%s", root, mnt->target) >= sizeof(to),
			"Path too long: \"%s\"", to);
		pexit_if(mount(mnt->source, to, mnt->type,
			       mnt->flags, mnt->options) == -1,
				"Mounting \"%s\" on \"%s\" failed", mnt->source, to);
	}

	/* /dev/ptmx -> /dev/pts/ptmx */
	exit_if(snprintf(to, sizeof(to), "%s/dev/ptmx", root) >= sizeof(to),
		"Path too long: \"%s\"", to);
	pexit_if(symlink("/dev/pts/ptmx", to) == -1,
		"Failed to create /dev/ptmx symlink");

	return EXIT_SUCCESS;
}
Example #2
0
static void diag(const char *exe)
{
	static const uint8_t	elf[] = {0x7f, 'E', 'L', 'F'};
	static const uint8_t	shebang[] = {'#','!'};
	static int		diag_depth;
	struct stat		st;
	const uint8_t		*mm;
	const char		*itrp = NULL;

	map_file(exe, PROT_READ, MAP_SHARED, &st, (void **)&mm);
	exit_if(!((S_IXUSR|S_IXGRP|S_IXOTH) & st.st_mode),
		"\"%s\" is not executable", exe)

	if(st.st_size >= sizeof(shebang) &&
	   !memcmp(mm, shebang, sizeof(shebang))) {
		const uint8_t	*nl;
		int		maxlen = MIN(PATH_MAX, st.st_size - sizeof(shebang));
		/* TODO(vc): EOF-terminated shebang lines are technically possible */
		exit_if(!(nl = memchr(&mm[sizeof(shebang)], '\n', maxlen)),
			"Shebang line too long");
		pexit_if(!(itrp = strndup((char *)&mm[sizeof(shebang)], (nl - mm) - 2)),
			"Failed to dup interpreter path");
	} else if(st.st_size >= sizeof(elf) &&
		  !memcmp(mm, elf, sizeof(elf))) {
		uint64_t	(*lget)(const uint8_t *) = NULL;
		uint32_t	(*iget)(const uint8_t *) = NULL;
		uint16_t	(*sget)(const uint8_t *) = NULL;
		const void	*phoff = NULL, *phesz = NULL, *phecnt = NULL;
		const uint8_t	*ph = NULL;
		int		i, phreloff, phrelsz;

		exit_if(mm[ELF_VERSION] != 1,
			"Unsupported ELF version: %hhx", mm[ELF_VERSION]);

		/* determine which accessors to use and where */
		if(mm[ELF_BITS] == ELF_BITS_32) {
			if(mm[ELF_ENDIAN] == ELF_ENDIAN_LITL) {
				lget = le32_lget;
				sget = le_sget;
				iget = le_iget;
			} else if(mm[ELF_ENDIAN] == ELF_ENDIAN_BIG) {
				lget = be32_lget;
				sget = be_sget;
				iget = be_iget;
			}
			phoff = &mm[ELF32_PHT_OFF];
			phesz = &mm[ELF32_PHTE_SIZE];
			phecnt = &mm[ELF32_PHTE_CNT];
			phreloff = ELF32_PHE_OFF;
			phrelsz = ELF32_PHE_SIZE;
		} else if(mm[ELF_BITS] == ELF_BITS_64) {
			if(mm[ELF_ENDIAN] == ELF_ENDIAN_LITL) {
				lget = le64_lget;
				sget = le_sget;
				iget = le_iget;
			} else if(mm[ELF_ENDIAN] == ELF_ENDIAN_BIG) {
				lget = be64_lget;
				sget = be_sget;
				iget = be_iget;
			}
			phoff = &mm[ELF64_PHT_OFF];
			phesz = &mm[ELF64_PHTE_SIZE];
			phecnt = &mm[ELF64_PHTE_CNT];
			phreloff = ELF64_PHE_OFF;
			phrelsz = ELF64_PHE_SIZE;
		}

		exit_if(!lget, "Unsupported ELF format");

		if(!phoff) /* program header may be absent, don't make it an error */
			return;

		/* TODO(vc): sanity checks on values before using them */
		for(ph = &mm[lget(phoff)], i = 0; i < sget(phecnt); i++, ph += sget(phesz)) {
			if(iget(ph) == ELF_PT_INTERP) {
				itrp = strndup((char *)&mm[lget(&ph[phreloff])], lget(&ph[phrelsz]));
				break;
			}
		}
	} else {
		exit_if(1, "Unsupported file type");
	}

	exit_if(!itrp, "Unable to determine interpreter for \"%s\"", exe);
	exit_if(*itrp != '/', "Path must be absolute: \"%s\"", itrp);
	exit_if(++diag_depth > MAX_DIAG_DEPTH,
		"Excessive interpreter recursion, giving up");
	diag(itrp);
}
Example #3
0
int main(int argc, char *argv[])
{
	int	fd;
	int	pid;
	pid_t	child;
	int	status;
	int	root_fd;

	exit_if(argc < 4,
		"Usage: %s pid imageid cmd [args...]", argv[0])

	pid = atoi(argv[1]);
	root_fd = openpidfd(pid, "root");

#define ns(_typ, _nam)							\
	fd = openpidfd(pid, _nam);					\
	pexit_if(setns(fd, _typ), "Unable to enter " _nam " namespace");

#if 0
	/* TODO(vc): Nspawn isn't employing CLONE_NEWUSER, disabled for now */
	ns(CLONE_NEWUSER, "ns/user");
#endif
	ns(CLONE_NEWIPC,  "ns/ipc");
	ns(CLONE_NEWUTS,  "ns/uts");
	ns(CLONE_NEWNET,  "ns/net");
	ns(CLONE_NEWPID,  "ns/pid");
	ns(CLONE_NEWNS,	  "ns/mnt");

	pexit_if(fchdir(root_fd) < 0,
		"Unable to chdir to pod root");
	pexit_if(chroot(".") < 0,
		"Unable to chroot");
	pexit_if(close(root_fd) == -1,
		"Unable to close root_fd");

	/* Fork is required to realize consequence of CLONE_NEWPID */
	pexit_if(((child = fork()) == -1),
		"Unable to fork");

/* some stuff make the argv->args copy less cryptic */
#define ENTER_ARGV_FWD_OFFSET		3
#define DIAGEXEC_ARGV_FWD_OFFSET	6
#define args_fwd_idx(_idx) \
	((_idx - ENTER_ARGV_FWD_OFFSET) + DIAGEXEC_ARGV_FWD_OFFSET)

	if(child == 0) {
		char		root[PATH_MAX];
		char		env[PATH_MAX];
		char		*args[args_fwd_idx(argc) + 1 /* NULL terminator */];
		int		i;

		/* Child goes on to execute /diagexec */

		exit_if(snprintf(root, sizeof(root),
				 "/opt/stage2/%s/rootfs", argv[2]) == sizeof(root),
			"Root path overflow");

		exit_if(snprintf(env, sizeof(env),
				 "/rkt/env/%s", argv[2]) == sizeof(env),
			"Env path overflow");

		args[0] = "/diagexec";
		args[1] = root;
		args[2] = "/";	/* TODO(vc): plumb this into app.WorkingDirectory */
		args[3] = env;
		args[4] = "0"; /* uid */
		args[5] = "0"; /* gid */
		for(i = ENTER_ARGV_FWD_OFFSET; i < argc; i++) {
			args[args_fwd_idx(i)] = argv[i];
		}
		args[args_fwd_idx(i)] = NULL;

		pexit_if(execv(args[0], args) == -1,
			"Exec failed");
	}

	/* Wait for child, nsenter-like */
	for(;;) {
		if(waitpid(child, &status, WUNTRACED) == pid &&
		   (WIFSTOPPED(status))) {
			kill(getpid(), SIGSTOP);
			/* the above stops us, upon receiving SIGCONT we'll
			 * continue here and inform our child */
			kill(child, SIGCONT);
		} else {
			break;
		}
	}

	if(WIFEXITED(status)) {
		exit(WEXITSTATUS(status));
	} else if(WIFSIGNALED(status)) {
		kill(getpid(), WTERMSIG(status));
	}

	return EXIT_FAILURE;
}