示例#1
0
/**
 * Create a writable mimic directory based on reference directory.
 *
 * The mimic directory is a tmpfs populated with bind mounts to the (possibly
 * read only) directories in the reference directory. While all the read-only
 * content stays read-only the actual mimic directory is writable so additional
 * content can be placed there.
 *
 * Flags are forwarded to sc_quirk_mkdir_bind()
 **/
static void sc_quirk_create_writable_mimic(const char *mimic_dir,
					   const char *ref_dir, unsigned flags)
{
	debug("creating writable mimic directory %s based on %s", mimic_dir,
	      ref_dir);
	sc_quirk_setup_tmpfs(mimic_dir);

	// Now copy the ownership and permissions of the mimicked directory
	struct stat stat_buf;
	if (stat(ref_dir, &stat_buf) < 0) {
		die("cannot stat %s", ref_dir);
	}
	if (chown(mimic_dir, stat_buf.st_uid, stat_buf.st_gid) < 0) {
		die("cannot chown for %s", mimic_dir);
	}
	if (chmod(mimic_dir, stat_buf.st_mode) < 0) {
		die("cannot chmod for %s", mimic_dir);
	}

	debug("bind-mounting all the files from the reference directory");
	DIR *dirp SC_CLEANUP(sc_cleanup_closedir) = NULL;
	dirp = opendir(ref_dir);
	if (dirp == NULL) {
		die("cannot open reference directory %s", ref_dir);
	}
	struct dirent *entryp = NULL;
	do {
		char src_name[PATH_MAX * 2] = { 0 };
		char dest_name[PATH_MAX * 2] = { 0 };
		// Set errno to zero, if readdir fails it will not only return null but
		// set errno to a non-zero value. This is how we can differentiate
		// end-of-directory from an actual error.
		errno = 0;
		entryp = readdir(dirp);
		if (entryp == NULL && errno != 0) {
			die("cannot read another directory entry");
		}
		if (entryp == NULL) {
			break;
		}
		if (strcmp(entryp->d_name, ".") == 0
		    || strcmp(entryp->d_name, "..") == 0) {
			continue;
		}
		if (entryp->d_type != DT_DIR && entryp->d_type != DT_REG) {
			die("unsupported entry type of file %s (%d)",
			    entryp->d_name, entryp->d_type);
		}
		sc_must_snprintf(src_name, sizeof src_name, "%s/%s", ref_dir,
				 entryp->d_name);
		sc_must_snprintf(dest_name, sizeof dest_name, "%s/%s",
				 mimic_dir, entryp->d_name);
		sc_quirk_mkdir_bind(src_name, dest_name, flags);
	} while (entryp != NULL);
}
static void validate_bpfpath_is_safe(const char *path)
{
	if (path == NULL || strlen(path) == 0 || path[0] != '/') {
		die("valid_bpfpath_is_safe needs an absolute path as input");
	}
	// strtok_r() modifies its first argument, so work on a copy
	char *tokenized SC_CLEANUP(sc_cleanup_string) = NULL;
	tokenized = strdup(path);
	if (tokenized == NULL) {
		die("cannot allocate memory for copy of path");
	}
	// allocate a string large enough to hold path, and initialize it to
	// '/'
	size_t checked_path_size = strlen(path) + 1;
	char *checked_path SC_CLEANUP(sc_cleanup_string) = NULL;
	checked_path = calloc(checked_path_size, 1);
	if (checked_path == NULL) {
		die("cannot allocate memory for checked_path");
	}

	checked_path[0] = '/';
	checked_path[1] = '\0';

	// validate '/'
	validate_path_has_strict_perms(checked_path);

	// strtok_r needs a pointer to keep track of where it is in the
	// string.
	char *buf_saveptr = NULL;

	// reconstruct the path from '/' down to profile_name
	char *buf_token = strtok_r(tokenized, "/", &buf_saveptr);
	while (buf_token != NULL) {
		char *prev SC_CLEANUP(sc_cleanup_string) = NULL;
		prev = strdup(checked_path);	// needed by vsnprintf in sc_must_snprintf
		if (prev == NULL) {
			die("cannot allocate memory for copy of checked_path");
		}
		// append '<buf_token>' if checked_path is '/', otherwise '/<buf_token>'
		if (strlen(checked_path) == 1) {
			sc_must_snprintf(checked_path, checked_path_size,
					 "%s%s", prev, buf_token);
		} else {
			sc_must_snprintf(checked_path, checked_path_size,
					 "%s/%s", prev, buf_token);
		}
		validate_path_has_strict_perms(checked_path);

		buf_token = strtok_r(NULL, "/", &buf_saveptr);
	}
}
void sc_cgroup_freezer_join(const char *snap_name, pid_t pid)
{
	// Format the name of the cgroup hierarchy.
	char buf[PATH_MAX] = { 0 };
	sc_must_snprintf(buf, sizeof buf, "snap.%s", snap_name);

	// Open the freezer cgroup directory.
	int cgroup_fd SC_CLEANUP(sc_cleanup_close) = -1;
	cgroup_fd = open(freezer_cgroup_dir,
			 O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
	if (cgroup_fd < 0) {
		die("cannot open freezer cgroup (%s)", freezer_cgroup_dir);
	}
	// Create the freezer hierarchy for the given snap.
	if (mkdirat(cgroup_fd, buf, 0755) < 0 && errno != EEXIST) {
		die("cannot create freezer cgroup hierarchy for snap %s",
		    snap_name);
	}
	// Open the hierarchy directory for the given snap.
	int hierarchy_fd SC_CLEANUP(sc_cleanup_close) = -1;
	hierarchy_fd = openat(cgroup_fd, buf,
			      O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
	if (hierarchy_fd < 0) {
		die("cannot open freezer cgroup hierarchy for snap %s",
		    snap_name);
	}
	// Since we may be running from a setuid but not setgid executable, ensure
	// that the group and owner of the hierarchy directory is root.root.
	if (fchownat(hierarchy_fd, "", 0, 0, AT_EMPTY_PATH) < 0) {
		die("cannot change owner of freezer cgroup hierarchy for snap %s to root.root", snap_name);
	}
	// Open the tasks file.
	int tasks_fd SC_CLEANUP(sc_cleanup_close) = -1;
	tasks_fd = openat(hierarchy_fd, "tasks",
			  O_WRONLY | O_NOFOLLOW | O_CLOEXEC);
	if (tasks_fd < 0) {
		die("cannot open tasks file for freezer cgroup hierarchy for snap %s", snap_name);
	}
	// Write the process (task) number to the tasks file. Linux task IDs are
	// limited to 2^29 so a long int is enough to represent it.
	// See include/linux/threads.h in the kernel source tree for details.
	int n = sc_must_snprintf(buf, sizeof buf, "%ld", (long)pid);
	if (write(tasks_fd, buf, n) < n) {
		die("cannot move process %ld to freezer cgroup hierarchy for snap %s", (long)pid, snap_name);
	}
	debug("moved process %ld to freezer cgroup hierarchy for snap %s",
	      (long)pid, snap_name);
}
示例#4
0
static dev_t find_base_snap_device(const char *base_snap_name,
				   const char *base_snap_rev)
{
	// Find the backing device of the base snap.
	// TODO: add support for "try mode" base snaps that also need
	// consideration of the mie->root component.
	dev_t base_snap_dev = 0;
	char base_squashfs_path[PATH_MAX];
	sc_must_snprintf(base_squashfs_path,
			 sizeof base_squashfs_path, "%s/%s/%s",
			 SNAP_MOUNT_DIR, base_snap_name, base_snap_rev);
	struct sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
	mi = sc_parse_mountinfo(NULL);
	if (mi == NULL) {
		die("cannot parse mountinfo of the current process");
	}
	bool found = false;
	for (struct sc_mountinfo_entry * mie =
	     sc_first_mountinfo_entry(mi); mie != NULL;
	     mie = sc_next_mountinfo_entry(mie)) {
		if (sc_streq(mie->mount_dir, base_squashfs_path)) {
			base_snap_dev = MKDEV(mie->dev_major, mie->dev_minor);
			debug("found base snap filesystem device %d:%d",
			      mie->dev_major, mie->dev_minor);
			// Don't break when found, we are interested in the last
			// entry as this is the "effective" one.
			found = true;
		}
	}
	if (!found) {
		die("cannot find device backing the base snap %s",
		    base_snap_name);
	}
	return base_snap_dev;
}
示例#5
0
static void test_sc_must_snprintf__fail()
{
	if (g_test_subprocess()) {
		char buf[5];
		sc_must_snprintf(buf, sizeof buf, "12345");
		g_test_message("expected sc_must_snprintf not to return");
		g_test_fail();
		return;
	}
	g_test_trap_subprocess(NULL, 0, 0);
	g_test_trap_assert_failed();
	g_test_trap_assert_stderr("cannot format string: 1234\n");
}
示例#6
0
void sc_setup_quirks(void)
{
	// because /var/lib/snapd is essential let's move it to /tmp/snapd for a sec
	char snapd_tmp[] = "/tmp/snapd.quirks_XXXXXX";
	if (mkdtemp(snapd_tmp) == 0) {
		die("cannot create temporary directory for /var/lib/snapd mount point");
	}
	debug("performing operation: mount --move %s %s", "/var/lib/snapd",
	      snapd_tmp);
	if (mount("/var/lib/snapd", snapd_tmp, NULL, MS_MOVE, NULL)
	    != 0) {
		die("cannot perform operation: mount --move %s %s",
		    "/var/lib/snapd", snapd_tmp);
	}
	// now let's make /var/lib the vanilla /var/lib from the core snap
	char buf[PATH_MAX] = { 0 };
	sc_must_snprintf(buf, sizeof buf, "%s/var/lib",
			 sc_get_inner_core_mount_point());
	sc_quirk_create_writable_mimic("/var/lib", buf,
				       MS_RDONLY | MS_REC | MS_SLAVE | MS_NODEV
				       | MS_NOSUID);
	// now let's move /var/lib/snapd (that was originally there) back
	debug("performing operation: umount %s", "/var/lib/snapd");
	if (umount("/var/lib/snapd") != 0) {
		die("cannot perform operation: umount %s", "/var/lib/snapd");
	}
	debug("performing operation: mount --move %s %s", snapd_tmp,
	      "/var/lib/snapd");
	if (mount(snapd_tmp, "/var/lib/snapd", NULL, MS_MOVE, NULL)
	    != 0) {
		die("cannot perform operation: mount --move %s %s", snapd_tmp,
		    "/var/lib/snapd");
	}
	debug("performing operation: rmdir %s", snapd_tmp);
	if (rmdir(snapd_tmp) != 0) {
		die("cannot perform operation: rmdir %s", snapd_tmp);
	}
	// We are now ready to apply any quirks that relate to /var/lib
	sc_setup_lxd_quirk();
}
示例#7
0
void sc_discard_preserved_ns_group(struct sc_ns_group *group)
{
	// Remember the current working directory
	int old_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
	old_dir_fd = open(".", O_PATH | O_DIRECTORY | O_CLOEXEC);
	if (old_dir_fd < 0) {
		die("cannot open current directory");
	}
	// Move to the mount namespace directory (/run/snapd/ns)
	if (fchdir(group->dir_fd) < 0) {
		die("cannot move to namespace group directory");
	}
	// Unmount ${group_name}.mnt which holds the preserved namespace
	char mnt_fname[PATH_MAX] = { 0 };
	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name,
			 SC_NS_MNT_FILE);
	debug("unmounting preserved mount namespace file %s", mnt_fname);
	if (umount2(mnt_fname, UMOUNT_NOFOLLOW) < 0) {
		switch (errno) {
		case EINVAL:
			// EINVAL is returned when there's nothing to unmount (no bind-mount).
			// Instead of checking for this explicitly (which is always racy) we
			// just unmount and check the return code.
			break;
		case ENOENT:
			// We may be asked to discard a namespace that doesn't yet
			// exist (even the mount point may be absent). We just
			// ignore that error and return gracefully.
			break;
		default:
			die("cannot unmount preserved mount namespace file %s",
			    mnt_fname);
			break;
		}
	}
	// Get back to the original directory
	if (fchdir(old_dir_fd) < 0) {
		die("cannot move back to original directory");
	}
}
int sc_apply_seccomp_bpf(const char *filter_profile)
{
	debug("loading bpf program for security tag %s", filter_profile);

	char profile_path[PATH_MAX] = { 0 };
	sc_must_snprintf(profile_path, sizeof(profile_path), "%s/%s.bin",
			 filter_profile_dir, filter_profile);

	// Wait some time for the security profile to show up. When
	// the system boots snapd will created security profiles, but
	// a service snap (e.g. network-manager) starts in parallel with
	// snapd so for such snaps, the profiles may not be generated
	// yet
	long max_wait = 120;
	const char *MAX_PROFILE_WAIT = getenv("SNAP_CONFINE_MAX_PROFILE_WAIT");
	if (MAX_PROFILE_WAIT != NULL) {
		char *endptr = NULL;
		errno = 0;
		long env_max_wait = strtol(MAX_PROFILE_WAIT, &endptr, 10);
		if (errno != 0 || MAX_PROFILE_WAIT == endptr || *endptr != '\0'
		    || env_max_wait <= 0) {
			die("SNAP_CONFINE_MAX_PROFILE_WAIT invalid");
		}
		max_wait = env_max_wait > 0 ? env_max_wait : max_wait;
	}
	if (max_wait > 3600) {
		max_wait = 3600;
	}
	for (long i = 0; i < max_wait; ++i) {
		if (access(profile_path, F_OK) == 0) {
			break;
		}
		sleep(1);
	}

	// validate '/' down to profile_path are root-owned and not
	// 'other' writable to avoid possibility of privilege
	// escalation via bpf program load when paths are incorrectly
	// set on the system.
	validate_bpfpath_is_safe(profile_path);

	// load bpf
	char bpf[MAX_BPF_SIZE + 1] = { 0 };	// account for EOF
	FILE *fp = fopen(profile_path, "rb");
	if (fp == NULL) {
		die("cannot read %s", profile_path);
	}
	// set 'size' to 1 to get bytes transferred
	size_t num_read = fread(bpf, 1, sizeof(bpf), fp);
	if (ferror(fp) != 0) {
		die("cannot read seccomp profile %s", profile_path);
	} else if (feof(fp) == 0) {
		die("seccomp profile %s exceeds %zu bytes", profile_path,
		    sizeof(bpf));
	}
	fclose(fp);
	debug("read %zu bytes from %s", num_read, profile_path);

	if (sc_streq(bpf, "@unrestricted\n")) {
		return 0;
	}

	uid_t real_uid, effective_uid, saved_uid;
	if (getresuid(&real_uid, &effective_uid, &saved_uid) < 0) {
		die("cannot call getresuid");
	}
	// If we can, raise privileges so that we can load the BPF into the
	// kernel via 'prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, ...)'.
	debug("raising privileges to load seccomp profile");
	if (effective_uid != 0 && saved_uid == 0) {
		if (seteuid(0) != 0) {
			die("seteuid failed");
		}
		if (geteuid() != 0) {
			die("raising privs before seccomp_load did not work");
		}
	}
	// Load filter into the kernel. Importantly we are
	// intentionally *not* setting NO_NEW_PRIVS because it
	// interferes with exec transitions in AppArmor with certain
	// snappy interfaces. Not setting NO_NEW_PRIVS does mean that
	// applications can adjust their sandbox if they have
	// CAP_SYS_ADMIN or, if running on < 4.8 kernels, break out of
	// the seccomp via ptrace. Both CAP_SYS_ADMIN and 'ptrace
	// (trace)' are blocked by AppArmor with typical snappy
	// interfaces.
	struct sock_fprog prog = {
		.len = num_read / sizeof(struct sock_filter),
		.filter = (struct sock_filter *)bpf,
	};
	if (seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG, &prog) !=
	    0) {
		if (errno == ENOSYS) {
			debug("kernel doesn't support the seccomp(2) syscall");
		} else if (errno == EINVAL) {
			debug
			    ("kernel may not support the SECCOMP_FILTER_FLAG_LOG flag");
		}

		debug
		    ("falling back to prctl(2) syscall to load seccomp filter");
		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) != 0) {
			die("cannot apply seccomp profile");
		}
	}
	// drop privileges again
	debug("dropping privileges after loading seccomp profile");
	if (geteuid() == 0) {
		unsigned real_uid = getuid();
		if (seteuid(real_uid) != 0) {
			die("seteuid failed");
		}
		if (real_uid != 0 && geteuid() == 0) {
			die("dropping privs after seccomp_load did not work");
		}
	}

	return 0;
}
示例#9
0
const char *sc_mount_opt2str(char *buf, size_t buf_size, unsigned long flags)
{
	unsigned long used = 0;
	sc_string_init(buf, buf_size);

#define F(FLAG, TEXT) do {                                         \
    if (flags & (FLAG)) {                                          \
      sc_string_append(buf, buf_size, #TEXT ","); flags ^= (FLAG); \
    }                                                              \
  } while (0)

	F(MS_RDONLY, ro);
	F(MS_NOSUID, nosuid);
	F(MS_NODEV, nodev);
	F(MS_NOEXEC, noexec);
	F(MS_SYNCHRONOUS, sync);
	F(MS_REMOUNT, remount);
	F(MS_MANDLOCK, mand);
	F(MS_DIRSYNC, dirsync);
	F(MS_NOATIME, noatime);
	F(MS_NODIRATIME, nodiratime);
	if (flags & MS_BIND) {
		if (flags & MS_REC) {
			sc_string_append(buf, buf_size, "rbind,");
			used |= MS_REC;
		} else {
			sc_string_append(buf, buf_size, "bind,");
		}
		flags ^= MS_BIND;
	}
	F(MS_MOVE, move);
	// The MS_REC flag handled separately by affected flags (MS_BIND,
	// MS_PRIVATE, MS_SLAVE, MS_SHARED)
	// XXX: kernel has MS_VERBOSE, glibc has MS_SILENT, both use the same constant
	F(MS_SILENT, silent);
	F(MS_POSIXACL, acl);
	F(MS_UNBINDABLE, unbindable);
	if (flags & MS_PRIVATE) {
		if (flags & MS_REC) {
			sc_string_append(buf, buf_size, "rprivate,");
			used |= MS_REC;
		} else {
			sc_string_append(buf, buf_size, "private,");
		}
		flags ^= MS_PRIVATE;
	}
	if (flags & MS_SLAVE) {
		if (flags & MS_REC) {
			sc_string_append(buf, buf_size, "rslave,");
			used |= MS_REC;
		} else {
			sc_string_append(buf, buf_size, "slave,");
		}
		flags ^= MS_SLAVE;
	}
	if (flags & MS_SHARED) {
		if (flags & MS_REC) {
			sc_string_append(buf, buf_size, "rshared,");
			used |= MS_REC;
		} else {
			sc_string_append(buf, buf_size, "shared,");
		}
		flags ^= MS_SHARED;
	}
	flags ^= used;		// this is just for MS_REC
	F(MS_RELATIME, relatime);
	F(MS_KERNMOUNT, kernmount);
	F(MS_I_VERSION, iversion);
	F(MS_STRICTATIME, strictatime);
#ifndef MS_LAZYTIME
#define MS_LAZYTIME (1<<25)
#endif
	F(MS_LAZYTIME, lazytime);
#ifndef MS_NOSEC
#define MS_NOSEC (1 << 28)
#endif
	F(MS_NOSEC, nosec);
#ifndef MS_BORN
#define MS_BORN (1 << 29)
#endif
	F(MS_BORN, born);
	F(MS_ACTIVE, active);
	F(MS_NOUSER, nouser);
#undef F
	// Render any flags that are unaccounted for.
	if (flags) {
		char of[128] = { 0 };
		sc_must_snprintf(of, sizeof of, "%#lx", flags);
		sc_string_append(buf, buf_size, of);
	}
	// Chop the excess comma from the end.
	size_t len = strnlen(buf, buf_size);
	if (len > 0 && buf[len - 1] == ',') {
		buf[len - 1] = 0;
	}
	return buf;
}
示例#10
0
static void test_sc_must_snprintf()
{
	char buf[5];
	sc_must_snprintf(buf, sizeof buf, "1234");
	g_assert_cmpstr(buf, ==, "1234");
}
示例#11
0
int sc_create_or_join_ns_group(struct sc_ns_group *group,
			       struct sc_apparmor *apparmor,
			       const char *base_snap_name,
			       const char *snap_name)
{
	// Open the mount namespace file.
	char mnt_fname[PATH_MAX] = { 0 };
	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name,
			 SC_NS_MNT_FILE);
	int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
	// NOTE: There is no O_EXCL here because the file can be around but
	// doesn't have to be a mounted namespace.
	//
	// If the mounted namespace is discarded with
	// sc_discard_preserved_ns_group() it will revert to a regular file.  If
	// snap-confine is killed for whatever reason after the file is created but
	// before the file is bind-mounted it will also be a regular file.
	mnt_fd = openat(group->dir_fd, mnt_fname,
			O_CREAT | O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
	if (mnt_fd < 0) {
		die("cannot open mount namespace file for namespace group %s",
		    group->name);
	}
	// Check if we got an nsfs-based or procfs file or a regular file. This can
	// be reliably tested because nsfs has an unique filesystem type
	// NSFS_MAGIC.  On older kernels that don't support nsfs yet we can look
	// for PROC_SUPER_MAGIC instead. 
	// We can just ensure that this is the case thanks to fstatfs.
	struct statfs ns_statfs_buf;
	if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) {
		die("cannot perform fstatfs() on the mount namespace file descriptor");
	}
	// Stat the mount namespace as well, this is later used to check if the
	// namespace is used by other processes if we are considering discarding a
	// stale namespace.
	struct stat ns_stat_buf;
	if (fstat(mnt_fd, &ns_stat_buf) < 0) {
		die("cannot perform fstat() on the mount namespace file descriptor");
	}
#ifndef NSFS_MAGIC
// Account for kernel headers old enough to not know about NSFS_MAGIC.
#define NSFS_MAGIC 0x6e736673
#endif
	if (ns_statfs_buf.f_type == NSFS_MAGIC
	    || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) {

		// Inspect and perhaps discard the preserved mount namespace.
		if (sc_inspect_and_maybe_discard_stale_ns
		    (mnt_fd, snap_name, base_snap_name) == EAGAIN) {
			return EAGAIN;
		}
		// Remember the vanilla working directory so that we may attempt to restore it later.
		char *vanilla_cwd SC_CLEANUP(sc_cleanup_string) = NULL;
		vanilla_cwd = get_current_dir_name();
		if (vanilla_cwd == NULL) {
			die("cannot get the current working directory");
		}
		// Move to the mount namespace of the snap we're trying to start.
		debug
		    ("attempting to re-associate the mount namespace with the namespace group %s",
		     group->name);
		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
			die("cannot re-associate the mount namespace with namespace group %s", group->name);
		}
		debug
		    ("successfully re-associated the mount namespace with the namespace group %s",
		     group->name);

		// Try to re-locate back to vanilla working directory. This can fail
		// because that directory is no longer present.
		if (chdir(vanilla_cwd) != 0) {
			debug
			    ("cannot remain in %s, moving to the void directory",
			     vanilla_cwd);
			if (chdir(SC_VOID_DIR) != 0) {
				die("cannot change directory to %s",
				    SC_VOID_DIR);
			}
			debug("successfully moved to %s", SC_VOID_DIR);
		}
		return 0;
	}
	debug("initializing new namespace group %s", group->name);
	// Create a new namespace and ask the caller to populate it.
	// For rationale of forking see this:
	// https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html
	//
	// The eventfd created here is used to synchronize the child and the parent
	// processes. It effectively tells the child to perform the capture
	// operation.
	group->event_fd = eventfd(0, EFD_CLOEXEC);
	if (group->event_fd < 0) {
		die("cannot create eventfd for mount namespace capture");
	}
	debug("forking support process for mount namespace capture");
	// Store the PID of the "parent" process. This done instead of calls to
	// getppid() because then we can reliably track the PID of the parent even
	// if the child process is re-parented.
	pid_t parent = getpid();
	// Glibc defines pid as a signed 32bit integer. There's no standard way to
	// print pid's portably so this is the best we can do.
	pid_t pid = fork();
	debug("forked support process has pid %d", (int)pid);
	if (pid < 0) {
		die("cannot fork support process for mount namespace capture");
	}
	if (pid == 0) {
		// This is the child process which will capture the mount namespace.
		//
		// It will do so by bind-mounting the SC_NS_MNT_FILE after the parent
		// process calls unshare() and finishes setting up the namespace
		// completely.
		// Change the hat to a sub-profile that has limited permissions
		// necessary to accomplish the capture of the mount namespace.
		debug
		    ("changing apparmor hat of the support process for mount namespace capture");
		sc_maybe_aa_change_hat(apparmor,
				       "mount-namespace-capture-helper", 0);
		// Configure the child to die as soon as the parent dies. In an odd
		// case where the parent is killed then we don't want to complete our
		// task or wait for anything.
		if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
			die("cannot set parent process death notification signal to SIGINT");
		}
		// Check that parent process is still alive. If this is the case then
		// we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake
		// us up from eventfd_read() below. In the rare case that the PID numbers
		// overflow and the now-dead parent PID is recycled we will still hang
		// forever on the read from eventfd below.
		debug("ensuring that parent process is still alive");
		if (kill(parent, 0) < 0) {
			switch (errno) {
			case ESRCH:
				debug("parent process has already terminated");
				abort();
			default:
				die("cannot ensure that parent process is still alive");
				break;
			}
		}
		if (fchdir(group->dir_fd) < 0) {
			die("cannot move process for mount namespace capture to namespace group directory");
		}
		debug
		    ("waiting for a eventfd data from the parent process to continue");
		eventfd_t value = 0;
		sc_enable_sanity_timeout();
		if (eventfd_read(group->event_fd, &value) < 0) {
			die("cannot read expected data from eventfd");
		}
		sc_disable_sanity_timeout();
		debug
		    ("capturing mount namespace of process %d in namespace group %s",
		     (int)parent, group->name);
		char src[PATH_MAX] = { 0 };
		char dst[PATH_MAX] = { 0 };
		sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt",
				 (int)parent);
		sc_must_snprintf(dst, sizeof dst, "%s%s", group->name,
				 SC_NS_MNT_FILE);
		if (mount(src, dst, NULL, MS_BIND, NULL) < 0) {
			die("cannot bind-mount the mount namespace file %s -> %s", src, dst);
		}
		debug
		    ("successfully captured mount namespace in namespace group %s",
		     group->name);
		exit(0);
	} else {
		group->child = pid;
		// Unshare the mount namespace and set a flag instructing the caller that 
		// the namespace is pristine and needs to be populated now.
		debug("unsharing the mount namespace");
		if (unshare(CLONE_NEWNS) < 0) {
			die("cannot unshare the mount namespace");
		}
		group->should_populate = true;
	}
	return 0;
}
示例#12
0
// The namespace may be stale. To check this we must actually switch into it
// but then we use up our setns call (the kernel misbehaves if we setns twice).
// To work around this we'll fork a child and use it to probe. The child will
// inspect the namespace and send information back via eventfd and then exit
// unconditionally.
static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd,
						 const char *snap_name,
						 const char *base_snap_name)
{
	char base_snap_rev[PATH_MAX] = { 0 };
	char fname[PATH_MAX] = { 0 };
	char mnt_fname[PATH_MAX] = { 0 };
	dev_t base_snap_dev;
	int event_fd SC_CLEANUP(sc_cleanup_close) = -1;

	// Read the revision of the base snap by looking at the current symlink.
	sc_must_snprintf(fname, sizeof fname, "%s/%s/current",
			 SNAP_MOUNT_DIR, base_snap_name);
	if (readlink(fname, base_snap_rev, sizeof base_snap_rev) < 0) {
		die("cannot read revision of base snap %s", fname);
	}
	if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') {
		die("cannot use symbolic link %s - value is too long", fname);
	}
	// Find the device that is backing the current revision of the base snap.
	base_snap_dev = find_base_snap_device(base_snap_name, base_snap_rev);

	// Check if we are running on classic. Do it here because we will always
	// (seemingly) run on a core system once we are inside a mount namespace.
	bool is_classic = is_running_on_classic_distribution();

	// Store the PID of this process. This is done instead of calls to
	// getppid() below because then we can reliably track the PID of the
	// parent even if the child process is re-parented.
	pid_t parent = getpid();

	// Create an eventfd for the communication with the child.
	event_fd = eventfd(0, EFD_CLOEXEC);
	if (event_fd < 0) {
		die("cannot create eventfd for communication with inspection process");
	}
	// Fork a child, it will do the inspection for us.
	pid_t child = fork();
	if (child < 0) {
		die("cannot fork support process for namespace inspection");
	}

	if (child == 0) {
		// This is the child process which will inspect the mount namespace.
		//
		// Configure the child to die as soon as the parent dies. In an odd
		// case where the parent is killed then we don't want to complete our
		// task or wait for anything.
		if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
			die("cannot set parent process death notification signal to SIGINT");
		}
		// Check that parent process is still alive. If this is the case then
		// we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake
		// us up from eventfd_read() below. In the rare case that the PID
		// numbers overflow and the now-dead parent PID is recycled we will
		// still hang forever on the read from eventfd below.
		debug("ensuring that parent process is still alive");
		if (kill(parent, 0) < 0) {
			switch (errno) {
			case ESRCH:
				debug("parent process has already terminated");
				abort();
			default:
				die("cannot ensure that parent process is still alive");
				break;
			}
		}

		debug("joining the namespace that we are about to probe");
		// Move to the mount namespace of the snap we're trying to inspect.
		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
			die("cannot join the mount namespace in order to inspect it");
		}
		// Check if the namespace needs to be discarded.
		//
		// TODO: enable this for core distributions. This is complex because on
		// core the rootfs is mounted in initrd and is _not_ changed (no
		// pivot_root) and the base snap is again mounted (2nd time) by
		// systemd. This makes us end up in a situation where the outer base
		// snap will never match the rootfs inside the mount namespace.
		bool should_discard =
		    is_classic ? should_discard_current_ns(base_snap_dev) :
		    false;

		// Send this back to the parent: 2 - discard, 1 - keep.
		// Note that we cannot just use 0 and 1 because of the semantics of eventfd(2).
		debug
		    ("sending information about the state of the mount namespace (%s)",
		     should_discard ? "discard" : "keep");
		if (eventfd_write
		    (event_fd,
		     should_discard ? SC_DISCARD_YES : SC_DISCARD_NO) < 0) {
			die("cannot send information about the state of the mount namespace");
		}
		// Exit, we're done.
		debug
		    ("support process for mount namespace inspection is about to finish");
		exit(0);
	}
	// This is back in the parent process.
	//
	// Enable a sanity timeout in case the read blocks for unbound amount of
	// time. This will ensure we will not hang around while holding the lock.
	// Next, read the value written by the child process.
	sc_enable_sanity_timeout();
	eventfd_t value = 0;
	debug("receiving information about the state of the mount namespace");
	if (eventfd_read(event_fd, &value) < 0) {
		die("cannot receive information about the state of the mount namespace");
	}
	sc_disable_sanity_timeout();

	// Wait for the child process to exit and collect its exit status.
	errno = 0;
	int status = 0;
	if (waitpid(child, &status, 0) < 0) {
		die("cannot wait for the support process for mount namespace inspection");
	}
	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
		die("support process for mount namespace inspection exited abnormally");
	}
	// If the namespace is up-to-date then we are done.
	if (value == SC_DISCARD_NO) {
		debug("the mount namespace is up-to-date and can be reused");
		return 0;
	}
	// The namespace is stale, let's check if we can discard it.
	debug("the mount namespace is stale and should be discarded");
	if (sc_cgroup_freezer_occupied(snap_name)) {
		// Some processes are still using the namespace so we cannot discard it
		// as that would fracture the view that the set of processes inside
		// have on what is mounted.
		return 0;
	}
	// The namespace is both stale and empty. We can discard it now.
	debug("discarding stale and empty mount namespace");
	sc_must_snprintf(mnt_fname, sizeof mnt_fname,
			 "%s/%s%s", sc_ns_dir, snap_name, SC_NS_MNT_FILE);

	// Use MNT_DETACH as otherwise we get EBUSY.
	if (umount2(mnt_fname, MNT_DETACH | UMOUNT_NOFOLLOW) < 0) {
		die("cannot umount stale mount namespace %s", mnt_fname);
	}
	debug("stale mount namespace discarded");
	return EAGAIN;
}
bool sc_cgroup_freezer_occupied(const char *snap_name)
{
	// Format the name of the cgroup hierarchy.
	char buf[PATH_MAX] = { 0 };
	sc_must_snprintf(buf, sizeof buf, "snap.%s", snap_name);

	// Open the freezer cgroup directory.
	int cgroup_fd SC_CLEANUP(sc_cleanup_close) = -1;
	cgroup_fd = open(freezer_cgroup_dir,
			 O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
	if (cgroup_fd < 0) {
		die("cannot open freezer cgroup (%s)", freezer_cgroup_dir);
	}
	// Open the proc directory.
	int proc_fd SC_CLEANUP(sc_cleanup_close) = -1;
	proc_fd = open("/proc", O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
	if (proc_fd < 0) {
		die("cannot open /proc");
	}
	// Open the hierarchy directory for the given snap.
	int hierarchy_fd SC_CLEANUP(sc_cleanup_close) = -1;
	hierarchy_fd = openat(cgroup_fd, buf,
			      O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
	if (hierarchy_fd < 0) {
		if (errno == ENOENT) {
			return false;
		}
		die("cannot open freezer cgroup hierarchy for snap %s",
		    snap_name);
	}
	// Open the "cgroup.procs" file. Alternatively we could open the "tasks"
	// file and see per-thread data but we don't need that.
	int cgroup_procs_fd SC_CLEANUP(sc_cleanup_close) = -1;
	cgroup_procs_fd = openat(hierarchy_fd, "cgroup.procs",
				 O_RDONLY | O_NOFOLLOW | O_CLOEXEC);
	if (cgroup_procs_fd < 0) {
		die("cannot open cgroup.procs file for freezer cgroup hierarchy for snap %s", snap_name);
	}

	FILE *cgroup_procs SC_CLEANUP(sc_cleanup_file) = NULL;
	cgroup_procs = fdopen(cgroup_procs_fd, "r");
	if (cgroup_procs == NULL) {
		die("cannot convert tasks file descriptor to FILE");
	}
	cgroup_procs_fd = -1;	// cgroup_procs_fd will now be closed by fclose.

	char *line_buf SC_CLEANUP(sc_cleanup_string) = NULL;
	size_t line_buf_size = 0;
	ssize_t num_read;
	struct stat statbuf;
	do {
		num_read = getline(&line_buf, &line_buf_size, cgroup_procs);
		if (num_read < 0 && errno != 0) {
			die("cannot read next PID belonging to snap %s",
			    snap_name);
		}
		if (num_read <= 0) {
			break;
		} else {
			if (line_buf[num_read - 1] == '\n') {
				line_buf[num_read - 1] = '\0';
			} else {
				die("could not find newline in cgroup.procs");
			}
		}
		debug("found process id: %s\n", line_buf);

		if (fstatat(proc_fd, line_buf, &statbuf, AT_SYMLINK_NOFOLLOW) <
		    0) {
			// The process may have died already.
			if (errno != ENOENT) {
				die("cannot stat /proc/%s", line_buf);
			}
		}
		debug("found process %s belonging to user %d",
		      line_buf, statbuf.st_uid);
		return true;
	} while (num_read > 0);

	return false;
}