Пример #1
0
static void validate_bpfpath_is_safe(const char *path)
{
	if (path == NULL || strlen(path) == 0 || path[0] != '/') {
		die("valid_bpfpath_is_safe needs an absolute path as input");
	}
	// strtok_r() modifies its first argument, so work on a copy
	char *tokenized SC_CLEANUP(sc_cleanup_string) = NULL;
	tokenized = strdup(path);
	if (tokenized == NULL) {
		die("cannot allocate memory for copy of path");
	}
	// allocate a string large enough to hold path, and initialize it to
	// '/'
	size_t checked_path_size = strlen(path) + 1;
	char *checked_path SC_CLEANUP(sc_cleanup_string) = NULL;
	checked_path = calloc(checked_path_size, 1);
	if (checked_path == NULL) {
		die("cannot allocate memory for checked_path");
	}

	checked_path[0] = '/';
	checked_path[1] = '\0';

	// validate '/'
	validate_path_has_strict_perms(checked_path);

	// strtok_r needs a pointer to keep track of where it is in the
	// string.
	char *buf_saveptr = NULL;

	// reconstruct the path from '/' down to profile_name
	char *buf_token = strtok_r(tokenized, "/", &buf_saveptr);
	while (buf_token != NULL) {
		char *prev SC_CLEANUP(sc_cleanup_string) = NULL;
		prev = strdup(checked_path);	// needed by vsnprintf in sc_must_snprintf
		if (prev == NULL) {
			die("cannot allocate memory for copy of checked_path");
		}
		// append '<buf_token>' if checked_path is '/', otherwise '/<buf_token>'
		if (strlen(checked_path) == 1) {
			sc_must_snprintf(checked_path, checked_path_size,
					 "%s%s", prev, buf_token);
		} else {
			sc_must_snprintf(checked_path, checked_path_size,
					 "%s/%s", prev, buf_token);
		}
		validate_path_has_strict_perms(checked_path);

		buf_token = strtok_r(NULL, "/", &buf_saveptr);
	}
}
Пример #2
0
static bool should_discard_current_ns(dev_t base_snap_dev)
{
	// Inspect the namespace and check if we should discard it.
	//
	// The namespace may become "stale" when the rootfs is not the same
	// device we found above. This will happen whenever the base snap is
	// refreshed since the namespace was first created.
	struct sc_mountinfo_entry *mie;
	struct sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL;

	mi = sc_parse_mountinfo(NULL);
	if (mi == NULL) {
		die("cannot parse mountinfo of the current process");
	}
	for (mie = sc_first_mountinfo_entry(mi); mie != NULL;
	     mie = sc_next_mountinfo_entry(mie)) {
		if (!sc_streq(mie->mount_dir, "/")) {
			continue;
		}
		// NOTE: we want the initial rootfs just in case overmount
		// was used to do something weird. The initial rootfs was
		// set up by snap-confine and that is the one we want to
		// measure.
		debug("found root filesystem inside the mount namespace %d:%d",
		      mie->dev_major, mie->dev_minor);
		return base_snap_dev != MKDEV(mie->dev_major, mie->dev_minor);
	}
	die("cannot find mount entry of the root filesystem inside snap namespace");
}
Пример #3
0
static dev_t find_base_snap_device(const char *base_snap_name,
				   const char *base_snap_rev)
{
	// Find the backing device of the base snap.
	// TODO: add support for "try mode" base snaps that also need
	// consideration of the mie->root component.
	dev_t base_snap_dev = 0;
	char base_squashfs_path[PATH_MAX];
	sc_must_snprintf(base_squashfs_path,
			 sizeof base_squashfs_path, "%s/%s/%s",
			 SNAP_MOUNT_DIR, base_snap_name, base_snap_rev);
	struct sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
	mi = sc_parse_mountinfo(NULL);
	if (mi == NULL) {
		die("cannot parse mountinfo of the current process");
	}
	bool found = false;
	for (struct sc_mountinfo_entry * mie =
	     sc_first_mountinfo_entry(mi); mie != NULL;
	     mie = sc_next_mountinfo_entry(mie)) {
		if (sc_streq(mie->mount_dir, base_squashfs_path)) {
			base_snap_dev = MKDEV(mie->dev_major, mie->dev_minor);
			debug("found base snap filesystem device %d:%d",
			      mie->dev_major, mie->dev_minor);
			// Don't break when found, we are interested in the last
			// entry as this is the "effective" one.
			found = true;
		}
	}
	if (!found) {
		die("cannot find device backing the base snap %s",
		    base_snap_name);
	}
	return base_snap_dev;
}
Пример #4
0
int sc_nonfatal_mkpath(const char *const path, mode_t mode)
{
	// If asked to create an empty path, return immediately.
	if (strlen(path) == 0) {
		return 0;
	}
	// We're going to use strtok_r, which needs to modify the path, so we'll
	// make a copy of it.
	char *path_copy SC_CLEANUP(sc_cleanup_string) = NULL;
	path_copy = strdup(path);
	if (path_copy == NULL) {
		return -1;
	}
	// Open flags to use while we walk the user data path:
	// - Don't follow symlinks
	// - Don't allow child access to file descriptor
	// - Only open a directory (fail otherwise)
	const int open_flags = O_NOFOLLOW | O_CLOEXEC | O_DIRECTORY;

	// We're going to create each path segment via openat/mkdirat calls instead
	// of mkdir calls, to avoid following symlinks and placing the user data
	// directory somewhere we never intended for it to go. The first step is to
	// get an initial file descriptor.
	int fd SC_CLEANUP(sc_cleanup_close) = AT_FDCWD;
	if (path_copy[0] == '/') {
		fd = open("/", open_flags);
		if (fd < 0) {
			return -1;
		}
	}
	// strtok_r needs a pointer to keep track of where it is in the string.
	char *path_walker = NULL;

	// Initialize tokenizer and obtain first path segment.
	char *path_segment = strtok_r(path_copy, "/", &path_walker);
	while (path_segment) {
		// Try to create the directory.  It's okay if it already existed, but
		// return with error on any other error. Reset errno before attempting
		// this as it may stay stale (errno is not reset if mkdirat(2) returns
		// successfully).
		errno = 0;
		if (mkdirat(fd, path_segment, mode) < 0 && errno != EEXIST) {
			return -1;
		}
		// Open the parent directory we just made (and close the previous one
		// (but not the special value AT_FDCWD) so we can continue down the
		// path.
		int previous_fd = fd;
		fd = openat(fd, path_segment, open_flags);
		if (previous_fd != AT_FDCWD && close(previous_fd) != 0) {
			return -1;
		}
		if (fd < 0) {
			return -1;
		}
		// Obtain the next path segment.
		path_segment = strtok_r(NULL, "/", &path_walker);
	}
	return 0;
}
Пример #5
0
/**
 * Create a writable mimic directory based on reference directory.
 *
 * The mimic directory is a tmpfs populated with bind mounts to the (possibly
 * read only) directories in the reference directory. While all the read-only
 * content stays read-only the actual mimic directory is writable so additional
 * content can be placed there.
 *
 * Flags are forwarded to sc_quirk_mkdir_bind()
 **/
static void sc_quirk_create_writable_mimic(const char *mimic_dir,
					   const char *ref_dir, unsigned flags)
{
	debug("creating writable mimic directory %s based on %s", mimic_dir,
	      ref_dir);
	sc_quirk_setup_tmpfs(mimic_dir);

	// Now copy the ownership and permissions of the mimicked directory
	struct stat stat_buf;
	if (stat(ref_dir, &stat_buf) < 0) {
		die("cannot stat %s", ref_dir);
	}
	if (chown(mimic_dir, stat_buf.st_uid, stat_buf.st_gid) < 0) {
		die("cannot chown for %s", mimic_dir);
	}
	if (chmod(mimic_dir, stat_buf.st_mode) < 0) {
		die("cannot chmod for %s", mimic_dir);
	}

	debug("bind-mounting all the files from the reference directory");
	DIR *dirp SC_CLEANUP(sc_cleanup_closedir) = NULL;
	dirp = opendir(ref_dir);
	if (dirp == NULL) {
		die("cannot open reference directory %s", ref_dir);
	}
	struct dirent *entryp = NULL;
	do {
		char src_name[PATH_MAX * 2] = { 0 };
		char dest_name[PATH_MAX * 2] = { 0 };
		// Set errno to zero, if readdir fails it will not only return null but
		// set errno to a non-zero value. This is how we can differentiate
		// end-of-directory from an actual error.
		errno = 0;
		entryp = readdir(dirp);
		if (entryp == NULL && errno != 0) {
			die("cannot read another directory entry");
		}
		if (entryp == NULL) {
			break;
		}
		if (strcmp(entryp->d_name, ".") == 0
		    || strcmp(entryp->d_name, "..") == 0) {
			continue;
		}
		if (entryp->d_type != DT_DIR && entryp->d_type != DT_REG) {
			die("unsupported entry type of file %s (%d)",
			    entryp->d_name, entryp->d_type);
		}
		sc_must_snprintf(src_name, sizeof src_name, "%s/%s", ref_dir,
				 entryp->d_name);
		sc_must_snprintf(dest_name, sizeof dest_name, "%s/%s",
				 mimic_dir, entryp->d_name);
		sc_quirk_mkdir_bind(src_name, dest_name, flags);
	} while (entryp != NULL);
}
Пример #6
0
struct sc_mountinfo *sc_parse_mountinfo(const char *fname)
{
	struct sc_mountinfo *info = calloc(1, sizeof *info);
	if (info == NULL) {
		return NULL;
	}
	if (fname == NULL) {
		fname = "/proc/self/mountinfo";
	}
	FILE *f SC_CLEANUP(sc_cleanup_file) = NULL;
	f = fopen(fname, "rt");
	if (f == NULL) {
		free(info);
		return NULL;
	}
	char *line SC_CLEANUP(sc_cleanup_string) = NULL;
	size_t line_size = 0;
	struct sc_mountinfo_entry *entry, *last = NULL;
	for (;;) {
		errno = 0;
		if (getline(&line, &line_size, f) == -1) {
			if (errno != 0) {
				sc_free_mountinfo(info);
				return NULL;
			}
			break;
		};
		entry = sc_parse_mountinfo_entry(line);
		if (entry == NULL) {
			sc_free_mountinfo(info);
			return NULL;
		}
		if (last != NULL) {
			last->next = entry;
		} else {
			info->first = entry;
		}
		last = entry;
	}
	return info;
}
Пример #7
0
sc_distro sc_classify_distro(void)
{
	FILE *f SC_CLEANUP(sc_cleanup_file) = fopen(os_release, "r");
	if (f == NULL) {
		return SC_DISTRO_CLASSIC;
	}

	bool is_core = false;
	int core_version = 0;
	char buf[255] = { 0 };

	while (fgets(buf, sizeof buf, f) != NULL) {
		size_t len = strlen(buf);
		if (len > 0 && buf[len - 1] == '\n') {
			buf[len - 1] = '\0';
		}
		if (sc_streq(buf, "ID=\"ubuntu-core\"")
		    || sc_streq(buf, "ID=ubuntu-core")) {
			is_core = true;
		} else if (sc_streq(buf, "VERSION_ID=\"16\"")
			   || sc_streq(buf, "VERSION_ID=16")) {
			core_version = 16;
		} else if (sc_streq(buf, "VARIANT_ID=\"snappy\"")
			   || sc_streq(buf, "VARIANT_ID=snappy")) {
			is_core = true;
		}
	}

	if (!is_core) {
		/* Since classic systems don't have a /meta/snap.yaml file the simple
		   presence of that file qualifies as SC_DISTRO_CORE_OTHER. */
		if (access(meta_snap_yaml, F_OK) == 0) {
			is_core = true;
		}
	}

	if (is_core) {
		if (core_version == 16) {
			return SC_DISTRO_CORE16;
		}
		return SC_DISTRO_CORE_OTHER;
	} else {
		return SC_DISTRO_CLASSIC;
	}
}
Пример #8
0
/**
 * Read /proc/self/mountinfo and check if /run/snapd/ns is a private bind mount.
 *
 * We do this because /run/snapd/ns cannot be shared with any other peers as per:
 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
 **/
static bool sc_is_ns_group_dir_private(void)
{
	struct sc_mountinfo *info SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
	info = sc_parse_mountinfo(NULL);
	if (info == NULL) {
		die("cannot parse /proc/self/mountinfo");
	}
	struct sc_mountinfo_entry *entry = sc_first_mountinfo_entry(info);
	while (entry != NULL) {
		const char *mount_dir = entry->mount_dir;
		const char *optional_fields = entry->optional_fields;
		if (strcmp(mount_dir, sc_ns_dir) == 0
		    && strcmp(optional_fields, "") == 0) {
			// If /run/snapd/ns has no optional fields, we know it is mounted
			// private and there is nothing else to do.
			return true;
		}
		entry = sc_next_mountinfo_entry(entry);
	}
	return false;
}
Пример #9
0
int sc_create_or_join_ns_group(struct sc_ns_group *group,
			       struct sc_apparmor *apparmor,
			       const char *base_snap_name,
			       const char *snap_name)
{
	// Open the mount namespace file.
	char mnt_fname[PATH_MAX] = { 0 };
	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name,
			 SC_NS_MNT_FILE);
	int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
	// NOTE: There is no O_EXCL here because the file can be around but
	// doesn't have to be a mounted namespace.
	//
	// If the mounted namespace is discarded with
	// sc_discard_preserved_ns_group() it will revert to a regular file.  If
	// snap-confine is killed for whatever reason after the file is created but
	// before the file is bind-mounted it will also be a regular file.
	mnt_fd = openat(group->dir_fd, mnt_fname,
			O_CREAT | O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
	if (mnt_fd < 0) {
		die("cannot open mount namespace file for namespace group %s",
		    group->name);
	}
	// Check if we got an nsfs-based or procfs file or a regular file. This can
	// be reliably tested because nsfs has an unique filesystem type
	// NSFS_MAGIC.  On older kernels that don't support nsfs yet we can look
	// for PROC_SUPER_MAGIC instead. 
	// We can just ensure that this is the case thanks to fstatfs.
	struct statfs ns_statfs_buf;
	if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) {
		die("cannot perform fstatfs() on the mount namespace file descriptor");
	}
	// Stat the mount namespace as well, this is later used to check if the
	// namespace is used by other processes if we are considering discarding a
	// stale namespace.
	struct stat ns_stat_buf;
	if (fstat(mnt_fd, &ns_stat_buf) < 0) {
		die("cannot perform fstat() on the mount namespace file descriptor");
	}
#ifndef NSFS_MAGIC
// Account for kernel headers old enough to not know about NSFS_MAGIC.
#define NSFS_MAGIC 0x6e736673
#endif
	if (ns_statfs_buf.f_type == NSFS_MAGIC
	    || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) {

		// Inspect and perhaps discard the preserved mount namespace.
		if (sc_inspect_and_maybe_discard_stale_ns
		    (mnt_fd, snap_name, base_snap_name) == EAGAIN) {
			return EAGAIN;
		}
		// Remember the vanilla working directory so that we may attempt to restore it later.
		char *vanilla_cwd SC_CLEANUP(sc_cleanup_string) = NULL;
		vanilla_cwd = get_current_dir_name();
		if (vanilla_cwd == NULL) {
			die("cannot get the current working directory");
		}
		// Move to the mount namespace of the snap we're trying to start.
		debug
		    ("attempting to re-associate the mount namespace with the namespace group %s",
		     group->name);
		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
			die("cannot re-associate the mount namespace with namespace group %s", group->name);
		}
		debug
		    ("successfully re-associated the mount namespace with the namespace group %s",
		     group->name);

		// Try to re-locate back to vanilla working directory. This can fail
		// because that directory is no longer present.
		if (chdir(vanilla_cwd) != 0) {
			debug
			    ("cannot remain in %s, moving to the void directory",
			     vanilla_cwd);
			if (chdir(SC_VOID_DIR) != 0) {
				die("cannot change directory to %s",
				    SC_VOID_DIR);
			}
			debug("successfully moved to %s", SC_VOID_DIR);
		}
		return 0;
	}
	debug("initializing new namespace group %s", group->name);
	// Create a new namespace and ask the caller to populate it.
	// For rationale of forking see this:
	// https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html
	//
	// The eventfd created here is used to synchronize the child and the parent
	// processes. It effectively tells the child to perform the capture
	// operation.
	group->event_fd = eventfd(0, EFD_CLOEXEC);
	if (group->event_fd < 0) {
		die("cannot create eventfd for mount namespace capture");
	}
	debug("forking support process for mount namespace capture");
	// Store the PID of the "parent" process. This done instead of calls to
	// getppid() because then we can reliably track the PID of the parent even
	// if the child process is re-parented.
	pid_t parent = getpid();
	// Glibc defines pid as a signed 32bit integer. There's no standard way to
	// print pid's portably so this is the best we can do.
	pid_t pid = fork();
	debug("forked support process has pid %d", (int)pid);
	if (pid < 0) {
		die("cannot fork support process for mount namespace capture");
	}
	if (pid == 0) {
		// This is the child process which will capture the mount namespace.
		//
		// It will do so by bind-mounting the SC_NS_MNT_FILE after the parent
		// process calls unshare() and finishes setting up the namespace
		// completely.
		// Change the hat to a sub-profile that has limited permissions
		// necessary to accomplish the capture of the mount namespace.
		debug
		    ("changing apparmor hat of the support process for mount namespace capture");
		sc_maybe_aa_change_hat(apparmor,
				       "mount-namespace-capture-helper", 0);
		// Configure the child to die as soon as the parent dies. In an odd
		// case where the parent is killed then we don't want to complete our
		// task or wait for anything.
		if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
			die("cannot set parent process death notification signal to SIGINT");
		}
		// Check that parent process is still alive. If this is the case then
		// we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake
		// us up from eventfd_read() below. In the rare case that the PID numbers
		// overflow and the now-dead parent PID is recycled we will still hang
		// forever on the read from eventfd below.
		debug("ensuring that parent process is still alive");
		if (kill(parent, 0) < 0) {
			switch (errno) {
			case ESRCH:
				debug("parent process has already terminated");
				abort();
			default:
				die("cannot ensure that parent process is still alive");
				break;
			}
		}
		if (fchdir(group->dir_fd) < 0) {
			die("cannot move process for mount namespace capture to namespace group directory");
		}
		debug
		    ("waiting for a eventfd data from the parent process to continue");
		eventfd_t value = 0;
		sc_enable_sanity_timeout();
		if (eventfd_read(group->event_fd, &value) < 0) {
			die("cannot read expected data from eventfd");
		}
		sc_disable_sanity_timeout();
		debug
		    ("capturing mount namespace of process %d in namespace group %s",
		     (int)parent, group->name);
		char src[PATH_MAX] = { 0 };
		char dst[PATH_MAX] = { 0 };
		sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt",
				 (int)parent);
		sc_must_snprintf(dst, sizeof dst, "%s%s", group->name,
				 SC_NS_MNT_FILE);
		if (mount(src, dst, NULL, MS_BIND, NULL) < 0) {
			die("cannot bind-mount the mount namespace file %s -> %s", src, dst);
		}
		debug
		    ("successfully captured mount namespace in namespace group %s",
		     group->name);
		exit(0);
	} else {
		group->child = pid;
		// Unshare the mount namespace and set a flag instructing the caller that 
		// the namespace is pristine and needs to be populated now.
		debug("unsharing the mount namespace");
		if (unshare(CLONE_NEWNS) < 0) {
			die("cannot unshare the mount namespace");
		}
		group->should_populate = true;
	}
	return 0;
}
bool sc_cgroup_freezer_occupied(const char *snap_name)
{
	// Format the name of the cgroup hierarchy.
	char buf[PATH_MAX] = { 0 };
	sc_must_snprintf(buf, sizeof buf, "snap.%s", snap_name);

	// Open the freezer cgroup directory.
	int cgroup_fd SC_CLEANUP(sc_cleanup_close) = -1;
	cgroup_fd = open(freezer_cgroup_dir,
			 O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
	if (cgroup_fd < 0) {
		die("cannot open freezer cgroup (%s)", freezer_cgroup_dir);
	}
	// Open the proc directory.
	int proc_fd SC_CLEANUP(sc_cleanup_close) = -1;
	proc_fd = open("/proc", O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
	if (proc_fd < 0) {
		die("cannot open /proc");
	}
	// Open the hierarchy directory for the given snap.
	int hierarchy_fd SC_CLEANUP(sc_cleanup_close) = -1;
	hierarchy_fd = openat(cgroup_fd, buf,
			      O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
	if (hierarchy_fd < 0) {
		if (errno == ENOENT) {
			return false;
		}
		die("cannot open freezer cgroup hierarchy for snap %s",
		    snap_name);
	}
	// Open the "cgroup.procs" file. Alternatively we could open the "tasks"
	// file and see per-thread data but we don't need that.
	int cgroup_procs_fd SC_CLEANUP(sc_cleanup_close) = -1;
	cgroup_procs_fd = openat(hierarchy_fd, "cgroup.procs",
				 O_RDONLY | O_NOFOLLOW | O_CLOEXEC);
	if (cgroup_procs_fd < 0) {
		die("cannot open cgroup.procs file for freezer cgroup hierarchy for snap %s", snap_name);
	}

	FILE *cgroup_procs SC_CLEANUP(sc_cleanup_file) = NULL;
	cgroup_procs = fdopen(cgroup_procs_fd, "r");
	if (cgroup_procs == NULL) {
		die("cannot convert tasks file descriptor to FILE");
	}
	cgroup_procs_fd = -1;	// cgroup_procs_fd will now be closed by fclose.

	char *line_buf SC_CLEANUP(sc_cleanup_string) = NULL;
	size_t line_buf_size = 0;
	ssize_t num_read;
	struct stat statbuf;
	do {
		num_read = getline(&line_buf, &line_buf_size, cgroup_procs);
		if (num_read < 0 && errno != 0) {
			die("cannot read next PID belonging to snap %s",
			    snap_name);
		}
		if (num_read <= 0) {
			break;
		} else {
			if (line_buf[num_read - 1] == '\n') {
				line_buf[num_read - 1] = '\0';
			} else {
				die("could not find newline in cgroup.procs");
			}
		}
		debug("found process id: %s\n", line_buf);

		if (fstatat(proc_fd, line_buf, &statbuf, AT_SYMLINK_NOFOLLOW) <
		    0) {
			// The process may have died already.
			if (errno != ENOENT) {
				die("cannot stat /proc/%s", line_buf);
			}
		}
		debug("found process %s belonging to user %d",
		      line_buf, statbuf.st_uid);
		return true;
	} while (num_read > 0);

	return false;
}