Exemplo n.º 1
0
struct sc_ns_group *sc_open_ns_group(const char *group_name,
				     const unsigned flags)
{
	struct sc_ns_group *group = sc_alloc_ns_group();
	debug("opening namespace group directory %s", sc_ns_dir);
	group->dir_fd =
	    open(sc_ns_dir, O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW);
	if (group->dir_fd < 0) {
		if (flags & SC_NS_FAIL_GRACEFULLY && errno == ENOENT) {
			free(group);
			return NULL;
		}
		die("cannot open directory for namespace group %s", group_name);
	}
	char lock_fname[PATH_MAX];
	must_snprintf(lock_fname, sizeof lock_fname, "%s%s", group_name,
		      SC_NS_LOCK_FILE);
	debug("opening lock file for namespace group %s", group_name);
	group->lock_fd =
	    openat(group->dir_fd, lock_fname,
		   O_CREAT | O_RDWR | O_CLOEXEC | O_NOFOLLOW, 0600);
	if (group->lock_fd < 0) {
		die("cannot open lock file for namespace group %s", group_name);
	}
	group->name = strdup(group_name);
	if (group->name == NULL) {
		die("cannot duplicate namespace group name %s", group_name);
	}
	return group;
}
Exemplo n.º 2
0
/**
 * Create a writable mimic directory based on reference directory.
 *
 * The mimic directory is a tmpfs populated with bind mounts to the (possibly
 * read only) directories in the reference directory. While all the read-only
 * content stays read-only the actual mimic directory is writable so additional
 * content can be placed there.
 *
 * Flags are forwarded to sc_quirk_mkdir_bind()
 **/
static void sc_quirk_create_writable_mimic(const char *mimic_dir,
					   const char *ref_dir, unsigned flags)
{
	debug("creating writable mimic directory %s based on %s", mimic_dir,
	      ref_dir);
	sc_quirk_setup_tmpfs(mimic_dir);
	debug("bind-mounting all the files from the reference directory");
	DIR *dirp __attribute__ ((cleanup(sc_cleanup_closedir))) = NULL;
	dirp = opendir(ref_dir);
	if (dirp == NULL) {
		die("cannot open reference directory %s", ref_dir);
	}
	struct dirent *entryp = NULL;
	do {
		char src_name[PATH_MAX * 2];
		char dest_name[PATH_MAX * 2];
		// Set errno to zero, if readdir fails it will not only return null but
		// set errno to a non-zero value. This is how we can differentiate
		// end-of-directory from an actual error.
		errno = 0;
		entryp = readdir(dirp);
		if (entryp == NULL && errno != 0) {
			die("cannot read another directory entry");
		}
		if (entryp == NULL) {
			break;
		}
		if (strcmp(entryp->d_name, ".") == 0
		    || strcmp(entryp->d_name, "..") == 0) {
			continue;
		}
		if (entryp->d_type != DT_DIR && entryp->d_type != DT_REG) {
			die("unsupported entry type of file %s (%d)",
			    entryp->d_name, entryp->d_type);
		}
		must_snprintf(src_name, sizeof src_name, "%s/%s", ref_dir,
			      entryp->d_name);
		must_snprintf(dest_name, sizeof dest_name, "%s/%s", mimic_dir,
			      entryp->d_name);
		sc_quirk_mkdir_bind(src_name, dest_name, flags);
	} while (entryp != NULL);
}
Exemplo n.º 3
0
/*
 * Setup mount profiles as described by snapd.
 *
 * This function reads /var/lib/snapd/mount/$security_tag.fstab as a fstab(5) file
 * and executes the mount requests described there.
 *
 * Currently only bind mounts are allowed. All bind mounts are read only by
 * default though the `rw` flag can be used.
 *
 * This function is called with the rootfs being "consistent" so that it is
 * either the core snap on an all-snap system or the core snap + punched holes
 * on a classic system.
 **/
static void sc_setup_mount_profiles(const char *security_tag)
{
	debug("%s: %s", __FUNCTION__, security_tag);

	FILE *f __attribute__ ((cleanup(sc_cleanup_endmntent))) = NULL;
	const char *mount_profile_dir = "/var/lib/snapd/mount";

	char profile_path[PATH_MAX];
	must_snprintf(profile_path, sizeof(profile_path), "%s/%s.fstab",
		      mount_profile_dir, security_tag);

	debug("opening mount profile %s", profile_path);
	f = setmntent(profile_path, "r");
	// it is ok for the file to not exist
	if (f == NULL && errno == ENOENT) {
		debug("mount profile %s doesn't exist, ignoring", profile_path);
		return;
	}
	// however any other error is a real error
	if (f == NULL) {
		die("cannot open %s", profile_path);
	}

	struct mntent *m = NULL;
	while ((m = getmntent(f)) != NULL) {
		debug("read mount entry\n"
		      "\tmnt_fsname: %s\n"
		      "\tmnt_dir: %s\n"
		      "\tmnt_type: %s\n"
		      "\tmnt_opts: %s\n"
		      "\tmnt_freq: %d\n"
		      "\tmnt_passno: %d",
		      m->mnt_fsname, m->mnt_dir, m->mnt_type,
		      m->mnt_opts, m->mnt_freq, m->mnt_passno);
		int flags = MS_BIND | MS_RDONLY | MS_NODEV | MS_NOSUID;
		debug("initial flags are: bind,ro,nodev,nosuid");
		if (strcmp(m->mnt_type, "none") != 0) {
			die("cannot honor mount profile, only 'none' filesystem type is supported");
		}
		if (hasmntopt(m, "bind") == NULL) {
			die("cannot honor mount profile, the bind mount flag is mandatory");
		}
		if (hasmntopt(m, "rw") != NULL) {
			flags &= ~MS_RDONLY;
		}
		if (mount(m->mnt_fsname, m->mnt_dir, NULL, flags, NULL) != 0) {
			die("cannot mount %s at %s with options %s",
			    m->mnt_fsname, m->mnt_dir, m->mnt_opts);
		}
	}
}
Exemplo n.º 4
0
void sc_setup_quirks()
{
	// because /var/lib/snapd is essential let's move it to /tmp/snapd for a sec
	char snapd_tmp[] = "/tmp/snapd.quirks_XXXXXX";
	if (mkdtemp(snapd_tmp) == 0) {
		die("cannot create temporary directory for /var/lib/snapd mount point");
	}
	debug("performing operation: mount --move %s %s", "/var/lib/snapd",
	      snapd_tmp);
	if (mount("/var/lib/snapd", snapd_tmp, NULL, MS_MOVE, NULL)
	    != 0) {
		die("cannot perform operation: mount --move %s %s",
		    "/var/lib/snapd", snapd_tmp);
	}
	// now let's make /var/lib the vanilla /var/lib from the core snap
	char buf[PATH_MAX];
	must_snprintf(buf, sizeof buf, "%s/var/lib",
		      sc_get_inner_core_mount_point());
	sc_quirk_create_writable_mimic("/var/lib", buf,
				       MS_RDONLY | MS_REC | MS_SLAVE | MS_NODEV
				       | MS_NOSUID);
	// now let's move /var/lib/snapd (that was originally there) back
	debug("performing operation: umount %s", "/var/lib/snapd");
	if (umount("/var/lib/snapd") != 0) {
		die("cannot perform operation: umount %s", "/var/lib/snapd");
	}
	debug("performing operation: mount --move %s %s", snapd_tmp,
	      "/var/lib/snapd");
	if (mount(snapd_tmp, "/var/lib/snapd", NULL, MS_MOVE, NULL)
	    != 0) {
		die("cannot perform operation: mount --move %s %s", snapd_tmp,
		    "/var/lib/snapd");
	}
	debug("performing operation: rmdir %s", snapd_tmp);
	if (rmdir(snapd_tmp) != 0) {
		die("cannot perform operation: rmdir %s", snapd_tmp);
	}
	// We are now ready to apply any quirks that relate to /var/lib
	sc_setup_lxd_quirk();
}
Exemplo n.º 5
0
// TODO: simplify this, after all it is just a tmpfs
// TODO: fold this into bootstrap
static void setup_private_mount(const char *security_tag)
{
	uid_t uid = getuid();
	gid_t gid = getgid();
	char tmpdir[MAX_BUF] = { 0 };

	// Create a 0700 base directory, this is the base dir that is
	// protected from other users.
	//
	// Under that basedir, we put a 1777 /tmp dir that is then bind
	// mounted for the applications to use
	must_snprintf(tmpdir, sizeof(tmpdir), "/tmp/snap.%d_%s_XXXXXX", uid,
		      security_tag);
	if (mkdtemp(tmpdir) == NULL) {
		die("cannot create temporary directory essential for private /tmp");
	}
	// now we create a 1777 /tmp inside our private dir
	mode_t old_mask = umask(0);
	char *d = strdup(tmpdir);
	if (!d) {
		die("cannot allocate memory for string copy");
	}
	must_snprintf(tmpdir, sizeof(tmpdir), "%s/tmp", d);
	free(d);

	if (mkdir(tmpdir, 01777) != 0) {
		die("cannot create temporary directory for private /tmp");
	}
	umask(old_mask);

	// chdir to '/' since the mount won't apply to the current directory
	char *pwd = get_current_dir_name();
	if (pwd == NULL)
		die("cannot get current working directory");
	if (chdir("/") != 0)
		die("cannot change directory to '/'");

	// MS_BIND is there from linux 2.4
	if (mount(tmpdir, "/tmp", NULL, MS_BIND, NULL) != 0) {
		die("cannot bind mount private /tmp");
	}
	// MS_PRIVATE needs linux > 2.6.11
	if (mount("none", "/tmp", NULL, MS_PRIVATE, NULL) != 0) {
		die("cannot change sharing on /tmp to make it private");
	}
	// do the chown after the bind mount to avoid potential shenanigans
	if (chown("/tmp/", uid, gid) < 0) {
		die("cannot change ownership of /tmp");
	}
	// chdir to original directory
	if (chdir(pwd) != 0)
		die("cannot change current working directory to the original directory");
	free(pwd);

	// ensure we set the various TMPDIRs to our newly created tmpdir
	const char *tmpd[] = { "TMPDIR", "TEMPDIR", NULL };
	int i;
	for (i = 0; tmpd[i] != NULL; i++) {
		if (setenv(tmpd[i], "/tmp", 1) != 0) {
			die("cannot set environment variable '%s'", tmpd[i]);
		}
	}
}
Exemplo n.º 6
0
/**
 * Bootstrap mount namespace.
 *
 * This is a chunk of tricky code that lets us have full control over the
 * layout and direction of propagation of mount events. The documentation below
 * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source
 * tree.
 *
 * As a reminder two definitions are quoted below:
 *
 *  A 'propagation event' is defined as event generated on a vfsmount
 *  that leads to mount or unmount actions in other vfsmounts.
 *
 *  A 'peer group' is defined as a group of vfsmounts that propagate
 *  events to each other.
 *
 * (end of quote).
 *
 * The main idea is to setup a mount namespace that has a root filesystem with
 * vfsmounts and peer groups that, depending on the location, either isolate
 * or share with the rest of the system.
 *
 * The vast majority of the filesystem is shared in one direction. Events from
 * the outside (from the main mount namespace) propagate inside (to namespaces
 * of particular snaps) so things like new snap revisions, mounted drives, etc,
 * just show up as expected but even if a snap is exploited or malicious in
 * nature it cannot affect anything in another namespace where it might cause
 * security or stability issues.
 *
 * Selected directories (today just /media) can be shared in both directions.
 * This allows snaps with sufficient privileges to either create, through the
 * mount system call, additional mount points that are visible by the rest of
 * the system (both the main mount namespace and namespaces of individual
 * snaps) or remove them, through the unmount system call.
 **/
static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config)
{
	char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX";
	char src[PATH_MAX];
	char dst[PATH_MAX];
	if (mkdtemp(scratch_dir) == NULL) {
		die("cannot create temporary directory for the root file system");
	}
	// NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new
	// mount namespace and have a private list of mounts.
	debug("scratch directory for constructing namespace: %s", scratch_dir);
	// Make the root filesystem recursively shared. This way propagation events
	// will be shared with main mount namespace.
	debug("performing operation: mount --make-rshared /");
	if (mount("none", "/", NULL, MS_REC | MS_SHARED, NULL) < 0) {
		die("cannot perform operation: mount --make-rshared /");
	}
	// Bind mount the temporary scratch directory for root filesystem over
	// itself so that it is a mount point. This is done so that it can become
	// unbindable as explained below.
	debug("performing operation: mount --bind %s %s", scratch_dir,
	      scratch_dir);
	if (mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL) < 0) {
		die("cannot perform operation: mount --bind %s %s", scratch_dir,
		    scratch_dir);
	}
	// Make the scratch directory unbindable.
	//
	// This is necessary as otherwise a mount loop can occur and the kernel
	// would crash. The term unbindable simply states that it cannot be bind
	// mounted anywhere. When we construct recursive bind mounts below this
	// guarantees that this directory will not be replicated anywhere.
	debug("performing operation: mount --make-unbindable %s", scratch_dir);
	if (mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL) < 0) {
		die("cannot perform operation: mount --make-unbindable %s",
		    scratch_dir);
	}
	// Recursively bind mount desired root filesystem directory over the
	// scratch directory. This puts the initial content into the scratch space
	// and serves as a foundation for all subsequent operations below.
	//
	// The mount is recursive because it can either be applied to the root
	// filesystem of a core system (aka all-snap) or the core snap on a classic
	// system. In the former case we need recursive bind mounts to accurately
	// replicate the state of the root filesystem into the scratch directory.
	debug("performing operation: mount --rbind %s %s", config->rootfs_dir,
	      scratch_dir);
	if (mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND, NULL)
	    < 0) {
		die("cannot perform operation: mount --rbind %s %s",
		    config->rootfs_dir, scratch_dir);
	}
	// Make the scratch directory recursively private. Nothing done there will
	// be shared with any peer group, This effectively detaches us from the
	// original namespace and coupled with pivot_root below serves as the
	// foundation of the mount sandbox.
	debug("performing operation: mount --make-rslave %s", scratch_dir);
	if (mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL) < 0) {
		die("cannot perform operation: mount --make-rslave %s",
		    scratch_dir);
	}
	// Bind mount certain directories from the host filesystem to the scratch
	// directory. By default mount events will propagate in both into and out
	// of the peer group. This way the running application can alter any global
	// state visible on the host and in other snaps. This can be restricted by
	// disabling the "is_bidirectional" flag as can be seen below.
	for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL;
	     mnt++) {
		must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, mnt->path);
		debug("performing operation: mount --rbind %s %s", mnt->path,
		      dst);
		if (mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL) < 0) {
			die("cannot perform operation: mount --rbind %s %s",
			    mnt->path, dst);
		}
		if (!mnt->is_bidirectional) {
			// Mount events will only propagate inwards to the namespace. This
			// way the running application cannot alter any global state apart
			// from that of its own snap.
			debug("performing operation: mount --make-rslave %s",
			      dst);
			if (mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL) !=
			    0) {
				die("cannot perform operation: mount --make-rslave %s", dst);
			}
		}
	}
	// Since we mounted /etc from the host filesystem to the scratch directory,
	// we may need to put /etc/alternatives from the desired root filesystem
	// (e.g. the core snap) back. This way the behavior of running snaps is not
	// affected by the alternatives directory from the host, if one exists.
	//
	// https://bugs.launchpad.net/snap-confine/+bug/1580018
	const char *etc_alternatives = "/etc/alternatives";
	if (access(etc_alternatives, F_OK) == 0) {
		must_snprintf(src, sizeof src, "%s%s", config->rootfs_dir,
			      etc_alternatives);
		must_snprintf(dst, sizeof dst, "%s%s", scratch_dir,
			      etc_alternatives);
		debug("performing operation: mount --bind %s %s", src, dst);
		if (mount(src, dst, NULL, MS_BIND, NULL) != 0) {
			die("cannot perform operation: mount --bind %s %s", src,
			    dst);
		}
		debug("performing operation: mount --make-slave %s", dst);
		if (mount("none", dst, NULL, MS_SLAVE, NULL) != 0) {
			die("cannot perform operation: mount --make-slave %s",
			    dst);
		}
	}
	// Bind mount the directory where all snaps are mounted. The location of
	// the this directory on the host filesystem may not match the location in
	// the desired root filesystem. In the "core" and "ubuntu-core" snaps the
	// directory is always /snap. On the host it is a build-time configuration
	// option stored in SNAP_MOUNT_DIR.
	must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir);
	debug("performing operation: mount --rbind %s %s", SNAP_MOUNT_DIR, dst);
	if (mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC | MS_SLAVE, NULL)
	    < 0) {
		die("cannot perform operation: mount --rbind -o slave %s %s",
		    SNAP_MOUNT_DIR, dst);
	}
	debug("performing operation: mount --make-rslave %s", dst);
	if (mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL) < 0) {
		die("cannot perform operation: mount --make-rslave %s", dst);
	}
	// Create the hostfs directory if one is missing. This directory is a part
	// of packaging now so perhaps this code can be removed later.
	if (access(SC_HOSTFS_DIR, F_OK) != 0) {
		debug("creating missing hostfs directory");
		if (mkdir(SC_HOSTFS_DIR, 0755) != 0) {
			die("cannot perform operation: mkdir %s",
			    SC_HOSTFS_DIR);
		}
	}
	// Make the upcoming "put_old" directory for pivot_root private so that
	// mount events don't propagate to any peer group. In practice pivot root
	// has a number of undocumented requirements and one of them is that the
	// "put_old" directory (the second argument) cannot be shared in any way.
	must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR);
	debug("performing operation: mount --bind %s %s", dst, dst);
	if (mount(dst, dst, NULL, MS_BIND, NULL) < 0) {
		die("cannot perform operation: mount --bind %s %s", dst, dst);
	}
	debug("performing operation: mount --make-private %s", dst);
	if (mount("none", dst, NULL, MS_PRIVATE, NULL) < 0) {
		die("cannot perform operation: mount --make-private %s", dst);
	}
	// On classic mount the nvidia driver. Ideally this would be done in an
	// uniform way after pivot_root but this is good enough and requires less
	// code changes the nvidia code assumes it has access to the existing
	// pre-pivot filesystem.
	if (config->on_classic) {
		sc_mount_nvidia_driver(scratch_dir);
	}
	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
	//                    pivot_root
	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
	// Use pivot_root to "chroot" into the scratch directory.
	//
	// Q: Why are we using something as esoteric as pivot_root(2)?
	// A: Because this makes apparmor handling easy. Using a normal chroot
	// makes all apparmor rules conditional.  We are either running on an
	// all-snap system where this would-be chroot didn't happen and all the
	// rules see / as the root file system _OR_ we are running on top of a
	// classic distribution and this chroot has now moved all paths to
	// /tmp/snap.rootfs_*.
	//
	// Because we are using unshare(2) with CLONE_NEWNS we can essentially use
	// pivot_root just like chroot but this makes apparmor unaware of the old
	// root so everything works okay.
	//
	// HINT: If you are debugging this and are trying to see why pivot_root
	// happens to return EINVAL with any changes you may be making, please
	// consider applying
	// misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree
	// kernel.
	debug("performing operation: pivot_root %s %s", scratch_dir, dst);
	if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) {
		die("cannot perform operation: pivot_root %s %s", scratch_dir,
		    dst);
	}
	// Unmount the self-bind mount over the scratch directory created earlier
	// in the original root filesystem (which is now mounted on SC_HOSTFS_DIR).
	// This way we can remove the temporary directory we created and "clean up"
	// after ourselves nicely.
	must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir);
	debug("performing operation: umount %s", dst);
	if (umount2(dst, 0) < 0) {
		die("cannot perform operation: umount %s", dst);
	}
	// Remove the scratch directory. Note that we are using the path that is
	// based on the old root filesystem as after pivot_root we cannot guarantee
	// what is present at the same location normally. (It is probably an empty
	// /tmp directory that is populated in another place).
	debug("performing operation: rmdir %s", dst);
	if (rmdir(scratch_dir) < 0) {
		die("cannot perform operation: rmdir %s", dst);
	};
	// Make the old root filesystem recursively slave. This way operations
	// performed in this mount namespace will not propagate to the peer group.
	// This is another essential part of the confinement system.
	debug("performing operation: mount --make-rslave %s", SC_HOSTFS_DIR);
	if (mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL) < 0) {
		die("cannot perform operation: mount --make-rslave %s",
		    SC_HOSTFS_DIR);
	}
	// Detach the redundant hostfs version of sysfs since it shows up in the
	// mount table and software inspecting the mount table may become confused
	// (eg, docker and LP:# 162601).
	must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR);
	debug("performing operation: umount --lazy %s", src);
	if (umount2(src, UMOUNT_NOFOLLOW | MNT_DETACH) < 0) {
		die("cannot perform operation: umount --lazy %s", src);
	}
	// Detach the redundant hostfs version of /dev since it shows up in the
	// mount table and software inspecting the mount table may become confused.
	must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR);
	debug("performing operation: umount --lazy %s", src);
	if (umount2(src, UMOUNT_NOFOLLOW | MNT_DETACH) < 0) {
		die("cannot perform operation: umount --lazy %s", src);
	}
	// Detach the redundant hostfs version of /proc since it shows up in the
	// mount table and software inspecting the mount table may become confused.
	must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR);
	debug("performing operation: umount --lazy %s", src);
	if (umount2(src, UMOUNT_NOFOLLOW | MNT_DETACH) < 0) {
		die("cannot perform operation: umount --lazy %s", src);
	}
}
Exemplo n.º 7
0
void sc_create_or_join_ns_group(struct sc_ns_group *group,
				struct sc_apparmor *apparmor)
{
	// Open the mount namespace file.
	char mnt_fname[PATH_MAX];
	must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name,
		      SC_NS_MNT_FILE);
	int mnt_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1;
	// NOTE: There is no O_EXCL here because the file can be around but
	// doesn't have to be a mounted namespace.
	//
	// If the mounted namespace is discarded with
	// sc_discard_preserved_ns_group() it will revert to a regular file.  If
	// snap-confine is killed for whatever reason after the file is created but
	// before the file is bind-mounted it will also be a regular file.
	mnt_fd =
	    openat(group->dir_fd, mnt_fname,
		   O_CREAT | O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
	if (mnt_fd < 0) {
		die("cannot open mount namespace file for namespace group %s",
		    group->name);
	}
	// Check if we got an nsfs-based file or a regular file. This can be
	// reliably tested because nsfs has an unique filesystem type NSFS_MAGIC.
	// We can just ensure that this is the case thanks to fstatfs.
	struct statfs buf;
	if (fstatfs(mnt_fd, &buf) < 0) {
		die("cannot perform fstatfs() on an mount namespace file descriptor");
	}
#ifndef NSFS_MAGIC
// Account for kernel headers old enough to not know about NSFS_MAGIC.
#define NSFS_MAGIC 0x6e736673
#endif
	if (buf.f_type == NSFS_MAGIC) {
		char *vanilla_cwd __attribute__ ((cleanup(sc_cleanup_string))) =
		    NULL;
		vanilla_cwd = get_current_dir_name();
		if (vanilla_cwd == NULL) {
			die("cannot get the current working directory");
		}
		debug
		    ("attempting to re-associate the mount namespace with the namespace group %s",
		     group->name);
		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
			die("cannot re-associate the mount namespace with namespace group %s", group->name);
		}
		debug
		    ("successfully re-associated the mount namespace with the namespace group %s",
		     group->name);
		// Try to re-locate back to vanilla working directory. This can fail
		// because that directory is no longer present.
		if (chdir(vanilla_cwd) != 0) {
			debug
			    ("cannot remain in %s, moving to the void directory",
			     vanilla_cwd);
			if (chdir(SC_VOID_DIR) != 0) {
				die("cannot change directory to %s",
				    SC_VOID_DIR);
			}
			debug("successfully moved to %s", SC_VOID_DIR);
		}
		return;
	}
Exemplo n.º 8
0
scmp_filter_ctx sc_prepare_seccomp_context(const char *filter_profile)
{
	int rc = 0;
	scmp_filter_ctx ctx = NULL;
	FILE *f = NULL;
	size_t lineno = 0;
	uid_t real_uid, effective_uid, saved_uid;
	struct preprocess pre;
	struct seccomp_args sargs;

	debug("preparing seccomp profile associated with security tag %s",
	      filter_profile);

	// initialize hsearch map
	sc_map_init();

	ctx = seccomp_init(SCMP_ACT_KILL);
	if (ctx == NULL) {
		errno = ENOMEM;
		die("seccomp_init() failed");
	}
	// Setup native arch and any compatibility archs
	sc_add_seccomp_archs(ctx);

	// Disable NO_NEW_PRIVS because it interferes with exec transitions in
	// AppArmor. Unfortunately this means that security policies must be
	// very careful to not allow the following otherwise apps can escape
	// the sandbox:
	//   - seccomp syscall
	//   - prctl with PR_SET_SECCOMP
	//   - ptrace (trace) in AppArmor
	//   - capability sys_admin in AppArmor
	// Note that with NO_NEW_PRIVS disabled, CAP_SYS_ADMIN is required to
	// change the seccomp sandbox.

	if (getresuid(&real_uid, &effective_uid, &saved_uid) != 0)
		die("could not find user IDs");

	// If running privileged or capable of raising, disable nnp
	if (real_uid == 0 || effective_uid == 0 || saved_uid == 0)
		if (seccomp_attr_set(ctx, SCMP_FLTATR_CTL_NNP, 0) != 0)
			die("Cannot disable nnp");

	// Note that secure_gettenv will always return NULL when suid, so
	// SNAPPY_LAUNCHER_SECCOMP_PROFILE_DIR can't be (ab)used in that case.
	if (secure_getenv("SNAPPY_LAUNCHER_SECCOMP_PROFILE_DIR") != NULL)
		filter_profile_dir =
		    secure_getenv("SNAPPY_LAUNCHER_SECCOMP_PROFILE_DIR");

	char profile_path[512];	// arbitrary path name limit
	must_snprintf(profile_path, sizeof(profile_path), "%s/%s",
		      filter_profile_dir, filter_profile);

	f = fopen(profile_path, "r");
	if (f == NULL) {
		fprintf(stderr, "Can not open %s (%s)\n", profile_path,
			strerror(errno));
		die("aborting");
	}
	// Note, preprocess_filter() die()s on error
	preprocess_filter(f, &pre);

	if (pre.unrestricted) {
		seccomp_release(ctx);
		ctx = NULL;
		goto out;
	}
	// FIXME: right now complain mode is the equivalent to unrestricted.
	// We'll want to change this once we seccomp logging is in order.
	if (pre.complain) {
		seccomp_release(ctx);
		ctx = NULL;
		goto out;
	}

	char buf[SC_MAX_LINE_LENGTH];
	while (fgets(buf, sizeof(buf), f) != NULL) {
		lineno++;

		// skip policy-irrelevant lines
		if (validate_and_trim_line(buf, sizeof(buf), lineno) == 0)
			continue;

		char *buf_copy = strdup(buf);
		if (buf_copy == NULL)
			die("Out of memory");

		int pr_rc = parse_line(buf_copy, &sargs);
		free(buf_copy);
		if (pr_rc != PARSE_OK) {
			// as this is a syscall whitelist an invalid syscall
			// is ok and the error can be ignored
			if (pr_rc == PARSE_INVALID_SYSCALL)
				continue;
			die("could not parse line");
		}

		rc = seccomp_rule_add_exact_array(ctx, SCMP_ACT_ALLOW,
						  sargs.syscall_nr,
						  sargs.length, sargs.arg_cmp);
		if (rc != 0) {
			rc = seccomp_rule_add_array(ctx, SCMP_ACT_ALLOW,
						    sargs.syscall_nr,
						    sargs.length,
						    sargs.arg_cmp);
			if (rc != 0) {
				fprintf(stderr,
					"seccomp_rule_add_array failed with %i for '%s'\n",
					rc, buf);
				errno = 0;
				die("aborting");
			}
		}
	}

 out:
	if (f != NULL) {
		if (fclose(f) != 0)
			die("could not close seccomp file");
	}
	sc_map_destroy();
	return ctx;
}