Esempio n. 1
0
int sc_lock(const char *scope)
{
	// Create (if required) and open the lock directory.
	debug("creating lock directory %s (if missing)", sc_lock_dir);
	if (sc_nonfatal_mkpath(sc_lock_dir, 0755) < 0) {
		die("cannot create lock directory %s", sc_lock_dir);
	}
	debug("opening lock directory %s", sc_lock_dir);
	int dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
	dir_fd =
	    open(sc_lock_dir, O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW);
	if (dir_fd < 0) {
		die("cannot open lock directory");
	}
	// Construct the name of the lock file.
	char lock_fname[PATH_MAX];
	sc_must_snprintf(lock_fname, sizeof lock_fname, "%s/%s.lock",
			 sc_lock_dir, scope ? : "");

	// Open the lock file and acquire an exclusive lock.
	debug("opening lock file: %s", lock_fname);
	int lock_fd = openat(dir_fd, lock_fname,
			     O_CREAT | O_RDWR | O_CLOEXEC | O_NOFOLLOW, 0600);
	if (lock_fd < 0) {
		die("cannot open lock file: %s", lock_fname);
	}

	sc_enable_sanity_timeout();
	debug("acquiring exclusive lock (scope %s)", scope ? : "(global)");
	if (flock(lock_fd, LOCK_EX) < 0) {
		sc_disable_sanity_timeout();
		close(lock_fd);
		die("cannot acquire exclusive lock (scope %s)",
		    scope ? : "(global)");
	} else {
Esempio n. 2
0
static void test_sc_enable_sanity_timeout()
{
	if (g_test_subprocess()) {
		sc_enable_sanity_timeout();
		debug("waiting...");
		usleep(4 * G_USEC_PER_SEC);
		debug("woke up");
		sc_disable_sanity_timeout();
		return;
	}
	g_test_trap_subprocess(NULL, 5 * G_USEC_PER_SEC,
			       G_TEST_SUBPROCESS_INHERIT_STDERR);
	g_test_trap_assert_failed();
}
Esempio n. 3
0
void sc_lock_ns_mutex(struct sc_ns_group *group)
{
	if (group->lock_fd < 0) {
		die("precondition failed: we don't have an open file descriptor for the mutex file");
	}
	debug("acquiring exclusive lock for namespace group %s", group->name);
	sc_enable_sanity_timeout();
	if (flock(group->lock_fd, LOCK_EX) < 0) {
		die("cannot acquire exclusive lock for namespace group %s",
		    group->name);
	}
	sc_disable_sanity_timeout();
	debug("acquired exclusive lock for namespace group %s", group->name);
}
Esempio n. 4
0
void sc_initialize_ns_groups()
{
	debug("creating namespace group directory %s", sc_ns_dir);
	if (sc_nonfatal_mkpath(sc_ns_dir, 0755) < 0) {
		die("cannot create namespace group directory %s", sc_ns_dir);
	}
	debug("opening namespace group directory %s", sc_ns_dir);
	int dir_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1;
	dir_fd = open(sc_ns_dir, O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW);
	if (dir_fd < 0) {
		die("cannot open namespace group directory");
	}
	debug("opening lock file for group directory");
	int lock_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1;
	lock_fd = openat(dir_fd,
			 SC_NS_LOCK_FILE,
			 O_CREAT | O_RDWR | O_CLOEXEC | O_NOFOLLOW, 0600);
	if (lock_fd < 0) {
		die("cannot open lock file for namespace group directory");
	}
	debug("locking the namespace group directory");
	sc_enable_sanity_timeout();
	if (flock(lock_fd, LOCK_EX) < 0) {
		die("cannot acquire exclusive lock for namespace group directory");
	}
	sc_disable_sanity_timeout();
	if (!sc_is_ns_group_dir_private()) {
		debug
		    ("bind mounting the namespace group directory over itself");
		if (mount(sc_ns_dir, sc_ns_dir, NULL, MS_BIND | MS_REC, NULL) <
		    0) {
			die("cannot bind mount namespace group directory over itself");
		}
		debug
		    ("making the namespace group directory mount point private");
		if (mount(NULL, sc_ns_dir, NULL, MS_PRIVATE, NULL) < 0) {
			die("cannot make the namespace group directory mount point private");
		}
	} else {
		debug
		    ("namespace group directory does not require intialization");
	}
	debug("unlocking the namespace group directory");
	if (flock(lock_fd, LOCK_UN) < 0) {
		die("cannot release lock for namespace control directory");
	}
}
Esempio n. 5
0
int sc_create_or_join_ns_group(struct sc_ns_group *group,
			       struct sc_apparmor *apparmor,
			       const char *base_snap_name,
			       const char *snap_name)
{
	// Open the mount namespace file.
	char mnt_fname[PATH_MAX] = { 0 };
	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name,
			 SC_NS_MNT_FILE);
	int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
	// NOTE: There is no O_EXCL here because the file can be around but
	// doesn't have to be a mounted namespace.
	//
	// If the mounted namespace is discarded with
	// sc_discard_preserved_ns_group() it will revert to a regular file.  If
	// snap-confine is killed for whatever reason after the file is created but
	// before the file is bind-mounted it will also be a regular file.
	mnt_fd = openat(group->dir_fd, mnt_fname,
			O_CREAT | O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
	if (mnt_fd < 0) {
		die("cannot open mount namespace file for namespace group %s",
		    group->name);
	}
	// Check if we got an nsfs-based or procfs file or a regular file. This can
	// be reliably tested because nsfs has an unique filesystem type
	// NSFS_MAGIC.  On older kernels that don't support nsfs yet we can look
	// for PROC_SUPER_MAGIC instead. 
	// We can just ensure that this is the case thanks to fstatfs.
	struct statfs ns_statfs_buf;
	if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) {
		die("cannot perform fstatfs() on the mount namespace file descriptor");
	}
	// Stat the mount namespace as well, this is later used to check if the
	// namespace is used by other processes if we are considering discarding a
	// stale namespace.
	struct stat ns_stat_buf;
	if (fstat(mnt_fd, &ns_stat_buf) < 0) {
		die("cannot perform fstat() on the mount namespace file descriptor");
	}
#ifndef NSFS_MAGIC
// Account for kernel headers old enough to not know about NSFS_MAGIC.
#define NSFS_MAGIC 0x6e736673
#endif
	if (ns_statfs_buf.f_type == NSFS_MAGIC
	    || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) {

		// Inspect and perhaps discard the preserved mount namespace.
		if (sc_inspect_and_maybe_discard_stale_ns
		    (mnt_fd, snap_name, base_snap_name) == EAGAIN) {
			return EAGAIN;
		}
		// Remember the vanilla working directory so that we may attempt to restore it later.
		char *vanilla_cwd SC_CLEANUP(sc_cleanup_string) = NULL;
		vanilla_cwd = get_current_dir_name();
		if (vanilla_cwd == NULL) {
			die("cannot get the current working directory");
		}
		// Move to the mount namespace of the snap we're trying to start.
		debug
		    ("attempting to re-associate the mount namespace with the namespace group %s",
		     group->name);
		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
			die("cannot re-associate the mount namespace with namespace group %s", group->name);
		}
		debug
		    ("successfully re-associated the mount namespace with the namespace group %s",
		     group->name);

		// Try to re-locate back to vanilla working directory. This can fail
		// because that directory is no longer present.
		if (chdir(vanilla_cwd) != 0) {
			debug
			    ("cannot remain in %s, moving to the void directory",
			     vanilla_cwd);
			if (chdir(SC_VOID_DIR) != 0) {
				die("cannot change directory to %s",
				    SC_VOID_DIR);
			}
			debug("successfully moved to %s", SC_VOID_DIR);
		}
		return 0;
	}
	debug("initializing new namespace group %s", group->name);
	// Create a new namespace and ask the caller to populate it.
	// For rationale of forking see this:
	// https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html
	//
	// The eventfd created here is used to synchronize the child and the parent
	// processes. It effectively tells the child to perform the capture
	// operation.
	group->event_fd = eventfd(0, EFD_CLOEXEC);
	if (group->event_fd < 0) {
		die("cannot create eventfd for mount namespace capture");
	}
	debug("forking support process for mount namespace capture");
	// Store the PID of the "parent" process. This done instead of calls to
	// getppid() because then we can reliably track the PID of the parent even
	// if the child process is re-parented.
	pid_t parent = getpid();
	// Glibc defines pid as a signed 32bit integer. There's no standard way to
	// print pid's portably so this is the best we can do.
	pid_t pid = fork();
	debug("forked support process has pid %d", (int)pid);
	if (pid < 0) {
		die("cannot fork support process for mount namespace capture");
	}
	if (pid == 0) {
		// This is the child process which will capture the mount namespace.
		//
		// It will do so by bind-mounting the SC_NS_MNT_FILE after the parent
		// process calls unshare() and finishes setting up the namespace
		// completely.
		// Change the hat to a sub-profile that has limited permissions
		// necessary to accomplish the capture of the mount namespace.
		debug
		    ("changing apparmor hat of the support process for mount namespace capture");
		sc_maybe_aa_change_hat(apparmor,
				       "mount-namespace-capture-helper", 0);
		// Configure the child to die as soon as the parent dies. In an odd
		// case where the parent is killed then we don't want to complete our
		// task or wait for anything.
		if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
			die("cannot set parent process death notification signal to SIGINT");
		}
		// Check that parent process is still alive. If this is the case then
		// we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake
		// us up from eventfd_read() below. In the rare case that the PID numbers
		// overflow and the now-dead parent PID is recycled we will still hang
		// forever on the read from eventfd below.
		debug("ensuring that parent process is still alive");
		if (kill(parent, 0) < 0) {
			switch (errno) {
			case ESRCH:
				debug("parent process has already terminated");
				abort();
			default:
				die("cannot ensure that parent process is still alive");
				break;
			}
		}
		if (fchdir(group->dir_fd) < 0) {
			die("cannot move process for mount namespace capture to namespace group directory");
		}
		debug
		    ("waiting for a eventfd data from the parent process to continue");
		eventfd_t value = 0;
		sc_enable_sanity_timeout();
		if (eventfd_read(group->event_fd, &value) < 0) {
			die("cannot read expected data from eventfd");
		}
		sc_disable_sanity_timeout();
		debug
		    ("capturing mount namespace of process %d in namespace group %s",
		     (int)parent, group->name);
		char src[PATH_MAX] = { 0 };
		char dst[PATH_MAX] = { 0 };
		sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt",
				 (int)parent);
		sc_must_snprintf(dst, sizeof dst, "%s%s", group->name,
				 SC_NS_MNT_FILE);
		if (mount(src, dst, NULL, MS_BIND, NULL) < 0) {
			die("cannot bind-mount the mount namespace file %s -> %s", src, dst);
		}
		debug
		    ("successfully captured mount namespace in namespace group %s",
		     group->name);
		exit(0);
	} else {
		group->child = pid;
		// Unshare the mount namespace and set a flag instructing the caller that 
		// the namespace is pristine and needs to be populated now.
		debug("unsharing the mount namespace");
		if (unshare(CLONE_NEWNS) < 0) {
			die("cannot unshare the mount namespace");
		}
		group->should_populate = true;
	}
	return 0;
}
Esempio n. 6
0
// The namespace may be stale. To check this we must actually switch into it
// but then we use up our setns call (the kernel misbehaves if we setns twice).
// To work around this we'll fork a child and use it to probe. The child will
// inspect the namespace and send information back via eventfd and then exit
// unconditionally.
static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd,
						 const char *snap_name,
						 const char *base_snap_name)
{
	char base_snap_rev[PATH_MAX] = { 0 };
	char fname[PATH_MAX] = { 0 };
	char mnt_fname[PATH_MAX] = { 0 };
	dev_t base_snap_dev;
	int event_fd SC_CLEANUP(sc_cleanup_close) = -1;

	// Read the revision of the base snap by looking at the current symlink.
	sc_must_snprintf(fname, sizeof fname, "%s/%s/current",
			 SNAP_MOUNT_DIR, base_snap_name);
	if (readlink(fname, base_snap_rev, sizeof base_snap_rev) < 0) {
		die("cannot read revision of base snap %s", fname);
	}
	if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') {
		die("cannot use symbolic link %s - value is too long", fname);
	}
	// Find the device that is backing the current revision of the base snap.
	base_snap_dev = find_base_snap_device(base_snap_name, base_snap_rev);

	// Check if we are running on classic. Do it here because we will always
	// (seemingly) run on a core system once we are inside a mount namespace.
	bool is_classic = is_running_on_classic_distribution();

	// Store the PID of this process. This is done instead of calls to
	// getppid() below because then we can reliably track the PID of the
	// parent even if the child process is re-parented.
	pid_t parent = getpid();

	// Create an eventfd for the communication with the child.
	event_fd = eventfd(0, EFD_CLOEXEC);
	if (event_fd < 0) {
		die("cannot create eventfd for communication with inspection process");
	}
	// Fork a child, it will do the inspection for us.
	pid_t child = fork();
	if (child < 0) {
		die("cannot fork support process for namespace inspection");
	}

	if (child == 0) {
		// This is the child process which will inspect the mount namespace.
		//
		// Configure the child to die as soon as the parent dies. In an odd
		// case where the parent is killed then we don't want to complete our
		// task or wait for anything.
		if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
			die("cannot set parent process death notification signal to SIGINT");
		}
		// Check that parent process is still alive. If this is the case then
		// we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake
		// us up from eventfd_read() below. In the rare case that the PID
		// numbers overflow and the now-dead parent PID is recycled we will
		// still hang forever on the read from eventfd below.
		debug("ensuring that parent process is still alive");
		if (kill(parent, 0) < 0) {
			switch (errno) {
			case ESRCH:
				debug("parent process has already terminated");
				abort();
			default:
				die("cannot ensure that parent process is still alive");
				break;
			}
		}

		debug("joining the namespace that we are about to probe");
		// Move to the mount namespace of the snap we're trying to inspect.
		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
			die("cannot join the mount namespace in order to inspect it");
		}
		// Check if the namespace needs to be discarded.
		//
		// TODO: enable this for core distributions. This is complex because on
		// core the rootfs is mounted in initrd and is _not_ changed (no
		// pivot_root) and the base snap is again mounted (2nd time) by
		// systemd. This makes us end up in a situation where the outer base
		// snap will never match the rootfs inside the mount namespace.
		bool should_discard =
		    is_classic ? should_discard_current_ns(base_snap_dev) :
		    false;

		// Send this back to the parent: 2 - discard, 1 - keep.
		// Note that we cannot just use 0 and 1 because of the semantics of eventfd(2).
		debug
		    ("sending information about the state of the mount namespace (%s)",
		     should_discard ? "discard" : "keep");
		if (eventfd_write
		    (event_fd,
		     should_discard ? SC_DISCARD_YES : SC_DISCARD_NO) < 0) {
			die("cannot send information about the state of the mount namespace");
		}
		// Exit, we're done.
		debug
		    ("support process for mount namespace inspection is about to finish");
		exit(0);
	}
	// This is back in the parent process.
	//
	// Enable a sanity timeout in case the read blocks for unbound amount of
	// time. This will ensure we will not hang around while holding the lock.
	// Next, read the value written by the child process.
	sc_enable_sanity_timeout();
	eventfd_t value = 0;
	debug("receiving information about the state of the mount namespace");
	if (eventfd_read(event_fd, &value) < 0) {
		die("cannot receive information about the state of the mount namespace");
	}
	sc_disable_sanity_timeout();

	// Wait for the child process to exit and collect its exit status.
	errno = 0;
	int status = 0;
	if (waitpid(child, &status, 0) < 0) {
		die("cannot wait for the support process for mount namespace inspection");
	}
	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
		die("support process for mount namespace inspection exited abnormally");
	}
	// If the namespace is up-to-date then we are done.
	if (value == SC_DISCARD_NO) {
		debug("the mount namespace is up-to-date and can be reused");
		return 0;
	}
	// The namespace is stale, let's check if we can discard it.
	debug("the mount namespace is stale and should be discarded");
	if (sc_cgroup_freezer_occupied(snap_name)) {
		// Some processes are still using the namespace so we cannot discard it
		// as that would fracture the view that the set of processes inside
		// have on what is mounted.
		return 0;
	}
	// The namespace is both stale and empty. We can discard it now.
	debug("discarding stale and empty mount namespace");
	sc_must_snprintf(mnt_fname, sizeof mnt_fname,
			 "%s/%s%s", sc_ns_dir, snap_name, SC_NS_MNT_FILE);

	// Use MNT_DETACH as otherwise we get EBUSY.
	if (umount2(mnt_fname, MNT_DETACH | UMOUNT_NOFOLLOW) < 0) {
		die("cannot umount stale mount namespace %s", mnt_fname);
	}
	debug("stale mount namespace discarded");
	return EAGAIN;
}