int sc_lock(const char *scope) { // Create (if required) and open the lock directory. debug("creating lock directory %s (if missing)", sc_lock_dir); if (sc_nonfatal_mkpath(sc_lock_dir, 0755) < 0) { die("cannot create lock directory %s", sc_lock_dir); } debug("opening lock directory %s", sc_lock_dir); int dir_fd SC_CLEANUP(sc_cleanup_close) = -1; dir_fd = open(sc_lock_dir, O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW); if (dir_fd < 0) { die("cannot open lock directory"); } // Construct the name of the lock file. char lock_fname[PATH_MAX]; sc_must_snprintf(lock_fname, sizeof lock_fname, "%s/%s.lock", sc_lock_dir, scope ? : ""); // Open the lock file and acquire an exclusive lock. debug("opening lock file: %s", lock_fname); int lock_fd = openat(dir_fd, lock_fname, O_CREAT | O_RDWR | O_CLOEXEC | O_NOFOLLOW, 0600); if (lock_fd < 0) { die("cannot open lock file: %s", lock_fname); } sc_enable_sanity_timeout(); debug("acquiring exclusive lock (scope %s)", scope ? : "(global)"); if (flock(lock_fd, LOCK_EX) < 0) { sc_disable_sanity_timeout(); close(lock_fd); die("cannot acquire exclusive lock (scope %s)", scope ? : "(global)"); } else {
static void test_sc_enable_sanity_timeout() { if (g_test_subprocess()) { sc_enable_sanity_timeout(); debug("waiting..."); usleep(4 * G_USEC_PER_SEC); debug("woke up"); sc_disable_sanity_timeout(); return; } g_test_trap_subprocess(NULL, 5 * G_USEC_PER_SEC, G_TEST_SUBPROCESS_INHERIT_STDERR); g_test_trap_assert_failed(); }
void sc_lock_ns_mutex(struct sc_ns_group *group) { if (group->lock_fd < 0) { die("precondition failed: we don't have an open file descriptor for the mutex file"); } debug("acquiring exclusive lock for namespace group %s", group->name); sc_enable_sanity_timeout(); if (flock(group->lock_fd, LOCK_EX) < 0) { die("cannot acquire exclusive lock for namespace group %s", group->name); } sc_disable_sanity_timeout(); debug("acquired exclusive lock for namespace group %s", group->name); }
void sc_initialize_ns_groups() { debug("creating namespace group directory %s", sc_ns_dir); if (sc_nonfatal_mkpath(sc_ns_dir, 0755) < 0) { die("cannot create namespace group directory %s", sc_ns_dir); } debug("opening namespace group directory %s", sc_ns_dir); int dir_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1; dir_fd = open(sc_ns_dir, O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW); if (dir_fd < 0) { die("cannot open namespace group directory"); } debug("opening lock file for group directory"); int lock_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1; lock_fd = openat(dir_fd, SC_NS_LOCK_FILE, O_CREAT | O_RDWR | O_CLOEXEC | O_NOFOLLOW, 0600); if (lock_fd < 0) { die("cannot open lock file for namespace group directory"); } debug("locking the namespace group directory"); sc_enable_sanity_timeout(); if (flock(lock_fd, LOCK_EX) < 0) { die("cannot acquire exclusive lock for namespace group directory"); } sc_disable_sanity_timeout(); if (!sc_is_ns_group_dir_private()) { debug ("bind mounting the namespace group directory over itself"); if (mount(sc_ns_dir, sc_ns_dir, NULL, MS_BIND | MS_REC, NULL) < 0) { die("cannot bind mount namespace group directory over itself"); } debug ("making the namespace group directory mount point private"); if (mount(NULL, sc_ns_dir, NULL, MS_PRIVATE, NULL) < 0) { die("cannot make the namespace group directory mount point private"); } } else { debug ("namespace group directory does not require intialization"); } debug("unlocking the namespace group directory"); if (flock(lock_fd, LOCK_UN) < 0) { die("cannot release lock for namespace control directory"); } }
int sc_create_or_join_ns_group(struct sc_ns_group *group, struct sc_apparmor *apparmor, const char *base_snap_name, const char *snap_name) { // Open the mount namespace file. char mnt_fname[PATH_MAX] = { 0 }; sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name, SC_NS_MNT_FILE); int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; // NOTE: There is no O_EXCL here because the file can be around but // doesn't have to be a mounted namespace. // // If the mounted namespace is discarded with // sc_discard_preserved_ns_group() it will revert to a regular file. If // snap-confine is killed for whatever reason after the file is created but // before the file is bind-mounted it will also be a regular file. mnt_fd = openat(group->dir_fd, mnt_fname, O_CREAT | O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); if (mnt_fd < 0) { die("cannot open mount namespace file for namespace group %s", group->name); } // Check if we got an nsfs-based or procfs file or a regular file. This can // be reliably tested because nsfs has an unique filesystem type // NSFS_MAGIC. On older kernels that don't support nsfs yet we can look // for PROC_SUPER_MAGIC instead. // We can just ensure that this is the case thanks to fstatfs. struct statfs ns_statfs_buf; if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) { die("cannot perform fstatfs() on the mount namespace file descriptor"); } // Stat the mount namespace as well, this is later used to check if the // namespace is used by other processes if we are considering discarding a // stale namespace. struct stat ns_stat_buf; if (fstat(mnt_fd, &ns_stat_buf) < 0) { die("cannot perform fstat() on the mount namespace file descriptor"); } #ifndef NSFS_MAGIC // Account for kernel headers old enough to not know about NSFS_MAGIC. #define NSFS_MAGIC 0x6e736673 #endif if (ns_statfs_buf.f_type == NSFS_MAGIC || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) { // Inspect and perhaps discard the preserved mount namespace. if (sc_inspect_and_maybe_discard_stale_ns (mnt_fd, snap_name, base_snap_name) == EAGAIN) { return EAGAIN; } // Remember the vanilla working directory so that we may attempt to restore it later. char *vanilla_cwd SC_CLEANUP(sc_cleanup_string) = NULL; vanilla_cwd = get_current_dir_name(); if (vanilla_cwd == NULL) { die("cannot get the current working directory"); } // Move to the mount namespace of the snap we're trying to start. debug ("attempting to re-associate the mount namespace with the namespace group %s", group->name); if (setns(mnt_fd, CLONE_NEWNS) < 0) { die("cannot re-associate the mount namespace with namespace group %s", group->name); } debug ("successfully re-associated the mount namespace with the namespace group %s", group->name); // Try to re-locate back to vanilla working directory. This can fail // because that directory is no longer present. if (chdir(vanilla_cwd) != 0) { debug ("cannot remain in %s, moving to the void directory", vanilla_cwd); if (chdir(SC_VOID_DIR) != 0) { die("cannot change directory to %s", SC_VOID_DIR); } debug("successfully moved to %s", SC_VOID_DIR); } return 0; } debug("initializing new namespace group %s", group->name); // Create a new namespace and ask the caller to populate it. // For rationale of forking see this: // https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html // // The eventfd created here is used to synchronize the child and the parent // processes. It effectively tells the child to perform the capture // operation. group->event_fd = eventfd(0, EFD_CLOEXEC); if (group->event_fd < 0) { die("cannot create eventfd for mount namespace capture"); } debug("forking support process for mount namespace capture"); // Store the PID of the "parent" process. This done instead of calls to // getppid() because then we can reliably track the PID of the parent even // if the child process is re-parented. pid_t parent = getpid(); // Glibc defines pid as a signed 32bit integer. There's no standard way to // print pid's portably so this is the best we can do. pid_t pid = fork(); debug("forked support process has pid %d", (int)pid); if (pid < 0) { die("cannot fork support process for mount namespace capture"); } if (pid == 0) { // This is the child process which will capture the mount namespace. // // It will do so by bind-mounting the SC_NS_MNT_FILE after the parent // process calls unshare() and finishes setting up the namespace // completely. // Change the hat to a sub-profile that has limited permissions // necessary to accomplish the capture of the mount namespace. debug ("changing apparmor hat of the support process for mount namespace capture"); sc_maybe_aa_change_hat(apparmor, "mount-namespace-capture-helper", 0); // Configure the child to die as soon as the parent dies. In an odd // case where the parent is killed then we don't want to complete our // task or wait for anything. if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { die("cannot set parent process death notification signal to SIGINT"); } // Check that parent process is still alive. If this is the case then // we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake // us up from eventfd_read() below. In the rare case that the PID numbers // overflow and the now-dead parent PID is recycled we will still hang // forever on the read from eventfd below. debug("ensuring that parent process is still alive"); if (kill(parent, 0) < 0) { switch (errno) { case ESRCH: debug("parent process has already terminated"); abort(); default: die("cannot ensure that parent process is still alive"); break; } } if (fchdir(group->dir_fd) < 0) { die("cannot move process for mount namespace capture to namespace group directory"); } debug ("waiting for a eventfd data from the parent process to continue"); eventfd_t value = 0; sc_enable_sanity_timeout(); if (eventfd_read(group->event_fd, &value) < 0) { die("cannot read expected data from eventfd"); } sc_disable_sanity_timeout(); debug ("capturing mount namespace of process %d in namespace group %s", (int)parent, group->name); char src[PATH_MAX] = { 0 }; char dst[PATH_MAX] = { 0 }; sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); sc_must_snprintf(dst, sizeof dst, "%s%s", group->name, SC_NS_MNT_FILE); if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { die("cannot bind-mount the mount namespace file %s -> %s", src, dst); } debug ("successfully captured mount namespace in namespace group %s", group->name); exit(0); } else { group->child = pid; // Unshare the mount namespace and set a flag instructing the caller that // the namespace is pristine and needs to be populated now. debug("unsharing the mount namespace"); if (unshare(CLONE_NEWNS) < 0) { die("cannot unshare the mount namespace"); } group->should_populate = true; } return 0; }
// The namespace may be stale. To check this we must actually switch into it // but then we use up our setns call (the kernel misbehaves if we setns twice). // To work around this we'll fork a child and use it to probe. The child will // inspect the namespace and send information back via eventfd and then exit // unconditionally. static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd, const char *snap_name, const char *base_snap_name) { char base_snap_rev[PATH_MAX] = { 0 }; char fname[PATH_MAX] = { 0 }; char mnt_fname[PATH_MAX] = { 0 }; dev_t base_snap_dev; int event_fd SC_CLEANUP(sc_cleanup_close) = -1; // Read the revision of the base snap by looking at the current symlink. sc_must_snprintf(fname, sizeof fname, "%s/%s/current", SNAP_MOUNT_DIR, base_snap_name); if (readlink(fname, base_snap_rev, sizeof base_snap_rev) < 0) { die("cannot read revision of base snap %s", fname); } if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') { die("cannot use symbolic link %s - value is too long", fname); } // Find the device that is backing the current revision of the base snap. base_snap_dev = find_base_snap_device(base_snap_name, base_snap_rev); // Check if we are running on classic. Do it here because we will always // (seemingly) run on a core system once we are inside a mount namespace. bool is_classic = is_running_on_classic_distribution(); // Store the PID of this process. This is done instead of calls to // getppid() below because then we can reliably track the PID of the // parent even if the child process is re-parented. pid_t parent = getpid(); // Create an eventfd for the communication with the child. event_fd = eventfd(0, EFD_CLOEXEC); if (event_fd < 0) { die("cannot create eventfd for communication with inspection process"); } // Fork a child, it will do the inspection for us. pid_t child = fork(); if (child < 0) { die("cannot fork support process for namespace inspection"); } if (child == 0) { // This is the child process which will inspect the mount namespace. // // Configure the child to die as soon as the parent dies. In an odd // case where the parent is killed then we don't want to complete our // task or wait for anything. if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { die("cannot set parent process death notification signal to SIGINT"); } // Check that parent process is still alive. If this is the case then // we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake // us up from eventfd_read() below. In the rare case that the PID // numbers overflow and the now-dead parent PID is recycled we will // still hang forever on the read from eventfd below. debug("ensuring that parent process is still alive"); if (kill(parent, 0) < 0) { switch (errno) { case ESRCH: debug("parent process has already terminated"); abort(); default: die("cannot ensure that parent process is still alive"); break; } } debug("joining the namespace that we are about to probe"); // Move to the mount namespace of the snap we're trying to inspect. if (setns(mnt_fd, CLONE_NEWNS) < 0) { die("cannot join the mount namespace in order to inspect it"); } // Check if the namespace needs to be discarded. // // TODO: enable this for core distributions. This is complex because on // core the rootfs is mounted in initrd and is _not_ changed (no // pivot_root) and the base snap is again mounted (2nd time) by // systemd. This makes us end up in a situation where the outer base // snap will never match the rootfs inside the mount namespace. bool should_discard = is_classic ? should_discard_current_ns(base_snap_dev) : false; // Send this back to the parent: 2 - discard, 1 - keep. // Note that we cannot just use 0 and 1 because of the semantics of eventfd(2). debug ("sending information about the state of the mount namespace (%s)", should_discard ? "discard" : "keep"); if (eventfd_write (event_fd, should_discard ? SC_DISCARD_YES : SC_DISCARD_NO) < 0) { die("cannot send information about the state of the mount namespace"); } // Exit, we're done. debug ("support process for mount namespace inspection is about to finish"); exit(0); } // This is back in the parent process. // // Enable a sanity timeout in case the read blocks for unbound amount of // time. This will ensure we will not hang around while holding the lock. // Next, read the value written by the child process. sc_enable_sanity_timeout(); eventfd_t value = 0; debug("receiving information about the state of the mount namespace"); if (eventfd_read(event_fd, &value) < 0) { die("cannot receive information about the state of the mount namespace"); } sc_disable_sanity_timeout(); // Wait for the child process to exit and collect its exit status. errno = 0; int status = 0; if (waitpid(child, &status, 0) < 0) { die("cannot wait for the support process for mount namespace inspection"); } if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { die("support process for mount namespace inspection exited abnormally"); } // If the namespace is up-to-date then we are done. if (value == SC_DISCARD_NO) { debug("the mount namespace is up-to-date and can be reused"); return 0; } // The namespace is stale, let's check if we can discard it. debug("the mount namespace is stale and should be discarded"); if (sc_cgroup_freezer_occupied(snap_name)) { // Some processes are still using the namespace so we cannot discard it // as that would fracture the view that the set of processes inside // have on what is mounted. return 0; } // The namespace is both stale and empty. We can discard it now. debug("discarding stale and empty mount namespace"); sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s/%s%s", sc_ns_dir, snap_name, SC_NS_MNT_FILE); // Use MNT_DETACH as otherwise we get EBUSY. if (umount2(mnt_fname, MNT_DETACH | UMOUNT_NOFOLLOW) < 0) { die("cannot umount stale mount namespace %s", mnt_fname); } debug("stale mount namespace discarded"); return EAGAIN; }