/** * Create a writable mimic directory based on reference directory. * * The mimic directory is a tmpfs populated with bind mounts to the (possibly * read only) directories in the reference directory. While all the read-only * content stays read-only the actual mimic directory is writable so additional * content can be placed there. * * Flags are forwarded to sc_quirk_mkdir_bind() **/ static void sc_quirk_create_writable_mimic(const char *mimic_dir, const char *ref_dir, unsigned flags) { debug("creating writable mimic directory %s based on %s", mimic_dir, ref_dir); sc_quirk_setup_tmpfs(mimic_dir); // Now copy the ownership and permissions of the mimicked directory struct stat stat_buf; if (stat(ref_dir, &stat_buf) < 0) { die("cannot stat %s", ref_dir); } if (chown(mimic_dir, stat_buf.st_uid, stat_buf.st_gid) < 0) { die("cannot chown for %s", mimic_dir); } if (chmod(mimic_dir, stat_buf.st_mode) < 0) { die("cannot chmod for %s", mimic_dir); } debug("bind-mounting all the files from the reference directory"); DIR *dirp SC_CLEANUP(sc_cleanup_closedir) = NULL; dirp = opendir(ref_dir); if (dirp == NULL) { die("cannot open reference directory %s", ref_dir); } struct dirent *entryp = NULL; do { char src_name[PATH_MAX * 2] = { 0 }; char dest_name[PATH_MAX * 2] = { 0 }; // Set errno to zero, if readdir fails it will not only return null but // set errno to a non-zero value. This is how we can differentiate // end-of-directory from an actual error. errno = 0; entryp = readdir(dirp); if (entryp == NULL && errno != 0) { die("cannot read another directory entry"); } if (entryp == NULL) { break; } if (strcmp(entryp->d_name, ".") == 0 || strcmp(entryp->d_name, "..") == 0) { continue; } if (entryp->d_type != DT_DIR && entryp->d_type != DT_REG) { die("unsupported entry type of file %s (%d)", entryp->d_name, entryp->d_type); } sc_must_snprintf(src_name, sizeof src_name, "%s/%s", ref_dir, entryp->d_name); sc_must_snprintf(dest_name, sizeof dest_name, "%s/%s", mimic_dir, entryp->d_name); sc_quirk_mkdir_bind(src_name, dest_name, flags); } while (entryp != NULL); }
static void validate_bpfpath_is_safe(const char *path) { if (path == NULL || strlen(path) == 0 || path[0] != '/') { die("valid_bpfpath_is_safe needs an absolute path as input"); } // strtok_r() modifies its first argument, so work on a copy char *tokenized SC_CLEANUP(sc_cleanup_string) = NULL; tokenized = strdup(path); if (tokenized == NULL) { die("cannot allocate memory for copy of path"); } // allocate a string large enough to hold path, and initialize it to // '/' size_t checked_path_size = strlen(path) + 1; char *checked_path SC_CLEANUP(sc_cleanup_string) = NULL; checked_path = calloc(checked_path_size, 1); if (checked_path == NULL) { die("cannot allocate memory for checked_path"); } checked_path[0] = '/'; checked_path[1] = '\0'; // validate '/' validate_path_has_strict_perms(checked_path); // strtok_r needs a pointer to keep track of where it is in the // string. char *buf_saveptr = NULL; // reconstruct the path from '/' down to profile_name char *buf_token = strtok_r(tokenized, "/", &buf_saveptr); while (buf_token != NULL) { char *prev SC_CLEANUP(sc_cleanup_string) = NULL; prev = strdup(checked_path); // needed by vsnprintf in sc_must_snprintf if (prev == NULL) { die("cannot allocate memory for copy of checked_path"); } // append '<buf_token>' if checked_path is '/', otherwise '/<buf_token>' if (strlen(checked_path) == 1) { sc_must_snprintf(checked_path, checked_path_size, "%s%s", prev, buf_token); } else { sc_must_snprintf(checked_path, checked_path_size, "%s/%s", prev, buf_token); } validate_path_has_strict_perms(checked_path); buf_token = strtok_r(NULL, "/", &buf_saveptr); } }
void sc_cgroup_freezer_join(const char *snap_name, pid_t pid) { // Format the name of the cgroup hierarchy. char buf[PATH_MAX] = { 0 }; sc_must_snprintf(buf, sizeof buf, "snap.%s", snap_name); // Open the freezer cgroup directory. int cgroup_fd SC_CLEANUP(sc_cleanup_close) = -1; cgroup_fd = open(freezer_cgroup_dir, O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); if (cgroup_fd < 0) { die("cannot open freezer cgroup (%s)", freezer_cgroup_dir); } // Create the freezer hierarchy for the given snap. if (mkdirat(cgroup_fd, buf, 0755) < 0 && errno != EEXIST) { die("cannot create freezer cgroup hierarchy for snap %s", snap_name); } // Open the hierarchy directory for the given snap. int hierarchy_fd SC_CLEANUP(sc_cleanup_close) = -1; hierarchy_fd = openat(cgroup_fd, buf, O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); if (hierarchy_fd < 0) { die("cannot open freezer cgroup hierarchy for snap %s", snap_name); } // Since we may be running from a setuid but not setgid executable, ensure // that the group and owner of the hierarchy directory is root.root. if (fchownat(hierarchy_fd, "", 0, 0, AT_EMPTY_PATH) < 0) { die("cannot change owner of freezer cgroup hierarchy for snap %s to root.root", snap_name); } // Open the tasks file. int tasks_fd SC_CLEANUP(sc_cleanup_close) = -1; tasks_fd = openat(hierarchy_fd, "tasks", O_WRONLY | O_NOFOLLOW | O_CLOEXEC); if (tasks_fd < 0) { die("cannot open tasks file for freezer cgroup hierarchy for snap %s", snap_name); } // Write the process (task) number to the tasks file. Linux task IDs are // limited to 2^29 so a long int is enough to represent it. // See include/linux/threads.h in the kernel source tree for details. int n = sc_must_snprintf(buf, sizeof buf, "%ld", (long)pid); if (write(tasks_fd, buf, n) < n) { die("cannot move process %ld to freezer cgroup hierarchy for snap %s", (long)pid, snap_name); } debug("moved process %ld to freezer cgroup hierarchy for snap %s", (long)pid, snap_name); }
static dev_t find_base_snap_device(const char *base_snap_name, const char *base_snap_rev) { // Find the backing device of the base snap. // TODO: add support for "try mode" base snaps that also need // consideration of the mie->root component. dev_t base_snap_dev = 0; char base_squashfs_path[PATH_MAX]; sc_must_snprintf(base_squashfs_path, sizeof base_squashfs_path, "%s/%s/%s", SNAP_MOUNT_DIR, base_snap_name, base_snap_rev); struct sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; mi = sc_parse_mountinfo(NULL); if (mi == NULL) { die("cannot parse mountinfo of the current process"); } bool found = false; for (struct sc_mountinfo_entry * mie = sc_first_mountinfo_entry(mi); mie != NULL; mie = sc_next_mountinfo_entry(mie)) { if (sc_streq(mie->mount_dir, base_squashfs_path)) { base_snap_dev = MKDEV(mie->dev_major, mie->dev_minor); debug("found base snap filesystem device %d:%d", mie->dev_major, mie->dev_minor); // Don't break when found, we are interested in the last // entry as this is the "effective" one. found = true; } } if (!found) { die("cannot find device backing the base snap %s", base_snap_name); } return base_snap_dev; }
static void test_sc_must_snprintf__fail() { if (g_test_subprocess()) { char buf[5]; sc_must_snprintf(buf, sizeof buf, "12345"); g_test_message("expected sc_must_snprintf not to return"); g_test_fail(); return; } g_test_trap_subprocess(NULL, 0, 0); g_test_trap_assert_failed(); g_test_trap_assert_stderr("cannot format string: 1234\n"); }
void sc_setup_quirks(void) { // because /var/lib/snapd is essential let's move it to /tmp/snapd for a sec char snapd_tmp[] = "/tmp/snapd.quirks_XXXXXX"; if (mkdtemp(snapd_tmp) == 0) { die("cannot create temporary directory for /var/lib/snapd mount point"); } debug("performing operation: mount --move %s %s", "/var/lib/snapd", snapd_tmp); if (mount("/var/lib/snapd", snapd_tmp, NULL, MS_MOVE, NULL) != 0) { die("cannot perform operation: mount --move %s %s", "/var/lib/snapd", snapd_tmp); } // now let's make /var/lib the vanilla /var/lib from the core snap char buf[PATH_MAX] = { 0 }; sc_must_snprintf(buf, sizeof buf, "%s/var/lib", sc_get_inner_core_mount_point()); sc_quirk_create_writable_mimic("/var/lib", buf, MS_RDONLY | MS_REC | MS_SLAVE | MS_NODEV | MS_NOSUID); // now let's move /var/lib/snapd (that was originally there) back debug("performing operation: umount %s", "/var/lib/snapd"); if (umount("/var/lib/snapd") != 0) { die("cannot perform operation: umount %s", "/var/lib/snapd"); } debug("performing operation: mount --move %s %s", snapd_tmp, "/var/lib/snapd"); if (mount(snapd_tmp, "/var/lib/snapd", NULL, MS_MOVE, NULL) != 0) { die("cannot perform operation: mount --move %s %s", snapd_tmp, "/var/lib/snapd"); } debug("performing operation: rmdir %s", snapd_tmp); if (rmdir(snapd_tmp) != 0) { die("cannot perform operation: rmdir %s", snapd_tmp); } // We are now ready to apply any quirks that relate to /var/lib sc_setup_lxd_quirk(); }
void sc_discard_preserved_ns_group(struct sc_ns_group *group) { // Remember the current working directory int old_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; old_dir_fd = open(".", O_PATH | O_DIRECTORY | O_CLOEXEC); if (old_dir_fd < 0) { die("cannot open current directory"); } // Move to the mount namespace directory (/run/snapd/ns) if (fchdir(group->dir_fd) < 0) { die("cannot move to namespace group directory"); } // Unmount ${group_name}.mnt which holds the preserved namespace char mnt_fname[PATH_MAX] = { 0 }; sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name, SC_NS_MNT_FILE); debug("unmounting preserved mount namespace file %s", mnt_fname); if (umount2(mnt_fname, UMOUNT_NOFOLLOW) < 0) { switch (errno) { case EINVAL: // EINVAL is returned when there's nothing to unmount (no bind-mount). // Instead of checking for this explicitly (which is always racy) we // just unmount and check the return code. break; case ENOENT: // We may be asked to discard a namespace that doesn't yet // exist (even the mount point may be absent). We just // ignore that error and return gracefully. break; default: die("cannot unmount preserved mount namespace file %s", mnt_fname); break; } } // Get back to the original directory if (fchdir(old_dir_fd) < 0) { die("cannot move back to original directory"); } }
int sc_apply_seccomp_bpf(const char *filter_profile) { debug("loading bpf program for security tag %s", filter_profile); char profile_path[PATH_MAX] = { 0 }; sc_must_snprintf(profile_path, sizeof(profile_path), "%s/%s.bin", filter_profile_dir, filter_profile); // Wait some time for the security profile to show up. When // the system boots snapd will created security profiles, but // a service snap (e.g. network-manager) starts in parallel with // snapd so for such snaps, the profiles may not be generated // yet long max_wait = 120; const char *MAX_PROFILE_WAIT = getenv("SNAP_CONFINE_MAX_PROFILE_WAIT"); if (MAX_PROFILE_WAIT != NULL) { char *endptr = NULL; errno = 0; long env_max_wait = strtol(MAX_PROFILE_WAIT, &endptr, 10); if (errno != 0 || MAX_PROFILE_WAIT == endptr || *endptr != '\0' || env_max_wait <= 0) { die("SNAP_CONFINE_MAX_PROFILE_WAIT invalid"); } max_wait = env_max_wait > 0 ? env_max_wait : max_wait; } if (max_wait > 3600) { max_wait = 3600; } for (long i = 0; i < max_wait; ++i) { if (access(profile_path, F_OK) == 0) { break; } sleep(1); } // validate '/' down to profile_path are root-owned and not // 'other' writable to avoid possibility of privilege // escalation via bpf program load when paths are incorrectly // set on the system. validate_bpfpath_is_safe(profile_path); // load bpf char bpf[MAX_BPF_SIZE + 1] = { 0 }; // account for EOF FILE *fp = fopen(profile_path, "rb"); if (fp == NULL) { die("cannot read %s", profile_path); } // set 'size' to 1 to get bytes transferred size_t num_read = fread(bpf, 1, sizeof(bpf), fp); if (ferror(fp) != 0) { die("cannot read seccomp profile %s", profile_path); } else if (feof(fp) == 0) { die("seccomp profile %s exceeds %zu bytes", profile_path, sizeof(bpf)); } fclose(fp); debug("read %zu bytes from %s", num_read, profile_path); if (sc_streq(bpf, "@unrestricted\n")) { return 0; } uid_t real_uid, effective_uid, saved_uid; if (getresuid(&real_uid, &effective_uid, &saved_uid) < 0) { die("cannot call getresuid"); } // If we can, raise privileges so that we can load the BPF into the // kernel via 'prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, ...)'. debug("raising privileges to load seccomp profile"); if (effective_uid != 0 && saved_uid == 0) { if (seteuid(0) != 0) { die("seteuid failed"); } if (geteuid() != 0) { die("raising privs before seccomp_load did not work"); } } // Load filter into the kernel. Importantly we are // intentionally *not* setting NO_NEW_PRIVS because it // interferes with exec transitions in AppArmor with certain // snappy interfaces. Not setting NO_NEW_PRIVS does mean that // applications can adjust their sandbox if they have // CAP_SYS_ADMIN or, if running on < 4.8 kernels, break out of // the seccomp via ptrace. Both CAP_SYS_ADMIN and 'ptrace // (trace)' are blocked by AppArmor with typical snappy // interfaces. struct sock_fprog prog = { .len = num_read / sizeof(struct sock_filter), .filter = (struct sock_filter *)bpf, }; if (seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG, &prog) != 0) { if (errno == ENOSYS) { debug("kernel doesn't support the seccomp(2) syscall"); } else if (errno == EINVAL) { debug ("kernel may not support the SECCOMP_FILTER_FLAG_LOG flag"); } debug ("falling back to prctl(2) syscall to load seccomp filter"); if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) != 0) { die("cannot apply seccomp profile"); } } // drop privileges again debug("dropping privileges after loading seccomp profile"); if (geteuid() == 0) { unsigned real_uid = getuid(); if (seteuid(real_uid) != 0) { die("seteuid failed"); } if (real_uid != 0 && geteuid() == 0) { die("dropping privs after seccomp_load did not work"); } } return 0; }
const char *sc_mount_opt2str(char *buf, size_t buf_size, unsigned long flags) { unsigned long used = 0; sc_string_init(buf, buf_size); #define F(FLAG, TEXT) do { \ if (flags & (FLAG)) { \ sc_string_append(buf, buf_size, #TEXT ","); flags ^= (FLAG); \ } \ } while (0) F(MS_RDONLY, ro); F(MS_NOSUID, nosuid); F(MS_NODEV, nodev); F(MS_NOEXEC, noexec); F(MS_SYNCHRONOUS, sync); F(MS_REMOUNT, remount); F(MS_MANDLOCK, mand); F(MS_DIRSYNC, dirsync); F(MS_NOATIME, noatime); F(MS_NODIRATIME, nodiratime); if (flags & MS_BIND) { if (flags & MS_REC) { sc_string_append(buf, buf_size, "rbind,"); used |= MS_REC; } else { sc_string_append(buf, buf_size, "bind,"); } flags ^= MS_BIND; } F(MS_MOVE, move); // The MS_REC flag handled separately by affected flags (MS_BIND, // MS_PRIVATE, MS_SLAVE, MS_SHARED) // XXX: kernel has MS_VERBOSE, glibc has MS_SILENT, both use the same constant F(MS_SILENT, silent); F(MS_POSIXACL, acl); F(MS_UNBINDABLE, unbindable); if (flags & MS_PRIVATE) { if (flags & MS_REC) { sc_string_append(buf, buf_size, "rprivate,"); used |= MS_REC; } else { sc_string_append(buf, buf_size, "private,"); } flags ^= MS_PRIVATE; } if (flags & MS_SLAVE) { if (flags & MS_REC) { sc_string_append(buf, buf_size, "rslave,"); used |= MS_REC; } else { sc_string_append(buf, buf_size, "slave,"); } flags ^= MS_SLAVE; } if (flags & MS_SHARED) { if (flags & MS_REC) { sc_string_append(buf, buf_size, "rshared,"); used |= MS_REC; } else { sc_string_append(buf, buf_size, "shared,"); } flags ^= MS_SHARED; } flags ^= used; // this is just for MS_REC F(MS_RELATIME, relatime); F(MS_KERNMOUNT, kernmount); F(MS_I_VERSION, iversion); F(MS_STRICTATIME, strictatime); #ifndef MS_LAZYTIME #define MS_LAZYTIME (1<<25) #endif F(MS_LAZYTIME, lazytime); #ifndef MS_NOSEC #define MS_NOSEC (1 << 28) #endif F(MS_NOSEC, nosec); #ifndef MS_BORN #define MS_BORN (1 << 29) #endif F(MS_BORN, born); F(MS_ACTIVE, active); F(MS_NOUSER, nouser); #undef F // Render any flags that are unaccounted for. if (flags) { char of[128] = { 0 }; sc_must_snprintf(of, sizeof of, "%#lx", flags); sc_string_append(buf, buf_size, of); } // Chop the excess comma from the end. size_t len = strnlen(buf, buf_size); if (len > 0 && buf[len - 1] == ',') { buf[len - 1] = 0; } return buf; }
static void test_sc_must_snprintf() { char buf[5]; sc_must_snprintf(buf, sizeof buf, "1234"); g_assert_cmpstr(buf, ==, "1234"); }
int sc_create_or_join_ns_group(struct sc_ns_group *group, struct sc_apparmor *apparmor, const char *base_snap_name, const char *snap_name) { // Open the mount namespace file. char mnt_fname[PATH_MAX] = { 0 }; sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name, SC_NS_MNT_FILE); int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; // NOTE: There is no O_EXCL here because the file can be around but // doesn't have to be a mounted namespace. // // If the mounted namespace is discarded with // sc_discard_preserved_ns_group() it will revert to a regular file. If // snap-confine is killed for whatever reason after the file is created but // before the file is bind-mounted it will also be a regular file. mnt_fd = openat(group->dir_fd, mnt_fname, O_CREAT | O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); if (mnt_fd < 0) { die("cannot open mount namespace file for namespace group %s", group->name); } // Check if we got an nsfs-based or procfs file or a regular file. This can // be reliably tested because nsfs has an unique filesystem type // NSFS_MAGIC. On older kernels that don't support nsfs yet we can look // for PROC_SUPER_MAGIC instead. // We can just ensure that this is the case thanks to fstatfs. struct statfs ns_statfs_buf; if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) { die("cannot perform fstatfs() on the mount namespace file descriptor"); } // Stat the mount namespace as well, this is later used to check if the // namespace is used by other processes if we are considering discarding a // stale namespace. struct stat ns_stat_buf; if (fstat(mnt_fd, &ns_stat_buf) < 0) { die("cannot perform fstat() on the mount namespace file descriptor"); } #ifndef NSFS_MAGIC // Account for kernel headers old enough to not know about NSFS_MAGIC. #define NSFS_MAGIC 0x6e736673 #endif if (ns_statfs_buf.f_type == NSFS_MAGIC || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) { // Inspect and perhaps discard the preserved mount namespace. if (sc_inspect_and_maybe_discard_stale_ns (mnt_fd, snap_name, base_snap_name) == EAGAIN) { return EAGAIN; } // Remember the vanilla working directory so that we may attempt to restore it later. char *vanilla_cwd SC_CLEANUP(sc_cleanup_string) = NULL; vanilla_cwd = get_current_dir_name(); if (vanilla_cwd == NULL) { die("cannot get the current working directory"); } // Move to the mount namespace of the snap we're trying to start. debug ("attempting to re-associate the mount namespace with the namespace group %s", group->name); if (setns(mnt_fd, CLONE_NEWNS) < 0) { die("cannot re-associate the mount namespace with namespace group %s", group->name); } debug ("successfully re-associated the mount namespace with the namespace group %s", group->name); // Try to re-locate back to vanilla working directory. This can fail // because that directory is no longer present. if (chdir(vanilla_cwd) != 0) { debug ("cannot remain in %s, moving to the void directory", vanilla_cwd); if (chdir(SC_VOID_DIR) != 0) { die("cannot change directory to %s", SC_VOID_DIR); } debug("successfully moved to %s", SC_VOID_DIR); } return 0; } debug("initializing new namespace group %s", group->name); // Create a new namespace and ask the caller to populate it. // For rationale of forking see this: // https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html // // The eventfd created here is used to synchronize the child and the parent // processes. It effectively tells the child to perform the capture // operation. group->event_fd = eventfd(0, EFD_CLOEXEC); if (group->event_fd < 0) { die("cannot create eventfd for mount namespace capture"); } debug("forking support process for mount namespace capture"); // Store the PID of the "parent" process. This done instead of calls to // getppid() because then we can reliably track the PID of the parent even // if the child process is re-parented. pid_t parent = getpid(); // Glibc defines pid as a signed 32bit integer. There's no standard way to // print pid's portably so this is the best we can do. pid_t pid = fork(); debug("forked support process has pid %d", (int)pid); if (pid < 0) { die("cannot fork support process for mount namespace capture"); } if (pid == 0) { // This is the child process which will capture the mount namespace. // // It will do so by bind-mounting the SC_NS_MNT_FILE after the parent // process calls unshare() and finishes setting up the namespace // completely. // Change the hat to a sub-profile that has limited permissions // necessary to accomplish the capture of the mount namespace. debug ("changing apparmor hat of the support process for mount namespace capture"); sc_maybe_aa_change_hat(apparmor, "mount-namespace-capture-helper", 0); // Configure the child to die as soon as the parent dies. In an odd // case where the parent is killed then we don't want to complete our // task or wait for anything. if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { die("cannot set parent process death notification signal to SIGINT"); } // Check that parent process is still alive. If this is the case then // we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake // us up from eventfd_read() below. In the rare case that the PID numbers // overflow and the now-dead parent PID is recycled we will still hang // forever on the read from eventfd below. debug("ensuring that parent process is still alive"); if (kill(parent, 0) < 0) { switch (errno) { case ESRCH: debug("parent process has already terminated"); abort(); default: die("cannot ensure that parent process is still alive"); break; } } if (fchdir(group->dir_fd) < 0) { die("cannot move process for mount namespace capture to namespace group directory"); } debug ("waiting for a eventfd data from the parent process to continue"); eventfd_t value = 0; sc_enable_sanity_timeout(); if (eventfd_read(group->event_fd, &value) < 0) { die("cannot read expected data from eventfd"); } sc_disable_sanity_timeout(); debug ("capturing mount namespace of process %d in namespace group %s", (int)parent, group->name); char src[PATH_MAX] = { 0 }; char dst[PATH_MAX] = { 0 }; sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); sc_must_snprintf(dst, sizeof dst, "%s%s", group->name, SC_NS_MNT_FILE); if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { die("cannot bind-mount the mount namespace file %s -> %s", src, dst); } debug ("successfully captured mount namespace in namespace group %s", group->name); exit(0); } else { group->child = pid; // Unshare the mount namespace and set a flag instructing the caller that // the namespace is pristine and needs to be populated now. debug("unsharing the mount namespace"); if (unshare(CLONE_NEWNS) < 0) { die("cannot unshare the mount namespace"); } group->should_populate = true; } return 0; }
// The namespace may be stale. To check this we must actually switch into it // but then we use up our setns call (the kernel misbehaves if we setns twice). // To work around this we'll fork a child and use it to probe. The child will // inspect the namespace and send information back via eventfd and then exit // unconditionally. static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd, const char *snap_name, const char *base_snap_name) { char base_snap_rev[PATH_MAX] = { 0 }; char fname[PATH_MAX] = { 0 }; char mnt_fname[PATH_MAX] = { 0 }; dev_t base_snap_dev; int event_fd SC_CLEANUP(sc_cleanup_close) = -1; // Read the revision of the base snap by looking at the current symlink. sc_must_snprintf(fname, sizeof fname, "%s/%s/current", SNAP_MOUNT_DIR, base_snap_name); if (readlink(fname, base_snap_rev, sizeof base_snap_rev) < 0) { die("cannot read revision of base snap %s", fname); } if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') { die("cannot use symbolic link %s - value is too long", fname); } // Find the device that is backing the current revision of the base snap. base_snap_dev = find_base_snap_device(base_snap_name, base_snap_rev); // Check if we are running on classic. Do it here because we will always // (seemingly) run on a core system once we are inside a mount namespace. bool is_classic = is_running_on_classic_distribution(); // Store the PID of this process. This is done instead of calls to // getppid() below because then we can reliably track the PID of the // parent even if the child process is re-parented. pid_t parent = getpid(); // Create an eventfd for the communication with the child. event_fd = eventfd(0, EFD_CLOEXEC); if (event_fd < 0) { die("cannot create eventfd for communication with inspection process"); } // Fork a child, it will do the inspection for us. pid_t child = fork(); if (child < 0) { die("cannot fork support process for namespace inspection"); } if (child == 0) { // This is the child process which will inspect the mount namespace. // // Configure the child to die as soon as the parent dies. In an odd // case where the parent is killed then we don't want to complete our // task or wait for anything. if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { die("cannot set parent process death notification signal to SIGINT"); } // Check that parent process is still alive. If this is the case then // we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake // us up from eventfd_read() below. In the rare case that the PID // numbers overflow and the now-dead parent PID is recycled we will // still hang forever on the read from eventfd below. debug("ensuring that parent process is still alive"); if (kill(parent, 0) < 0) { switch (errno) { case ESRCH: debug("parent process has already terminated"); abort(); default: die("cannot ensure that parent process is still alive"); break; } } debug("joining the namespace that we are about to probe"); // Move to the mount namespace of the snap we're trying to inspect. if (setns(mnt_fd, CLONE_NEWNS) < 0) { die("cannot join the mount namespace in order to inspect it"); } // Check if the namespace needs to be discarded. // // TODO: enable this for core distributions. This is complex because on // core the rootfs is mounted in initrd and is _not_ changed (no // pivot_root) and the base snap is again mounted (2nd time) by // systemd. This makes us end up in a situation where the outer base // snap will never match the rootfs inside the mount namespace. bool should_discard = is_classic ? should_discard_current_ns(base_snap_dev) : false; // Send this back to the parent: 2 - discard, 1 - keep. // Note that we cannot just use 0 and 1 because of the semantics of eventfd(2). debug ("sending information about the state of the mount namespace (%s)", should_discard ? "discard" : "keep"); if (eventfd_write (event_fd, should_discard ? SC_DISCARD_YES : SC_DISCARD_NO) < 0) { die("cannot send information about the state of the mount namespace"); } // Exit, we're done. debug ("support process for mount namespace inspection is about to finish"); exit(0); } // This is back in the parent process. // // Enable a sanity timeout in case the read blocks for unbound amount of // time. This will ensure we will not hang around while holding the lock. // Next, read the value written by the child process. sc_enable_sanity_timeout(); eventfd_t value = 0; debug("receiving information about the state of the mount namespace"); if (eventfd_read(event_fd, &value) < 0) { die("cannot receive information about the state of the mount namespace"); } sc_disable_sanity_timeout(); // Wait for the child process to exit and collect its exit status. errno = 0; int status = 0; if (waitpid(child, &status, 0) < 0) { die("cannot wait for the support process for mount namespace inspection"); } if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { die("support process for mount namespace inspection exited abnormally"); } // If the namespace is up-to-date then we are done. if (value == SC_DISCARD_NO) { debug("the mount namespace is up-to-date and can be reused"); return 0; } // The namespace is stale, let's check if we can discard it. debug("the mount namespace is stale and should be discarded"); if (sc_cgroup_freezer_occupied(snap_name)) { // Some processes are still using the namespace so we cannot discard it // as that would fracture the view that the set of processes inside // have on what is mounted. return 0; } // The namespace is both stale and empty. We can discard it now. debug("discarding stale and empty mount namespace"); sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s/%s%s", sc_ns_dir, snap_name, SC_NS_MNT_FILE); // Use MNT_DETACH as otherwise we get EBUSY. if (umount2(mnt_fname, MNT_DETACH | UMOUNT_NOFOLLOW) < 0) { die("cannot umount stale mount namespace %s", mnt_fname); } debug("stale mount namespace discarded"); return EAGAIN; }
bool sc_cgroup_freezer_occupied(const char *snap_name) { // Format the name of the cgroup hierarchy. char buf[PATH_MAX] = { 0 }; sc_must_snprintf(buf, sizeof buf, "snap.%s", snap_name); // Open the freezer cgroup directory. int cgroup_fd SC_CLEANUP(sc_cleanup_close) = -1; cgroup_fd = open(freezer_cgroup_dir, O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); if (cgroup_fd < 0) { die("cannot open freezer cgroup (%s)", freezer_cgroup_dir); } // Open the proc directory. int proc_fd SC_CLEANUP(sc_cleanup_close) = -1; proc_fd = open("/proc", O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); if (proc_fd < 0) { die("cannot open /proc"); } // Open the hierarchy directory for the given snap. int hierarchy_fd SC_CLEANUP(sc_cleanup_close) = -1; hierarchy_fd = openat(cgroup_fd, buf, O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); if (hierarchy_fd < 0) { if (errno == ENOENT) { return false; } die("cannot open freezer cgroup hierarchy for snap %s", snap_name); } // Open the "cgroup.procs" file. Alternatively we could open the "tasks" // file and see per-thread data but we don't need that. int cgroup_procs_fd SC_CLEANUP(sc_cleanup_close) = -1; cgroup_procs_fd = openat(hierarchy_fd, "cgroup.procs", O_RDONLY | O_NOFOLLOW | O_CLOEXEC); if (cgroup_procs_fd < 0) { die("cannot open cgroup.procs file for freezer cgroup hierarchy for snap %s", snap_name); } FILE *cgroup_procs SC_CLEANUP(sc_cleanup_file) = NULL; cgroup_procs = fdopen(cgroup_procs_fd, "r"); if (cgroup_procs == NULL) { die("cannot convert tasks file descriptor to FILE"); } cgroup_procs_fd = -1; // cgroup_procs_fd will now be closed by fclose. char *line_buf SC_CLEANUP(sc_cleanup_string) = NULL; size_t line_buf_size = 0; ssize_t num_read; struct stat statbuf; do { num_read = getline(&line_buf, &line_buf_size, cgroup_procs); if (num_read < 0 && errno != 0) { die("cannot read next PID belonging to snap %s", snap_name); } if (num_read <= 0) { break; } else { if (line_buf[num_read - 1] == '\n') { line_buf[num_read - 1] = '\0'; } else { die("could not find newline in cgroup.procs"); } } debug("found process id: %s\n", line_buf); if (fstatat(proc_fd, line_buf, &statbuf, AT_SYMLINK_NOFOLLOW) < 0) { // The process may have died already. if (errno != ENOENT) { die("cannot stat /proc/%s", line_buf); } } debug("found process %s belonging to user %d", line_buf, statbuf.st_uid); return true; } while (num_read > 0); return false; }